Java - OnlyLady Spider(HttpClient 4.5 )
发布日期:2021-06-30 19:51:24
浏览次数:2
分类:技术文章
本文共 9120 字,大约阅读时间需要 30 分钟。
pom
4.0.0 peerslee Onlylady 0.0.1-SNAPSHOT jar Onlylady http://maven.apache.org UTF-8 junit junit 3.8.1 test org.apache.httpcomponents httpclient 4.5 org.jsoup jsoup 1.10.2 org.mongodb mongo-java-driver 3.4.2
Spider
package peerslee.Onlylady;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Spider { private static HttpClient client = HttpClients.createDefault(); private static MongoUtil util = new MongoUtil(); Pattern pattern = null; Matcher matcher = null; Integer Max_Value = 500; //每个帖子,最多抓500页评论,最大不要过3000条,单条数据插入容量溢出 // util,返回jsoup document Document get_doc(String url) { RequestConfig requestConfig = RequestConfig.custom() .setConnectTimeout(5000) //设置连接超时时间,单位毫秒 .setConnectionRequestTimeout(5000) // 设置从connect Manager获取Connection 超时时间,单位毫秒 .setSocketTimeout(5000) //请求获取数据的超时时间,单位毫秒 .build(); HttpGet get = new HttpGet(url); get.setConfig(requestConfig); try { HttpResponse response = client.execute(get); HttpEntity entity = response.getEntity(); String html = EntityUtils.toString(entity); return Jsoup.parse(html); } catch (Exception e) { System.out.println("------超时------"); return get_doc(url); //重新抓取 } } // 评论 Listutil
package peerslee.Onlylady;import com.mongodb.MongoClient;import com.mongodb.client.MongoCollection;import com.mongodb.client.MongoDatabase;import org.bson.Document;import java.util.ArrayList;import java.util.List;import java.util.Map;/** * Created by PeersLee on 2017/1/30. */public class MongoUtil { private MongoClient client = null; private String dbName = null; public MongoUtil() { dbName = "Onlylady"; this.client = new MongoClient("127.0.0.1", 27017); } //插入(去重复) public void insertCol(String colName, Mapmsg) { try { MongoDatabase db = client.getDatabase(dbName); MongoCollection col = db.getCollection(colName); Document doc = new Document(); for(Map.Entry entry : msg.entrySet()) { doc.append(entry.getKey(), entry.getValue()); } List docs = new ArrayList (); docs.add(doc); col.insertMany(docs); System.out.println("Doc insert" + colName + " ok..."); } catch (Exception e) { System.out.println(e.getClass().getName() + ":" + e.getMessage()); } }}
代理
HttpClient client = HttpClients.createDefault(); // 设置代理 HttpHost proxy = new HttpHost(ip, port); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy); // HttpClientBuilder client = HttpClients.custom().setRoutePlanner(routePlanner). setConnectionTimeToLive(2, TimeUnit.SECONDS).build();
转载地址:https://lipenglin.blog.csdn.net/article/details/78222090 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
能坚持,总会有不一样的收获!
[***.219.124.196]2024年05月03日 23时54分55秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
论DEV-C++怎样才能做窗口
2019-04-30
Failed to connect to github.com port 443: Operation timed out和弹出无法打开"GoogleSoftwareUpdate.bundle"
2019-04-30
2021.5.19 JS高级第二天
2019-04-30
2021.5.20 JS高级第三天
2019-04-30
2021.5.21 Jquery
2019-04-30
2021.5.22 Jquery
2019-04-30
2021.5.25 JSON
2019-04-30
2021.5.25 Flex
2019-04-30
2021.5.28 AJAX
2019-04-30
正则表达式
2019-04-30
过滤器Filter
2019-04-30
2021.6.1 Array补充
2019-04-30
【收藏】Markdown笔记
2019-04-30
离散数学-图论
2019-04-30
啊沙发沙发
2019-04-30
2021-07-02
2019-04-30
「收藏」HTML笔记-持续更新中
2019-04-30
数据结构期末复习------排序汇总
2019-04-30
数据结构期末复习------查找汇总
2019-04-30
青云QingMR集群创建、HDFS节点与Client节点管理教程
2019-04-30