基于Jsoup爬虫Demo
发布日期:2021-11-11 12:48:07 浏览次数:2 分类:技术文章

本文共 5316 字,大约阅读时间需要 17 分钟。

 今天写了一个爬虫跟大家分享一下,该爬虫为简单爬虫,后续会跟大家分享难一些的爬虫,话不多说,直接上代码。如果有疑问,可以直接评论。。。。。

package com.analysis;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.dao.FriendLinkDao;public class SnatchSHUJUJU {	public static Document getDocument (String url){        try {       	 //5000是设置连接超时时间,单位ms            return Jsoup.connect(url).timeout(5000).get();        } catch (IOException e) {            e.printStackTrace();        }        return null;    }	public static List
getEveryOtherUrl(){ List
urlList = new ArrayList<>(); String host = "http://www.shujuju.cn"; String url = "http://www.shujuju.cn/navigation/navigationPage"; Document document = getDocument(url); Elements elements1 = document.select("[class=more fr]"); Elements elements2 = elements1.select("a[href]"); for(Element element : elements2){ String string = host+element.attr("href"); urlList.add(string); } return urlList; } public static List
getDetailUrl(List
list){ List
mapList = new ArrayList<>(); for(String url:list){ Document document = getDocument(url); Elements elements1 = document.select("[class=nav-sort-info]"); String channelName = elements1.get(0).select("h4").text(); System.out.println("channelName:"+channelName); Elements elements2 = elements1.select("[class=nav-sort-body clearfix]").select("a"); for(Element element : elements2){ Map
map = new HashMap<>(); String linkUrl = element.attr("href"); String name = element.text(); System.out.println("linkUrl:"+linkUrl); System.out.println("name:"+name); map.put("channelName", channelName); map.put("linkUrl", linkUrl); map.put("name", name); mapList.add(map); } } return mapList; } public static void main(String[] args) { List
list = getDetailUrl(getEveryOtherUrl()); FriendLinkDao friendDao = new FriendLinkDao(); for(Map map:list){ String channelName = map.get("channelName").toString(); Integer channelId = friendDao.getChannelId(channelName); if(channelId != -1){ System.out.println("channelId: " + channelId); map.put("channelId", channelId); map.put("stat", "1"); friendDao.insertFriendLink(map); }else { friendDao.insertChannelName(channelName, 1); channelId = friendDao.getChannelId(channelName); System.out.println("channelId: " + channelId); map.put("channelId", channelId); map.put("stat", "1"); friendDao.insertFriendLink(map); } } }}
package com.dao;import java.sql.Connection;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.sql.SQLException;import java.sql.Timestamp;import java.text.SimpleDateFormat;import java.util.HashSet;import java.util.Date;import java.util.Map;import java.util.Set;import com.util.ConnectUtil;public class FriendLinkDao {	public Connection conn = ConnectUtil.getConn();	public Integer getChannelId(String channelName) {		Integer id = -1;		try {			String sql = "SELECT id FROM t_zsff_friend_link_channel WHERE channel_name = ?";			PreparedStatement ptmt = conn.prepareStatement(sql);			ptmt.setString(1, channelName);			ResultSet rs = ptmt.executeQuery();			while (rs.next()) {				id = rs.getInt("id");			}			return id;		} catch (SQLException e) {			// TODO Auto-generated catch block			e.printStackTrace();			return id; // 返回-1,数据库插入异常		}	}				public void insertChannelName(String channelName,Integer pid) {		String sql = "INSERT INTO t_zsff_friend_link_channel (channel_name, pid) VALUES (?, ?)";		try {			PreparedStatement ptmt = conn.prepareStatement(sql);			ptmt.setObject(1, channelName);			ptmt.setObject(2, pid);			ptmt.executeUpdate();		} catch (Exception e) {			e.printStackTrace();// TODO: handle exception		}	}			public void insertFriendLink(Map map) {		String sql = "INSERT INTO t_zsff_friend_link (name, channel_id, link_url, stat) VALUES (?, ?, ?, ?)";		try {			PreparedStatement ptmt = conn.prepareStatement(sql);			ptmt.setObject(1, map.get("name"));			ptmt.setObject(2, map.get("channelId"));			ptmt.setObject(3, map.get("linkUrl"));			ptmt.setObject(4, map.get("stat"));			ptmt.executeUpdate();		} catch (Exception e) {			e.printStackTrace();// TODO: handle exception		}	}}

package com.util;import java.sql.Connection;import java.sql.DriverManager;import java.sql.SQLException;public class ConnectUtil {    private static Connection conn;    public static Connection getConn() {                try {            //1.加载mysql连接到数据库jar包,数据库驱动            Class.forName("com.mysql.jdbc.Driver");            //2.数据库所在位置以及要访问数据库的名字            String url = "jdbc:mysql://127.0.0.7:3306/test?characterEncoding=UTF-8";            //3.数据库的用户名,密码            String username = "root";            String password = "root";            //4.使用驱动管理器连接到数据库            conn = DriverManager.getConnection(url,username,password);        } catch (ClassNotFoundException e) {            // TODO Auto-generated catch block            e.printStackTrace();        } catch (SQLException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        return conn;    }    public void setConn(Connection conn1) {    	conn = conn1;    }	}
org.jsoup
jsoup
1.13.1

转载地址:https://blog.csdn.net/Carson073/article/details/83867414 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:Zookeepper如何判断子服务器是否正常
下一篇:SSM和SSH2的区别

发表评论

最新留言

很好
[***.229.124.182]2024年04月17日 11时54分26秒