java jsoup 网络爬虫 学习例子(四) 抓取网页连接插入mysql数据库
?
class="java" name="code">package com.iteye.injavawetrust.jsoup; import java.io.IOException; import java.util.Iterator; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author InJavaWeTrust * */ public class GetLink { private JsoupUtil ju = JsoupUtil.getInstance(); private DBUtil du = DBUtil.getInstance(); private Link link = new Link(); private String insertSql = ""; public void getLink(String url) { try { Document document = Jsoup.connect(url).timeout(5000).get(); Elements hrefs = document.select("a[href]"); Iterator<Element> hrefIter = hrefs.iterator(); while (hrefIter.hasNext()) { Element href = hrefIter.next(); link.setId(ju.getUUID()); link.setUrlName(href.text()); link.setUrl(href.attr("href")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } Elements srcs = document.select("img[src]"); Iterator<Element> srcIter = srcs.iterator(); while(srcIter.hasNext()){ Element src = srcIter.next(); link.setId(ju.getUUID()); link.setUrlName(src.attr("alt")); link.setUrl(src.attr("src")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } Elements opts = document.select("option[value]"); Iterator<Element> optIter = opts.iterator(); while(optIter.hasNext()){ Element opt = optIter.next(); link.setId(ju.getUUID()); link.setUrlName(opt.text()); link.setUrl(opt.attr("value")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } Elements links = document.select("link[href]"); Iterator<Element> linkIter = links.iterator(); while(linkIter.hasNext()){ Element li = linkIter.next(); link.setId(ju.getUUID()); link.setUrlName(li.text()); link.setUrl(li.attr("href")); insertSql = ju.getInsertSql(link); du.insert(insertSql); } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { new GetLink().getLink(Constants.URL); } } package com.iteye.injavawetrust.jsoup; import java.io.Serializable; import java.util.Date; /** * * @author InJavaWeTrust * */ public class Link implements Serializable{ private static final long serialVersionUID = 1165098694307553167L; /** * ID */ private String id; /** * link name */ private String urlName; /** * link url */ private String url; /** * insert db date */ private Date date; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getUrlName() { return urlName; } public void setUrlName(String urlName) { this.urlName = urlName; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } } package com.iteye.injavawetrust.jsoup; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; /** * * @author InJavaWeTrust * */ public class DBUtil { private static Connection conn = null; private static Statement st = null; private static ResultSet rs = null; private DBUtil() { } private static final DBUtil instance = new DBUtil(); public static DBUtil getInstance() { return instance; } /** * 连接数据库 * * @return */ public Connection connection() { try { Class.forName(Constants.DRIVER); } catch (ClassNotFoundException e1) { e1.printStackTrace(); } try { conn = DriverManager.getConnection(Constants.DBURL, Constants.USER, Constants.PASSWORD); } catch (SQLException e) { e.printStackTrace(); } return conn; } /** * 关闭连接 * * @param rs * @param st * @param conn */ public void release(ResultSet rs, Statement st, Connection conn) { try { try { if (null != rs) { rs.close(); } } catch (Exception e) { rs = null; } try { if (null != st) { st.close(); } } catch (Exception e) { st = null; } try { if (null != conn) { conn.close(); } } catch (Exception e) { conn = null; } } finally { rs = null; st = null; conn = null; } } /** * 插入 * @param sql */ public void insert(String sql){ try{ DBUtil.getInstance().connection(); st = conn.createStatement(); st.execute(sql); DBUtil.getInstance().release(rs, st, conn); }catch(Exception e){ e.printStackTrace(); } } } package com.iteye.injavawetrust.jsoup; /** * * @author InJavaWeTrust * */ public class Constants { /** * mysql 驱动 */ public static final String DRIVER = "com.mysql.jdbc.Driver"; /** * 链接 */ public static final String DBURL = "jdbc:mysql://localhost:3306/jsoupdb?useUnicode=true&characterEncoding=utf-8"; /** * username */ public static final String USER = "root"; /** * password */ public static final String PASSWORD = "root"; /** * 随便找的一个URL */ public static final String URL = "http://www.hrbhuade.net/html/main/index.htm"; } package com.iteye.injavawetrust.jsoup; import java.util.UUID; /** * * @author InJavaWeTrust * */ public class JsoupUtil { private JsoupUtil() { } private static final JsoupUtil instance = new JsoupUtil(); public static JsoupUtil getInstance() { return instance; } /** * 得到UUID * @return 32位UUID */ public String getUUID() { String s = UUID.randomUUID().toString(); return s.substring(0, 8) + s.substring(9, 13) + s.substring(14, 18) + s.substring(19, 23) + s.substring(24); } /** * insert sql * @param link Link obj * @return sql */ public String getInsertSql(Link link) { return "insert into link (id, urlname, url, date) values ('" + link.getId() + "','" + link.getUrlName() + "','" + link.getUrl() + "',NOW())"; } }
?
link 表
?
DROP TABLE IF EXISTS `link`; CREATE TABLE `link` ( `id` varchar(32) NOT NULL, `urlname` varchar(200) DEFAULT NULL, `url` varchar(200) DEFAULT NULL, `date` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
?
?
?