1.这个类是用来
解析网站的内容
重点是:"div#page>div#content>div#local>div#recommend>ul>li>a";
这里用用firefox的firebug组件查看网页的代码结构,不同的网页路径也不一样。
class="java" name="code">
package zy.crawl.hupu;
import java.io.IOException;
import zy.crawl.common.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class CrawlHupu
{
private List<NewsInfo> newsList = new ArrayList<>();//用来存储爬取的信息对象
public String GetHtml(String url) //还方法是设置网络链接,是固定的用法
{
String html = null;
HttpClient httpClient = new DefaultHttpClient();
//set proxy ,because of nsn
// HttpHost proxy = new HttpHost("10.68.120.11", 3128);
// httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
//configuration timeout
httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
HttpGet httpGet = new HttpGet(url);
try
{
HttpResponse httpResponse = httpClient.execute(httpGet);
int resStatu = httpResponse.getStatusLine().getStatusCode();
if(resStatu == HttpStatus.SC_OK)
{
HttpEntity entity = httpResponse.getEntity();
if(entity != null)
{
html = EntityUtils.toString(entity);
}
}
}
catch (Exception e)
{
System.out.println("Connect " + url+" error");
e.printStackTrace();
}
finally
{
httpClient.getConnectionManager().shutdown();
}
return html;
}
public void ParseHtmlForNewsList()
{
String html = GetHtml("http://qczx.qc1818.com/");
//hupu voice 的第一个可以暂时去掉一个css,这样就不用处理空格了
//String cssQueryHupu = "div.content>div.row>div.column>div.row>div.column>div.uibox>div.uibox-con>ul.ui-list>li>a";
String cssQueryHupu ="div#mainbody>div.cjkx_mtsd>div.cjkx>ul.list_left>li>a";//这行是用来获取每条对象的标题信息
// String cssQueryHuxiu = "div.container-hx>div.row-fluid-wrap-hx>"
// + "div.center-container-hx>div.clearfix>div.center-ctr-wrap>div.center-ctr-box>div.article-list>div.article-box>div.article-box-ctt>h4>a";
//
// String cssQueryIteye = "div#page>div#content>div#local>div#recommend>ul>li>a";
if(!html.isEmpty())
{
Document doc = Jsoup.parse(html,"http://qczx.qc1818.com/");
Elements linkElements = doc.select(cssQueryHupu);
/*
* <a class="button read" href="http://book.zongheng.com/showchapter/48552.html">点击阅读</a>
* 最后经过测试发现带空格的class可以写成两个select 写成 Elements indexEs = doc.select(".button").select(".read");成功抓取该书所有目录和链接。
*/
//Elements linkElements = doc.select("div.hp-wrap").select("div.index-wrap>div.col-B>div.voice-main>div.public>div#J_public_item>ul>li>dl.item-bd>dt>span>a");
for(Element ele:linkElements)
{
NewsInfo newsTemp = new NewsInfo(ele.text(), ele.absUrl("href"));
PaserHtmlForNewsContent(newsTemp.getHtmlAddr(),newsTemp);
newsList.add(newsTemp);
//String href = ele.attr("abs:href"); 也可以获取绝对地址
//for test
System.out.println(newsTemp.getTitle()+" "+newsTemp.getHtmlAddr());
if(newsTemp.getImageAddrList() != null)
System.out.println(newsTemp.getImageAddrList().get(0));
System.out.println(newsTemp.getContent());
}//System.out.println(newsList.get(0).getContent());
}
}
public void PaserHtmlForNewsContent(String contentHtmlAddr, NewsInfo newsTemp)//通过上面获得的标题信息的连接,抓取标题的正文部分。
{
String html = GetHtml(contentHtmlAddr);
String cssQueryphoto="asdfas";
String cssQueryContent = //"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailTitle"+
//+"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailIntr"
"div#pageMain>div.pageMainLeft>div.detailWrap>div.detail";
//String cssQueryContent = "div.content>div.row>div.column>div#articlewrap.area";
// String cssQueryphoto = "div.hp-wrap>div.voice-main>div.voice-item>ul>li>div.voice-read-detailed>div.voice-photoVideo>"
// + "div.voice-photo>div.small-img>img";
if(!html.isEmpty())
{
Document doc = Jsoup.parse(html);
Elements contentElements = doc.select(cssQueryContent);
Elements imgElements = doc.select(cssQueryphoto);
for(Element ele:contentElements)
{
newsTemp.setContent(ele.html());
}
for(Element ele:imgElements)
{
List<String> tempImgList = new ArrayList<>();
tempImgList.add(ele.attr("src"));
newsTemp.setImageAddrList(tempImgList);
}
}
}
public static void main(String[] args)
{
CrawlHupu crawlHupu = new CrawlHupu();
crawlHupu.ParseHtmlForNewsList();
}
}
2.这个是要获取的信息的类。不多解释。
package zy.crawl.common;
import java.util.List;
public class NewsInfo
{
private String title;
private String htmlAddr;
private String content;
private List<String> imageAddrList;
public NewsInfo(String title, String htmlAddr)
{
super();
this.title = title;
this.htmlAddr = htmlAddr;
}
public NewsInfo(String content, List<String> imageAddrList)
{
super();
this.content = content;
this.imageAddrList = imageAddrList;
}
public String getTitle()
{
return title;
}
public void setTitle(String title)
{
this.title = title;
}
public String getHtmlAddr()
{
return htmlAddr;
}
public void setHtmlAddr(String htmlAddr)
{
this.htmlAddr = htmlAddr;
}
public String getContent()
{
return content;
}
public void setContent(String content)
{
this.content = content;
}
public List<String> getImageAddrList()
{
return imageAddrList;
}
public void setImageAddrList(List<String> imageAddrList)
{
this.imageAddrList = imageAddrList;
}
}