search-demo托管于github
?
search-demo演示了如何利用Java来调用百度搜索和谷歌搜索,更多细节请到github上查看search-demo
?
自己没搜索引擎,又想要大规模的数据源,怎么办?可以对百度搜索和谷歌搜索善加利用,以小搏大,站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现,比如网站的新闻采集,比如技术、品牌的新闻跟踪,比如知识库的收集,比如人机问答系统等,我之前做的一个准确率达百分之九十几的人机问答系统的数据源,其中一部分就是充分利用了百度搜索和谷歌搜索。在此演示的技术的基础上,可以容易地扩展到其他的搜索引擎,可以借鉴使用的NekoHTML+XPath技术,轻松获取页面的自定义的内容。
class="java" name="code">package org.apdplat.demo.search;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import com.sun.org.apache.xpath.internal.XPathAPI;
import javax.xml.transform.TransformerException;
import org.w3c.dom.DOMException;
import org.xml.sax.SAXException;
public class BaiduSearcher {
private static final Logger LOG = LoggerFactory.getLogger(BaiduSearcher.class);
public static List<String> parse(String url, String xpathExpression) {
InputStream in = null;
try {
in = new URL(url).openStream();
return parse(in, xpathExpression);
} catch (Exception e) {
LOG.error("错误", e);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
LOG.error("错误", e);
}
}
}
return null;
}
public static List<String> parse(InputStream in, String xpathExpression) {
return parse(in, xpathExpression, "UTF-8");
}
public static List<Map<String, String>> parseMore(InputStream in, String xpathExpression) {
return parseMore(in, xpathExpression, "UTF-8");
}
public static List<Map<String, String>> parseMore(InputStream in, String xpathExpression, String encoding) {
DOMParser parser = new DOMParser();
List<Map<String, String>> list = new ArrayList<>();
try {
// 设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
encoding);
/*
* The Xerces HTML DOM implementation does not support namespaces
* and cannot represent XHTML documents with namespace information.
* Therefore, in order to use the default HTML DOM implementation
* with NekoHTML's DOMParser to parse XHTML documents, you must turn
* off namespace processing.
*/
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(new BufferedReader(new InputStreamReader(in, encoding))));
Document doc = parser.getDocument();
NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());
for (int i = 0; i < products.getLength(); i++) {
Node node = products.item(i);
String title = node.getTextContent();
Map<String, String> map = new HashMap<>();
map.put("title", title);
try {
String href = node.getAttributes().getNamedItem("href").getTextContent();
map.put("href", href);
} catch (Exception e) {
LOG.error("提取链接失败",e);
}
list.add(map);
}
} catch (SAXException | IOException | TransformerException | DOMException e) {
LOG.error("错误", e);
}
return list;
}
public static List<String> parse(InputStream in, String xpathExpression, String encoding) {
DOMParser parser = new DOMParser();
List<String> list = new ArrayList<>();
try {
// 设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
encoding);
/*
* The Xerces HTML DOM implementation does not support namespaces
* and cannot represent XHTML documents with namespace information.
* Therefore, in order to use the default HTML DOM implementation
* with NekoHTML's DOMParser to parse XHTML documents, you must turn
* off namespace processing.
*/
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(new BufferedReader(new InputStreamReader(in, encoding))));
Document doc = parser.getDocument();
NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());
for (int i = 0; i < products.getLength(); i++) {
Node node = products.item(i);
list.add(node.getTextContent());
}
} catch (SAXException | IOException | TransformerException | DOMException e) {
LOG.error("错误", e);
}
return list;
}
public static List<Webpage> searchBaidu(String url) {
InputStream in = null;
try {
in = new URL(url).openStream();
return searchBaidu(in);
} catch (Exception e) {
LOG.error("错误", e);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
LOG.error("错误", e);
}
}
}
return null;
}
public static List<Webpage> searchBaidu(InputStream in) {
//保证只读一次
byte[] datas = Tools.readAll(in);
if (LOG.isDebugEnabled()) {
try {
LOG.debug("内容:" + new String(datas, "UTF-8"));
} catch (UnsupportedEncodingException e) {
LOG.error("错误", e);
}
}
in = new ByteArrayInputStream(datas);
String totalXpathExpression = "//html/body/div/div/div/div[3]/p/span";
List<String> totals = parse(in, totalXpathExpression);
int total;
int len = 10;
if (totals != null && totals.size() == 1) {
String str = totals.get(0);
int start = 10;
if (str.indexOf("约") != -1) {
start = 11;
}
total = Integer.parseInt(str.substring(start).replace(",", "").replace("个", ""));
LOG.info("搜索结果数:" + total);
} else {
return null;
}
if (total < 1) {
return null;
}
if (total < 10) {
len = total;
}
List<Webpage> webpages = new ArrayList<>();
for (int i = 0; i < len; i++) {
String content = "";
String url = "";
String titleXpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/h3/a";
String contentXpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/div[1]";
LOG.debug("titleXpathExpression:" + titleXpathExpression);
LOG.debug("contentXpathExpression:" + contentXpathExpression);
//重新构造输入流
in = new ByteArrayInputStream(datas);
List<String> titles = parse(in, titleXpathExpression);
//重新构造输入流
in = new ByteArrayInputStream(datas);
List<Map<String, String>> titleWithHrefs = parseMore(in, titleXpathExpression);
for (Map<String, String> titleWithHref : titleWithHrefs) {
String title = titleWithHref.get("title");
String href = titleWithHref.get("href");
LOG.debug(title + " " + titleWithHref.get("href"));
if (href != null) {
content = Tools.getHTMLContent(href);
url = href;
} else {
LOG.info("页面正确提取失败");
}
}
//重新构造输入流
in = new ByteArrayInputStream(datas);
List<String> summaries = parse(in, contentXpathExpression);
//处理百度知道1
if (titles != null && titles.size() == 1 && (summaries == null || summaries.isEmpty())) {
//重新构造输入流
in = new ByteArrayInputStream(datas);
String baiduZhidao1XpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[2]/div/div/p[2]";
LOG.debug("baiduZhidao1XpathExpression:" + baiduZhidao1XpathExpression);
summaries = parse(in, baiduZhidao1XpathExpression);
}
//处理百度知道2
if (titles != null && titles.size() == 1 && (summaries == null || summaries.isEmpty())) {
//重新构造输入流
in = new ByteArrayInputStream(datas);
String baiduZhidao2XpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[2]";
LOG.debug("baiduZhidao2XpathExpression:" + baiduZhidao2XpathExpression);
summaries = parse(in, baiduZhidao2XpathExpression);
}
//处理百度文库
if (titles != null && titles.size() == 1 && (summaries == null || summaries.isEmpty())) {
//重新构造输入流
in = new ByteArrayInputStream(datas);
String baiduWenkuXpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[1]";
LOG.debug("baiduWenkuXpathExpression:" + baiduWenkuXpathExpression);
summaries = parse(in, baiduWenkuXpathExpression);
}
if (titles != null && titles.size() == 1 && summaries != null && summaries.size() == 1) {
Webpage webpage = new Webpage();
webpage.setTitle(titles.get(0));
webpage.setUrl(url);
webpage.setSummary(summaries.get(0));
webpage.setContent(content);
webpages.add(webpage);
} else {
LOG.error("获取搜索结果列表项出错:" + titles + " - " + summaries);
}
}
if (webpages.isEmpty()) {
return null;
}
return webpages;
}
public static void main(String[] args) {
String url = "http://www.baidu.com/s?pn=0&wd=杨尚川";
List<Webpage> webpages = searchBaidu(url);
if (webpages != null) {
int i = 1;
for (Webpage webpage : webpages) {
LOG.info("搜索结果 " + (i++) + " :");
LOG.info("标题:" + webpage.getTitle());
LOG.info("URL:" + webpage.getUrl());
LOG.info("摘要:" + webpage.getSummary());
LOG.info("正文:" + webpage.getContent());
LOG.info("");
}
} else {
LOG.error("没有搜索到结果");
}
}
}
?
?