search-demo托管于github
?
search-demo演示了如何利用Java来调用百度搜索和谷歌搜索,更多细节请到github上查看search-demo
?
自己没搜索引擎,又想要大规模的数据源,怎么办?可以对百度搜索和谷歌搜索善加利用,以小搏大,站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现,比如网站的新闻采集,比如技术、品牌的新闻跟踪,比如知识库的收集,比如人机问答系统等,我之前做的一个准确率达百分之九十几的人机问答系统的数据源,其中一部分就是充分利用了百度搜索和谷歌搜索。在此演示的技术的基础上,可以容易地扩展到其他的搜索引擎,可以借鉴使用的NekoHTML+XPath技术,轻松获取页面的自定义的内容。
?
class="java">package org.apdplat.demo.search; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class GoogleSearcher { private static final Logger LOG = LoggerFactory.getLogger(GoogleSearcher.class); public static List<Webpage> searchGoogle(String url) { List<Webpage> webpages = new ArrayList<>(); try { HttpClient httpClient = new HttpClient(); GetMethod getMethod = new GetMethod(url); httpClient.executeMethod(getMethod); getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { LOG.error("搜索失败: " + getMethod.getStatusLine()); return null; } InputStream in = getMethod.getResponseBodyAsStream(); byte[] responseBody = Tools.readAll(in); String response = new String(responseBody, "UTF-8"); LOG.debug("搜索返回数据:" + response); JSONObject json = new JSONObject(response); String totalResult = json.getJSONObject("responseData").getJSONObject("cursor").getString("estimatedResultCount"); int totalResultCount = Integer.parseInt(totalResult); LOG.info("搜索返回记录数: " + totalResultCount); JSONArray results = json.getJSONObject("responseData").getJSONArray("results"); LOG.debug("搜索结果:"); for (int i = 0; i < results.length(); i++) { Webpage webpage = new Webpage(); JSONObject result = results.getJSONObject(i); //提取标题 String title = result.getString("titleNoFormatting"); LOG.debug("标题:" + title); webpage.setTitle(title); //提取摘要 String summary = result.get("content").toString(); summary = summary.replaceAll("<b>", ""); summary = summary.replaceAll("</b>", ""); summary = summary.replaceAll("\\.\\.\\.", ""); LOG.debug("摘要:" + summary); webpage.setSummary(summary); //从URL中提取正文 String _url = result.get("url").toString(); webpage.setUrl(_url); String content = Tools.getHTMLContent(_url); LOG.debug("正文:" + content); webpage.setContent(content); webpages.add(webpage); } } catch (IOException | JSONException | NumberFormatException e) { LOG.error("执行搜索失败:", e); } return webpages; } public static void main(String args[]) { String query = "杨尚川"; try { query = URLEncoder.encode(query, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error("url构造失败", e); return; } String url = "http://ajax.googleapis.com/ajax/services/search/web?start=0&rsz=large&v=1.0&q=" + query; List<Webpage> webpages = searchGoogle(url); if (webpages != null) { int i = 1; for (Webpage webpage : webpages) { LOG.info("搜索结果 " + (i++) + " :"); LOG.info("标题:" + webpage.getTitle()); LOG.info("URL:" + webpage.getUrl()); LOG.info("摘要:" + webpage.getSummary()); LOG.info("正文:" + webpage.getContent()); LOG.info(""); } } else { LOG.error("没有搜索到结果"); } } }
?
?