使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页_JAVA_编程开发_程序员俱乐部

中国优秀的程序员网站程序员频道CXYCLUB技术地图
热搜:
更多>>
 
您所在的位置: 程序员俱乐部 > 编程开发 > JAVA > 使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页

使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页

 2012/5/10 10:42:04  ialy_2000  程序员俱乐部  我要评论(0)
  • 摘要:packagehttpclient;importjava.io.IOException;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.util.zip.GZIPInputStream;importorg.apache.commons.httpclient.HttpClient;importorg.apache.commons.httpclient.HttpException;importorg
  • 标签:使用 浏览 工具 client 百度 浏览器 Apache 网页 HTTP

?

package httpclient;

?

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.zip.GZIPInputStream;

?

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.methods.GetMethod;

/**

?* 使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页

?* @author Ivan

?*

?*/

?

public class HttpClientTest {

?

/**

* @param args

* @throws IOException

* @throws HttpException

*/

public static void main(String[] args) throws HttpException, IOException {

?

HttpClient httpclient = new HttpClient();// 创建一个客户端,类似打开一个浏览器

// httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

// httpclient.getParams().setParameter("http.protocol.single-cookie-header",true);

?

GetMethod getMethod = new GetMethod("http://www.iteye.com");//http://itindex.net

?

// google

// getMethod.setRequestHeader("Host", "laohuang.iteye.com");

// getMethod.setRequestHeader("Connection", "Keep-Alive");

// getMethod.setRequestHeader("Accept", "*/*");

// getMethod.setRequestHeader("From", "goolebot@googlebot.com");

// getMethod.setRequestHeader("User-Agent",

// "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");

// getMethod.setRequestHeader("Accept-Encoding", "gzip, deflate");

?

// baidu

getMethod.setRequestHeader("Host", " www.iteye?.com");//itindex.net

getMethod.setRequestHeader("Connection", "Keep-Alive");

getMethod.setRequestHeader("Accept", "*/*");

getMethod.setRequestHeader("From", "goolebot@googlebot.com");

getMethod

.setRequestHeader(

"User-Agent",

"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");

getMethod.setRequestHeader("Accept-Encoding", "gzip");

?

?

int statusCode = httpclient.executeMethod(getMethod);

?

?

System.out.println(getMethod.getResponseCharSet());

System.out.println(getMethod.getResponseHeader("Content-Encoding"));

System.out.println(getBodyAsString(getMethod,getMethod.getResponseCharSet()));

?

?

?

}

?

public static String getBodyAsString(GetMethod getHC, String charset)

throws IOException {

String acceptEncoding = "";

if (getHC.getResponseHeader("Content-Encoding") != null)

acceptEncoding = getHC.getResponseHeader("Content-Encoding")

.getValue();

StringBuffer sb = new StringBuffer();

?

if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {

// 建立gzip解压工作流

InputStream is = getHC.getResponseBodyAsStream();

GZIPInputStream gzin = new GZIPInputStream(is);

InputStreamReader isr = new InputStreamReader(gzin, charset); // 设置读取流的编码格式,自定义编码

java.io.BufferedReader br = new java.io.BufferedReader(isr);

String tempbf;

while ((tempbf = br.readLine()) != null) {

sb.append(tempbf);

sb.append("\r\n");

}

isr.close();

gzin.close();

} else {

InputStreamReader isr = new InputStreamReader(getHC

.getResponseBodyAsStream(), charset); // 设置读取流的编码格式,自定义编码

java.io.BufferedReader br = new java.io.BufferedReader(isr);

String tempbf;

while ((tempbf = br.readLine()) != null) {

sb.append(tempbf);

sb.append("\r\n");

}

isr.close();

}

getHC.abort();

getHC.releaseConnection();

return sb.toString();

}

?

}

?

Via http://itindex.net

发表评论
用户名: 匿名