使用httpClient和httpParser获取指定网址的title_JAVA_编程开发_程序员俱乐部

中国优秀的程序员网站程序员频道CXYCLUB技术地图
热搜:
更多>>
 
您所在的位置: 程序员俱乐部 > 编程开发 > JAVA > 使用httpClient和httpParser获取指定网址的title

使用httpClient和httpParser获取指定网址的title

 2013/10/23 18:55:40  ldz0414  程序员俱乐部  我要评论(0)
  • 摘要:packagecom.xinhuanet.cloudDesk.controller;importjava.util.regex.Matcher;importjava.util.regex.Pattern;importorg.apache.commons.httpclient.HttpClient;importorg.apache.commons.httpclient.HttpConnectionManager;importorg.apache.commons.httpclient
  • 标签:网址 使用 client HTTP
class="java" name="code">
package com.xinhuanet.cloudDesk.controller;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.htmlparser.Parser;
import org.htmlparser.visitors.HtmlPage;

public class R {
	public static void main(String[] args) throws Exception {

		HttpClient httpClient = new HttpClient();
		httpClient.getHostConfiguration().setProxy("202.84.17.41", 8080);

		HttpConnectionManager httpConnManager = httpClient
				.getHttpConnectionManager();

		if (httpConnManager != null) {
			HttpConnectionManagerParams mgrParams = new HttpConnectionManagerParams();
			mgrParams.setSoTimeout(20000000);
			mgrParams.setTcpNoDelay(true);
			mgrParams.setConnectionTimeout(20000000);
			mgrParams.setLinger(0);
			mgrParams.setStaleCheckingEnabled(false);
			httpConnManager.setParams(mgrParams);
		}

		String url = "http://www.poetry4cn.com";
		GetMethod methodGet = new GetMethod(url);
		httpClient.executeMethod(methodGet);
		String charset = getCharSet(new String(methodGet.getResponseBody()));
		System.out.println("getCharSet:" + charset);
		String responseGet = new String(methodGet.getResponseBody(), charset);
		System.out.println(responseGet);

		

		Parser myParser = Parser.createParser(responseGet.toString(), charset);
		HtmlPage visitor = new HtmlPage(myParser);
		myParser.visitAllNodesWith(visitor);
		String textInPage = visitor.getTitle();
		System.out.println("title:" + textInPage);

	}
	
	public static String getCharSet(String content) {
		// String regex = ".*charset=([^;]*).*";
		String regex = "<meta.+?charset=[^\\w]?([-\\w]+)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(content);
		if (matcher.find())
			return matcher.group(1);
		else
			return null;
	}

}
发表评论
用户名: 匿名