java抓取页面_JAVA_编程开发_程序员俱乐部

中国优秀的程序员网站程序员频道CXYCLUB技术地图
热搜:
更多>>
 
您所在的位置: 程序员俱乐部 > 编程开发 > JAVA > java抓取页面

java抓取页面

 2014/4/4 18:49:54  策码奔腾  程序员俱乐部  我要评论(0)
  • 摘要:packageload;importjava.io.BufferedInputStream;importjava.io.BufferedReader;importjava.io.BufferedWriter;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileOutputStream;importjava.io.FileWriter;importjava.io.IOException;importjava.io
  • 标签:Java
package load;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;

import org.dom4j.Node;
import org.dom4j.NodeFilter;
import org.dom4j.io.OutputFormat;
import org.junit.Test;
import org.w3c.dom.NodeList;

public class loadPage {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args)  {

loadPage l = new loadPage();
String charset = l.chaset(l.writread("utf-8"));
l.writread(charset);
}
// @Test
public void read() throws IOException{
URL url = new URL("http://saite.com/");
Reader reader = new InputStreamReader(new BufferedInputStream(url.openStream()));
// BufferedReader reader = new BufferedReader(new InputStreamReader(in) InputStreamReader(url,"utf8 "));
int c;
while ((c = reader.read()) != -1) {
String string = String.valueOf(c);
System.out.print(string);
}
reader.close();
}
//抓取页面
public String writread(String set){
URL url;
StringBuffer stringBuffer = new StringBuffer();
try {
url = new URL("http://www.baidu.com/");
Reader reader = new InputStreamReader(new BufferedInputStream(url.openStream()),set); //需要设置抓取网页编码
int i;
Writer writer = new OutputStreamWriter(new FileOutputStream("D:\\test.html"), set);
new StringBuffer();
while ((i = reader.read()) != -1) {
writer.write((char)i);
stringBuffer.append((char)i);
writer.flush();
}
writer.close();
System.out.println("抓取成功!已存到指定目录");
System.out.println(stringBuffer);
} catch (Exception e) {
System.out.println("抓取失败!");
e.printStackTrace();
}
return stringBuffer.toString();
}
//截取出页面的编码格式
public String chaset(String string){
// String string = " content="+"text/html;charset = gb2312"+"><title>百度搜索_java怎么获得某远程页面的编码方式      </title>";
String bns = "";
String ens = "";
int bnc = 0;
int enc = 0;
for(int i = 0; i < string.length();i++){
bns = string.substring(i, i+7);
if(!"".equalsIgnoreCase(bns) && !bns.equalsIgnoreCase(null)){
if(bns.equalsIgnoreCase("charset")){
ens = string.substring(i,i+17);
bnc = ens.indexOf("=");
enc = ens.indexOf("\"");
ens = ens.substring(bnc+1,enc);
System.out.println("---"+ens.trim());
return ens;
}
}
}
return ens;
}

}
发表评论
用户名: 匿名