一个朋友问我能不能帮他做个小程序。抓取58上面包含"维修"的数据,比如公司名称,电话号码等等
打开58,收索"维修"
单击 房屋维修,进入一个列表页面,
随便单击一个,进入详细页面
需要请求58服务器3次。然后匹配html元素获取自己需要的信息,数据匹配自然少不了正则表达式,用过的都知道,
对于我来说,写正则表达式是非常头疼的事情,所以可以选择第三方库:比如HtmlAgilityPack,Jumony等等,我这里选择的是Jumony
博客园有对Jumony入门的文章: http://www.cnblogs.com/Ivony/archive/2010/12/19/jumony-guide-1.html
jumony直接安装在项目中:
首先:选择需要添加的项目,单击引用,然后选择管理NuGet程序包,在必要的情况下,需要升级NuGet
其次:收索Jumony安装即可
先看看我实现的效果图:因为公司比较忙,只能晚上回家写写,问题也是非常多。所有先记录这两天实现的效果
左侧显示的是在首页匹配后的关键字。然后通过多线程月抓取每个列表页面的信息。
看看我主窗体的布局
显示数据的DatatGridView是动态创建的。
来看看核心代码:模拟请求58服务器,就要去观察58的请求与响应,可以通过Fiddler2和Firebug抓包观察
我根据我项目的需求封装了一个HttpWebHelper类,
1 /// <summary> 2 /// 封装Http类 3 /// </summary> 4 class HttpWebHelper 5 { 6 /// <summary> 7 /// 显示验证码页面容器 8 /// </summary> 9 public static WebBrowser webBrowser { get; set; } 10 11 /// <summary> 12 /// 验证码需要的唯一id 13 /// </summary> 14 public static string uuid { get; set; } 15 16 /// <summary> 17 /// 验证码是否通过 18 /// </summary> 19 public static bool isPass { get; set; } 20 21 /// <summary> 22 /// 首页关键字对应的url 23 /// </summary> 24 public static Dictionary<string, string> list; 25 26 /// <summary> 27 /// 抓取页面前缀 28 /// </summary> 29 public static string prefix; 30 /// <summary> 31 /// 显示验证码页面 32 /// </summary> 33 public string codeUrl { get; set; } 34 /// <summary> 35 /// 抓取首页 36 /// </summary> 37 public string dataUrl { get; set; } 38 /// <summary> 39 /// 验证码提交页面 40 /// </summary> 41 public static string verCode { get; set; } 42 43 /// <summary> 44 /// 页面请求方式 45 /// </summary> 46 public string Method { get; set; } 47 /// <summary> 48 /// RefererHTTP 表头值 49 /// </summary> 50 public string Referer { get; set; } 51 /// <summary> 52 /// 主机 53 /// </summary> 54 public string Host { get; set; } 55 /// <summary> 56 /// cookie 57 /// </summary> 58 public CookieContainer cookie { get; set; } 59 /// <summary> 60 /// 61 /// </summary> 62 public string Accept { get; set; } 63 public string UserAgent { get; set; } 64 public string ContentType { get; set; } 65 public string Accept_Language { get; set; } 66 public Encoding encoding { get; set; } 67 68 public Image PictureBox { get; set; } 69 70 71 72 73 public HttpWebHelper() 74 { 75 this.codeUrl = "http://support.58.com/firewall/valid/3071088800.do"; 76 //this.verCode = "http://support.58.com/firewall/valid/3071088800.do"; 77 78 Method = "post"; 79 Referer = "http://support.58.com/firewall/valid/3071088800.do"; 80 Host = "support.58.com"; 81 Accept_Language = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"; 82 Accept = "*/*"; 83 ContentType = "application/x-www-form-urlencoded; charset=UTF-8"; 84 UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0"; 85 encoding = Encoding.UTF8; 86 87 88 } 89 90 public HttpWebHelper(WebBrowser webBrowser, string uuid, string dataUrl) 91 { 92 //this.codeUrl = "http://support.58.com/firewall/valid/3071088800.do"; 93 //HttpWebHelper.webBrowser = webBrowser; 94 //this.uuid = uuid; 95 this.dataUrl = dataUrl; 96 } 97 98 /// <summary> 99 /// 验证 验证码,验证码和页面生成的一个id值同时post到服务器 100 /// </summary> 101 /// <param name="code">验证码</param> 102 public void postVerCode(string code, string uuid) 103 { 104 try 105 { 106 //HtmlElement d = webBrowser.Document.GetElementById("uuid"); 107 108 //获取页面uid。 109 /* 110 * 验证方式:验证码和页面生成的一个id值 111 */ 112 //string y = webBrowser.Document.GetElementById("uuid").GetAttribute("value"); 113 114 // string postUrl = "http://support.58.com/firewall/valid/3071088800.do"; 115 HttpWebHelper h = new HttpWebHelper(); 116 117 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(verCode); 118 request.Method = Method; 119 request.Referer = Referer; 120 request.Headers.Add("X-Requested-With", "XMLHttpRequest"); 121 request.Host = Host; 122 CookieContainer cookie = new CookieContainer(); 123 request.CookieContainer = cookie; 124 request.Accept = Accept; 125 request.ContentType = ContentType; 126 request.Headers.Add("Accept-Language", Accept_Language); 127 request.UserAgent = UserAgent; 128 string parameter = string.Format("inputcode={0}&namespace=infodetailweb&uuid={1}", HttpUtility.UrlEncode(code), uuid); 129 130 byte[] buffer = Encoding.Default.GetBytes(parameter); 131 132 string result = string.Empty; 133 Stream reqStr = request.GetRequestStream(); 134 reqStr.Write(buffer, 0, buffer.Length); 135 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 136 { 137 138 using (StreamReader reader = new StreamReader(response1.GetResponseStream(), encoding)) 139 { 140 result = reader.ReadToEnd().Trim(); 141 } 142 } 143 HttpWebHelper.isPass = (result == "1" ? true : false); 144 } 145 catch (Exception ex) 146 { 147 MessageBox.Show(ex.StackTrace); 148 } 149 } 150 151 /// <summary> 152 /// WebClient简单下载页面 153 /// </summary> 154 /// <param name="url">下载html的页面</param> 155 /// <returns></returns> 156 public string webClient(string url) 157 { 158 string html = string.Empty; 159 try 160 { 161 //WebClient client = new WebClient(); 162 //client.Encoding = encoding; 163 //string html = client.DownloadString(url); 164 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 165 166 request.Method = "get"; 167 //request.Timeout = 300; 168 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 169 { 170 using (StreamReader reader = new StreamReader(response1.GetResponseStream(), encoding)) 171 { 172 html = reader.ReadToEnd().Trim(); 173 } 174 } 175 } 176 catch (Exception ex) 177 { 178 MessageBox.Show(ex.StackTrace); 179 } 180 return html; 181 } 182 }
模拟请求类有了。接下来就是在返回的htlm中抓取关键字,这里是匹配包含 "维修" 的a 标签
我封装了一个方法,根据url和关键字抓取数据后。直接给窗体的控件listBoxMenu绑定数据
/// <summary> /// /// </summary> /// <param name="url">首页抓取</param> /// <param name="keyword">首页关键字</param> private void ProcessDownload(string url, string keyword) { this.Invoke( new Action(() => { richTextBoxInfo.AppendText(url + "开始下载中......\n"); }) ); //抓取关键字对应的url WebClient client = new WebClient(); string html = client.DownloadString(url); IHtmlDocument document = new JumonyParser().Parse(html); IEnumerable<IHtmlElement> result = document.Find("a").Where(t => t.InnerText().Contains(keyword)); Dictionary<string, string> dir = new Dictionary<string, string>(); foreach (var item in result) { var href = item.Attribute("href").Value(); var text = item.InnerText(); if (!dir.ContainsKey(href)) dir.Add(text, href); } //左边菜单栏赋值 this.Invoke(new Action(() => { foreach (var item in dir) { listBoxMenu.Items.Add(item.Key); } })); //共享数据 HttpWebHelper.list = dir; HttpWebHelper.prefix = url; //开启多线程下载。 //foreach (var item in dir) //{ // Thread thread = new Thread(() => { DownloadHtml(item.Key); }); // thread.Name = item.Key; //线程取名字 //} try { foreach (var item in dir) { //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadHtml), item.Key); Thread thread = new Thread(ThreadDownload); thread.Name = item.Key; thread.Start(item.Key + "," + item.Value); } } catch (Exception ex) { MessageBox.Show(ex.StackTrace); } }
这个void ProcessDownload(string url, string keyword)有几点注意。这个方法是异步调用的。所以在这里给窗体的控件赋值,就属于跨线程操作UI,因为UI是在主线程中创建和绘制的
有关跨线程问题可以看此篇博文:http://www.cnblogs.com/nsky/p/4436309.html
可以看到里面是有用到线程池的 :ThreadPool,后来被我注释了。因为我需要给线程命名。但线程池我没找到此方法。是不是没有呢?
在ProcessDownload方法里面。当首页关键字匹配后,根据匹配的个数,开启多线程执行详细页面抓取,首页的关键字我保存在了字典里面
Dictionary<string, string> dir = new Dictionary<string, string>(); 分别用关键字和关键字对应的url来存取key-value。在HttpWebHelper类中。我也定义了static
try { foreach (var item in dir) { //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadHtml), item.Key); Thread thread = new Thread(ThreadDownload); thread.Name = item.Key; thread.Start(item.Key + "," + item.Value); } } catch (Exception ex) { MessageBox.Show(ex.StackTrace); }
这里把key-value传值给ThreadDownload。
了解多线程可以看博文:http://www.cnblogs.com/nsky/p/4425286.html
首页抓取关键字的方法有了。那还缺一个什么方法?还需要一个抓取显示列表的页面,这里取名为:ThreadDownload方法
1 /// <summary> 2 /// 3 /// </summary> 4 /// <param name="title">当前抓取的关键字</param> 5 private void ThreadDownload(object obj) 6 { 7 //因为58有采集频率限制。所以改成同步 8 Monitor.Enter(this); 9 10 string[] ob = obj.ToString().Split(','); 11 this.Invoke( 12 new Action(() => { richTextBoxInfo.AppendText(string.Format("正在抓取:{0}\n", ob[0])); }) 13 ); 14 Dictionary<string, string> list = HttpWebHelper.list; 15 string prefix = HttpWebHelper.prefix; 16 17 18 HttpWebHelper client = new HttpWebHelper(); 19 client.encoding = Encoding.UTF8; 20 //client.webClient(prefix); 21 22 23 DataTable dt = new DataTable(); 24 dt.Columns.Add("公司名字", typeof(string)); 25 dt.Columns.Add("联系人", typeof(string)); 26 dt.Columns.Add("联系电话", typeof(string)); 27 28 //遍历每个信息对象的url 如:家庭维修==》 www.baidu.com 29 //foreach (var item in list) 30 //{ 31 //获取列表 32 string fullurl = string.Format("{0}{1}", prefix, ob[1]); 33 string html = client.webClient(fullurl); 34 35 IHtmlDocument document = new JumonyParser().Parse(html); 36 IEnumerable<IHtmlElement> result = document.Find("table[id=jingzhun]"); 37 38 var items = result.Find("tr"); 39 40 foreach (var o in items) 41 { 42 if (o.Find("a").Count() > 0) 43 { 44 /* 45 * 执行该url的时候。服务器判断了请求的频繁度,需要输入验证码。 46 * 输入验证码成功后。会执行该url 即下面的referer 47 */ 48 //列表中找到a标签转到详细页面 49 string referer = o.FindFirst("a").Attribute("href").Value(); 50 51 52 //http://support.58.com/firewall/valid/1032910901.do?namespace=infodetailweb&url=http://sz.58.com/qichejx/19720429696131x.shtml 53 54 //等待5秒,防止抓取频率过高 时间根据当前的环境来定 55 Thread.Sleep(5000); 56 57 58 59 string n = Thread.CurrentThread.Name; 60 string i = Thread.CurrentThread.ManagedThreadId.ToString(); 61 62 //抓取详细页面。这里如果过于频繁,会跳到输入验证码页面 63 string sonHtml = client.webClient(referer); 64 65 //Monitor.Enter(this); 66 67 if (sonHtml.Contains("验证码")) 68 { 69 70 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer); 71 request.Method = "get"; 72 string responseUrl = string.Empty; 73 string rediect = string.Empty; 74 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 75 { 76 //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml" 77 responseUrl = response1.ResponseUri.ToString(); 78 79 //获取绝对路径 "/firewall/valid/1032910901.do" 80 string absolutePath = response1.ResponseUri.AbsolutePath; 81 82 //ResponseUri.Authority "support.58.com" 83 HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath; 84 85 //获取?后面的字符串 86 string query = response1.ResponseUri.Query; 87 88 //验证码成功后,重定向的url 89 rediect = query.Substring(query.LastIndexOf("=") + 1); 90 } 91 //response1.ResponseUri.GetComponents(UriComponents.Query, UriFormat.UriEscaped); 92 //HttpWebHelper http = new HttpWebHelper(); 93 //HttpWebHelper.webBrowser = new WebBrowser(); 94 //HttpWebHelper.webBrowser.Url = new Uri(http.codeUrl); 95 96 //http.webBrowser.Navigate(http.codeUrl); 97 //HttpWebHelper.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted); 98 //HttpWebHelper.webBrowser.NewWindow += new CancelEventHandler(webBrowser_NewWindow); 99 //http://blog.csdn.net/jinjazz/article/details/1916883 100 //while (waitHandle.WaitOne(10, false) == false) { Application.DoEvents(); } 101 102 //Thread thread = new Thread(() => 103 //{ 104 // showCode code = new showCode(); 105 // code.codeHandler = new HttpWebHelper().postVerCode; 106 // //code.p = h.PictureBox; 107 // if (code.ShowDialog() == DialogResult.OK) 108 // { 109 // code.Hide(); 110 // } 111 //}); 112 113 this.Invoke(new Action(() => 114 { 115 116 showCode code = new showCode(); 117 code.codeHandler = new HttpWebHelper().postVerCode; 118 code.showCodeUrl = responseUrl; 119 //code.p = h.PictureBox; 120 //this.dia 121 if (code.ShowDialog() == DialogResult.OK) 122 { 123 code.Hide(); 124 if (HttpWebHelper.isPass) 125 { 126 sonHtml = client.webClient(rediect); 127 128 getTable(sonHtml, ref dt); 129 } 130 } 131 //waitHandle.Set(); 132 133 //waitHandle.WaitOne(); 134 })); 135 //waitHandle.WaitOne(); 136 } 137 else 138 getTable(sonHtml, ref dt); 139 140 //获取当前线程 141 Thread th = Thread.CurrentThread; 142 string name = th.Name; 143 144 this.Invoke(new Action(() => 145 { 146 //MessageBox.Show(name.ToString()); 147 148 149 //创建tab选项卡,如果不存在 150 if (!tabControlWarp.TabPages.ContainsKey(name)) 151 tabControlWarp.TabPages.Add(name, name); 152 153 //动态创建选项卡中显示的数据,和一些属性设置 154 DataGridView view = new DataGridView(); 155 view.AllowUserToAddRows = false; 156 view.AllowUserToDeleteRows = false; 157 view.AllowUserToResizeColumns = false; 158 view.AllowUserToResizeRows = false; 159 view.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.Fill; 160 view.ColumnHeadersHeightSizeMode = DataGridViewColumnHeadersHeightSizeMode.AutoSize; 161 view.MultiSelect = false; 162 view.ReadOnly = true; 163 view.RowHeadersVisible = false; 164 view.BackgroundColor = Color.White; 165 view.ScrollBars = ScrollBars.Vertical; 166 view.SelectionMode = DataGridViewSelectionMode.FullRowSelect; 167 view.Dock = DockStyle.Fill; 168 view.DataSource = dt; 169 //把DataGridView添加到当前选项卡 170 tabControlWarp.TabPages[name].Controls.Add(view); 171 172 //刷新窗体,否则DataGridView数据没有变化 173 this.Refresh(); 174 })); 175 } 176 } 177 //当前线程执行完毕,把当前的数据导出为excel 178 ExcelRender.ExcelRender.RenderToExcel(dt, ob[0] + ".xls"); 179 Monitor.Exit(this); 180 }
这个地方有一个难点就是,如果你采集的频率过高,58会跳转到一个验证码登录页面。这里本来是用多线程执行异步任务,
但:比如同时在执行采集 "手机维修"和"电脑维修"的时候。只要"手机维修"雨打验证码的时候,显然"电脑维修"也会遇到。会有很多不确定的因素,
因为是多线程异步操作,当我弹窗让用户输入验证码的代码,同样会执行多次。
所以找了采取了线程同步 。我用了 Monitor.Enter(this);实现同步。当然你可以用更简单的lock关键字可以实现同样的效果。
说到验证码。58算是下了大功夫,都知道58信息量的巨大。采集的人肯定多。58验证码的机制是。当跳转到验证码登录页面,
页面会生成唯一一个uuid,和一个验证码post到服务器的url和显示验证码有相关联的信息,下面会说明
从图片中可以看出来,显示验证码中的url和post到服务器中的url都包含 1032910901。这是重点,当你提交验证码的时候,服务器会验证 这个 数字 和uuid如果不匹配则验证错误。
那我这里是怎么显示验证码的呢?
首先我是用最普通也是最大众的方式。
用HttpWebRequest读取,其实当HttpWebRequest读取的时候,服务器的验证码已经变了。
当跳转到验证码登录页面。服务器就已经记住了uuid,url中的数字 和验证码,当你用HttpWebRequest去获取验证码肯定
和之前的验证码不同。
除了这种方式,网上也提到了好几种方式,这里验证成功后,有一个回调方法
可以通过HttpWebResponse获取响应请求的url。比如
1 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer); 2 request.Method = "get"; 3 string responseUrl = string.Empty; 4 string rediect = string.Empty; 5 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 6 { 7 //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml" 8 responseUrl = response1.ResponseUri.ToString(); 9 10 //获取绝对路径 "/firewall/valid/1032910901.do" 11 string absolutePath = response1.ResponseUri.AbsolutePath; 12 13 //ResponseUri.Authority "support.58.com" 14 HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath; 15 16 //获取?后面的字符串 17 string query = response1.ResponseUri.Query; 18 19 //验证码成功后,重定向的url 20 rediect = query.Substring(query.LastIndexOf("=") + 1); 21 }
第一种:页面在WebBrowser中打开。读取验证码图片流。保存在剪切板中
1 /// <summary> 2 /// 返回指定WebBrowser中图片<IMG></IMG>中的图内容 3 /// </summary> 4 /// <param name="WebCtl">WebBrowser控件</param> 5 /// <param name="ImgeTag">IMG元素</param> 6 /// <returns>IMG对象</returns> 7 private Image GetWebImage(WebBrowser WebCtl, HtmlElement ImgeTag) 8 { 9 10 /* 11 * 这种方法有时候会因为剪切板没有头像而报异常, 12 * 初步判断是页面(我这里是js对图片赋值)图片没有加载完成,而没获取到图片 13 * System.Threading.Thread.Sleep(8000);测试通过。但每次时间是不确定的。 14 */ 15 16 HTMLDocument doc = (HTMLDocument)WebCtl.Document.DomDocument; 17 HTMLBody body = (HTMLBody)doc.body; 18 IHTMLControlRange rang = (IHTMLControlRange)body.createControlRange(); 19 IHTMLControlElement Img = (IHTMLControlElement)ImgeTag.DomElement; //图片地址 20 Image oldImage = Clipboard.GetImage(); 21 rang.add(Img); 22 rang.execCommand("Copy", false, null); //拷贝到内存 23 Image numImage = Clipboard.GetImage(); //如果为null则保存 24 25 //判断剪切板是否有图片 26 //https://msdn.microsoft.com/zh-cn/library/system.windows.forms.clipboard.getimage.aspx 27 if (Clipboard.ContainsImage()) 28 { } 29 30 31 try 32 { 33 Clipboard.SetImage(oldImage); 34 } 35 catch (Exception ex) 36 { 37 MessageBox.Show(ex.Message); 38 } 39 return numImage; 40 }
调用代码:
1 //找到图片 2 HtmlElement ImgeTag = webBrowser1.Document.GetElementById("imgCode"); 3 4 Image numPic = GetWebImage(webBrowser1, ImgeTag); // 得到验证码图片 5 pictureBox1.Image = numPic; //图片赋值
HTMLDocument需要添加引用:F:\Program Files (x86)\Microsoft Visual Studio 12.0\Visual Studio Tools for Office\PIA\Common\Microsoft.mshtml.dll
引入命名空间:using mshtml;
显然。页面必须加载完成后才能获取到图片。即在事件中webBrowser1_DocumentCompleted获取。但它却不能判断js脚本什么时候完成。
如果是多线程异步任务,还需要webBrowser1_DocumentCompleted执行后,在执行后面的方法,因为webBrowser1_DocumentCompleted本身就是异步的
此时的解决方案是 利用AutoResetEvent阻止线程,等当前线程执行完毕
AutoResetEvent waitHandle = new AutoResetEvent(false); while (waitHandle.WaitOne(10, false) == false) { Application.DoEvents(); }
第二种:抓图。根据图片的高宽来剪切
首先动态创建WebBrowser,并注册事件
WebBrowser we = new WebBrowser(); we.Url = new Uri("http://support.58.com/firewall/valid/3071088800.do"); we.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(we_DocumentCompleted);
1 void we_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) 2 { 3 4 //HtmlElement d = webBrowser1.Document.GetElementById("uuid"); 5 6 //string y = webBrowser1.Document.GetElementById("uuid").GetAttribute("value"); 7 8 9 10 //var wb = new WebBrowser(); 11 12 HtmlElementCollection docs = we.Document.All; 13 foreach (HtmlElement item in docs) 14 { 15 string ii = item.Id; 16 17 if (item.Id == "uuid") 18 { 19 string c = item.GetAttribute("value"); 20 } 21 else if (item.Id == "imgCode") 22 { 23 HtmlElement img = item.Document.GetElementById("imgCode"); 24 item.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px"; 25 26 //抓图 27 var b = new Bitmap(item.ClientRectangle.Width, item.ClientRectangle.Height); 28 we.DrawToBitmap(b, new Rectangle(new Point(), item.ClientRectangle.Size)); 29 pictureBox1.Image = b; 30 break; 31 32 } 33 } 34 }
第二种有个注意的地方:WebBrowser必须动态创建但不能依附于窗体上,即不将WebBrowser加载到窗体,否则截取后的图片是显示白色的。我也不知道什么原因
第3种:是根据第二种演化而来的,也是我当前用的。感觉有些投机取巧
你可以到显示验证码页面查看验证码图片的大小,也就是高度和宽度,然后新建一个显示验证码的窗体,我这里取名为showCode
在showCode上放一个webBrowser,高度和宽度设置为验证码图片的高度和宽度。比如:
AllowWebBrowserDrop=false //控件不能拖动
ScrollBarsEnabled = false //取消滚动条
size = 120,40 验证码图片的高度
然后找到webbrowser中的图片。设置样式。使其显示在最右上角
img.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px";
窗体布局:
核心代码
1 public partial class showCode : Form 2 { 3 public Image p { get; set; } 4 public string showCodeUrl { get; set; } //显示验证码页面 5 public delegate void delegateCode(string code, string uuid); 6 public delegateCode codeHandler; 7 8 9 public showCode() 10 { 11 InitializeComponent(); 12 //InitializeEvents(); 13 } 14 /// <summary> 15 /// 初始化 16 /// </summary> 17 //private void InitializeEvents() 18 //{ 19 // this.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted); 20 //} 21 22 void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) 23 { 24 WebBrowser bro = (WebBrowser)sender; 25 26 HtmlElement img = bro.Document.GetElementById("imgCode"); 27 28 bro.Document.GetElementById("uuid").GetAttribute("value"); 29 30 img.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px"; //使其显示在最右上角 31 img.SetAttribute("onclick", "javascript:void(0)"); //取消单击图片刷新验证码操作 32 } 33 private void btnOk_Click(object sender, EventArgs e) 34 { 35 string code = textCode.Text; 36 if (string.IsNullOrEmpty(code)) 37 { 38 MessageBox.Show("请输入验证码", "验证码", MessageBoxButtons.OK, MessageBoxIcon.Information); 39 textCode.Focus(); 40 return; 41 } 42 if (codeHandler != null) 43 { 44 string uuid = webBrowser.Document.GetElementById("uuid").GetAttribute("value"); 45 46 this.DialogResult = DialogResult.OK; 47 codeHandler(code, uuid); 48 } 49 } 50 51 private void showCode_Load(object sender, EventArgs e) 52 { 53 //pictureBoxCode.Image = p; 54 webBrowser.Url = new Uri(showCodeUrl); 55 this.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted); 56 } 57 }
我这里定义了一个委托。利用回调机制,把验证码和uuid传给主窗体,这里显示验证码的url由主窗体传进来。
当遇到验证码的时候,就会弹窗,如果能做到自动识别就更好了。
好了。现在回到之前的问题上。现在需要抓取详细页面的数据,上面说了ThreadDownload只是抓取列表页面。
现在定义一个方法DataTable getTable(string document, ref DataTable dt),这里的dt是ref类型。是之前需要用的。好像现在已经用不上了。大家可以根据自己的要求修改
getTable方法是接收传来的详细页面。然后匹配信息:比如:用户名,手机号码,公司名称
1 private DataTable getTable(string document, ref DataTable dt) 2 { 3 try 4 { 5 //if (IsDisposed) return null; 6 //this.Invoke( 7 // new Action(() => { richTextBoxInfo.AppendText("正在下载\n"); }) 8 // ); 9 10 IHtmlDocument hd = new JumonyParser().Parse(document); 11 //string company = hd.FindFirst("div[class=su_tit]").InnerText(); 12 13 string company = "未知"; 14 string phone = "未知"; 15 string linkman = "未知"; 16 17 //判断是个人还是企业 18 var su = hd.Find("ul[class=suUl]"); 19 20 //顶部html包含联系人。电话 21 IHtmlDocument top = new JumonyParser().Parse(hd.FindFirst("ul[class=suUl]").InnerHtml()); 22 23 if (su.Count() > 0) 24 { 25 if (top.Find("div[class=su_tit]").Count() > 0) 26 { 27 string txt = top.FindFirst("div[class=su_tit]").InnerText(); 28 if (txt.Contains("公司名称")) 29 { 30 if (top.Find("div[class=su_con]").Count() > 0) 31 //company = top.FindFirst("div[class=su_con]").FindFirst("a").InnerText(); 32 company = top.FindFirst("div[class=su_con]").InnerText().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0]; 33 if (top.Find("li:nth-child(1)").Count() > 0) 34 linkman = top.FindFirst("li:nth-child(2)").FindFirst("div[class=su_con]").FindFirst("a").InnerText(); 35 if (top.Find("span[class=l_phone]").Count() > 0) 36 phone = top.FindFirst("span[class=l_phone]").InnerText(); 37 } 38 else if (txt.Contains("联系人")) 39 { 40 if (top.Find("li:nth-child(1)").Count() > 0) 41 linkman = top.FindFirst("li:nth-child(1)").FindFirst("div[class=su_con]").InnerText(); 42 if (top.Find("li:nth-child(2)").Count() > 0) 43 phone = top.FindFirst("li:nth-child(2)").FindFirst("span[id=t_phone]").InnerText(); 44 } 45 } 46 } 47 48 DataRow row = dt.NewRow(); 49 row["公司名字"] = company; 50 row["联系电话"] = phone; 51 row["联系人"] = linkman; 52 53 dt.Rows.Add(row); 54 55 56 return dt; 57 } 58 catch (Exception) 59 { 60 61 return null; 62 } 63 }
来看看入口函数,开启异步调用。显然是不让窗体假死
/// <summary> /// 开始抓取 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> /// void btnStart_Click(object sender, EventArgs e) { btnStart.Enabled = false; //Dictionary<string, string> result = new Dictionary<string, string>(); //string url = "http://sz.58.com/"; //string keyword = "维修"; string url = textBoxUrl.Text; string keyword = textBoxKeyword.Text; if (string.IsNullOrEmpty(url)) { MessageBox.Show("请输入要抓取的网址", "网址", MessageBoxButtons.OK, MessageBoxIcon.Information); textBoxUrl.Focus(); return; } else if (string.IsNullOrEmpty(keyword)) { MessageBox.Show("请输入要抓取的关键字", "关键字", MessageBoxButtons.OK, MessageBoxIcon.Information); textBoxKeyword.Focus(); return; } //string prefix = "http://sz.58.com"; // 声明一个异步委托去处理下载操作 Action downloadAction = new Action(() => { ProcessDownload(url, keyword); }); //Action<string, string> an = new Action<string, string>(ProcessDownload); //声明一个下载完成后的回调函数 AsyncCallback callback = new AsyncCallback((asyncResult) => { this.Invoke( new Action(() => { richTextBoxInfo.AppendText("首页关键字匹配完成,显示在左侧列表中.....\n"); }) ); }); downloadAction.BeginInvoke(callback, null); }
其余代码
/// <summary> /// 窗体关闭提醒 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void Main_FormClosing(object sender, FormClosingEventArgs e) { if (MessageBox.Show("是否退出当前程序", "关闭", MessageBoxButtons.YesNo, MessageBoxIcon.Question) == DialogResult.No) e.Cancel = true; else Environment.Exit(0); //强制退出所以线程 } /// <summary> /// 单击左边菜单栏 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void listBoxMenu_MouseClick(object sender, MouseEventArgs e) { string txt = listBoxMenu.Text; if (tabControlWarp.TabPages.ContainsKey(txt) && !string.IsNullOrEmpty(txt)) { //tabControlWarp.TabPages.Add(txt, txt); //创建选项卡 tabControlWarp.SelectedTab = tabControlWarp.TabPages[txt];//并且选中 } //else tabControlWarp.SelectedTab = tabControlWarp.TabPages[txt]; }
项目中用到了NPOI导出excel,这里附上相关帮助类
1 public class ExcelRender 2 { 3 /// <summary> 4 /// 根据Excel列类型获取列的值 5 /// </summary> 6 /// <param name="cell">Excel列</param> 7 /// <returns></returns> 8 private static string GetCellValue(ICell cell) 9 { 10 if (cell == null) 11 return string.Empty; 12 switch (cell.CellType) 13 { 14 case CellType.BLANK: 15 return string.Empty; 16 case CellType.BOOLEAN: 17 return cell.BooleanCellValue.ToString(); 18 case CellType.ERROR: 19 return cell.ErrorCellValue.ToString(); 20 case CellType.NUMERIC: 21 case CellType.Unknown: 22 default: 23 return cell.ToString();//This is a trick to get the correct value of the cell. NumericCellValue will return a numeric value no matter the cell value is a date or a number 24 case CellType.STRING: 25 return cell.StringCellValue; 26 case CellType.FORMULA: 27 try 28 { 29 HSSFFormulaEvaluator e = new HSSFFormulaEvaluator(cell.Sheet.Workbook); 30 e.EvaluateInCell(cell); 31 return cell.ToString(); 32 } 33 catch 34 { 35 return cell.NumericCellValue.ToString(); 36 } 37 } 38 } 39 40 /// <summary> 41 /// 自动设置Excel列宽 42 /// </summary> 43 /// <param name="sheet">Excel表</param> 44 private static void AutoSizeColumns(ISheet sheet) 45 { 46 47 if (sheet.PhysicalNumberOfRows > 0) 48 { 49 IRow headerRow = sheet.GetRow(0); 50 51 for (int i = 0, l = headerRow.LastCellNum; i < l; i++) 52 { 53 sheet.AutoSizeColumn(i); 54 } 55 } 56 } 57 58 /// <summary> 59 /// 保存Excel文档流到文件 60 /// </summary> 61 /// <param name="ms">Excel文档流</param> 62 /// <param name="fileName">文件名</param> 63 private static void SaveToFile(MemoryStream ms, string fileName) 64 { 65 using (FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write)) 66 { 67 byte[] data = ms.ToArray(); 68 69 fs.Write(data, 0, data.Length); 70 fs.Flush(); 71 72 data = null; 73 } 74 } 75 76 /// <summary> 77 /// 输出文件到浏览器 78 /// </summary> 79 /// <param name="ms">Excel文档流</param> 80 /// <param name="context">HTTP上下文</param> 81 /// <param name="fileName">文件名</param> 82 private static void RenderToBrowser(MemoryStream ms, HttpContext context, string fileName) 83 { 84 if (context.Request.Browser.Browser == "IE") 85 fileName = HttpUtility.UrlEncode(fileName); 86 context.Response.AddHeader("Content-Disposition", "attachment;fileName=" + fileName); 87 context.Response.BinaryWrite(ms.ToArray()); 88 } 89 90 /// <summary> 91 /// DataReader转换成Excel文档流 92 /// </summary> 93 /// <param name="reader"></param> 94 /// <returns></returns> 95 public static MemoryStream RenderToExcel(IDataReader reader) 96 { 97 MemoryStream ms = new MemoryStream(); 98 99 using (reader) 100 { 101 using (IWorkbook workbook = new HSSFWorkbook()) 102 { 103 using (ISheet sheet = workbook.CreateSheet()) 104 { 105 IRow headerRow = sheet.CreateRow(0); 106 int cellCount = reader.FieldCount; 107 108 // handling header. 109 for (int i = 0; i < cellCount; i++) 110 { 111 headerRow.CreateCell(i).SetCellValue(reader.GetName(i)); 112 } 113 114 // handling value. 115 int rowIndex = 1; 116 while (reader.Read()) 117 { 118 IRow dataRow = sheet.CreateRow(rowIndex); 119 120 for (int i = 0; i < cellCount; i++) 121 { 122 dataRow.CreateCell(i).SetCellValue(reader[i].ToString()); 123 } 124 125 rowIndex++; 126 } 127 128 AutoSizeColumns(sheet); 129 130 workbook.Write(ms); 131 ms.Flush(); 132 ms.Position = 0; 133 } 134 } 135 } 136 return ms; 137 } 138 139 /// <summary> 140 /// DataReader转换成Excel文档流,并保存到文件 141 /// </summary> 142 /// <param name="reader"></param> 143 /// <param name="fileName">保存的路径</param> 144 public static void RenderToExcel(IDataReader reader, string fileName) 145 { 146 using (MemoryStream ms = RenderToExcel(reader)) 147 { 148 SaveToFile(ms, fileName); 149 } 150 } 151 152 /// <summary> 153 /// DataReader转换成Excel文档流,并输出到客户端 154 /// </summary> 155 /// <param name="reader"></param> 156 /// <param name="context">HTTP上下文</param> 157 /// <param name="fileName">输出的文件名</param> 158 public static void RenderToExcel(IDataReader reader, HttpContext context, string fileName) 159 { 160 using (MemoryStream ms = RenderToExcel(reader)) 161 { 162 RenderToBrowser(ms, context, fileName); 163 } 164 } 165 166 /// <summary> 167 /// DataTable转换成Excel文档流 168 /// </summary> 169 /// <param name="table"></param> 170 /// <returns></returns> 171 public static MemoryStream RenderToExcel(DataTable table) 172 { 173 MemoryStream ms = new MemoryStream(); 174 175 using (table) 176 { 177 using (IWorkbook workbook = new HSSFWorkbook()) 178 { 179 using (ISheet sheet = workbook.CreateSheet()) 180 { 181 IRow headerRow = sheet.CreateRow(0); 182 183 // handling header. 184 foreach (DataColumn column in table.Columns) 185 headerRow.CreateCell(column.Ordinal).SetCellValue(column.Caption);//If Caption not set, returns the ColumnName value 186 187 // handling value. 188 int rowIndex = 1; 189 190 foreach (DataRow row in table.Rows) 191 { 192 IRow dataRow = sheet.CreateRow(rowIndex); 193 194 foreach (DataColumn column in table.Columns) 195 { 196 dataRow.CreateCell(column.Ordinal).SetCellValue(row[column].ToString()); 197 } 198 199 rowIndex++; 200 } 201 AutoSizeColumns(sheet); 202 203 workbook.Write(ms); 204 ms.Flush(); 205 ms.Position = 0; 206 } 207 } 208 } 209 return ms; 210 } 211 212 /// <summary> 213 /// DataTable转换成Excel文档流,并保存到文件 214 /// </summary> 215 /// <param name="table"></param> 216 /// <param name="fileName">保存的路径</param> 217 public static void RenderToExcel(DataTable table, string fileName) 218 { 219 using (MemoryStream ms = RenderToExcel(table)) 220 { 221 SaveToFile(ms, fileName); 222 } 223 } 224 225 /// <summary> 226 /// DataTable转换成Excel文档流,并输出到客户端 227 /// </summary> 228 /// <param name="table"></param> 229 /// <param name="response"></param> 230 /// <param name="fileName">输出的文件名</param> 231 public static void RenderToExcel(DataTable table, HttpContext context, string fileName) 232 { 233 using (MemoryStream ms = RenderToExcel(table)) 234 { 235 RenderToBrowser(ms, context, fileName); 236 } 237 } 238 239 /// <summary> 240 /// Excel文档流是否有数据 241 /// </summary> 242 /// <param name="excelFileStream">Excel文档流</param> 243 /// <returns></returns> 244 public static bool HasData(Stream excelFileStream) 245 { 246 return HasData(excelFileStream, 0); 247 } 248 249 /// <summary> 250 /// Excel文档流是否有数据 251 /// </summary> 252 /// <param name="excelFileStream">Excel文档流</param> 253 /// <param name="sheetIndex">表索引号,如第一个表为0</param> 254 /// <returns></returns> 255 public static bool HasData(Stream excelFileStream, int sheetIndex) 256 { 257 using (excelFileStream) 258 { 259 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 260 { 261 if (workbook.NumberOfSheets > 0) 262 { 263 if (sheetIndex < workbook.NumberOfSheets) 264 { 265 using (ISheet sheet = workbook.GetSheetAt(sheetIndex)) 266 { 267 return sheet.PhysicalNumberOfRows > 0; 268 } 269 } 270 } 271 } 272 } 273 return false; 274 } 275 276 /// <summary> 277 /// Excel文档流转换成DataTable 278 /// 第一行必须为标题行 279 /// </summary> 280 /// <param name="excelFileStream">Excel文档流</param> 281 /// <param name="sheetName">表名称</param> 282 /// <returns></returns> 283 public static DataTable RenderFromExcel(Stream excelFileStream, string sheetName) 284 { 285 return RenderFromExcel(excelFileStream, sheetName, 0); 286 } 287 288 /// <summary> 289 /// Excel文档流转换成DataTable 290 /// </summary> 291 /// <param name="excelFileStream">Excel文档流</param> 292 /// <param name="sheetName">表名称</param> 293 /// <param name="headerRowIndex">标题行索引号,如第一行为0</param> 294 /// <returns></returns> 295 public static DataTable RenderFromExcel(Stream excelFileStream, string sheetName, int headerRowIndex) 296 { 297 DataTable table = null; 298 299 using (excelFileStream) 300 { 301 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 302 { 303 using (ISheet sheet = workbook.GetSheet(sheetName)) 304 { 305 table = RenderFromExcel(sheet, headerRowIndex); 306 } 307 } 308 } 309 return table; 310 } 311 312 /// <summary> 313 /// Excel文档流转换成DataTable 314 /// 默认转换Excel的第一个表 315 /// 第一行必须为标题行 316 /// </summary> 317 /// <param name="excelFileStream">Excel文档流</param> 318 /// <returns></returns> 319 public static DataTable RenderFromExcel(Stream excelFileStream) 320 { 321 return RenderFromExcel(excelFileStream, 0, 0); 322 } 323 324 /// <summary> 325 /// Excel文档流转换成DataTable 326 /// 第一行必须为标题行 327 /// </summary> 328 /// <param name="excelFileStream">Excel文档流</param> 329 /// <param name="sheetIndex">表索引号,如第一个表为0</param> 330 /// <returns></returns> 331 public static DataTable RenderFromExcel(Stream excelFileStream, int sheetIndex) 332 { 333 return RenderFromExcel(excelFileStream, sheetIndex, 0); 334 } 335 336 /// <summary> 337 /// Excel文档流转换成DataTable 338 /// </summary> 339 /// <param name="excelFileStream">Excel文档流</param> 340 /// <param name="sheetIndex">表索引号,如第一个表为0</param> 341 /// <param name="headerRowIndex">标题行索引号,如第一行为0</param> 342 /// <returns></returns> 343 public static DataTable RenderFromExcel(Stream excelFileStream, int sheetIndex, int headerRowIndex) 344 { 345 DataTable table = null; 346 347 using (excelFileStream) 348 { 349 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 350 { 351 using (ISheet sheet = workbook.GetSheetAt(sheetIndex)) 352 { 353 table = RenderFromExcel(sheet, headerRowIndex); 354 } 355 } 356 } 357 return table; 358 } 359 360 /// <summary> 361 /// Excel表格转换成DataTable 362 /// </summary> 363 /// <param name="sheet">表格</param> 364 /// <param name="headerRowIndex">标题行索引号,如第一行为0</param> 365 /// <returns></returns> 366 private static DataTable RenderFromExcel(ISheet sheet, int headerRowIndex) 367 { 368 DataTable table = new DataTable(); 369 370 IRow headerRow = sheet.GetRow(headerRowIndex); 371 int cellCount = headerRow.LastCellNum;//LastCellNum = PhysicalNumberOfCells 372 int rowCount = sheet.LastRowNum;//LastRowNum = PhysicalNumberOfRows - 1 373 374 //handling header. 375 for (int i = headerRow.FirstCellNum; i < cellCount; i++) 376 { 377 DataColumn column = new DataColumn(headerRow.GetCell(i).StringCellValue); 378 table.Columns.Add(column); 379 } 380 381 for (int i = (sheet.FirstRowNum + 1); i <= rowCount; i++) 382 { 383 IRow row = sheet.GetRow(i); 384 DataRow dataRow = table.NewRow(); 385 386 if (row != null) 387 { 388 for (int j = row.FirstCellNum; j < cellCount; j++) 389 { 390 if (row.GetCell(j) != null) 391 dataRow[j] = GetCellValue(row.GetCell(j)); 392 } 393 } 394 395 table.Rows.Add(dataRow); 396 } 397 398 return table; 399 } 400 401 /// <summary> 402 /// Excel文档导入到数据库 403 /// 默认取Excel的第一个表 404 /// 第一行必须为标题行 405 /// </summary> 406 /// <param name="excelFileStream">Excel文档流</param> 407 /// <param name="insertSql">插入语句</param> 408 /// <param name="dbAction">更新到数据库的方法</param> 409 /// <returns></returns> 410 public static int RenderToDb(Stream excelFileStream, string insertSql, DBAction dbAction) 411 { 412 return RenderToDb(excelFileStream, insertSql, dbAction, 0, 0); 413 } 414 415 public delegate int DBAction(string sql, params IDataParameter[] parameters); 416 417 /// <summary> 418 /// Excel文档导入到数据库 419 /// </summary> 420 /// <param name="excelFileStream">Excel文档流</param> 421 /// <param name="insertSql">插入语句</param> 422 /// <param name="dbAction">更新到数据库的方法</param> 423 /// <param name="sheetIndex">表索引号,如第一个表为0</param> 424 /// <param name="headerRowIndex">标题行索引号,如第一行为0</param> 425 /// <returns></returns> 426 public static int RenderToDb(Stream excelFileStream, string insertSql, DBAction dbAction, int sheetIndex, int headerRowIndex) 427 { 428 int rowAffected = 0; 429 using (excelFileStream) 430 { 431 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 432 { 433 using (ISheet sheet = workbook.GetSheetAt(sheetIndex)) 434 { 435 StringBuilder builder = new StringBuilder(); 436 437 IRow headerRow = sheet.GetRow(headerRowIndex); 438 int cellCount = headerRow.LastCellNum;//LastCellNum = PhysicalNumberOfCells 439 int rowCount = sheet.LastRowNum;//LastRowNum = PhysicalNumberOfRows - 1 440 441 for (int i = (sheet.FirstRowNum + 1); i <= rowCount; i++) 442 { 443 IRow row = sheet.GetRow(i); 444 if (row != null) 445 { 446 builder.Append(insertSql); 447 builder.Append(" values ("); 448 for (int j = row.FirstCellNum; j < cellCount; j++) 449 { 450 builder.AppendFormat("'{0}',", GetCellValue(row.GetCell(j)).Replace("'", "''")); 451 } 452 builder.Length = builder.Length - 1; 453 builder.Append(");"); 454 } 455 456 if ((i % 50 == 0 || i == rowCount) && builder.Length > 0) 457 { 458 //每50条记录一次批量插入到数据库 459 rowAffected += dbAction(builder.ToString()); 460 builder.Length = 0; 461 } 462 } 463 } 464 } 465 } 466 return rowAffected; 467 } 468 }View Code
代码没什么高级的地方。关键是看逻辑是否清晰,我这里优化的还很多。数据采集无非就是异步委托,多线程同步等等。就看你怎么灵活运用。