前几天看到一篇博文:C# 爬虫 抓取小说
下面使用HtmlAgilityPack来改写原博主的代码
在使用HtmlAgilityPack之前,可以先熟悉一下XPath:点我
代码如下:
class="code_img_closed" src="/Upload/Images/2017090618/0015B68B3C38AA5B.gif" alt="">1 using System; 2 using System.IO; 3 using System.Text; 4 using HtmlAgilityPack; 5 6 namespace HtmlAgilityPackDemo 7 { 8 class Program 9 { 10 static void Main(string[] args) 11 { 12 HtmlWeb htmlWeb = new HtmlWeb(); 13 HtmlDocument document = htmlWeb.Load("http://www.23us.so/files/article/html/13/13655/index.html"); 14 FileStream fs = new FileStream("无疆.txt", FileMode.Append, FileAccess.Write); 15 StreamWriter sr = new StreamWriter(fs, Encoding.UTF8); 16 try 17 { 18 HtmlNodeCollection nodeCollection = document.DocumentNode.SelectNodes(@"//table/tr/td/a[@href]"); // //代表获取所有 19 foreach (var node in nodeCollection) 20 { 21 HtmlAttribute attribute = node.Attributes["href"]; 22 string val = attribute.Value; 23 var title = htmlWeb.Load(val).DocumentNode.SelectNodes(@"//h1")[0].InnerText; //文章标题 24 var doc = htmlWeb.Load(val).DocumentNode.SelectNodes(@"//dd[@id='contents']");//文章内容 25 var content = doc[0].InnerHtml.Replace(" ", "").Replace("<br>", "\r\n"); 26 sr.WriteLine("\r\n" + title + "\r\n" + content); // 开始写入 27 } 28 } 29 catch (Exception ex) 30 { 31 Console.WriteLine(ex.ToString()); 32 } 33 finally 34 { 35 sr.Close(); 36 fs.Close(); 37 } 38 Console.WriteLine("ok"); 39 Console.ReadKey(true); 40 41 42 } 43 44 45 } 46 }logs_code_collapse">View Code
实现效果和原博主一样!
代码仅供参考!!!