代码是由Asp.net开发的一个采集器,采集功能不是很强,是我去年采集写的,不完善,但是能能够采集和下载,采集方法是用正则表达式
代码就不详细讲解了,需要学习采集的朋友自己可以研究
代码去年写的,不喜勿喷
不说废话:先上下载地址:懒人图库采集
源码好像加密了;如果有加密请输入解压密码:www.qhttl.com主页效果:
采集结果实体:
public class JsModel { public string Name { get; set; } public string Remark { get; set; } public string Code { get; set; } public string Type { get; set; } public byte[] FileData { get; set; } public string File { get; set; } public string Image { get; set; } public byte[] ImageData { get; set; } public int ID { get; set; } public bool IsDownFile { get; set; } }
js采集源码:
代码统一实现Thief借口即可,主要采集函数Get()
public class JsLr : Thief { protected string Link { get { return base.getAttr("config/lanren/js"); } } protected string Encod { get { return base.GetNodeText("config/lanren/js/Encoder"); } } protected string[] ErrorPage { get { return base.GetNodeText("config/lanren/js/errorPage").Split(','); } } private int PageIndex; private static JsLr this_JsLr; public static JsLr Create() { if (this_JsLr == null) { this_JsLr = new JsLr(); } return this_JsLr; } private JsLr() { Jses = new Queue<Model.JsModel>(); _dir = Context.Server.MapPath("/Download/"); PageIndex = 1; IsComplate = false; //GetJsAsync = new Thread(GetJsAll); //DownAsync = new Thread(DownFile); } private bool isErrorPage() { foreach (string s in ErrorPage) { if (s == PageIndex.ToString()) { return true; } } return false; } public override void Get() { if (IsAsync) { //if (GetJsAsync.ThreadState == ThreadState.AbortRequested || GetJsAsync.ThreadState == ThreadState.Aborted) GetJsAsync = new Thread(GetJsAll); GetJsAsync.Start(); } else GetJsAll(); } private void GetJsAll() { if (!IsComplate) { if (isErrorPage()) { PageIndex++; GetJsAll(); } string html = GetHttpString(string.Format(Link, PageIndex), Encod); html = html.Replace("\n", "").Replace("\r", "").Replace("\t", ""); if (html.Contains(" class=\"zuozhe\"")) { Model.JsModel model = new Model.JsModel(); model.Type = "js"; model.Name = DelHtml(ReplaceEx(html, "<div[^<>]*title.*?<h1>(?<name>.*?)</h1>", "name").Replace("/", "_").Replace("\\", "_")); if (!string.IsNullOrEmpty(model.Name) && !File.Exists(_dir + "js/" + model.Name + ".zip") && !File.Exists(_dir + "js/" + "Html/" + model.Name + ".html")) { //获取下载文件地址 model.File = ReplaceEx(html, "<a[^<>]*bt-green.*?href.*?[\"'](?<url>.*?)[\"'].*?>.*?</a>", "url"); if (!string.IsNullOrEmpty(model.File) && model.File != "#") { model.IsDownFile = true; model.File = "http://www.lanrentuku.com" + model.File; //string[] images=ReplaceEx(html,"<div[^<>]*class=['\"]l['\"].*?img.*?src.*?[\"'](?<image>.*?)[\"'].*?>") model.Image = ReplaceEx(html, "<div.*?content-js.*?[^<>]*class=['\"]l['\"].*?img.*?src.*?[\"'](?<image>.*?)[\"'].*?>", "image"); } else { model.Remark = ReplaceEx(html, "<div[^<>]*content-tx.*?textarea.*?>(?<remark>.*?)</textarea>", "remark").Replace("懒人图库", "程序吧").Replace("lanrentuku.com", "qhttl.cn"); } model.ID = PageIndex; lock (Jses) { Jses.Enqueue(model); } //SendThiefModel(model); lock (MsgList) { MsgList.Enqueue(new Model.MsgModel() { Type = "js", IsDown = false, Model = model }); } Thread.Sleep(50);//休息 } PageIndex++; GetJsAll(); } } this.IsComplate = true; if (IsAsync) { Thread.CurrentThread.Abort(); } } public override void DownLoad() { if (IsAsync) { //if (DownAsync.ThreadState == ThreadState.AbortRequested || DownAsync.ThreadState == ThreadState.Aborted) DownAsync = new Thread(DownFile); DownAsync.Start(); } else DownFile(); } }
素材采集:
public class ScLr : Thief { protected string Link { get { return base.getAttr("config/lanren/sc"); } } protected string Encod { get { return base.GetNodeText("config/lanren/sc/Encoder"); } } protected string[] ErrorPage { get { return base.GetNodeText("config/lanren/sc/errorPage").Split(','); } } private string[] TypeSc = new string[] { "bg", "gif", "psd", "qq", "png", "vector" }; private string CurrType { get; set; } private int PageIndex; private int Count; private static ScLr this_scLr; public static ScLr Create() { if (this_scLr == null) this_scLr = new ScLr(); return this_scLr; } private ScLr() { Jses = new Queue<Model.JsModel>(); _dir = Context.Server.MapPath("/Download/"); PageIndex = 1; IsComplate = false; //GetJsAsync = new Thread(GetScByType); //DownAsync = new Thread(DownFile); } public override void Get() { if (IsAsync) { //if (GetJsAsync.ThreadState == ThreadState.AbortRequested || GetJsAsync.ThreadState == ThreadState.Aborted) GetJsAsync = new Thread(GetScByType); GetJsAsync.Start(); } else GetScAll(); } public override void DownLoad() { if (IsAsync) { //if (DownAsync.ThreadState == ThreadState.AbortRequested || DownAsync.ThreadState == ThreadState.Aborted) DownAsync = new Thread(DownFile); DownAsync.Start(); } else DownFile(); } private bool isErrorPage() { foreach (string s in ErrorPage) { if (s == PageIndex.ToString()) { return true; } } return false; } private void GetScByType() { if (!IsComplate) { foreach (string s in TypeSc) { CurrType = s; PageIndex = 1; GetScAll(); } } this.IsComplate = true; if (IsAsync) { Thread.CurrentThread.Abort(); } } private void GetScAll() { if (isErrorPage()) { PageIndex++; GetScAll(); } string html = GetHttpString(string.Format(Link, PageIndex, CurrType), Encod); html = html.Replace("\n", "").Replace("\r", "").Replace("\t", ""); string ddes = ReplaceEx(html, "<div[^<>]*?list-pic.*?</div>", 0);// ReplaceEx(html, "list-pic.*?(?<dds><dd.*?/dd>).*?/div>", "dds"); string[] dds = ReplaceEx(ddes, "<dd.*?/dd>"); foreach (string s in dds) { JsModel model = new JsModel(); model.ID = ++Count; model.Type = CurrType; model.Name = ReplaceEx(s, "<a.*>(?<name>[^<>]*)</a>", "name").Replace("/", "_").Replace("\\", "_"); model.Image = ReplaceEx(s, "<a.*src.*?[\"'](?<src>.*?)[\"'].*?</a>", "src"); if (!model.Image.Contains("http://img.lanrentuku.com")) { model.Image = "http://img.lanrentuku.com" + model.Image; } string link = ReplaceEx(s, "<a[^<>]*href.*?[\"'](?<href>.*?)[\"'].*?</a>", "href"); html = GetHttpString("http://www.lanrentuku.com" + link, Encod); if (!string.IsNullOrEmpty(model.Name) && !File.Exists(_dir + CurrType + "/" + model.Name + ".zip")) { //获取下载文件地址 model.File = ReplaceEx(html, "<a[^<>]*bt-green.*?href.*?[\"'](?<url>.*?)[\"'].*?>.*?</a>", "url"); if (!string.IsNullOrEmpty(model.File) && model.File != "#") { model.IsDownFile = true; model.File = model.File; } else { model.Remark = ReplaceEx(html, "<div[^<>]*content-tx.*?textarea.*?>(?<remark>.*?)</textarea>", "remark").Replace("懒人图库", "程序吧").Replace("lanrentuku.com", "qhttl.cn"); } //model.ID = PageIndex; lock (Jses) { Jses.Enqueue(model); } //SendThiefModel(model); lock (MsgList) { MsgList.Enqueue(new Model.MsgModel() { Type = CurrType, IsDown = false, Model = model }); } //Thread.Sleep(50);//休息两百毫秒 } } if (dds.Length > 0) { PageIndex++; GetScAll(); } } }