多线程 Q群 号码爬虫_PHP_编程开发_程序员俱乐部

中国优秀的程序员网站程序员频道CXYCLUB技术地图
热搜:
更多>>
 
您所在的位置: 程序员俱乐部 > 编程开发 > PHP > 多线程 Q群 号码爬虫

多线程 Q群 号码爬虫

 2014/7/20 18:37:51  netkiller.github.com  程序员俱乐部  我要评论(0)
  • 摘要:通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。这里仅仅给出怕中代码片段,你可以进一步优化,将Q群分类存储。通过Q群相互浏览关系,可以通过绘图工具绘制好友网络。等等欢迎跟过讨论,请加Q群注明“读者”代码涉及pthreads如果不清楚请阅读:《PHP高级编程之多线程》http://netkiller.github.io/journal/thread.php.html标签:pthreadsPHP代码片段(1
  • 标签:多线程 线程
通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。
这里仅仅给出怕中代码片段,你可以进一步优化,将Q群分类存储。通过Q群相互浏览关系,可以通过绘图工具绘制好友网络。等等
欢迎跟过讨论,请加Q群注明“读者”

代码涉及pthreads?如果不清楚请阅读:《PHP?高级编程之多线程
http://netkiller.github.io/journal/thread.php.html ? 标签: pthreads?PHP ?

代码片段(1)[全屏查看所有代码]

1.?[代码][PHP]代码?????class="jump_to_code" style="padding: 0px; margin: 0px;">

? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 monospace !important;"><?php /* Homepage: http://netkiller.github.io Author: Neo <netkiller@msn.com> */ if(!functions" style="padding: 0px !important; margin: 0px !important; border-top-left-radius: 0px !important; border-top-right-radius: 0px !important; border-bottom-right-radius: 0px !important; border-bottom-left-radius: 0px !important; background-image: none !important; border: 0px !important; float: none !important; height: auto !important; line-height: 1.1em !important; overflow: visible !important; vertical-align: baseline !important; width: auto !important; font-family: Consolas, 'Bitstream Vera Sans Mono', 'Courier New', Courier, monospace !important; color: #ff1493 !important;">extension_loaded('pthreads')) die ('Please install pthreads'); ? include_once('Snoopy.class.php'); ? class CrawlerWorker extends Worker { ? ????protected? static $dbh; ????public function __construct() { ? ????} ????public function run(){ ????/* ????????$dbhost = 'db.example.com';???????? // 数据库服务器 ????????$dbuser = 'example.com';??????????? // 数据库用户名 ????????$dbpw = 'password';???????????????? // 数据库密码 ????????$dbname = 'example';??????????????? // 数据库名 ? ????????self::$dbh? = new PDO("mysql:host=$dbhost;port=3306;dbname=$dbname", $dbuser, $dbpw, array( ????????????PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES \'UTF8\'', ????????????PDO::MYSQL_ATTR_COMPRESS => true, ????????????PDO::ATTR_PERSISTENT => true ????????????) ????????); ????*/ ????} ????protected function getInstance(){ ????????return self::$dbh; ????} ? } ? /* the collectable class implements machinery for Pool::collect */ class Crawler extends Stackable { ????public $depth = 3; ????private static $level = 0; ????public function __construct($qq) { ????????$this->qq = $qq; ????} ????public function run() { ? ????????try { ????????????$dbh? = $this->worker->getInstance(); ????????????$this->recursion(array($this->qq)); ????????} ????????catch(PDOException $e) { ????????????$error = sprintf("%s,%s\n", $mobile, $id ); ????????????file_put_contents("mobile_error.log", $error, FILE_APPEND); ????????} ????????//printf("runtime: %s, %s\n", date('Y-m-d H:i:s'), $this->worker->getThreadId()); ????????//$lst = $this->qzone($this->qq); ????????//print_r($lst); ????} ????public function recursion($qqs){ ????????? ????????if( self::$level <= $this->depth){ ????????????self::$level++; ????????}else if(self::$level > 0){ ????????????self::$level--; ????????} ????????printf("Level: %s\n", self::$level); ????????//sleep(1); ????????usleep(mt_rand(10000,1000000)); ????????if(self::$level >= $this->depth){ ????????????return; ????????} ????????? ????????foreach($qqs as $uin) { ????????????$lst = $this->qzone($uin); ????????????print_r($lst); ????????????$this->recursion($lst); ????????} ????} ? ????public function qzone($qq){ ????????$url = 'http://m.qzone.com/mqz_get_visitor?g_tk=1191852101&res_mode=0&res_uin='.$qq.'&offset=0&count=100&page=1&format=json&t=1401762986882&sid=dODKVcYv6azjN87cxXQ5mao1xgakYjHg18c8aa5e0201%3D%3D'; ????????$snoopy = new Snoopy; ?????????? ????????// need an proxy? ????????//$snoopy->proxy_host = "my.proxy.host"; ????????//$snoopy->proxy_port = "8080"; ?????????? ????????// set browser and referer: ????????$snoopy->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"; ????????$snoopy->referer = "http://m.qzone.com/"; ?????????? ????????// set some cookies: ????????//$snoopy->cookies["SessionID"] = '238472834723489'; ????????//$snoopy->cookies["favoriteColor"] = "blue"; ?????????? ????????// set an raw-header: ????????$snoopy->rawheaders["Pragma"] = "no-cache"; ?????????? ????????// set some internal variables: ????????$snoopy->maxredirs = 2; ????????$snoopy->offsiteok = false; ????????$snoopy->expandlinks = false; ?????????? ????????// set username and password (optional) ????????//$snoopy->user = "joe"; ????????//$snoopy->pass = "bloe"; ?????????? ????????// fetch the text of the website www.google.com: ????????if($snoopy->fetchtext($url)){ ????????????// other methods: fetch, fetchform, fetchlinks, submittext and submitlinks ? ????????????// response code: ????????????//print "response code: ".$snoopy->response_code."<br/>\n"; ?????????? ????????????// print the headers: ????????????//print "<b>Headers:</b><br/>"; ????????????//while(list($key,$val) = each($snoopy->headers)){ ????????????//? print $key.": ".$val."<br/>\n"; ????????????//} ? ????????????// print the texts of the website: ????????????//print_r( json_decode($snoopy->results) ); ????????????? ????????????$results = array(); ????????????$tmp = json_decode($snoopy->results); ????????????? ????????????if($tmp){ ????????????????if(property_exists($tmp, 'data')){ ????????????????????foreach( $tmp->data->list as $lst ){ ????????????????????????$results[] = $lst->uin; ????????????????????} ????????????????} ????????????} ????????????return ($results); ????????????? ????????} ????????else { ????????????print "Snoopy: error while fetching document: ".$snoopy->error."\n"; ????????}?????? ????} } ? $pool = new Pool(100, \CrawlerWorker::class, []); ? #foreach (range(1000, 100000) as $number) { #?? $pool->submit(new Crawler($number)); #} ? $pool->submit(new Crawler('13721218')); $pool->submit(new Crawler('291379')); //$pool->submit(new Crawler('xxx')); //$pool->submit(new Crawler('xxx')); //$pool->submit(new Crawler('xxx')); // 以此类推 //$pool->submit(new Crawler('nnn')); ? $pool->shutdown(); ?>
发表评论
用户名: 匿名