使用Google Analytics跟踪搜索引擎的抓取记录_PHP_编程开发_程序员俱乐部

中国优秀的程序员网站程序员频道CXYCLUB技术地图
热搜:
更多>>
 
您所在的位置: 程序员俱乐部 > 编程开发 > PHP > 使用Google Analytics跟踪搜索引擎的抓取记录

使用Google Analytics跟踪搜索引擎的抓取记录

 2013/9/21 23:08:45  wbj0110  程序员俱乐部  我要评论(0)
  • 摘要:<?php/**Name:TrackingRobotsWithGoogleAnalytics*Author:biaodianfu*URI;http://www.biaodianfu.com/tracking-robots-with-google-analytics.html*/$utmac='UA-16811947-5';//输入GoolgleAnalytics配置生成的跟踪ID$domain='biaodianfu.com'
  • 标签:使用 Google 使用google 索引 搜索引擎
<?php
/*
?* Name:Tracking Robots With Google Analytics
?* Author:biaodianfu
?* URI;http://www.biaodianfu.com/tracking-robots-with-google-analytics.html
?*/
$utmac = 'UA-16811947-5'; //输入Goolgle Analytics配置生成的跟踪ID
$domain = 'biaodianfu.com'; //输入要统计的网站的域名
$utmGifLocation = "http://www.google-analytics.com/__utm.gif"; //请求URL地址
$utmv = "4.8.9"; //Google Analytics统计版本
$title = ""; //网站标题,wp_title() ;

/* Robots
?* Google? http://www.google.com/support/webmasters/bin/answer.py?hl=cn&answer=1061943
?* Baidu? http://tieba.baidu.com/club/9374916/p/10669831
?* Yahoo? http://en.wikipedia.org/wiki/Yahoo!_Slurp
?* Bing? http://www.bing.com/community/site_blogs/b/webmaster/archive/2009/07/17/new-bot-work-continues-at-bing.aspx
?* SOSO? http://help.soso.com/webspider.htm
?*/
$bots = array( 'compatible; Googlebot/([0-9.]{1,10})?' => 'Google',
??????????????? 'Googlebot/([0-9.]{1,10})?'=>'Google',
??????????????? 'Googl(e|ebot)(-News)/([0-9.]{1,10})' => 'Google News',
??????????????? 'Googl(e|ebot)(-News)/' => 'Google News',
??????????????? 'Googl(e|ebot)(-Image)/([0-9.]{1,10})' => 'Google Image',
??????????????? 'Googl(e|ebot)(-Image)/' => 'Google Image',
??????????????? 'Googl(e|ebot)(-Video)/([0-9.]{1,10})' => 'Google Video',
??????????????? 'Googl(e|ebot)(-Video)/' => 'Google Video',
??????????????? 'Googl(e|ebot)(-Sitemaps)/([0-9.]{1,10})?' => 'Google-Sitemaps',
??????????????? 'Googl(e|ebot)(-Sitemaps)' => 'Google-Sitemaps',
??????????????? 'compatible; Googlebot-Mobile/([0-9.]{1,10})?' => 'Google Mobile',
??????????????? 'Googl(e|ebot)(-Mobile)/([0-9.]{1,10})?' => 'Google Mobile',
??????????????? 'compatible; Mediapartners-Google/([0-9.]{1,10})?' => 'Google Mediapartners',
??????????????? 'Mediapartners-Google[ /]([0-9.]{1,10})' => 'Google Mediapartners',
??????????????? 'Mediapartners-Google' => 'Google Mediapartners',
??????????????? '^AdsBot-Google' => 'Google-AdsBot',
??????????????? '^Feedfetcher-Google' => 'Google-Feedfetcher',
??????????????? 'compatible; Baiduspider/([0-9.]{1,10})?' => 'Baidu',
??????????????? 'Baiduspider' => 'Baidu',
??????????????? 'BaiduCustomer' => 'Baidu Customer',
??????????????? 'Baidu-Thumbnail' => 'Baidu Thumbnail',
??????????????? 'Baidu-Transcoder' => 'Baidu Mobile',
??????????????? 'baiduspider-mobile-gate' => 'Baidu Mobile',
??????????????? 'Yahoo(! ([a-z]{1,3} )?Slurp|-)' => 'Yahoo',
??????????????? 'Yahoo! Slurp China' => 'Yahoo China',
??????????????? 'YahooFeedSeeker' => 'Yahoo Feed',
??????????????? 'Yahoo-Blogs' => 'Yahoo Blog',
??????????????? 'Yahoo ContentMatch Crawler' => 'Yahoo Ads',
??????????????? 'Yahoo-MMCrawler ' => 'Yahoo Image',
??????????????? 'MSN(BOT|PTC)[ /]([0-9.]{1,10})' => 'MSN',
??????????????? 'MS Search ([0-9.]{1,10}) Robot' => 'MSN',
??????????????? 'MSNBOT_Mobile' => 'MSN Mobile',
??????????????? 'MSMOBOT' => 'MSN Mobile',
??????????????? 'MSNBOT-(MEDIA|PRODUCTS|ACADEMIC|NEWSBLOGS)[ /]([0-9.]{1,10})' => 'MS Live Search',
??????????????? 'Sosospider' => 'SoSo',
??????????????? 'Sosoblogspider' => 'SoSo Blog',
??????????????? 'Sosoimagespider' => 'SoSo IMAGE',
??????????????? 'Sogou web spider[ /]([0-9.]{1,10})' => 'Sogou',
??????????????? 'Sogou-Test-Spider[ /]([0-9.]{1,10})' => 'Sogou',
??????????????? 'Sogou web robot' => 'Sogou',
??????????????? 'Sogou orion spider[ /]([0-9.]{1,10})' => 'Sogou',
??????????????? 'YodaoBot[ /]([0-9.]{1,10})' => 'Youdao',
??????????????? 'YodaoBot-Image[ /]([0-9.]{1,10})' => 'Youdao Image',
??????????????? 'YodaoBot-Reader[ /]([0-9.]{1,10})' => 'Youdao Reader',
??????????????? 'QihooBot[ /]([0-9.]{1,10})' => 'Qihoo',
??????????????? 'gougou' => 'GouGou',
??????????????? '(robot|spider|harvest|bot|(?<!msie)crawler)' => 'Unknown Robot'
??????????????? );

$os = array ( 'wi(n|ndows)?' => 'windows',
????????????? 'linux[ /\-]([a-z0-9._]{1,10})' => 'linux',
????????????? 'linux' => 'linux',
????????????? 'Mac[ _]?OS[ _]?X[ /]([0-9.]{1,10})' => 'macosx',
????????????? 'Mac[ _]?OS[ _]?X' => 'macosx',
????????????? 'Mac 10.([0-9.]{1,10})' => 'macosx',
????????????? 'Mac(_Power|intosh.+P)PC' => 'macppc',
????????????? 'beos[ a-z]*([0-9.]{1,10})' => 'beos',
????????????? 'beos' => 'beos',
????????????? 'fedora' => 'fedora',
????????????? 'free[ \-]?bsd[ /]([a-z0-9._]{1,10})' => 'freebsd',
????????????? 'free[ \-]?bsd' => 'freebsd',
????????????? 'open[ \-]?bsd[ /]([a-z0-9._]{1,10})' => 'openbsd',
????????????? 'open[ \-]?bsd' => 'openbsd',
????????????? 'PCLinuxOS[ /]?([0-9.]{1,10})' => 'pclinux',
????????????? 'ubuntu' => 'ubuntu'
????????????? );

function domainHash($domain) {
?if(!$domain || $domain=="") return 1;
?$h=0; $g=0;
?for($i=strlen($domain)-1;$i>=0;$i--) {
??$c = (int)(ord($domain[$i]));
??$h = (($h << 6) & 0xfffffff) + $c + ($c << 14);
??$g = ($h & 0xfe00000);
??if($g!=0) $h = ($h ^ ($g >> 21));
?}
?return $h;
}

function httpRequest($utmUrl){
??? if(function_exists('curl_exec')){
??? $ch = curl_init();
??????? curl_setopt($ch, CURLOPT_HEADER, 1);
??curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
??curl_setopt($ch, CURLOPT_URL, $utmUrl);
??$data = curl_exec($ch);
??curl_close($ch);
??? }
??? elseif(function_exists('file_get_contents')){
??????? $options = array(
??????????? "http" => array(
??????????????? "method" => "GET",
??????????????? "user_agent" => $_SERVER["HTTP_USER_AGENT"],
??????????????? "header" => ("Accepts-Language: " . $_SERVER["HTTP_ACCEPT_LANGUAGE"]))
??????? );
??????? $data = file_get_contents( $utmUrl, false, stream_context_create($options));
??? }
}

if ( empty( $_SERVER['HTTP_REFERER'] ) && $_SERVER["HTTP_USER_AGENT"] ){
??? foreach ( $os as $patternos => $o ){
??????? if ( preg_match('#'.$patternos.'#msi', $_SERVER["HTTP_USER_AGENT"] ) == 0){
??????????? foreach( $bots as $patternbots => $bot ){
??????????????? if (preg_match( '#'.$patternbots.'#i' , $_SERVER['HTTP_USER_AGENT'] ) == 1){
??????????????????? $botname = preg_replace ( "/\\s{1,}/i" , '-' , $bot );
??????????????????? $utmUrl = $utmGifLocation . "?" .
????????????????????????????? "utmwv=" . $utmv .
????????????????????????????? "&utmn=" . rand(0, 0x7fffffff) .
????????????????????????????? "&utmhn=" . urlencode($_SERVER["SERVER_NAME"]) .
????????????????????????????? "&utmdt=" . urlencode($title).
????????????????????????????? "&utmr=-" .
????????????????????????????? "&utmp=" . urlencode($_SERVER["REQUEST_URI"]) .
????????????????????????????? "&utmac=" . $utmac .
????????????????????????????? "&utmcc=" .
??????????????????????????????? '__utma%3D'.domainHash($domain).'.'.rand(0, 0x7fffffff).'.'.time().'.'.time

().'.'.time().'.1%3B%2B'.
??????????????????????????????? '__utmb%3D'.domainHash($domain).'%3B%2B'.
??????????????????????????????? '__utmc%3D'.domainHash($domain).'%3B%2B'.
??????????????????????????????? '__utmz%3D'.domainHash($domain).'.'.time().'.1.1.utmccn%3D(organic)%7Cutmcsr%

3D'.$botname.'%7Cutmctr%3D'.$_SERVER["REQUEST_URI"].'%7Cutmcmd%3Dorganic%3B%2B'.
??????????????????????????????? '__utmv%3D'.domainHash($domain).'.Robot%20hostname%3A%20'.gethostbyaddr( $_SERVER

['REMOTE_ADDR'] ).'%3B';
??????????????????? httpRequest($utmUrl);
??????????????? }
??????????? }
??????? }
??? }
}
?>

本方法适合使用虚拟主机的朋友,如果您自己有服务器的话建议还是开启服务器日志使用awstats进行分析,英文这样你才能真正的了解蜘蛛,特别是对服务器状态码分析统计。

以上代码参考了一个法文网站,由于代码比较老(2008年的),同时中间的搜索引擎的User-Agent和不太适合中国网站,百度也在近期修改了User-Agent。自己修改了下代码。本代码还未测试,如果发现问题请及时联系。

发表评论
用户名: 匿名