之前,本博客有发过一篇文章,写的是用php爬某博客的文章(当然,在项目中,还是用火车头这样专业工具去爬咯~~~但是在学习的过程中,还是要多重复造轮子的),当时爬那300多页文章,共3k篇左右文章(包括图片),单线程需要好几个小时,当时也是用面向过程方式写的。。。对于多线程的研究,始于项目测QPS的需要。其实,在小公司真的很苦逼,没有专业的运维团队,服务器环境搭建部署,都要我这个后端程序员完成。环境采用nginx软负载均衡内网的apache webserver,推广运营预估了用户量,然后就开始测试当前服务器配置的QPS,一开始用apache的ab测,发现有太多局限性,后来,我们的资深andriod客户端帮我用java写了个多线程压力测试,于是,我就决定去找找php有没有多线程呢~~~
php作为一个解释性语言,zend引擎没有内置的多线程模块,可以用c/cpp开发扩展吗,果然,就找到了,pthread这个扩展,这个扩展支持php5.3以后的版本,因为php7对zval有改动过,要支持php7应该需要改写,所以应该是不姿瓷的吧。pthread扩展装起来…
都是爬的同一个网站(www.heu8.com),同样的页数(包括下载图片,写入mysql数据库)。多线程花了75秒,单线程花了201秒。(地址soft.feehi.com)
上代码(多线程版爬虫,oop封装,heu8.php为执行脚本,spider.php为爬虫类,pthread.php为多线程类):
heu8.php
<?php require_once “spider.php”; $spider = new feeSpider( true ); $start = isset( $argv[1] ) ? $argv[1] : 2;//开始的页码 $length = isset( $argv[2] ) ? $argv[2] : 0;//需要抓取的页数 $spider->db = [‘dsn’=>”mysql:host=localhost;dbname=spider” , ‘user’=>’root’ , ‘pass’=>” , ‘table’=>’heu8_thread’ ]; $url = “http://www.heu8.com/page/”; $spider->setReplace( [ ‘http://www.heu8.com/wp-content/themes/BYMT/images/images_error.jpg’ => ‘http://img.feehi.com/public/loadImgError.jpg’, ] );//抓取后文章自字符替换,把文章中的key替换成value $spider->thumbImgUrl = ‘http://7othwv.com1.z0.glb.clouddn.com/soft/’;//文章缩略图地址的host部分,我把图片会放到七牛,所以这里配置了,默认为空 $spider->imgUrl = ‘http://7othwv.com1.z0.glb.clouddn.com/soft/’;//文章图片地址的host部分,我把图片会放到七牛,这里配置了,默认为空 $pattern = [ ‘list’ => ‘/<h2 class=”post-title”>s*<a href=”(.*)”/U’, ‘list_img’ => ‘/<div class=”post-thumbnail”>.*<a href=”.*” rel=”bookmark” title=”.*”>.*<img src=”(.*)”/isU’, ‘title’ => ‘/<h2 class=”post-title”>(.*)</h2>/U’, ‘content’ => ‘/<div class=”post-content”>(.*)</div>/isU’, ‘content_img’ => ‘/<img.*src=”(.*)”/isU’, ];//依次为列表页匹配详细页正则,列表页缩略图url正则,文章标题正则,文章内容正则,文章图片地址正则 $spider->run( $url , $pattern , $start , $length );//开始爬取页面 echo “抓取完成,下载图片/日志文件,请在脚本同级目录查找rn”;
spider.php
<?php namespace fee; set_time_limit(0);//取消超时限制 date_default_timezone_set(‘PRC’); include “Pthread.php”; class Spider{ private $db=false;数据库连接,不配置则写入文本记录 private $logTxt;日志文件名,默认为执行脚本的时间 private $echoLog;是否输出控制台日志,默认不输出 private $replaceFind = []; private $replaceArray = []; private $imgUrl = ”; private $thumbImgUrl = ”; public function __construct( $echoLog = false , $logTxt=” ){ $this->echoLog = $echoLog; if( $logTxt == ” ) $this->logTxt = date(‘Y-m-d H-i-s’).’.txt’; else $this->logTxt = $logTxt; } public function __set( $k , $v ){ $this->$k = $v; } public function __get( $k ){ return $this->$k; } public function setReplace( $replace ){ foreach( $replace as $k => $v ){ $this->replaceFind[] = $k; $this->replaceArray[] = $v; } } public function run( $url , $pattern=[] , $start=2 , $length=0 ){ $this->log( “抓取开始” ); $startTime = microtime( true ); $articleCount = 0; for($i=$start;$i>$length;$i–){ $this->log( “正在分析第{$i}页” ); $listUrl = $url.$i; $content = file_get_contents( $listUrl ); preg_match_all($pattern[‘list’],$content, $matches); $this->log( “第{$i}页找到”.count($matches[1]).”篇文章” ); $articleCount += count($matches[1]); preg_match_all($pattern[‘list_img’],$content,$matchesThumb); $thumbPic = []; $matchesThumb[1] = array_reverse( $matchesThumb[1] ); foreach($matchesThumb[1] as $ThumbK => $ThumbV){ $this->log( “正在下载第{$i}页的第”.($ThumbK+1).”张缩略图” ); $dataThumb = file_get_contents($ThumbV); $infoThumb = pathinfo($ThumbV); $filePath = ‘thumb/’.date(‘Y-m-d’).’/’; $pathThumb = dirname(__FILE__).’/’.$filePath; if( !is_dir($pathThumb) ){ mkdir( $pathThumb , 0777 , true ); } $rand = rand(0,10000).’_’; $pathThumb .= $rand.urlencode($infoThumb[‘basename’]); $thumbPic[] = $this->thumbImgUrl.$filePath.$rand.urlencode($infoThumb[‘basename’]); $fp = @fopen( $pathThumb , ‘w’ ); @fwrite( $fp , $dataThumb ); //$this->log( “第{$i}页的第”.($ThumbK+1).”张缩略图下载完成” ); } $matches[1] = array_reverse( $matches[1] ); $pool = []; $j=0; foreach($matches[1] as $k => $v){ $j++; $pool[$j] = new feePthread( $v/**url**/ , $i/**page**/ , $k/**文章**/ , $pattern , $this , $thumbPic ); } foreach( $pool as $key => $val ){ $val->start(); } } while( count( $pool ) ){ foreach( $pool as $poolk => $poolv ){ if( $poolv->enddd == 1 ) unset( $pool[$poolk] ); } } $endTime = microtime( true ); $intvalTime = $endTime – $startTime; $totalPage = $start-$length; $this->log( “共分析{$totalPage}页,抓取{$articleCount}篇文章,耗时{$intvalTime}s” ); } public function log( $log ){ file_put_contents( $this->logTxt , date(‘Y-m-d H:i:s’).” $logrn” , FILE_APPEND ); $log = iconv(‘utf-8′,’GB2312’,$log); if( $this->echoLog ) echo date(‘Y-m-d H:i:s’).” $logrn”; } } ?>
pthread.php
<?php namespace fee; class Pthread extends Thread{ public $url; public $page; public $which; private $db; public $enddd = 0; public function __construct( $url , $page , $which , $pattern , $spider , $thumbPic){ $this->url = $url; $this->page = $page; $this->which = $which; $this->pattern = $pattern; $this->spider = $spider; $this->thumbPic = $thumbPic; } public function run( ){ $db = false; try{ $db = new PDO( $this->spider->db[‘dsn’] , $this->spider->db[‘user’] , $this->spider->db[‘pass’] ); $db->query( “set names utf8” ); }catch( Exception $e ){ echo $e->getMessage(); exit; } $db->query( “update {$this->spider->db[‘table’]} set checked_times=checked_times+1” ); if( is_array( $row=$db->query( “select * from {$this->spider->db[‘table’]} where url='{$this->url}'” )->fetch(PDO::FETCH_ASSOC) ) ){ $this->spider->log( “{$this->url}在”.date(‘Y-m-d H:i:s’,$row[‘created_at’]).’已经抓取过了,本次未抓取。’ ); $this->enddd = 1; return; } $this->which = $this->which+1; $this->spider->log( “正在分析{$this->page}页第”.($this->which).”篇文章…” ); $content2 = file_get_contents( $this->url ); //$content2 = mb_convert_encoding( $content2, ‘UTF-8’, ‘UTF-8,GBK,GB2312,BIG5’ ); preg_match($this->pattern[‘title’],$content2,$matches2); $title = $matches2[1]; //$title = iconv(‘utf-8′,’GB2312’,$title); $this->spider->log( “抓取 {$this->which} 篇文章 {$this->url}成功。标题:{$title}…” ); preg_match($this->pattern[‘content’],$content2,$matches2); $article = $matches2[1]; preg_match_all($this->pattern[‘content_img’],$article,$pics); foreach($pics[1] as $this->which2 => $v2){ $this->spider->log( ‘本页包含’.count($pics[1]).”张图,正在下载第”.($this->which2+1).”张…” ); $data = file_get_contents($v2); $this->pagenfo = pathinfo($v2); $filePath = ‘uploads/’.date(‘Y-m-d’).’/’; $path = dirname(__FILE__).’/’.$filePath; if( !is_dir($path) ){ mkdir($path,0777,true); } $rand_pic = rand(0,10000).’_’; $path .= $rand_pic.urlencode($this->pagenfo[‘basename’]); $fp = @fopen( $path , ‘w’ ); @fwrite( $fp , $data ); $filePahNoUpload = str_replace( ‘uploads/’ , ” , $filePath ); $article = str_replace( $v2 , $this->imgUrl.$filePahNoUpload.$rand_pic.urlencode($this->pagenfo[‘basename’]) , $article ); } if( isset( $this->spider->replaceFind[0] ) ) $article = str_replace( $this->spider->replaceFind , $this->spider->replaceArray , $article ); $article = addslashes( $article ); $time = time(); if( $db != false ){ if( $db->exec( “insert into {$this->spider->db[‘table’]}(title,content,thumb,created_at,url) values(‘{$title}’,'{$article}’,'{$this->thumbPic[$this->which]}’,$time,’$this->url’)” ) ){ $this->spider->log( “{$this->url}入库成功…” ); }else{ $this->spider->log( “{$this->url}入库失败…” ); }else{ file_put_contents( ‘spider.txt’ , $title.’ ‘.$article.’ ‘.$thumbPic[$this->which].’ ‘.$time.’ ‘.$this->url , FILE_APPEND ); } //file_put_contents( ‘spider.txt’ , “第 {$this->page} 页第 “.($this->which+1).’ ‘.$this->url.’ ‘.$title.”rn” , FILE_APPEND ); $this->enddd = 1; } } ?>
当然这个多线程爬虫还没有很完善,分析列表页后,会创建该列表页文章数目的线程爬取,而在爬列表页缩略图的时候依然是单线程。不过,如果分析列表页就用多线程,站长估计会误认为是攻击,而封我ip了,还有一个没有解决的问题是,多线程爬文章的顺序打乱了,先返回结果的线程先入库。