之前,本博客有发过一篇文章,写的是用php爬某博客的文章(当然,在项目中,还是用火车头这样专业工具去爬咯~~~但是在学习的过程中,还是要多重复造轮子的),当时爬那300多页文章,共3k篇左右文章(包括图片),单线程需要好几个小时,当时也是用面向过程方式写的。。。对于多线程的研究,始于项目测QPS的需要。其实,在小公司真的很苦逼,没有专业的运维团队,服务器环境搭建部署,都要我这个后端程序员完成。环境采用nginx软负载均衡内网的apache webserver,推广运营预估了用户量,然后就开始测试当前服务器配置的QPS,一开始用apache的ab测,发现有太多局限性,后来,我们的资深andriod客户端帮我用java写了个多线程压力测试,于是,我就决定去找找php有没有多线程呢~~~
php作为一个解释性语言,zend引擎没有内置的多线程模块,可以用c/cpp开发扩展吗,果然,就找到了,pthread这个扩展,这个扩展支持php5.3以后的版本,因为php7对zval有改动过,要支持php7应该需要改写,所以应该是不姿瓷的吧。pthread扩展装起来…
都是爬的同一个网站(www.heu8.com),同样的页数(包括下载图片,写入mysql数据库)。多线程花了75秒,单线程花了201秒。(地址soft.feehi.com)

上代码(多线程版爬虫,oop封装,heu8.php为执行脚本,spider.php为爬虫类,pthread.php为多线程类):
heu8.php
<?php require_once “spider.php”; $spider = new feeSpider( true ); $start = isset( $argv[1] ) ? $argv[1] : 2;//开始的页码 $length = isset( $argv[2] ) ? $argv[2] : 0;//需要抓取的页数 $spider->db = [‘dsn’=>”mysql:host=localhost;dbname=spider” , ‘user’=>’root’ , ‘pass’=>” , ‘table’=>’heu8_thread’ ]; $url = “http://www.heu8.com/page/”; $spider->setReplace( [ ‘http://www.heu8.com/wp-content/themes/BYMT/images/images_error.jpg’ => ‘http://img.feehi.com/public/loadImgError.jpg’, ] );//抓取后文章自字符替换,把文章中的key替换成value $spider->thumbImgUrl = ‘http://7othwv.com1.z0.glb.clouddn.com/soft/’;//文章缩略图地址的host部分,我把图片会放到七牛,所以这里配置了,默认为空 $spider->imgUrl = ‘http://7othwv.com1.z0.glb.clouddn.com/soft/’;//文章图片地址的host部分,我把图片会放到七牛,这里配置了,默认为空 $pattern = [ ‘list’ => ‘/<h2 class=”post-title”>s*<a href=”(.*)”/U’, ‘list_img’ => ‘/<div class=”post-thumbnail”>.*<a href=”.*” rel=”bookmark” title=”.*”>.*<img src=”(.*)”/isU’, ‘title’ => ‘/<h2 class=”post-title”>(.*)</h2>/U’, ‘content’ => ‘/<div class=”post-content”>(.*)</div>/isU’, ‘content_img’ => ‘/<img.*src=”(.*)”/isU’, ];//依次为列表页匹配详细页正则,列表页缩略图url正则,文章标题正则,文章内容正则,文章图片地址正则 $spider->run( $url , $pattern , $start , $length );//开始爬取页面 echo “抓取完成,下载图片/日志文件,请在脚本同级目录查找rn”;
spider.php
<?php
namespace fee;
set_time_limit(0);//取消超时限制
date_default_timezone_set(‘PRC’);
include “Pthread.php”;
class Spider{
private $db=false;数据库连接,不配置则写入文本记录
private $logTxt;日志文件名,默认为执行脚本的时间
private $echoLog;是否输出控制台日志,默认不输出
private $replaceFind = [];
private $replaceArray = [];
private $imgUrl = ”;
private $thumbImgUrl = ”;
public function __construct( $echoLog = false , $logTxt=” ){
$this->echoLog = $echoLog;
if( $logTxt == ” )
$this->logTxt = date(‘Y-m-d H-i-s’).’.txt’;
else
$this->logTxt = $logTxt;
}
public function __set( $k , $v ){
$this->$k = $v;
}
public function __get( $k ){
return $this->$k;
}
public function setReplace( $replace ){
foreach( $replace as $k => $v ){
$this->replaceFind[] = $k;
$this->replaceArray[] = $v;
}
}
public function run( $url , $pattern=[] , $start=2 , $length=0 ){
$this->log( “抓取开始” );
$startTime = microtime( true );
$articleCount = 0;
for($i=$start;$i>$length;$i–){
$this->log( “正在分析第{$i}页” );
$listUrl = $url.$i;
$content = file_get_contents( $listUrl );
preg_match_all($pattern[‘list’],$content, $matches);
$this->log( “第{$i}页找到”.count($matches[1]).”篇文章” );
$articleCount += count($matches[1]);
preg_match_all($pattern[‘list_img’],$content,$matchesThumb);
$thumbPic = [];
$matchesThumb[1] = array_reverse( $matchesThumb[1] );
foreach($matchesThumb[1] as $ThumbK => $ThumbV){
$this->log( “正在下载第{$i}页的第”.($ThumbK+1).”张缩略图” );
$dataThumb = file_get_contents($ThumbV);
$infoThumb = pathinfo($ThumbV);
$filePath = ‘thumb/’.date(‘Y-m-d’).’/’;
$pathThumb = dirname(__FILE__).’/’.$filePath;
if( !is_dir($pathThumb) ){
mkdir( $pathThumb , 0777 , true );
}
$rand = rand(0,10000).’_’;
$pathThumb .= $rand.urlencode($infoThumb[‘basename’]);
$thumbPic[] = $this->thumbImgUrl.$filePath.$rand.urlencode($infoThumb[‘basename’]); $fp = @fopen( $pathThumb , ‘w’ );
@fwrite( $fp , $dataThumb );
//$this->log( “第{$i}页的第”.($ThumbK+1).”张缩略图下载完成” );
}
$matches[1] = array_reverse( $matches[1] );
$pool = [];
$j=0;
foreach($matches[1] as $k => $v){
$j++;
$pool[$j] = new feePthread( $v/**url**/ , $i/**page**/ , $k/**文章**/ , $pattern , $this , $thumbPic );
}
foreach( $pool as $key => $val ){
$val->start();
}
}
while( count( $pool ) ){
foreach( $pool as $poolk => $poolv ){
if( $poolv->enddd == 1 ) unset( $pool[$poolk] );
}
}
$endTime = microtime( true );
$intvalTime = $endTime – $startTime;
$totalPage = $start-$length;
$this->log( “共分析{$totalPage}页,抓取{$articleCount}篇文章,耗时{$intvalTime}s” );
}
public function log( $log ){
file_put_contents( $this->logTxt , date(‘Y-m-d H:i:s’).” $logrn” , FILE_APPEND );
$log = iconv(‘utf-8′,’GB2312’,$log);
if( $this->echoLog ) echo date(‘Y-m-d H:i:s’).” $logrn”;
}
}
?>
pthread.php
<?php
namespace fee;
class Pthread extends Thread{
public $url;
public $page;
public $which;
private $db;
public $enddd = 0;
public function __construct( $url , $page , $which , $pattern , $spider , $thumbPic){
$this->url = $url;
$this->page = $page;
$this->which = $which;
$this->pattern = $pattern;
$this->spider = $spider;
$this->thumbPic = $thumbPic;
}
public function run( ){
$db = false;
try{
$db = new PDO( $this->spider->db[‘dsn’] , $this->spider->db[‘user’] , $this->spider->db[‘pass’] );
$db->query( “set names utf8” );
}catch( Exception $e ){
echo $e->getMessage();
exit;
}
$db->query( “update {$this->spider->db[‘table’]} set checked_times=checked_times+1” );
if( is_array( $row=$db->query( “select * from {$this->spider->db[‘table’]} where url='{$this->url}'” )->fetch(PDO::FETCH_ASSOC) ) ){
$this->spider->log( “{$this->url}在”.date(‘Y-m-d H:i:s’,$row[‘created_at’]).’已经抓取过了,本次未抓取。’ );
$this->enddd = 1;
return;
}
$this->which = $this->which+1;
$this->spider->log( “正在分析{$this->page}页第”.($this->which).”篇文章…” );
$content2 = file_get_contents( $this->url );
//$content2 = mb_convert_encoding( $content2, ‘UTF-8’, ‘UTF-8,GBK,GB2312,BIG5’ );
preg_match($this->pattern[‘title’],$content2,$matches2);
$title = $matches2[1];
//$title = iconv(‘utf-8′,’GB2312’,$title);
$this->spider->log( “抓取 {$this->which} 篇文章 {$this->url}成功。标题:{$title}…” );
preg_match($this->pattern[‘content’],$content2,$matches2);
$article = $matches2[1];
preg_match_all($this->pattern[‘content_img’],$article,$pics);
foreach($pics[1] as $this->which2 => $v2){
$this->spider->log( ‘本页包含’.count($pics[1]).”张图,正在下载第”.($this->which2+1).”张…” );
$data = file_get_contents($v2);
$this->pagenfo = pathinfo($v2);
$filePath = ‘uploads/’.date(‘Y-m-d’).’/’;
$path = dirname(__FILE__).’/’.$filePath;
if( !is_dir($path) ){
mkdir($path,0777,true);
}
$rand_pic = rand(0,10000).’_’;
$path .= $rand_pic.urlencode($this->pagenfo[‘basename’]);
$fp = @fopen( $path , ‘w’ );
@fwrite( $fp , $data );
$filePahNoUpload = str_replace( ‘uploads/’ , ” , $filePath );
$article = str_replace( $v2 , $this->imgUrl.$filePahNoUpload.$rand_pic.urlencode($this->pagenfo[‘basename’]) , $article );
}
if( isset( $this->spider->replaceFind[0] ) ) $article = str_replace( $this->spider->replaceFind , $this->spider->replaceArray , $article );
$article = addslashes( $article );
$time = time();
if( $db != false ){
if( $db->exec( “insert into {$this->spider->db[‘table’]}(title,content,thumb,created_at,url) values(‘{$title}’,'{$article}’,'{$this->thumbPic[$this->which]}’,$time,’$this->url’)” ) ){
$this->spider->log( “{$this->url}入库成功…” );
}else{
$this->spider->log( “{$this->url}入库失败…” );
}else{
file_put_contents( ‘spider.txt’ , $title.’ ‘.$article.’ ‘.$thumbPic[$this->which].’ ‘.$time.’ ‘.$this->url , FILE_APPEND );
}
//file_put_contents( ‘spider.txt’ , “第 {$this->page} 页第 “.($this->which+1).’ ‘.$this->url.’ ‘.$title.”rn” , FILE_APPEND );
$this->enddd = 1;
}
}
?>
当然这个多线程爬虫还没有很完善,分析列表页后,会创建该列表页文章数目的线程爬取,而在爬列表页缩略图的时候依然是单线程。不过,如果分析列表页就用多线程,站长估计会误认为是攻击,而封我ip了,还有一个没有解决的问题是,多线程爬文章的顺序打乱了,先返回结果的线程先入库。