本采集类可以实现多线程采集,我写这个是用来采集电商网站的采集产品列表,可以自定义采集规则,因此用于采集系统的制作最合适不过了。
list_page=$list_page;
$this->from=$from;
if($this->from==1)
{
if(empty($list_page_rule))
{
die('请设置详细页网址规则');
}
$this->list_page_rule=$list_page_rule;
$this->getListUrls();
}
$this->mp=new MultiHttpRequest();
}
public function getListUrls()
{
foreach ($this->list_page as $link) {
// 解析列表页数
preg_match_all('/\[(.*)\]/i', $link, $_page);
$pages = explode('-', $_page[1][0]);
for ($i = $pages[0]; $i <= $pages[1] ; $i ++) {
$this->list_urls[] = preg_replace('/\[(.*)\]/i', $i, $link);
}
}
return $this->list_urls;
}
public function parse()
{
$this->mp->set_urls($this->list_urls);
$contents = $this->mp->start();
foreach($contents as $content)
{
$content=$this->_prefilter($content);
preg_match_all('/'.str_replace('/', '\/', addslashes($this->list_page_rule)).'/i',$content,$pregArr);
$detail_urls = array();
foreach($pregArr[1] as $detail_key=>$detail_value){
if(strpos($detail_value, "http://")===false)
{
$detail_value=$this->base_url.$detail_value;
}
$detail_urls[]=$detail_value;
}
$this->parseResult($detail_urls);//一个分页处理一次结果
}
}
public function parseResult($urls=array())
{
$this->mp->set_urls($urls);
$contents=$this->mp->start();
$result=array();
foreach($contents as $k=>$content)
{
$content=$this->_prefilter($content);
$goods=array();
$goods['url']=$urls[$k];
foreach($this->detail_rule as $key=>$val)
{
$attr_var=$key;
if(strpos($val, "(.*?)")!==false)
{
$pattern=str_replace('/', '\/', addslashes($val));
$pattern=str_replace(array('{','}','[',']','|'), array('\{','\}','\[','\]','\|'), $pattern);
}
else
{
$pattern=str_replace('/', '\/',$val);
}
if(preg_match('/__(\w+)__/i', $pattern,$matches))
{
$pattern=str_replace($matches[0], $$matches[1], $pattern);
}
$pattern='/'.$pattern.'/i';
preg_match_all($pattern,$content,$detailArr);
if($key=='haoma')
{
if(substr_count($pattern, '(.*?)')==3)
{
$goods['haoma']=implode("|", array($detailArr[1][0],$detailArr[2][0],$detailArr[3][0]));
}
elseif(substr_count($pattern, '(.*?)')==5)
{
$goods['haoma']=implode("|", array($detailArr[1][0],$detailArr[2][0],$detailArr[3][0],$detailArr[4][0],$detailArr[5][0]));
}
else
{
$goods['haoma']=$detailArr[1][0];
}
}
else
{
$goods[$key]=$detailArr[1][0];
}
$$attr_var=$goods[$key];
}
$result[]=$goods;
}
if(count($result)==1)
{
return $result[0];
}
else
{
return $result;
}
}
// 对抓去到的内容做简单过滤(过滤空白字符,便于正则匹配)
private function _prefilter($output)
{
$output = preg_replace("/\/\/[\S\f\t\v ]*?;[\r|\n]/", "", $output);
$output = preg_replace("/\<\!\-\-[\s\S]*?\-\-\>/", "", $output);
$output = preg_replace("/\>[\s]+\", "><", $output);
$output = preg_replace("/;[\s]+/", ";", $output);
$output = preg_replace("/[\s]+\}/", "}", $output);
$output = preg_replace("/}[\s]+/", "}", $output);
$output = preg_replace("/\{[\s]+/", "{", $output);
$output = preg_replace("/([\s]){2,}/", "$1", $output);
$output = preg_replace("/[\s]+\=[\s]+/", "=", $output);
return $output;
}
public function setDetailRule($rule)
{
$this->detail_rule=$rule;
}
}
class MultiHttpRequest
{
public $urls = array();
public $curlopt_header = 0;
public $method = "GET";
function __construct($urls = false)
{
$this->urls = $urls;
}
function set_urls($urls)
{
$this->urls = $urls;
return $this;
}
function is_return_header($b)
{
$this->curlopt_header = $b;
return $this;
}
function set_method($m)
{
$this->medthod = strtoupper($m);
return $this;
}
function start()
{
if (! is_array($this->urls) or count($this->urls) == 0) {
return false;
}
$curl = $text = array();
$handle = curl_multi_init();
foreach ($this->urls as $k => $v) {
$curl[$k] = $this->add_handle($handle, $v);
}
$this->exec_handle($handle);
foreach ($this->urls as $k => $v) {
$text[$k] = curl_multi_getcontent($curl[$k]);
//$coding=mb_detect_encoding($text[$k]);
if(!is_utf8($text[$k]))
{
$text[$k] = iconv("", "UTF-8", $text[$k]);
}
curl_multi_remove_handle($handle, $curl[$k]);
}
curl_multi_close($handle);
return $text;
}
private function add_handle($handle, $url)
{
$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_multi_add_handle($handle, $curl);
return $curl;
}
private function exec_handle($handle)
{
$flag = null;
do {
curl_multi_exec($handle, $flag);
} while ($flag > 0);
}
public function get_content($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
return curl_exec($ch);
}
}
function is_utf8($str)
{
$c = 0;
$b = 0;
$bits = 0;
$len = strlen($str);
for ($i = 0; $i < $len; $i ++) {
$c = ord($str[$i]);
if ($c > 128) {
if (($c >= 254))
return false;
elseif ($c >= 252)
$bits = 6;
elseif ($c >= 248)
$bits = 5;
elseif ($c >= 240)
$bits = 4;
elseif ($c >= 224)
$bits = 3;
elseif ($c >= 192)
$bits = 2;
else
return false;
if (($i + $bits) > $len)
return false;
while ($bits > 1) {
$i ++;
$b = ord($str[$i]);
if ($b < 128 || $b > 191)
return false;
$bits --;
}
}
}
return true;
}
正文完