一个很棒的采集类,可以多线程采集。用来做采集系统。

1,166次阅读
没有评论

本采集类可以实现多线程采集,我写这个是用来采集电商网站的采集产品列表,可以自定义采集规则,因此用于采集系统的制作最合适不过了。

list_page=$list_page; 
        $this->from=$from;
        if($this->from==1)
        {
            if(empty($list_page_rule))
            {
                die('请设置详细页网址规则');
            }
            $this->list_page_rule=$list_page_rule;
            $this->getListUrls();
        }
        $this->mp=new MultiHttpRequest();  
    }
    public function getListUrls()
    {
        foreach ($this->list_page as $link) {
            // 解析列表页数
            preg_match_all('/\[(.*)\]/i', $link, $_page);
            $pages = explode('-', $_page[1][0]);
            for ($i = $pages[0]; $i <= $pages[1] ; $i ++) {
                $this->list_urls[] = preg_replace('/\[(.*)\]/i', $i, $link);
            }
        }
        return $this->list_urls;
    }
    public function parse()
    {
        $this->mp->set_urls($this->list_urls);
        $contents = $this->mp->start();
        foreach($contents as $content)
        {
            $content=$this->_prefilter($content);
            preg_match_all('/'.str_replace('/', '\/', addslashes($this->list_page_rule)).'/i',$content,$pregArr);
            $detail_urls = array();
            foreach($pregArr[1] as $detail_key=>$detail_value){
                if(strpos($detail_value, "http://")===false)
                {
                    $detail_value=$this->base_url.$detail_value;
                }
               $detail_urls[]=$detail_value;
            }
            $this->parseResult($detail_urls);//一个分页处理一次结果
        }
    }
    public function parseResult($urls=array())
    {
        $this->mp->set_urls($urls);
        $contents=$this->mp->start();
        $result=array();
        foreach($contents as $k=>$content)
        {
            $content=$this->_prefilter($content);
            $goods=array();
            $goods['url']=$urls[$k];
            
            foreach($this->detail_rule as $key=>$val)
            {
                $attr_var=$key;
                if(strpos($val, "(.*?)")!==false)
                {
                    $pattern=str_replace('/', '\/', addslashes($val));
                    $pattern=str_replace(array('{','}','[',']','|'), array('\{','\}','\[','\]','\|'), $pattern);
                }
                else 
                {
                    $pattern=str_replace('/', '\/',$val);
                }
                if(preg_match('/__(\w+)__/i', $pattern,$matches))
                {
                    $pattern=str_replace($matches[0], $$matches[1], $pattern);
                }
                $pattern='/'.$pattern.'/i';
                preg_match_all($pattern,$content,$detailArr);
                if($key=='haoma')
                {
                    if(substr_count($pattern, '(.*?)')==3)
                    {
                        $goods['haoma']=implode("|", array($detailArr[1][0],$detailArr[2][0],$detailArr[3][0]));
                    }
                    elseif(substr_count($pattern, '(.*?)')==5)
                    {
                        $goods['haoma']=implode("|", array($detailArr[1][0],$detailArr[2][0],$detailArr[3][0],$detailArr[4][0],$detailArr[5][0]));
                    }
                    else 
                    {
                        $goods['haoma']=$detailArr[1][0];
                    }
                }
                else 
                {
                $goods[$key]=$detailArr[1][0];
                }
                $$attr_var=$goods[$key];
            }
            $result[]=$goods;
        }
        if(count($result)==1)
        {
            return $result[0];
        }
        else
        {
        return $result;
        }
    }
    // 对抓去到的内容做简单过滤(过滤空白字符,便于正则匹配)
    private function _prefilter($output)
    {
        $output = preg_replace("/\/\/[\S\f\t\v ]*?;[\r|\n]/", "", $output);
        $output = preg_replace("/\<\!\-\-[\s\S]*?\-\-\>/", "", $output);
        $output = preg_replace("/\>[\s]+\<", $output);
        $output = preg_replace("/;[\s]+/", ";", $output);
        $output = preg_replace("/[\s]+\}/", "}", $output);
        $output = preg_replace("/}[\s]+/", "}", $output);
        $output = preg_replace("/\{[\s]+/", "{", $output);
        $output = preg_replace("/([\s]){2,}/", "$1", $output);
        $output = preg_replace("/[\s]+\=[\s]+/", "=", $output);
        return $output;
    }    
    public function setDetailRule($rule)
    {
        $this->detail_rule=$rule;
    }
}
class MultiHttpRequest
{

    public $urls = array();

    public $curlopt_header = 0;

    public $method = "GET";

    function __construct($urls = false)
    {
        $this->urls = $urls;
    }

    function set_urls($urls)
    {
        $this->urls = $urls;
        return $this;
    }

    function is_return_header($b)
    {
        $this->curlopt_header = $b;
        return $this;
    }

    function set_method($m)
    {
        $this->medthod = strtoupper($m);
        return $this;
    }

    function start()
    {
        if (! is_array($this->urls) or count($this->urls) == 0) {
            return false;
        }
        $curl = $text = array();
        $handle = curl_multi_init();
        foreach ($this->urls as $k => $v) {
            $curl[$k] = $this->add_handle($handle, $v);
        }
        
        $this->exec_handle($handle);
        foreach ($this->urls as $k => $v) {
            $text[$k] = curl_multi_getcontent($curl[$k]);
             //$coding=mb_detect_encoding($text[$k]);
            if(!is_utf8($text[$k]))
            {
                $text[$k] = iconv("", "UTF-8", $text[$k]);
            }
            curl_multi_remove_handle($handle, $curl[$k]);
        }
        curl_multi_close($handle);
        
        return $text;
    }

    private function add_handle($handle, $url)
    {
        $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
        curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        curl_multi_add_handle($handle, $curl);
        return $curl;
    }

    private function exec_handle($handle)
    {
        $flag = null;
        do {
            curl_multi_exec($handle, $flag);
        } while ($flag > 0);
    }

    public function get_content($url)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
        return curl_exec($ch);
    }
}

function is_utf8($str)
{
    $c = 0;
    $b = 0;
    $bits = 0;
    $len = strlen($str);
    for ($i = 0; $i < $len; $i ++) {
        $c = ord($str[$i]);
        if ($c > 128) {
            if (($c >= 254))
                return false;
            elseif ($c >= 252)
                $bits = 6;
            elseif ($c >= 248)
                $bits = 5;
            elseif ($c >= 240)
                $bits = 4;
            elseif ($c >= 224)
                $bits = 3;
            elseif ($c >= 192)
                $bits = 2;
            else
                return false;
            if (($i + $bits) > $len)
                return false;
            while ($bits > 1) {
                $i ++;
                $b = ord($str[$i]);
                if ($b < 128 || $b > 191)
                    return false;
                $bits --;
            }
        }
    }
    return true;
}
正文完
 

公众号