• 欢迎访问搞代码网站,推荐使用最新版火狐浏览器和Chrome浏览器访问本网站!
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏搞代码吧

自己写的一个php基于phpQuery的通用采集_php

php 搞代码 3年前 (2018-06-21) 142次浏览 已收录 0个评论

还是小菜,第一次分享代码哈,这是自己以前写的一个php的采集类,自己一直在用,自我感觉很简单很强大,只要懂一点点选择器的知识就可以采集任何页面了,也支持https页面,做简单的采集足够用了。

 <?php    /**   *通用列表采集类   *版本V1.3   *作者:JAE   *博客:http://blog.jaekj.com   */     require_once '../phpQuery/phpQuery/phpQuery.php';     class QueryList{                  private $pageURL;          private $regArr = array();          public $jsonArr = array();          private $regRange;          private $html;          /************************************************          * 参数: 页面地址 选择器数组 块选择器          * 【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......)          * 【类型】说明:值 "text" ,"html" ,"属性"           *【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择          *************************************************/          function QueryList($pageURL,$regArr=array(),$regRange='')          {              $this->pageURL = $pageURL;                   //为了能获取https://                $ch = curl_init();                 curl_setopt($ch, CURLOPT_URL,$this->pageURL);                 curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);                  curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);                 curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);                  $this->html = curl_exec($ch);                curl_close($ch);                              if(!empty($regArr))              {                                $this->regArr = $regArr;                  $this->regRange = $regRange;                  $this->getList();              }                          }          function setQuery($regArr,$regRange='')          {              $this->jsonArr=array();              $this->regArr = $regArr;              $this->regRange = $regRange;              $this->getList();          }         private function getList()          {                            $hobj = phpQuery::newDocumentHTML($this->html);              if(!empty($this->regRange))              {              $robj = pq($hobj)->find($this->regRange);                            $i=0;              foreach($robj as $item)              {                                    while(list($key,$reg_value)=each($this->regArr))                  {                      $iobj = pq($item)->find($reg_value[0]);                                             switch($reg_value[1])                        {                            case 'text':                                  $this->jsonArr[$i][$key] = trim(pq($iobj)->text());                                  break;                            case 'html':                                  $this->jsonArr[$i][$key] = trim(pq($iobj)->html());                                  break;                            default:                                 $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);                                 break;                                                     }                  }                  //重置数组指针                  reset($this->regArr);                  $i++;               }              }              else              {             while(list($key,$reg_value)=each($this->regArr))              {                 $lobj = pq($hobj)->find($reg_value[0]);                                                            $i=0;                    foreach($lobj as $item)                    {                        switch($reg_value[1])                        {                            case 'text':                                  $this->jsonArr[$i++][$key] = trim(pq($item)->text());                                  break;                            case 'html':                                  $this->jsonArr[$i++][$key] = trim(pq($item)->html());                                  break;                            default:                                 $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);                                 break;                                                     }                                                                 }                                         }            }          }            function getJSON()          {              return json_encode($this->jsonArr);          }           } 

 

 
  <?php require 'Query/QueryList.class.php';   //采集OSC的代码分享列表,标题 链接 作者 $url = "http://www.oschina.net/code/list"; $reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title")); $rang = ".code_list li"; $hj = new QueryList($url,$reg,$rang); $arr = $hj->jsonArr; print_r($arr); //如果还想采当前页面右边的 TOP40活跃贡献者 图像,得到JSON数据,可以这样写 $reg = array("portrait"=>array(".hot_top img","src")); $hj->setQuery($reg); $json = $hj->getJSON(); echo $json . "<hr/>";  //采OSC内容页内容 $url = "http://www.oschina.net/code/snippet_186288_23816"; $reg = array("title"=>array(".QTitle h1","text"),"con"=>array(".Content","html")); $hj = new QueryList($url,$reg); $arr = $hj->jsonArr; print_r($arr);  //就举这么多例子吧,是不是用来做采集很方便 
  <?php   /**   *自己写的百度和谷歌搜索API   *版本V2.0   *作者:JAE   *博客:http://blog.jaekj.com   **/ require_once 'QueryList_class.php';    class Searcher    {       private $searcher;       private $key;       private $num;       private $page;       private $regArr ;       private $regRange ;       private $regZnum;       public $jsonArr;       //参数 搜索引擎 搜索关键字 返回的结果条数 第几页       function Searcher($searcher,$key,$num,$page)       {           if($searcher=='baidu')           {               $this->regArr = array("title"=>array("h3.t a,#ting_singlesong_box a","text"),"tCon"=>array("div.c-abstract,font:slice(0,2),div#weibo,table tr:eq(0),div.c-abstract-size p:eq(0),div.vd_sitcom_new_tinfo","text"),"url"=>array("h3.t a,#ting_singlesong_box a","href"));               $this->regRange = 'table.result,table.result-op';               $this->regZnum=array("zNum"=>array("span.nums","text"));           }           else if($searcher=='google')           {               $this->regArr = array("title"=>array("h3.r a","text"),"tCon"=>array("span.st","text"),"url"=>array("h3.r a","href"));               $this->regRange = 'li.g';               $this->regZnum=array("zNum"=>array("div#resultStats","text"));           }           $this->searcher = $searcher;           $this->key = $key;           $this->num  = $num;           $this->page = $page-1;           $this->getList();       }       private function getList()       {             $s = urlencode($this->key);             $num = $this->num;             $start = $this->num*$this->page;             if($this->searcher=='baidu')             {                 $url = "http://www.baidu.com/s?pn=$start&rn=$num&wd=$s";                  $reg_znum='/[/d,]+/';             }             else if($this->searcher=='google')             {                 $url="https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&num=$num&start=$start&q=$s";                 $reg_znum='/([/d,]+) result(s)?/';             }            $searcherObj = new QueryList($url,$this->regArr,$this->regRange);           for($i=0;$i<count($searcherObj->jsonArr);$i++)           {               if($this->searcher=='baidu')               {                  $searcherObj->jsonArr[$i]['url'] = $this->getBaiduRealURL($searcherObj->jsonArr[$i]['url']);               }               else if($this->searcher=='google')               {                   $searcherObj->jsonArr[$i]['url'] = $this->getGoogleRealURL($searcherObj->jsonArr[$i]['url']);               }           }           $this->jsonArr = $searcherObj->jsonArr ;                      //获取总共结果条数                     $searcherObj->setQuery($this->regZnum);           $zNum = $searcherObj->jsonArr[0]['zNum'];           preg_match($reg_znum,$zNum,$arr)?$zNum=$arr[0]:$zNum=0;           $zNum = (int)str_replace(',','',$zNum);           //计算总页数             $zPage = ceil($zNum/$this->num);              $this->jsonArr=array('num'=>$this->num,'page'=>((int)$this->page+1),'zNum'=>$zNum,'zPage'=>$zPage,"s"=>"$this->key",'other'=>array('author'=>'JAE','QQ'=>'734708094','blog'=>'http://blog.jaekj.com'),'data'=>$this->jsonArr);                              }        function getJSON()       {           return json_encode($this->jsonArr);       }     private  function getBaiduRealURL($url)      {          //得到百度跳转的真正地址         $header = get_headers($url,1);         if (strpos($header[0],'301')  strpos($header[0],'302'))          {             if(is_array($header['Location']))              {                 //return $header['Location'][count($header['Location'])-1];                 return $header['Location'][0];             }             else             {                 return $header['Location'];             }         }         else         {             return $url;         }      }      private function getGoogleRealURL($url)      {           $reg_url = '/q=(.+)&/U';          return  preg_match($reg_url,$url,$arr)?urldecode($arr[1]):$url;                 }  } // $hj = new Searcher('google','oschina',20,2);  // print_r( $hj->jsonArr); //效果演示地址 //http://blog.jaekj.com//jae/demo/searcher/Searcher_class.php?searcher=baidu&s=jaekj&num=20&page=1

欢迎大家阅读《自己写的一个php基于phpQuery的通用采集_php》,跪求各位点评,若觉得好的话请收藏本文,by 搞代码


喜欢 (0)
[搞代码]
分享 (0)
发表我的评论
取消评论

表情 贴图 加粗 删除线 居中 斜体 签到

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址