php基于curl实现多线程批量抓取

#   php基于curl实现多线程批量抓取



<?php

/*

curl 多线程抓取

*/

 /** 

   * curl 多线程 

   * 

   * @param array $array 并行网址 

   * @param int $timeout 超时时间

   * @return array 

   */

 function Curl_http($array,$timeout){

 $res = array();

 $mh = curl_multi_init();//创建多个curl语柄

 $startime = getmicrotime();

 foreach($array as $k=>$url){

  $conn[$k]=curl_init($url);

 

    curl_setopt($conn[$k], CURLOPT_TIMEOUT, $timeout);//设置超时时间

    curl_setopt($conn[$k], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');

    curl_setopt($conn[$k], CURLOPT_MAXREDIRS, 7);//HTTp定向级别

    curl_setopt($conn[$k], CURLOPT_HEADER, 0);//这里不要header,加块效率

    curl_setopt($conn[$k], CURLOPT_FOLLOWLOCATION, 1); // 302 redirect

    curl_setopt($conn[$k],CURLOPT_RETURNTRANSFER,1);

    curl_multi_add_handle ($mh,$conn[$k]);

 }

 //防止死循环耗死cpu 这段是根据网上的写法

 do {

  $mrc = curl_multi_exec($mh,$active);//当无数据,active=true

 } while ($mrc == CURLM_CALL_MULTI_PERFORM);//当正在接受数据时

 while ($active and $mrc == CURLM_OK) {//当无数据时或请求暂停时,active=true

  if (curl_multi_select($mh) != -1) {

  do {

   $mrc = curl_multi_exec($mh, $active);

  } while ($mrc == CURLM_CALL_MULTI_PERFORM);

  }

 }

 

 foreach ($array as $k => $url) {

   curl_error($conn[$k]);

    $res[$k]=curl_multi_getcontent($conn[$k]);//获得返回信息

    $header[$k]=curl_getinfo($conn[$k]);//返回头信息

    curl_close($conn[$k]);//关闭语柄

    curl_multi_remove_handle($mh , $conn[$k]);  //释放资源 

 }

 

 curl_multi_close($mh);

 $endtime = getmicrotime();

 $diff_time = $endtime - $startime;

 

 return array('diff_time'=>$diff_time,

   'return'=>$res,

   'header'=>$header

   );

 

 }

 //计算当前时间

 function getmicrotime() {

   list($usec, $sec) = explode(" ",microtime());

   return ((float)$usec + (float)$sec);

 }

 

 //测试一下,curl 三个网址

 $array = array(

  "http://www.weibo.com/",

  "http://www.renren.com/",

  "http://www.qq.com/"

  );

 $data = Curl_http($array,'10');//调用

 var_dump($data);//输出

//如果POST的数据大于1024字节,curl并不会直接就发起POST请求

//发送请求时,header中包含一个空的Expect。curl_setopt($ch, CURLOPT_HTTPHEADER, array("Expect:"));

?>


# xiaoxiao [ 2019-08-30 ]

# Address in this article

# http://www.s7smile.com/php/47

# s7smile.com