抓取网页,先查看页面源代码。如果直接可以从html代码里获取我一般用phpquery 否则就另行正则匹配,或者找有没有api

最近抓的2个网站内容的代码

 

列表页抓取:第一种使用phpquery插件,可以快速获取,第二种它是api,所以直接获取

load_third("phpQuery.php");

 
/*********www.sosobtc.com***********/
/**/
$re = phpQuery::newDocumentFile(\'https://www.sosobtc.com/news/all\'); //设置好抓取的新闻列表网址
$data = array();

// 获取列表地址
foreach(pq(\'.news-list .news-thumbnail a\') as $key=>$value) {
  $href =  $value->getAttribute(\'href\');
  $data[$key][\'source_url\'] = "https://www.sosobtc.com".$href;
}

// 获取标题
foreach(pq(\'.news-list .news-title h3\') as $key=>$value) {
  $title =  pq($value)->text();
  $data[$key][\'title\'] = $title;
}

// 获取封面图地址
foreach(pq(\'.news-list .share-box ul\') as $key=>$value) {
  $re = pq($value)->find(\'li\')->eq(0)->find(\'a\')->attr(\'href\');
  $str = strrchr($re,"&");
  $arr= explode("=",$str);
  $data[$key][\'pic\'] = $arr[1];
  
  
  $str2 = explode("/",$arr[1]);
  $data[$key][\'add_time\'] = strtotime($str2[5]);
}

//获取信息初始来源
foreach(pq(\'.category\') as $key=>$value) {
  $source = pq($value)->text();
  $data[$key][\'source\'] = $source;
}

// exit;
foreach($data as $v){
    $adddata[\'title\'] = $v[\'title\'];
    $adddata[\'source_url\'] = $v[\'source_url\'];
    $adddata[\'add_time\'] = time();
    $adddata[\'add_time\'] = $v[\'add_time\'];
    $adddata[\'pic\'] = $v[\'pic\'];
    $adddata[\'source\'] = $v[\'source\'];
    // $adddata[\'stype\'] = 1;
    $result = News::add($adddata);
    if(!$result[\'insert_id\']){
        file_put_contents("/data/log/fail_spider.log",var_dump($result).",".$v[\'source_url\'].",".$v[\'pic\']."\r\n",FILE_APPEND);
    }
}

/*********www.sosobtc.com***********/

/*********www.36kr.com/***********/
$result = file_get_contents("http://36kr.com/api/search-column/208?per_page=20&page=1");

if(!$result){
    die;
}

$result = json_decode($result,true);

if(count($result[\'data\'][\'items\'])==0){
    die;
}

foreach($result[\'data\'][\'items\'] as $k=>$v){
    $sdata[\'add_time\'] = strtotime($v[\'published_at\']);
    $sdata[\'title\'] = $v[\'title\'];
    $sdata[\'pic\'] = $v[\'template_info\'][\'template_cover\'][0];
    $info = json_decode($v[\'user_info\'],true);
    $sdata[\'source\'] = $info[\'name\'];
    $sdata[\'source_url\'] = "http://36kr.com/p/".$v[\'id\'].".html";
    
    $re = News::add($sdata);
    if(!$re[\'insert_id\']){
        file_put_contents("/data/log/fail_spider.log",var_dump($re).",".$v[\'source_url\'].",".$v[\'pic\']."\r\n",FILE_APPEND);
    }
}

/*********www.36kr.com/***********/

先获取的列表内容,再根据列表对应的目标地址,再去挨个抓取详情,

详情页面抓取:

load_third("phpQuery.php");

function download($url)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
    $file = curl_exec($ch);
    curl_close($ch);
    $filename = pathinfo($url, PATHINFO_BASENAME);
  
  
    $path = \'/data/xxxxx.com/phone/wwwroot/upimg/\';//**************注意权限问题
    $dirarr = explode("/",$url);
    $path .= $dirarr[5]."/";

    if (!is_dir($path)) mkdir($path);
  
    $resource = fopen($path . $filename, \'a\');
    fwrite($resource, $file);
    fclose($resource);
    return "/".$dirarr[5]."/".$filename;
}

function download2($url)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
    $file = curl_exec($ch);
    curl_close($ch);
    $filename = pathinfo($url, PATHINFO_BASENAME).".jpg";
  
  
    $path = \'/data/xxxxx.com/phone/wwwroot/upimg/\';//**************注意权限问题
    $path .= date("Ymd")."/";

    if (!is_dir($path)) mkdir($path);
  
    $resource = fopen($path . $filename, \'a\');
    fwrite($resource, $file);
    fclose($resource);
    return "/".date("Ymd")."/".$filename;
}



$result = News::getdown();

if(count($result)==0){
    exit(2);
}


foreach($result as $v)
{    

    if(strpos($v[\'source_url\'],\'sosobtc\')){

        $path = download($v[\'pic\']);//下载图片到本地

        $re = phpQuery::newDocumentFile($v[\'source_url\']); //设置好抓取的新闻列表网址
        $content = pq(".article-main")->html();
        
        // $id = $v[\'id\'];
        $data[\'pic\'] = $path;
        $data[\'content\'] = addslashes(trim($content));
        $data[\'status\'] = 1;
        
        $result = News::modify($v[\'id\'],$data);
        if(!$result){
            file_put_contents("/data/log/fail_spiderdown.log",$v[\'id\']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND);
        }        
    }else if(strpos($v[\'source_url\'],\'36kr\')){
        // echo $v[\'id\']."\r\n";

        $path = download2($v[\'pic\']);//下载图片到本地
        
        
        $re = file_get_contents($v[\'source_url\']); //设置好抓取的新闻列表网址
        preg_match("/<script>var props=(.*),locationnal={/",$re,$match);
        $info = json_decode($match[1],true);
        $content = $info[\'detailArticle|post\'][\'content\'];
                
        $data[\'pic\'] = $path;
        $data[\'content\'] = $content;
        $data[\'status\'] = 1;
        
        $result = News::modify($v[\'id\'],$data);
        // print_r($data);
        // break;
        $result = News::modify($v[\'id\'],$data);
        if(!$result){
            file_put_contents("/data/log/fail_spiderdown.log",$v[\'id\']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND);
        }
    }

    
}

第一种还是用phpquery抓取。第二种查看源代码,它是js数据懒加载的,所以我直接php正则匹配我需要的数据。 其中我把二者的封面图都下载到本地了,本地的upimg主要要给权限,否则创建日期目录可能会失败。还有一点,我对source_url 也就是目标网址 mysql字段做了唯一索引,这样 我每天2个脚本定时跑,可以抓到最新数据,同时又不会抓到重复数据。

版权声明:本文为phpjinggege原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/phpjinggege/p/8615800.html