php 网页内容抓取
抓取网页,先查看页面源代码。如果直接可以从html代码里获取我一般用phpquery 否则就另行正则匹配,或者找有没有api
最近抓的2个网站内容的代码
列表页抓取:第一种使用phpquery插件,可以快速获取,第二种它是api,所以直接获取
load_third("phpQuery.php"); /*********www.sosobtc.com***********/ /**/ $re = phpQuery::newDocumentFile(\'https://www.sosobtc.com/news/all\'); //设置好抓取的新闻列表网址 $data = array(); // 获取列表地址 foreach(pq(\'.news-list .news-thumbnail a\') as $key=>$value) { $href = $value->getAttribute(\'href\'); $data[$key][\'source_url\'] = "https://www.sosobtc.com".$href; } // 获取标题 foreach(pq(\'.news-list .news-title h3\') as $key=>$value) { $title = pq($value)->text(); $data[$key][\'title\'] = $title; } // 获取封面图地址 foreach(pq(\'.news-list .share-box ul\') as $key=>$value) { $re = pq($value)->find(\'li\')->eq(0)->find(\'a\')->attr(\'href\'); $str = strrchr($re,"&"); $arr= explode("=",$str); $data[$key][\'pic\'] = $arr[1]; $str2 = explode("/",$arr[1]); $data[$key][\'add_time\'] = strtotime($str2[5]); } //获取信息初始来源 foreach(pq(\'.category\') as $key=>$value) { $source = pq($value)->text(); $data[$key][\'source\'] = $source; } // exit; foreach($data as $v){ $adddata[\'title\'] = $v[\'title\']; $adddata[\'source_url\'] = $v[\'source_url\']; $adddata[\'add_time\'] = time(); $adddata[\'add_time\'] = $v[\'add_time\']; $adddata[\'pic\'] = $v[\'pic\']; $adddata[\'source\'] = $v[\'source\']; // $adddata[\'stype\'] = 1; $result = News::add($adddata); if(!$result[\'insert_id\']){ file_put_contents("/data/log/fail_spider.log",var_dump($result).",".$v[\'source_url\'].",".$v[\'pic\']."\r\n",FILE_APPEND); } } /*********www.sosobtc.com***********/ /*********www.36kr.com/***********/ $result = file_get_contents("http://36kr.com/api/search-column/208?per_page=20&page=1"); if(!$result){ die; } $result = json_decode($result,true); if(count($result[\'data\'][\'items\'])==0){ die; } foreach($result[\'data\'][\'items\'] as $k=>$v){ $sdata[\'add_time\'] = strtotime($v[\'published_at\']); $sdata[\'title\'] = $v[\'title\']; $sdata[\'pic\'] = $v[\'template_info\'][\'template_cover\'][0]; $info = json_decode($v[\'user_info\'],true); $sdata[\'source\'] = $info[\'name\']; $sdata[\'source_url\'] = "http://36kr.com/p/".$v[\'id\'].".html"; $re = News::add($sdata); if(!$re[\'insert_id\']){ file_put_contents("/data/log/fail_spider.log",var_dump($re).",".$v[\'source_url\'].",".$v[\'pic\']."\r\n",FILE_APPEND); } } /*********www.36kr.com/***********/
先获取的列表内容,再根据列表对应的目标地址,再去挨个抓取详情,
详情页面抓取:
load_third("phpQuery.php"); function download($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $file = curl_exec($ch); curl_close($ch); $filename = pathinfo($url, PATHINFO_BASENAME); $path = \'/data/xxxxx.com/phone/wwwroot/upimg/\';//**************注意权限问题 $dirarr = explode("/",$url); $path .= $dirarr[5]."/"; if (!is_dir($path)) mkdir($path); $resource = fopen($path . $filename, \'a\'); fwrite($resource, $file); fclose($resource); return "/".$dirarr[5]."/".$filename; } function download2($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $file = curl_exec($ch); curl_close($ch); $filename = pathinfo($url, PATHINFO_BASENAME).".jpg"; $path = \'/data/xxxxx.com/phone/wwwroot/upimg/\';//**************注意权限问题 $path .= date("Ymd")."/"; if (!is_dir($path)) mkdir($path); $resource = fopen($path . $filename, \'a\'); fwrite($resource, $file); fclose($resource); return "/".date("Ymd")."/".$filename; } $result = News::getdown(); if(count($result)==0){ exit(2); } foreach($result as $v) { if(strpos($v[\'source_url\'],\'sosobtc\')){ $path = download($v[\'pic\']);//下载图片到本地 $re = phpQuery::newDocumentFile($v[\'source_url\']); //设置好抓取的新闻列表网址 $content = pq(".article-main")->html(); // $id = $v[\'id\']; $data[\'pic\'] = $path; $data[\'content\'] = addslashes(trim($content)); $data[\'status\'] = 1; $result = News::modify($v[\'id\'],$data); if(!$result){ file_put_contents("/data/log/fail_spiderdown.log",$v[\'id\']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND); } }else if(strpos($v[\'source_url\'],\'36kr\')){ // echo $v[\'id\']."\r\n"; $path = download2($v[\'pic\']);//下载图片到本地 $re = file_get_contents($v[\'source_url\']); //设置好抓取的新闻列表网址 preg_match("/<script>var props=(.*),locationnal={/",$re,$match); $info = json_decode($match[1],true); $content = $info[\'detailArticle|post\'][\'content\']; $data[\'pic\'] = $path; $data[\'content\'] = $content; $data[\'status\'] = 1; $result = News::modify($v[\'id\'],$data); // print_r($data); // break; $result = News::modify($v[\'id\'],$data); if(!$result){ file_put_contents("/data/log/fail_spiderdown.log",$v[\'id\']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND); } } }
第一种还是用phpquery抓取。第二种查看源代码,它是js数据懒加载的,所以我直接php正则匹配我需要的数据。 其中我把二者的封面图都下载到本地了,本地的upimg主要要给权限,否则创建日期目录可能会失败。还有一点,我对source_url 也就是目标网址 mysql字段做了唯一索引,这样 我每天2个脚本定时跑,可以抓到最新数据,同时又不会抓到重复数据。