【Python爬虫基础】抓取知乎页面所有图片
抓取地址所有图片
#! /usr/bin/env python from urlparse import urlsplit from os.path import basename import urllib2 import re import requests import os import json url = \'https://www.zhihu.com/question/37787176\' if not os.path.exists(\'images\'): os.mkdir("images") print("start>>>>>>>") page_size = 50 offset = 0 url_content = urllib2.urlopen(url).read() answers = re.findall(\'h3 data-num="(.*?)"\', url_content) limits = int(answers[0]) while offset < limits: post_url = "http://www.zhihu.com/node/QuestionAnswerListV2" params = json.dumps({ \'url_token\': 37787176, \'pagesize\': page_size, \'offset\': offset }) data = { \'_xsrf\': \'\', \'method\': \'next\', \'params\': params } header = { \'User-Agent\': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", \'Host\': "www.zhihu.com", \'Referer\': url } response = requests.post(post_url, data=data, headers=header) answer_list = response.json()["msg"] img_urls = re.findall(\'img .*?src="(.*?_b.*?)"\', \'\'.join(answer_list)) for img_url in img_urls: try: img_data = urllib2.urlopen(img_url).read() file_name = basename(urlsplit(img_url)[2]) print(file_name) output = open(\'images/\' + file_name, \'wb\') output.write(img_data) output.close() except: pass offset += page_size print("end>>>>>>>")
正则抓取网页title
#!/usr/bin/python # coding:utf-8 import httplib2 import urllib2 import re #正则表达式模块 class PageClass: #获取指定url的网页内容 def get_page(self,url,headers): http=httplib2.Http() response,content=http.request(url,\'GET\',headers=headers) return content.decode(\'utf-8\') def main(): headers={"cookie":\'your cookie\'} url = \'http://v.ktgj.com\' #print headers page = PageClass() content = page.get_page(url,headers) return content if __name__ == "__main__": htmltext = main() pattern = re.compile(r\'<title>(.*?)</title>\') match = pattern.match(htmltext) if match: print match.group() print htmltext
下载网页图片
#! /usr/bin/env python from urlparse import urlsplit from os.path import basename import urllib2 import re import requests import os import json import datetime if not os.path.exists(\'images\'): os.mkdir("images") print("start>>>>>>>>>>>>>>>>>>>>>>>") url = "http://www.ssff66.com/se/jingpintaotu/519271.html" response = requests.get(url) #print(response.text) img_urls = re.findall(\'img .*?src="(.*?)"\', response.text) #print(img_urls) for img_url in img_urls: try: img_data = urllib2.urlopen(img_url,timeout = 5).read() file_name = basename(urlsplit(img_url)[2]) print(datetime.datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\') + " " + file_name) output = open(\'images/\' + file_name, \'wb\') output.write(img_data) output.close() except Exception,e: print("error : " + e.message) pass print("end>>>>>>>>>>>>>>>>>>>>>>>")
版权声明:本文为jhli原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。