【爬虫】python+urllib+beautifusoup爬取花瓣网美女图片
爬取花瓣网的图片
#!/usr/bin/env python # -*- encoding:utf-8 -*- import urllib2 from bs4 import BeautifulSoup import re import requests url = \'http://huaban.com/favorite/beauty/\' def requestMain(): request = urllib2.Request(url) request.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" } html_doc = urllib2.urlopen(request) print html_doc.getcode() return html_doc def getPins(): html_doc = requestMain().read() soup = BeautifulSoup(html_doc, \'html.parser\', from_encoding=\'utf-8\') pins = soup.find_all(\'a\', href=re.compile(r"/pins/\d+/")) # print pins huaban = \'http://huaban.com\' i = 0 for pin in pins: pin_urls = huaban + pin[\'href\'] req = urllib2.Request(pin_urls) resp = urllib2.urlopen(req) soup = BeautifulSoup(resp, \'html.parser\', from_encoding=\'utf-8\') div_tag = soup.find_all(\'div\', class_="image-holder") i = i+1 print i for tag in div_tag: img = tag.find(\'img\') link = \'http:\'+img.get(\'src\') print link a = requests.get(link) imgname = i #imgname = link.split(\'/\')[-1] with open(r\'C:\Users\wuzhi_000\Desktop\Python\py_scrapy\image\%s.jpg\' % imgname, \'wb\') as pic: pic.write(a.content) if __name__ == \'__main__\': print getPins() # print (soup.prettify()) # print soup.title # print soup.title.name # # print soup.title.string # # print soup.p # # print soup.p[\'class\'] # # print soup.a # # print soup.find_all(\'img\') # # print (\'\r\n\') # # print soup.find(href="/pins/1147154763/") # # print (\'\r\n\') # # for img in soup.find_all(\'img\'): # print (img.get(\'src\'))
版权声明:本文为wuzhiyi原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。