python requests爬一点重口味的~~ 【ituba.cc (sexy,无码 露点 湿身 大MM 性感 裸女 妹子(✿◕‿◕✿) 作者:qq3461896724
上代码:
#首先创建文件夹:D:/Dituba 再运行,用requests,没有的cmd里写 pip install requests 下载~ (不懂代码可跳过,直接看下文
注意:程序运行后需要几分钟先获得所有图片的URL列表,然后才能开始下载
(运行结果 ,见下文
# -*- coding:utf-8 -*- import requests from lxml import etree import os import time class PictureSpider: # 分类爬取所有图片 def __init__(self): self.url = "https://www.ituba.cc/" self.category = {} def get_category(self): response = requests.get(self.url) html = etree.HTML(response.content.decode()) general_classification = html.xpath("//div[@class = 'l tionav fs14 b']/ul/li")[1:] # 注意标签的选择范围 for i in general_classification: name = i.xpath("./h2/a/text()")[0] category_names = i.xpath("./div/h3/a/text()") category_urls = i.xpath("./div/h3/a/@href") category_urls = ['http://www.ituba.cc/' + i for i in category_urls] url_data = {m: n for m, n in zip(category_names, category_urls)} self.category[name] = url_data def get_page_url(self): for i in self.category: for j in self.category[i]: url = self.category[i][j] response = requests.get(url) html = etree.HTML(response.content.decode()) # 获取页数,构造每一页的URL列表 page_1 = url + "index.html" page_format = page_1.replace('index', 'p{}') pages = eval(html.xpath("//div[@class = 'pages']/ul/a/text()")[-2]) page_url_list = [page_format.format(i) for i in range(2, pages + 1)] page_url_list.insert(0, page_1) self.category[i][j] = page_url_list def get_picture_url(self): for i in self.category: for j in self.category[i]: name_url = {} for page_url in self.category[i][j]: response = requests.get(page_url) html = etree.HTML(response.content.decode()) picture_urls = html.xpath("//div[@id = 'NewList']//img/@src") picture_urls = [i.replace('//t1.ituba.cc/', 'http://222.186.12.239:20011/') for i in picture_urls] picture_names = html.xpath("//div[@id = 'NewList']//a[@class = 'PicTxt']/text()") name_url.update({name: url for name, url in zip(picture_names, picture_urls)}) self.category[i][j] = name_url # print(self.category) def get_content(self, url): response = requests.get(url) content = response.content return content def save_picture(self, file_name, content): with open(file_name, 'wb') as f: f.write(content) f.close() def run(self): self.get_category() self.get_page_url() self.get_picture_url() print(self.category) for category1 in self.category: file_category1 = 'D:/Dituba' + '/' + category1 os.mkdir(file_category1) for category2 in self.category[category1]: file_category2 = file_category1 + '/' + category2 os.mkdir(file_category2) for picture_name in self.category[category1][category2]: try: content = self.get_content(self.category[category1][category2][picture_name]) except: continue else: try: file_name = file_category2 + '/' + picture_name + '.jpg' self.save_picture(file_name, content) except: continue print("{} 下载完成".format(file_category2)) time.sleep(5) #注意:程序运行后需要几分钟先获得所有图片的URL列表,然后才能开始下载 if __name__ == '__main__': spider = PictureSpider() spider.run()
只是一部分
有 1.49G,要2小时 才能 完整运行。。。 不过20-30分钟就能 获取完链接了,有图了
不过可以加我qq3461896724 给图 压缩包,只要2分钟 给你~ (也可以q群: 754827260 批量给
资源~
觉得好,就。。。来一下
点“首页”看更多