上代码:

#首先创建文件夹:D:/Dituba  再运行,用requests,没有的cmd里写 pip install requests 下载~ (不懂代码可跳过,直接看下文

注意:程序运行后需要几分钟先获得所有图片的URL列表,然后才能开始下载
(运行结果 ,见下文
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
import time


class PictureSpider:  # 分类爬取所有图片
    def __init__(self):
        self.url = "https://www.ituba.cc/"
        self.category = {}

    def get_category(self):
        response = requests.get(self.url)
        html = etree.HTML(response.content.decode())
        general_classification = html.xpath("//div[@class = 'l tionav fs14 b']/ul/li")[1:]  # 注意标签的选择范围
        for i in general_classification:
            name = i.xpath("./h2/a/text()")[0]
            category_names = i.xpath("./div/h3/a/text()")
            category_urls = i.xpath("./div/h3/a/@href")
            category_urls = ['http://www.ituba.cc/' + i for i in category_urls]
            url_data = {m: n for m, n in zip(category_names, category_urls)}
            self.category[name] = url_data

    def get_page_url(self):
        for i in self.category:
            for j in self.category[i]:
                url = self.category[i][j]
                response = requests.get(url)
                html = etree.HTML(response.content.decode())
                # 获取页数,构造每一页的URL列表
                page_1 = url + "index.html"
                page_format = page_1.replace('index', 'p{}')
                pages = eval(html.xpath("//div[@class = 'pages']/ul/a/text()")[-2])
                page_url_list = [page_format.format(i) for i in range(2, pages + 1)]
                page_url_list.insert(0, page_1)
                self.category[i][j] = page_url_list

    def get_picture_url(self):
        for i in self.category:
            for j in self.category[i]:
                name_url = {}
                for page_url in self.category[i][j]:
                    response = requests.get(page_url)
                    html = etree.HTML(response.content.decode())
                    picture_urls = html.xpath("//div[@id = 'NewList']//img/@src")
                    picture_urls = [i.replace('//t1.ituba.cc/', 'http://222.186.12.239:20011/') for i in picture_urls]
                    picture_names = html.xpath("//div[@id = 'NewList']//a[@class = 'PicTxt']/text()")
                    name_url.update({name: url for name, url in zip(picture_names, picture_urls)})
                self.category[i][j] = name_url
                # print(self.category)

    def get_content(self, url):
        response = requests.get(url)
        content = response.content
        return content

    def save_picture(self, file_name, content):
        with open(file_name, 'wb') as f:
            f.write(content)
            f.close()

    def run(self):
        self.get_category()
        self.get_page_url()
        self.get_picture_url()
        print(self.category)
        for category1 in self.category:
            file_category1 = 'D:/Dituba' + '/' + category1
            os.mkdir(file_category1)
            for category2 in self.category[category1]:
                file_category2 = file_category1 + '/' + category2
                os.mkdir(file_category2)
                for picture_name in self.category[category1][category2]:
                    try:
                        content = self.get_content(self.category[category1][category2][picture_name])
                    except:
                        continue
                    else:
                        try:
                            file_name = file_category2 + '/' + picture_name + '.jpg'
                            self.save_picture(file_name, content)
                        except:
                            continue
            print("{} 下载完成".format(file_category2))
            time.sleep(5)


#注意:程序运行后需要几分钟先获得所有图片的URL列表,然后才能开始下载
if __name__ == '__main__':
    spider = PictureSpider()
    spider.run()

只是一部分

 

 

 

 

 有 1.49G,要2小时 才能 完整运行。。。 不过20-30分钟就能 获取完链接了,有图了

不过可以加我qq3461896724 给图 压缩包,只要2分钟 给你~  (也可以q群: 754827260  批量给

 

 资源~

 

 觉得好,就。。。来一下

点“首页”看更多

 

版权声明:本文为hopecloud原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/hopecloud/p/sexygo.html