爬虫之亚马逊爬取

根据mazon函数里的参数来,爬取相关的书籍,并以json来存储
import requests
import re
import random
import json
from bs4 import BeautifulSoup
import pickle
dic = \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400 QQBrowser/9.7.12661.400\'
useagent = {\'User-Agent\':dic,\'Host\':\'www.amazon.cn\',\'Cookie\':\'x-wl-uid=1+EeiKz9a/J/y3g6XfXTnSbHAItJEus3oQ6Gz+T/haur7dZfkNIgoxzMGwviB+42iWIyk9LR+iHQ=;\'
                                                             \' session-id=457-2693740-8878563; ubid-acbcn=459-5133849-3255047; lc-acbcn=zh_CN; i18n-prefs=CNY; \'
                                                             \'session-token="8n/Oi/dUCiI9zc/0zDLjB9FQRC6sce2+Tl7F0oXncOcIYDK4SEJ7eek/Vs3UfwsRchW459OZni0AFjMW+\'
                                                             \'9xMMBPSLM8MxLNDPP1/13unryj8aiRIZAE1WAn6GaeAgauNsijuBKKUwwLh8Dba7hYEjwlI1J6xlW0LKkkyVuApjRXnOsvdYr\'
                                                             \'X8IURVpOxDBnuAF9r7O71d/NPkIQsHy7YCCw=="; session-id-time=2082787201l;\'
                                                             \' csm-hit=tb:s-85XYJNXFEJ5NBKR0JE6H|1566558845671&t:1566558845672&adb:adblk_no\'}

def mazon(text,type=\'\'):
    if type!=\'\':
        type = \'&i=\'+type
    cookies = dict(useid = \'123456\',token = \'funkystyle\')
    responsts = requests.get(f\'https://www.amazon.cn/s?k={text}{type}&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss\',headers=useagent,cookies=cookies)

    responsts.encoding = responsts.apparent_encoding
    index = responsts.text
    buti = BeautifulSoup(index,\'html.parser\')
    # print(buti.prettify())
    if responsts.status_code==200:
        page = re.findall(\'class="a-disabled">(\d+)</li>\',index)
        for i in range(1,int(page[-1])+1):
            rand =random.randint(1560000000,1570000000)
            url = f\'https://www.amazon.cn/s?k={text}{type}&page={i}&__mk_zh_CN=亚马逊网站&qid={rand}&ref=sr_pg_{i}\'
            responst = requests.get(url,headers=useagent)
            responst.encoding = responst.apparent_encoding
            if responst.status_code!=200:
                print(f\'运行到第{i}页请求失败\')
                break
            content = responst.text
            goodslist = re.findall(f\'<a class="a-link-normal a-text-normal" target="_blank" href="(.*?)ref=\',content)
            for j,goods in enumerate(goodslist):
                goodsurl = f\'https://www.amazon.cn/{goods}\'
                res = requests.get(goodsurl, headers=useagent)
                res.encoding = res.apparent_encoding
                if res.status_code != 200:
                    print(f\'运行到{i}失败\')
                cont = res.text
                title = re.findall(\'<span id="ebooksProductTitle" class="a-size-extra-large">(.*?)</span>\',cont,re.S)
                title = re.sub(\'\s+|&.*?;\',\'\',\'-\'.join(title))
                auther = re.findall(\'<span class="author notFaded".*?href=.*?>(.*?)</a>\',cont,re.S)
                price = re.findall(\'<span class="a-size-base a-color-price a-color-price">(.*?)</span>\',cont,re.S)
                price = re.findall(\'\S+\',\'-\'.join(price))
                dic_infor = {\'主题\':title,\'作者\':auther,\'价格\':price}
                with open(f\'第{i}页商品{j+1}.json\',\'at\',encoding=\'utf8\') as fa:
                    json.dump(dic_infor,fa)
                    fa.flush()
            # with open(f\'{i}.txt\',\'wt\',encoding=\'utf8\') as fw:
            #     fw.write()
    else:
        print(\'首页访问失败！\')

    # responsts.raise_for_status()


mazon(\'python\')
# with open(\'第2页商品.json\',\'rt\',encoding=\'gbk\') as fr:
#     data = json.load(fr)
# print(data)
本文链接：https://www.cnblogs.com/cheng825/p/11419324.html
爬虫之亚马逊爬取

爬虫之亚马逊爬取的更多相关文章

随机推荐

热门专题

目录导航