爬取北京小猪短租网的数据,将数据分别使用TXT、JSON、CSV存储。

参考网址:http://bj.xiaozhu.com/

需要爬取的信息包括:标题、地址、价格、房东名称、房东性别和房东头像的链接,将数据分别使用TXT、JSON、CSV存储。

import csv
import time
import json
import requests
from bs4 import BeautifulSoup
from requests import RequestException
    
    
def get_one_page(url):
    try:
        headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\'}
        response = requests.get(url, headers=headers)
        #response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def get_detailurl(text):
    detailurl = []
    soup = BeautifulSoup(text, \'lxml\')
    result_btm_con = soup.find_all(name=\'div\', class_=\'result_btm_con lodgeunitname\')
    for i in range(len(result_btm_con)):
        detailurl.append(result_btm_con[i][\'detailurl\'])
    return detailurl
    
def parse_one_page(text):
    soup = BeautifulSoup(text, \'lxml\') #使用lxml XML 解析库
    title = soup.select(\'.pho_info > h4 > em\')
    addresse = soup.select(\'.pho_info > p\')
    price = soup.find_all(name=\'span\', class_=\'detail_avgprice\')
    name = soup.find_all(name=\'a\', class_=\'lorder_name\')
    sex = soup.find_all(name=\'div\', class_=\'member_ico\')
    sex1 = soup.find_all(name=\'div\', class_=\'member_ico1\')
    ssex = \'\' if len(sex) else \'\'
    img = soup.select(\'.member_pic img\')
    yield{ 
        \'title\': title[0].string,
        \'address\': addresse[0][\'title\'],
        \'price\': price[0].string,
        \'name\': name[0][\'title\'],
        \'sex\': ssex,
        \'img\': img[0][\'src\']
    }
        
def write_to_file(content):
    with open(\'xiaozhu.txt\', \'a\', encoding=\'utf-8\') as f:
        f.write(json.dumps(content, ensure_ascii=False)+\'\n\')
        #dumps将json对象转化为字符串

def write_to_json(content):
    with open(\'xiaozhu.json\', \'a\', encoding=\'utf-8\') as f:
        f.write(json.dumps(content, ensure_ascii=False)+\'\n\')
        
def write_to_csv(content):
    with open(\'xiaozhu.csv\', \'a\', encoding=\'utf-8\') as f:
        fieldnames = [\'title\',\'address\',\'price\',\'name\',\'sex\',\'img\']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(content)

if __name__ == \'__main__\':
    url = \'http://bj.xiaozhu.com/search-duanzufang-p{}-0/\'
    urls = [url.format(page) for page in range(1,3)]
    content = []
    for url in urls:
        text1 = get_one_page(url)
        detailurl = get_detailurl(text1)
        for i in detailurl:
            text2 = get_one_page(i)
            for item in parse_one_page(text2):
                print(item)
                write_to_file(item)            
                content.append(item)
    write_to_csv(content)
    write_to_json(content)

版权声明:本文为oeong原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/oeong/p/11687056.html