目标,将网页上的内容爬取下来,并实现翻页,存储为csv。

import os
from concurrent.futures.thread import ThreadPoolExecutor
from threading import Thread

import requests
from re import findall
from json import loads
import time
import pymysql
from multiprocessing import Queue

import csv
# 获取每页的内容,定义一个函数
def get_one_page(page, city_code=\'000000\'):
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36\'
    }
    url = f\'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=\'
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        json_data = findall(r\'window.__SEARCH_RESULT__\s*=\s*(\{.+?\})</script>\', response.text)[0]
        return loads(json_data)[\'engine_search_result\']
    else:
        print(\'请求失败!\')
# 需要多少页!
start_page=1
ts=[]
for i in range(10):
    result = get_one_page(start_page)
    if not result:
        print(\'没有更多数据\')
        break
    ts.append(result)
    start_page += 1
#data_1 = get_one_page(1) #尝试保存一页的内容
data_1=[] # 创建空列表,用于存储多页


for i in range(len(ts)):
    for j in range(50):#一页50条
        data_1.append(ts[i][j])
# 我需要存储的信息

jobs = []
for job in data_1:
    job_info = [job.get(\'job_name\'),
               job.get(\'providesalary_text\'),
               job.get(\'company_name\'),
               job.get(\'companytype_text\'),
               job.get(\'workarea_text\'),
               \'-\'.join(job.get(\'attribute_text\', [\'-\', \'-\', \'-\', \'-\', \'-\'])),
               job.get(\'jobwelf\')
               ]
    jobs.append(job_info)
name=[\'job_name\',\'providesalary_text\',\'company_name\',\'companytype_text\',\'workarea_tex\',\'attribute_text\',\'jobwelf\']
test=pd.DataFrame(columns=name,data=jobs)
test.to_csv("testcsv.csv") # 保存为csv格式
test.info()
<class \'pandas.core.frame.DataFrame\'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_name            500 non-null    object
 1   providesalary_text  500 non-null    object
 2   company_name        500 non-null    object
 3   companytype_text    500 non-null    object
 4   workarea_tex        500 non-null    object
 5   attribute_text      500 non-null    object
 6   jobwelf             500 non-null    object
dtypes: object(7)
memory usage: 27.5+ KB


重要参考:https://gitee.com/wenhaha8/job51_analysis

版权声明:本文为Cookie-Jing原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/Cookie-Jing/p/15149865.html