爬取51job招聘信息(一)
目标,将网页上的内容爬取下来,并实现翻页,存储为csv。
import os from concurrent.futures.thread import ThreadPoolExecutor from threading import Thread import requests from re import findall from json import loads import time import pymysql from multiprocessing import Queue
import csv
# 获取每页的内容,定义一个函数 def get_one_page(page, city_code=\'000000\'): headers = { \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36\' } url = f\'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=\' response = requests.get(url, headers=headers) if response.status_code == 200: json_data = findall(r\'window.__SEARCH_RESULT__\s*=\s*(\{.+?\})</script>\', response.text)[0] return loads(json_data)[\'engine_search_result\'] else: print(\'请求失败!\')
# 需要多少页! start_page=1 ts=[] for i in range(10): result = get_one_page(start_page) if not result: print(\'没有更多数据\') break ts.append(result) start_page += 1
#data_1 = get_one_page(1) #尝试保存一页的内容 data_1=[] # 创建空列表,用于存储多页 for i in range(len(ts)): for j in range(50):#一页50条 data_1.append(ts[i][j])
# 我需要存储的信息 jobs = [] for job in data_1: job_info = [job.get(\'job_name\'), job.get(\'providesalary_text\'), job.get(\'company_name\'), job.get(\'companytype_text\'), job.get(\'workarea_text\'), \'-\'.join(job.get(\'attribute_text\', [\'-\', \'-\', \'-\', \'-\', \'-\'])), job.get(\'jobwelf\') ] jobs.append(job_info)
name=[\'job_name\',\'providesalary_text\',\'company_name\',\'companytype_text\',\'workarea_tex\',\'attribute_text\',\'jobwelf\'] test=pd.DataFrame(columns=name,data=jobs) test.to_csv("testcsv.csv") # 保存为csv格式
test.info()
<class \'pandas.core.frame.DataFrame\'> RangeIndex: 500 entries, 0 to 499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 job_name 500 non-null object 1 providesalary_text 500 non-null object 2 company_name 500 non-null object 3 companytype_text 500 non-null object 4 workarea_tex 500 non-null object 5 attribute_text 500 non-null object 6 jobwelf 500 non-null object dtypes: object(7) memory usage: 27.5+ KB
重要参考:https://gitee.com/wenhaha8/job51_analysis
版权声明:本文为Cookie-Jing原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。