[Python爬虫]起点中文网小说排行榜
import requests import re import time import json from requests.exceptions import RequestException def get_html_page(url): try: headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36\', } html = requests.get(url, headers=headers) if html.status_code==200: return html.text return None except RequestException: return None def get_parse_page(html): pattern=re.compile(\'<li data-rid.*?>[\s\S.]*?<span class=.*?>(.*?)<cite>[\s\S.]*?<h4><a.*?data-bid=.*?>(.*?)</a>\' \'</h4>[\s\S]*?<p class="author">[\s\S]*?<img.*?data-eid=.*?>(.*?)</a>\',re.S) items=re.findall(pattern,html) for item in items: yield { \'rank\':item[0], \'title\':item[1], \'author\':item[2] } #<h4><a.*?data-bid=.*?>(.*?)</a></h4>[\s\S]*?<p class="author">[\s\S]*?<a.*?target="_blank">(.*?)</a><em>.*?</span>[\s\S.]*?<p class="intro">[\s\S]*?(.*?)[\s\S]*?</p> def write_to_file(content): with open(\'result.txt\',\'a\',encoding=\'utf-8\') as f: f.write(json.dumps(content,ensure_ascii=False)+\'\n\') def main(page): url=\'https://www.qidian.com/rank/yuepiao?page=\'+str(page) html=get_html_page(url) for con in get_parse_page(html): write_to_file(con) if __name__==\'__main__\': for i in range(5): main(i+1) time.sleep(1)
版权声明:本文为lightmonster原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。