爬取微信公众号文章
搜狗对微信公众平台的公众号和文章做了整合,使用代理爬取。
spider.py
1 from urllib.parse import urlencode 2 import pymongo 3 import requests 4 from lxml.etree import XMLSyntaxError 5 from requests.exceptions import ConnectionError 6 from pyquery import PyQuery as pq 7 from config import * 8 9 client = pymongo.MongoClient(MONGO_URI) 10 db = client[MONGO_DB] 11 12 base_url = \'http://weixin.sogou.com/weixin?\' 13 14 headers = { 15 \'Cookie\': \'IPLOC=CN1100; SUID=194E796A2E08990A000000005B114E85; SUV=1527860869604056; ABTEST=1|1527860872|v1; SNUID=9FCBFCEF8680EB12510E6A9C86088B29; weixinIndexVisited=1; JSESSIONID=aaaqa95rD87Zu9-CJwlnw; sct=5; ppinf=5|1527862844|1529072444|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8Y3J0OjEwOjE1Mjc4NjI4NDR8cmVmbmljazoyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUh5bE5VSDJEVWNuSHBDWnVOVG9sN2tAd2VpeGluLnNvaHUuY29tfA; pprdig=EZE8CVVtoUTqmCoJj6bEWwKngY4di5UpGDFImTA9-1qrMK_tIJEtUyGR9_0Jcv5Xw1EuqLO9BNFvAKQv5DOQvmCWh-jxudk7SGv89NuhCLow7dxPysoOtLSI-keSaKVLKT82Vhg7rDBg0SlQ3y2uiG53lBUWL0wLVw4D_f_7MLg; sgid=17-35315605-AVsRVjwpV4ichpAzPibp6olGY; ppmdig=1527862844000000243bdb95cb03e086685bb1de06087c32\', 16 \'Host\': \'weixin.sogou.com\', 17 \'Upgrade-Insecure-Requests\': \'1\', 18 \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36\' 19 } 20 21 22 proxy = None 23 24 25 def get_proxy(): 26 try: 27 response = requests.get(PROXY_POOL_URL) 28 if response.status_code == 200: 29 return response.text 30 return None 31 except ConnectionError: 32 return None 33 34 def get_html(url, count=1): 35 print(\'Crawling\', url) 36 print(\'Trying Count\', count) 37 global proxy 38 if count >= MAX_COUNT: 39 print(\'Tried Too Many Counts\') 40 return None 41 try: 42 if proxy: 43 proxies = { 44 \'http\': \'http://\' + proxy 45 } 46 response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies) 47 else: 48 response = requests.get(url, allow_redirects=False, headers=headers) 49 if response.status_code == 200: 50 return response.text 51 if response.status_code == 302: 52 # Need Proxy 53 print(\'302\') 54 proxy = get_proxy() 55 if proxy: 56 print(\'Using Proxy\', proxy) 57 #count += 1 58 #return get_html(url, count) 59 return get_html(url) 60 else: 61 print(\'Get Proxy Failed\') 62 return None 63 except ConnectionError as e: 64 print(\'Error Occurred\', e.args) 65 proxy = get_proxy() 66 count += 1 67 return get_html(url, count) 68 69 70 71 def get_index(keyword, page): 72 data = { 73 \'query\': keyword, 74 \'type\': 2, 75 \'page\': page 76 } 77 queries = urlencode(data) 78 url = base_url + queries 79 html = get_html(url) 80 return html 81 82 def parse_index(html): 83 doc = pq(html) 84 items = doc(\'.news-box .news-list li .txt-box h3 a\').items() 85 for item in items: 86 yield item.attr(\'href\') 87 88 def get_detail(url): 89 try: 90 response = requests.get(url) 91 if response.status_code == 200: 92 return response.text 93 return None 94 except ConnectionError: 95 return None 96 97 def parse_detail(html): 98 try: 99 doc = pq(html) 100 title = doc(\'.rich_media_title\').text() 101 content = doc(\'.rich_media_content\').text() 102 date = doc(\'#publish_time\').text() 103 nickname = doc(\'#js_profile_qrcode > div > strong\').text() 104 wechat = doc(\'#js_profile_qrcode > div > p:nth-child(3) > span\').text() 105 return { 106 \'title\': title, 107 \'content\': content, 108 \'date\': date, 109 \'nickname\': nickname, 110 \'wechat\': wechat 111 } 112 except XMLSyntaxError: 113 return None 114 115 def save_to_mongo(data): 116 if db[\'articles\'].update({\'title\': data[\'title\']}, {\'$set\': data}, True): 117 print(\'Saved to Mongo\', data[\'title\']) 118 else: 119 print(\'Saved to Mongo Failed\', data[\'title\']) 120 121 122 def main(): 123 for page in range(1, 101): 124 html = get_index(KEYWORD, page) 125 if html: 126 article_urls = parse_index(html) 127 for article_url in article_urls: 128 #print(article_url) 129 article_html = get_detail(article_url) 130 if article_html: 131 article_data = parse_detail(article_html) 132 print(article_data) 133 if article_data: 134 save_to_mongo(article_data) 135 136 137 138 if __name__ == \'__main__\': 139 main()
config.py
1 PROXY_POOL_URL = \'http://127.0.0.1:5555/random\' 2 KEYWORD = \'python\' 3 MONGO_URI = \'localhost\' 4 MONGO_DB = \'weixin\' 5 MAX_COUNT = 5
版权声明:本文为wanglinjie原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。