搜狗对微信公众平台的公众号和文章做了整合,使用代理爬取。

 

spider.py

  1 from urllib.parse import urlencode
  2 import pymongo
  3 import requests
  4 from lxml.etree import XMLSyntaxError
  5 from requests.exceptions import ConnectionError
  6 from pyquery import PyQuery as pq
  7 from config import *
  8 
  9 client = pymongo.MongoClient(MONGO_URI)
 10 db = client[MONGO_DB]
 11 
 12 base_url = \'http://weixin.sogou.com/weixin?\'
 13 
 14 headers = {
 15 \'Cookie\': \'IPLOC=CN1100; SUID=194E796A2E08990A000000005B114E85; SUV=1527860869604056; ABTEST=1|1527860872|v1; SNUID=9FCBFCEF8680EB12510E6A9C86088B29; weixinIndexVisited=1; JSESSIONID=aaaqa95rD87Zu9-CJwlnw; sct=5; ppinf=5|1527862844|1529072444|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8Y3J0OjEwOjE1Mjc4NjI4NDR8cmVmbmljazoyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUh5bE5VSDJEVWNuSHBDWnVOVG9sN2tAd2VpeGluLnNvaHUuY29tfA; pprdig=EZE8CVVtoUTqmCoJj6bEWwKngY4di5UpGDFImTA9-1qrMK_tIJEtUyGR9_0Jcv5Xw1EuqLO9BNFvAKQv5DOQvmCWh-jxudk7SGv89NuhCLow7dxPysoOtLSI-keSaKVLKT82Vhg7rDBg0SlQ3y2uiG53lBUWL0wLVw4D_f_7MLg; sgid=17-35315605-AVsRVjwpV4ichpAzPibp6olGY; ppmdig=1527862844000000243bdb95cb03e086685bb1de06087c32\',
 16 \'Host\': \'weixin.sogou.com\',
 17 \'Upgrade-Insecure-Requests\': \'1\',
 18 \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36\'
 19 }
 20 
 21 
 22 proxy = None
 23 
 24 
 25 def get_proxy():
 26     try:
 27         response = requests.get(PROXY_POOL_URL)
 28         if response.status_code == 200:
 29             return response.text
 30         return None
 31     except ConnectionError:
 32         return None
 33 
 34 def get_html(url, count=1):
 35     print(\'Crawling\', url)
 36     print(\'Trying Count\', count)
 37     global proxy
 38     if count >= MAX_COUNT:
 39         print(\'Tried Too Many Counts\')
 40         return None
 41     try:
 42         if proxy:
 43             proxies = {
 44                 \'http\': \'http://\' + proxy
 45             }
 46             response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
 47         else:
 48             response = requests.get(url, allow_redirects=False, headers=headers)
 49         if response.status_code == 200:
 50             return response.text
 51         if response.status_code == 302:
 52             # Need Proxy
 53             print(\'302\')
 54             proxy = get_proxy()
 55             if proxy:
 56                 print(\'Using Proxy\', proxy)
 57                 #count += 1
 58                 #return get_html(url, count)
 59                 return get_html(url)
 60             else:
 61                 print(\'Get Proxy Failed\')
 62                 return None
 63     except ConnectionError as e:
 64         print(\'Error Occurred\', e.args)
 65         proxy = get_proxy()
 66         count += 1
 67         return get_html(url, count)
 68 
 69 
 70 
 71 def get_index(keyword, page):
 72     data = {
 73         \'query\': keyword,
 74         \'type\': 2,
 75         \'page\': page
 76     }
 77     queries = urlencode(data)
 78     url = base_url + queries
 79     html = get_html(url)
 80     return html
 81 
 82 def parse_index(html):
 83     doc = pq(html)
 84     items = doc(\'.news-box .news-list li .txt-box h3 a\').items()
 85     for item in items:
 86         yield item.attr(\'href\')
 87 
 88 def get_detail(url):
 89     try:
 90         response = requests.get(url)
 91         if response.status_code == 200:
 92             return response.text
 93         return None
 94     except ConnectionError:
 95         return None
 96 
 97 def parse_detail(html):
 98     try:
 99         doc = pq(html)
100         title = doc(\'.rich_media_title\').text()
101         content = doc(\'.rich_media_content\').text()
102         date = doc(\'#publish_time\').text()
103         nickname = doc(\'#js_profile_qrcode > div > strong\').text()
104         wechat = doc(\'#js_profile_qrcode > div > p:nth-child(3) > span\').text()
105         return {
106             \'title\': title,
107             \'content\': content,
108             \'date\': date,
109             \'nickname\': nickname,
110             \'wechat\': wechat
111         }
112     except XMLSyntaxError:
113         return None
114 
115 def save_to_mongo(data):
116     if db[\'articles\'].update({\'title\': data[\'title\']}, {\'$set\': data}, True):
117         print(\'Saved to Mongo\', data[\'title\'])
118     else:
119         print(\'Saved to Mongo Failed\', data[\'title\'])
120 
121 
122 def main():
123     for page in range(1, 101):
124         html = get_index(KEYWORD, page)
125         if html:
126             article_urls = parse_index(html)
127             for article_url in article_urls:
128                 #print(article_url)
129                 article_html = get_detail(article_url)
130                 if article_html:
131                     article_data = parse_detail(article_html)
132                     print(article_data)
133                     if article_data:
134                         save_to_mongo(article_data)
135 
136 
137 
138 if __name__ == \'__main__\':
139     main()

 

config.py
1 PROXY_POOL_URL = \'http://127.0.0.1:5555/random\'
2 KEYWORD = \'python\'
3 MONGO_URI = \'localhost\'
4 MONGO_DB = \'weixin\'
5 MAX_COUNT = 5

 

版权声明:本文为wanglinjie原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/wanglinjie/p/9231559.html