python_爬虫_微信公众号抓取
目前卡在视频部分,公众号的视频来源是腾讯视频,播放和下载地址都是加密的,目前vid及vkey均已经获取,但使用爬虫得到的结果永远是403、405,尚未解决。
考虑方法:
selenium当页面加载后查看广告用时,等广告加载时间过去后再点击视频,再由网页中获取加载的视频地址进行下载,明天测试看有无效果
import requests,pymysql import json,jsonpath,random,re,time,datetime,os,imghdr from lxml import etree from selenium import webdriver from urllib import request import ssl ssl._create_default_https_context = ssl._create_unverified_context \'\'\' 注意点:如果同一时间内刷新次数,或者获取分页太频繁,会被封 \'\'\' # -------------------- user_info = {\'username\':\'####@163.com\',\'password\':\'####\'} base_url = \'https://mp.weixin.qq.com/\' base_headers = { \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36\' } query_list = [ # 需要抓取的公众号列表 {\'fakeid\':\'MzIzOTQ0MTUwMA==\',\'nickname\':\'Sir电影\'}, {\'fakeid\':\'MzIxODc5MzM4NQ==\',\'nickname\':\'鱼Sir电影\'}, ] table = \'p_weixin\' # 数据库名称 key = \'title,author,js_name,publish_time,images,vedios\' # -------------------- def get_cookie(): # 登陆并获取cookie值 driver = webdriver.Chrome(executable_path=r\'/Applications/Google Chrome.app/chromedriver\') driver.get(base_url) time.sleep(2) # 页面缓冲 driver.find_element_by_name(\'account\').clear() driver.find_element_by_name(\'account\').send_keys(user_info[\'username\']) driver.find_element_by_name(\'password\').clear() driver.find_element_by_name(\'password\').send_keys(user_info[\'password\']) driver.find_element_by_class_name(\'icon_checkbox\').click() driver.find_element_by_class_name(\'btn_login\').click() time.sleep(25) # 等待手机扫码 c_total = driver.get_cookies() cookies = {} # cookies存储 for i in c_total: cookies[i[\'name\']] = i[\'value\'] return cookies def get_info(): # 获取公众号名称、总页数、token、fakeid等信息, cookies = get_cookie() res_token = requests.get(base_url,cookies=cookies,headers=base_headers) token = re.compile(r\'token=(\d+)\').findall(str(res_token.url)) for query in query_list: # 从列表里控制要爬取多少个公众号 url = \'https://mp.weixin.qq.com/cgi-bin/appmsg\' # 公众号里面的电影 fakeid = query[\'fakeid\'] appmsg = { \'token\': token, \'lang\': \'zh_CN\', \'f\': \'json\', \'ajax\': \'1\', \'random\': random.random(), \'action\': \'list_ex\', \'begin\': \'0\', \'count\': \'5\', \'fakeid\': fakeid, \'type\': \'9\', } res_cnt = requests.get(url,params=appmsg,cookies=cookies) # 看总共多少页 一页16篇,begin 0~4为一页 res_cnt = json.loads(res_cnt.text) for cnt in range(0,res_cnt[\'app_msg_cnt\'],5): # 循环所有分页 appmsg[\'begin\'] = cnt # 当前的分页值 response = requests.get(url,params=appmsg,cookies=cookies) data_list = json.loads(response.text) for data in data_list[\'app_msg_list\']: # 对当前页里的信息进行提取 yield [data[\'title\'],data[\'link\']] time.sleep(random.randrange(30,41)) # 设置间隔 print(\'公众号:%s,共有文章%s\' % (query[\'nickname\'],res_cnt[\'app_msg_cnt\'])) # 返回页数,这里应该单独输入,不需要存储 def get_news(): # def get_news(url): # 获取文章,这里可以用协程?也需要换agent \'\'\'视频、音频、图片\'\'\' print(\'-\'*40) url = \'https://mp.weixin.qq.com/s?src=3×tamp=1533397256&ver=1&signature=RbnX4tUBODpql9qsvp4jJRDrtHc-LSXXm9gSM*BNY*PTRKHJ2bUyeKkGPlpKGGsnKl4IyaxubTPPWv6jQzhm52M7qFY5*BJ8dEugb4XPUcLRSs8U-4Bb9ab9mso2NWDq0*RwRzZ2*zZ6r1YyQtNjpg==\' res = request.Request(url,headers=base_headers) response = request.urlopen(res) re_data = response.read().decode() data = etree.HTML(re_data) title = get_try(data, \'//h2[@id="activity-name"]/text()\') # 标题 author = get_try(data, \'//div[@id="meta_content"]//span[@class="rich_media_meta rich_media_meta_text"]//text()\') # 作者 js_name = get_try(data, \'//div[@id="meta_content"]//span[@class="rich_media_meta rich_media_meta_text"]//text()\') # 公众号名称 publish_time = re.compile(r\'var publish_time.*?\"(.*?)\"\').findall(re_data)[0] # 发布时间 昨天、前天、今天、1周前 images_list = [] # 图片 vedio_list = [] # 音视频 # 还有图片、视频、音频地址 js_content = data.xpath(\'//div[@id="js_content"]//p//text()|//div[@id="js_content"]//p//img/@data-src|//div[@id="js_content"]//p//iframe/@data-src|//mpvoice\') for i in range(len(js_content)): if \'。\' == js_content[i] or \',\' == js_content[i]: js_content[i] = \'\' elif isinstance(js_content[i], etree._Element): # 音频 res = js_content[i].xpath(\'//mpvoice/@voice_encode_fileid\')[0] js_content[i] = \'https://res.wx.qq.com/voice/getvoice?mediaid={}\'.format(res) vedio_list.append(js_content[i]) elif \'pic\' in js_content[i]: # 图片 images_list.append(js_content[i]) elif \'v.qq\' in js_content[i]: # 视频 vedio_json = \'https://h5vv.video.qq.com/getinfo?callback=txplayerJsonpCallBack_getinfo_24936&otype=json&vid={}\' # 包括视频播放地址的json文件 url = vedio_json.format(js_content[i].split(\'vid=\')[-1].split(\'&\')[0]) js_content[i] = url vedio_list.append(js_content[i]) else: js_content[i] = \'<p>%s</p>\' % js_content[i] get_video(vedio_list) # 从视频、音频json文件地址分析出下载路径 print(\'-\' * 30) total_data = { \'title\': title, \'author\': author, \'js_name\': js_name, \'publish_time\': publish_time, \'js_content\': js_content, \'images\': images_list, \'vedios\': vedio_list } # Down(total_data) # 下载函数 def get_try(data, fangfa): # 把try except封到一起 try: res = data.xpath(fangfa)[0].strip() return res except Exception as e: return \'暂无\' def get_video(url_list): # 获取视频 如果是音频,直接下载,否则转到视频界面处理 print(\'获取音、视频路径列表\',url_list) for base_url in url_list: if \'voice\' in base_url: pass # voice_name = base_url.split(\'=\')[-1][-10:] # request.urlretrieve(base_url,\'./\'+voice_name+\'.mp3\') # 目前mp3可以下载没有加密,后续需要转到Down函数统一处理 else: print(\'视频的json文件地址\',base_url) res = request.Request(base_url,headers=base_headers) response = request.urlopen(res) video_json = re.compile(r\'txplayerJsonpCallBack_getinfo_24936\((.*)\)\',re.S).search(response.read().decode()).group(1) video_data = json.loads(video_json) title = jsonpath.jsonpath(video_data,\'$..vl.vi..ti\')[0] vid = jsonpath.jsonpath(video_data,\'$..vl.vi..lnk\')[0] vkey = jsonpath.jsonpath(video_data,\'$..vl.vi..fvkey\')[0] fn = jsonpath.jsonpath(video_data,\'$..vl.vi..fn\')[0] url_list = jsonpath.jsonpath(video_data,\'$..vl.vi..ul.ui\')[0] full_url = \'http://ugcsjy.qq.com/\'+vid+\'.p712.1.mp4?vkey=\'+vkey print(\'下载路径\',full_url) try: base_headers[\'Host\'] = \'ugcbsy.qq.com\' v_response = requests.get(full_url,headers = base_headers) print(base_headers) print(v_response.status_code) except Exception as e: print(\'该下载路径下载失败\',e) def Down(data): # 先下载 下载之后还需要改路径名称 按时间建立文件夹 # 检测是否存在、下载、改类型 # 视频现在路径有问题,先搞定音频和图片名称和路径 for i in [data[\'images\'],data[\'vedios\']]: for img in i: img_name = img.split(\'/\')[-2][-10:] # 下载后的图片名称 down_path = \'../download/公众号_图片音视频/{}\'.format(img_name) # 下载路径 path = os.listdir(\'../download/公众号_图片音视频/\') path = \',\'.join(path) # 把当前所需要下载的文章资源路径拼接在一起,便于使用re.search方法判断 if re.search(img_name + \'.*\', path): # 正则匹配后缀名,避免判断时因为本地文件已经改了后缀找不到 print(\'文件已存在\', \'-\', img_name) else: request.urlretrieve(img, down_path) # 下载 end_name = imghdr.what(down_path) # 后缀名称 if end_name: # imghdr只能查看图片类,视频不能判断,所以判断是否为空 os.rename(down_path, down_path + \'.\' + end_name) print(\'已下载成功\', \'-\', down_path) def Mydb(data): # 下载后再存数据库 db = pymysql.connect(\'127.0.0.1\',\'root\',\'123456\',\'PaChong\',charset=\'utf8\') curosr = db.cursor() value = \',\'.join([\'%s\']*len(data)) sql = \'insert into {}({}) VALUES({})\'.format(table,key,value) data = data.values() curosr.execute(sql,data) def main(): # 主体,从这里运行就好 start = datetime.datetime.now() # 开始时间 for info in get_info(): # 获取公众号taken fakeid等信息 get_news(info[-1]) # 获取地址 end = datetime.datetime.now() # 结束时间 print(\'-\'*30) print(\'总用时\',end-start) print(\'-\'*30) if __name__ == \'__main__\': # main() get_news()
版权声明:本文为hejianlong原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。