使用Python爬取mobi格式电纸书
最近做了个微信推送kindle电子书的公众号:kindle免费书库
不过目前电子书不算非常多,所以需要使用爬虫来获取足够书籍。
于是,写了以下这个爬虫,来爬取kindle114的电子书。
值得注意的地方:
当爬取数过大时,由于对方有开启放抓取,会返回一个javascript而非原始的html,所以我使用
的PyV8来执行这段js从而拿到真正的地址。
目前存在的问题:
正则式写得还不够好,毕竟是第一次正式写爬虫:)
无法下载需要购买的附件
爬虫为单线程,爬完整个网站速度慢。我有试过转成多进程,但是貌似由于不能同时登陆,大多数
爬虫进程都无法正常爬取@@
# -*- coding: utf-8 -*- import urllib2 import re import requests import os import hashlib def fuckJS(js): import PyV8 import re #去掉<script>标签 js=js[31:-9] for st in [\'window\',\'location\',"\'assign\'","\'href\'","\'replace\'"]: equal=re.findall(\'[_A-Za-z0-9 =]+%s;\'%st,js)#找到变量赋值等式 if equal==[]:#有可能没有 continue else: equal=equal[0] var=equal.split(\'=\')[0].strip()#找出变量名 #把等式干掉 js=js.replace(equal,\'\') #把变量替换成它真正的意思 js=js.replace(var,st) #把[\'xx\'] 替换成 .xx js=js.replace("[\'%s\']"%st.strip("\'"),\'.%s\'%st.strip("\'")) #将 window.href= 后的内容踢掉,因为当PyV8只输出最后一个等式的值 if re.findall(\'window\.href=.+\',js)!=[]: js=js.replace(re.findall(\'window\.href=.+\',js)[0],\'\') #删掉location.xxx= js=js.replace(\'location.href=\',\'\').replace(\'location.replace\',\'\').replace(\'location.assign\',\'\') #交给你了-v- ctxt2 = PyV8.JSContext() ctxt2.enter() #print ctxt2.eval(js) trueAddr = ctxt2.eval(js) print trueAddr return trueAddr def downloadMobi(name, url): #去掉windows下不合法的文件名 unlawName = \'<>/\\|:""*?\' for i in unlawName: name = name.replace(i, \'\') #正则表达式写的不够好导致的问题@@ if name.count(\' img src=templateyeei_dream1cssyeeidigest_1.gif class=vm alt= title= \') > 0: name = name.split(\'  \')[0]+\'.mobi\' #避免重复下载 if os.path.exists(\'D:\Kindle114SpiderDownload\\\' + name): print \'already have\', name return url = url.split(\' \')[0] s = requests.session() username = \'你的用户名\' password = \'你的密码\' passwordMd5 = hashlib.md5(password).hexdigest() data = {\'formhash\': \'23cd6c29\', \'referer\': \'\',\'username\': username, \'password\': passwordMd5, \'questionid\':\'0\', \'answer\':\'\'} res=s.post(\'http://www.kindle114.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LYn7n&inajax=1\',data) #res = s.get(\'http://www.kindle114.com/forum.php?mod=attachment&aid=MTQ2NTB8ZjhkNjY3NmF8MTQxNjg5OTYxOXw0NDIxfDczNjI%3D\') try: res = s.get(url, timeout = 200) except: print \'time out for \', name #print \'content[:50]\' #print res.content[:50] if res.content.count(\'<!DOCTYPE html\') > 0: print \'!!!!!!!!!!!!!!!!!not a mobi, this file need gold coin!!!!!!!!!!!!!!!\' return try: with open(\'D:\\Kindle114SpiderDownload\\\' + name, "wb") as code: code.write(res.content) except: print \'!!!!!!!!!!!!!!!!!!!!!遇到不合法文件名!!!!!!!!!!!!!!!!!!\', name def spiderThread(url, threadName): req = urllib2.urlopen(url, timeout = 10) text = req.read() if text.count(\'<!DOCTYPE html\') == 0: js = text trueURL = \'http://www.kindle114.com/\' + fuckJS(js) print \'trueURL\', trueURL req = urllib2.urlopen(trueURL) text = req.read() #href = \'<a href="(.*?)" onmouseover="showMenu({\\'ctrlid\\':this.id,\\'pos\\':\\'12\\'})" id=.*?target="_blank">(.*?)</a>\' href = \'<a href="(.*?)".*?target="_blank">(.*?)</a>\' href_re = re.compile(href) href_info = href_re.findall(text) bookSum = 0 for i in href_info: if i[1].count(\'.mobi\') > 0: bookSum+=1 if bookSum == 0: print \'!!!bookSum = 0!!!!\', text[:100] if bookSum == 1: print \'only one book in this thread\' bookFileName = threadName + \'.mobi\' for i in href_info: if i[1].count(\'.mobi\') > 0: link = i[0].replace(\'amp;\',\'\') break print link, bookFileName downloadMobi(bookFileName, link) else: print str(bookSum), \'in this thread\' for i in href_info: if i[1].count(\'.mobi\') > 0: link = i[0].replace(\'amp;\',\'\') bookFileName = i[1] print link, bookFileName downloadMobi(bookFileName, link) for pageNum in range(1, 125): url = \'http://www.kindle114.com/forum.php?mod=forumdisplay&fid=2&filter=sortid&sortid=1&searchsort=1&geshi=1&page=\' + str(pageNum) print \'=============url\', url,\'===============\' try: req = urllib2.urlopen(url, timeout = 10) except: print \'page time out\', url text = req.read() href = \'<h4><a href="(.*?)" target="_blank" class="xst">(.*?)<span class="xi1">\' href_re = re.compile(href) href_info = href_re.findall(text) for i in href_info: print i[0], i[1] url = \'http://www.kindle114.com/\'+i[0] threadName = i[1] try: spiderThread(url, threadName) except Exception , e: print \'!!!!!!!!!!!!! Error with \',threadName, url,\'!!!!!!!!!!!!!!!!\' print e raw_input(\'finish all!!!\')