Python 学习(1) 简单的小爬虫
最近抽空学了两天的Python,基础知识都看完了,正好想申请个联通日租卡,就花了2小时写了个小爬虫,爬一下联通日租卡的申请页面,看有没有好记一点的手机号~ 人工挑眼都挑花了。
用的IDE是PyCharm,首先下载一些需要用到的包和模块: requests 和 beautifulsoup4。 不过发现请求的url返回的是json数据,就没用beautifulsoup4而是直接用正则提取手机号了。
注释写的还是很详细的,打分的方法非常简陋,一般这种便宜的套餐也没啥好号,今天太晚了,就这样吧,哪天有空了可以在增加些打分的规则。 代码在下面: 困,睡觉去~
1 import time 2 import random 3 import requests 4 import socket 5 import re 6 import http.client 7 import sys 8 9 URL = 'https://m.10010.com/NumApp/NumberCenter/qryNum?callback=jsonp_queryMoreNums&provinceCode=76&cityCode=760&monthFeeLimit=0&groupKey=41242783&searchCategory=3&net=01&amounts=200&codeTypeCode=&searchValue=&qryType=02&goodsNet=4&_=1513948237449' 10 11 12 def get_content(url): 13 '''获取url内容''' 14 #request header信息 15 header = { 16 'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01', 17 'Accept-Encoding': 'gzip, deflate, br', 18 'Accept-Language': 'zh-CN,zh;q=0.9', 19 'Connection': 'keep-alive', 20 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' 21 } 22 #超时时间 23 timeout = random.choice(range(80,180)) 24 while True: 25 try: 26 #请求url获取返回的response对象 27 rep = requests.get(url, headers=header, timeout=timeout) 28 # rep = requests.get(url) 29 rep.encoding = 'utf-8' 30 break 31 except: 32 #出错后延迟一段你时间重试 33 time.sleep(random.choice(range(5, 20))) 34 35 return rep.text 36 37 def grade(phone): 38 '''给手机号打分''' 39 if phone[3:7] == phone[7:]:#1--abcdabcd 40 return 100 41 elif phone[3:5] == phone[7:9] and phone[5:7] == phone[9:]: #1--aabbaabb 42 return 95 43 elif phone[3:7] == phone[7::-1]: #1--abcddcba 44 return 90 45 elif phone[7:9] == phone[9:]: #尾号aabb 46 return 80 47 elif phone[7:9] == phone[9::-1]: #尾号abba 48 return 70 49 elif phone[3:5] == phone[7:9]: #1--ab--ab-- 50 return 50 51 else: 52 return 0 53 #可以在加一些其他的判定条件 54 55 def save_resule(result): 56 '''把结果保存到result.txt文件''' 57 if len(result) > 0: 58 with open('result.txt', 'a') as f: 59 for x in result: 60 f.write(x + '\n') 61 62 #循环查询次数 63 loop = 5 64 #结果集合 65 result = [] 66 67 if len(sys.argv) == 2: 68 loop = int(sys.argv[1]) 69 70 for i in range(1, loop+1): 71 # now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 72 # print('{} 第{}次查询'.format(now, i)) 73 html = get_content(URL) #请求url,获取json内容 74 # print(html) 75 regex_str = r'1\d{10}' 76 #提取手机号 77 phones = re.findall(regex_str, html) 78 for x in phones: 79 level = grade(x) 80 if level > 0 and x not in result: 81 result.append(x) 82 print(level, '-', x) 83 time.sleep(random.choice(range(1, 5)))#不要请求的太频繁 84 save_resule(result)#循环结束保存到result.txt文件