使用Python抓取美团数据存于Excel中
0.程序是针对美团中的美食部分数据按好评排序采集。
要抓取保存的数据为:
商家名 类型 地理位置 评论人数 均价 最低价格
1.首先编写网页数据采集函数,使用request采集网页源码,具体实现如下
def getHtml(url): headers = (\'User-Agent\', \'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\') opener = urllib.request.build_opener() opener.addheaders = [headers] htmldata = opener.open(url).read() htmldata=htmldata.decode(\'utf-8\') return htmldata2.根据网页源码解析获取已上线城市的url
class GetCityUrl(HTMLParser): part = (\'gaevent\',\'changecity/build\') urldic = {} def handle_starttag(self, tag, attrs): if tag==\'a\' and (self.part in attrs): for att,value in attrs: if att==\'href\': self.urldic.__setitem__(value, value+\'/category/meishi/all/rating\') def getUrl(self): return self.urldic3.获取分页url
class GetPages(HTMLParser): pagelist = list() temphref = str() flg = 0 initurl = str() def setInitUrl(self,url): self.initurl = url def handle_starttag(self, tag, attrs): if tag==\'a\': for attr,value in attrs: if attr==\'href\' and (\'page\' in value): self.temphref = self.initurl + value if self.temphref not in self.pagelist: self.pagelist.append(self.temphref) def getList(self): return self.pagelist4.解析网页源码 获取有效信息
class MyHTMLParser(HTMLParser): tempstr = str() divsum = int() def handle_starttag(self, tag, attrs): if tag==\'div\': for attr,value in attrs: if attr==\'class\' and value.find(\'poi-tile-nodeal\')!=-1: self.tempstr=\'\' self.divsum = 0 def handle_data(self, data): if(data.isspace()==False): data = data.replace(\'·\', \'·\') if data==\'¥\': if \'¥\' not in self.tempstr: self.tempstr+=\'无\' +\'\t\' self.tempstr+=data elif data==\'¥\': if \'¥\' not in self.tempstr: self.tempstr+=\'无\' +\'\t\' self.tempstr+=\'¥\' elif data==\'人评价\': self.tempstr=self.tempstr[0:-1]+data+\'\t\' elif data==\'人均 \': self.tempstr+=\'人均\' elif data[0]==\'起\': self.tempstr=self.tempstr[0:-1]+\'起\' else: self.tempstr+=data+\'\t\' def handle_endtag(self, tag): if tag==\'div\': self.divsum+=1 if self.divsum==6: if (self.tempstr.find(\'¥\'))!=-1: if (re.split(r\'\t\', self.tempstr).__len__())==5: teststr = str() flg = 0 for stmp in re.split(r\'\t\',self.tempstr): if flg==2: teststr+=\'无位置信息\'+\'\t\' teststr+=stmp+\'\t\' flg+=1 self.tempstr=teststr if (re.split(r\'\t\', self.tempstr).__len__())==6: arraystr.append(self.tempstr) self.divsum=0 self.tempstr=\'\'5.将信息存放于Excel中
def SaveExcel(listdata): head=[\'商家名\',\'类型\',\'地理位置\',\'评论人数\',\'均价\',\'最低价格\'] wbk=xlwt.Workbook() sheet1=wbk.add_sheet("sheet1") ii=0 for testhand in head: sheet1.write(0,ii,testhand) ii+=1 i=1 j=0 for stt in listdata: j=0 lis = re.split(r\'\t\',stt) for ls in lis: sheet1.write(i,j,ls) j=j+1 i+=1 wbk.save(\'test.xls\')以下是Excel中的数据:
附录完整代码:
#encoding:utf-8 \'\'\' Created on 2016年7月22日 python version 3.5 @author: baalhuo \'\'\' from html.parser import HTMLParser import re import urllib.request import xlwt import time #存放采集的商家信息 arraystr = list() #解析网页源码 获取有效信息 class MyHTMLParser(HTMLParser): tempstr = str() divsum = int() def handle_starttag(self, tag, attrs): if tag==\'div\': for attr,value in attrs: if attr==\'class\' and value.find(\'poi-tile-nodeal\')!=-1: self.tempstr=\'\' self.divsum = 0 def handle_data(self, data): if(data.isspace()==False): data = data.replace(\'·\', \'·\') if data==\'¥\': if \'¥\' not in self.tempstr: self.tempstr+=\'无\' +\'\t\' self.tempstr+=data elif data==\'¥\': if \'¥\' not in self.tempstr: self.tempstr+=\'无\' +\'\t\' self.tempstr+=\'¥\' elif data==\'人评价\': self.tempstr=self.tempstr[0:-1]+data+\'\t\' elif data==\'人均 \': self.tempstr+=\'人均\' elif data[0]==\'起\': self.tempstr=self.tempstr[0:-1]+\'起\' else: self.tempstr+=data+\'\t\' def handle_endtag(self, tag): if tag==\'div\': self.divsum+=1 if self.divsum==6: if (self.tempstr.find(\'¥\'))!=-1: if (re.split(r\'\t\', self.tempstr).__len__())==5: teststr = str() flg = 0 for stmp in re.split(r\'\t\',self.tempstr): if flg==2: teststr+=\'无位置信息\'+\'\t\' teststr+=stmp+\'\t\' flg+=1 self.tempstr=teststr if (re.split(r\'\t\', self.tempstr).__len__())==6: arraystr.append(self.tempstr) self.divsum=0 self.tempstr=\'\' #获取美团已上线城市的url 目前为844个城市地区 class GetCityUrl(HTMLParser): part = (\'gaevent\',\'changecity/build\') urldic = {} def handle_starttag(self, tag, attrs): if tag==\'a\' and (self.part in attrs): for att,value in attrs: if att==\'href\': self.urldic.__setitem__(value, value+\'/category/meishi/all/rating\') def getUrl(self): return self.urldic #获取分页URL class GetPages(HTMLParser): pagelist = list() temphref = str() flg = 0 initurl = str() def setInitUrl(self,url): self.initurl = url def handle_starttag(self, tag, attrs): if tag==\'a\': for attr,value in attrs: if attr==\'href\' and (\'page\' in value): self.temphref = self.initurl + value if self.temphref not in self.pagelist: self.pagelist.append(self.temphref) def getList(self): return self.pagelist #采集网页源码信息 def getHtml(url): headers = (\'User-Agent\', \'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\') opener = urllib.request.build_opener() opener.addheaders = [headers] htmldata = opener.open(url).read() htmldata=htmldata.decode(\'utf-8\') return htmldata #将信息保存到Excel中 def SaveExcel(listdata): head=[\'商家名\',\'类型\',\'地理位置\',\'评论人数\',\'均价\',\'最低价格\'] wbk=xlwt.Workbook() sheet1=wbk.add_sheet("sheet1") ii=0 for testhand in head: sheet1.write(0,ii,testhand) ii+=1 i=1 j=0 for stt in listdata: j=0 lis = re.split(r\'\t\',stt) for ls in lis: sheet1.write(i,j,ls) j=j+1 i+=1 wbk.save(\'e:/test3.xls\') par = GetCityUrl() par.feed(getHtml(\'http://www.meituan.com/index/changecity/initiative\')) urldic = par.getUrl() par = MyHTMLParser() print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) ffwait=1 for url in urldic: data = getHtml(urldic.get(url)) getpage = GetPages() getpage.setInitUrl(url) getpage.feed(data) pageurllist = getpage.getList() par.feed(data) for urltemp in pageurllist: par.feed(getHtml(urltemp)) arraystr.append(\'切换地区 \') if ffwait ==4:#此处只抓取了4个城市数据 break; ffwait+=1 SaveExcel(arraystr) print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) print(\'Done\')
学之,以记之。