爬取国家统计局2020年行政区划分数据
使用python获取统计局2020年行政区划数据。
参考:https://blog.csdn.net/qlx119/article/details/105289974
在MySQL中创建tab_citys数据表:
DROP TABLE IF EXISTS `tab_citys`; CREATE TABLE `tab_citys` ( `id` int(11) NOT NULL AUTO_INCREMENT, `parent_id` int(11) DEFAULT NULL, `city_name_zh` varchar(20) NOT NULL, `city_name_en` varchar(20) DEFAULT NULL, `city_level` int(11) NOT NULL, `city_code` char(12) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;
创建xzqh.py的pyton脚本:
1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 # 功能: 获取省市县数据 4 # 版本:v1.1 5 import importlib 6 import sys 7 import pymysql 8 importlib.reload(sys) 9 import requests 10 import lxml.etree as etree 11 import os 12 class chinese_city(): 13 # 初始化函数 14 def __init__(self): 15 self.baseUrl = \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html\' 16 self.base = \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/\' 17 self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="xzqh", charset=\'utf8\') 18 self.cur = self.conn.cursor() 19 self.trdic = { 20 1: \'//tr[@class="provincetr"]\', 21 2: \'//tr[@class="citytr"]\', 22 3: \'//tr[@class="countytr"]\', 23 4: \'//tr[@class="towntr"]\', 24 5: \'//tr[@class="villagetr"]\' 25 } 26 def __del__(self): 27 if self.cur: 28 self.cur.close() 29 if self.conn: 30 self.conn.close() 31 32 def crawl_page(self,url): 33 \'\'\' 爬行政区划代码公布页 \'\'\' 34 # print(f"crawling...{url}") 35 headers = {\'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0\', 36 \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\'} 37 i = 0 38 while i < 3: 39 try: 40 html = requests.get(url, headers=headers, timeout=20) 41 html.encoding = \'gbk\' # 这里添加一行 42 # print(html.status_code) 43 text = html.text 44 return text 45 except requests.exceptions.RequestException: 46 i += 1 47 print(\'超时\'+url) 48 49 #解析省页,返回list 50 def parseProvince(self): 51 html = self.crawl_page(self.baseUrl) 52 tree = etree.HTML(html, parser=etree.HTMLParser(encoding=\'gbk\')) 53 nodes = tree.xpath(\'//tr[@class="provincetr"]\') 54 id = 1 55 values = [] 56 for node in nodes: 57 items = node.xpath(\'./td\') 58 for item in items: 59 value = {} 60 nexturl = item.xpath(\'./a/@href\') 61 province = item.xpath(\'./a/text()\') 62 print(province) 63 value[\'url\'] = self.base + "".join(nexturl) 64 value[\'name\'] = "".join(province) 65 value[\'code\'] = 0 66 value[\'pid\'] = 0 67 value[\'id\'] = id 68 value[\'level\'] = 1 69 print(repr(value[\'name\'])) 70 id = id + 1 71 last_id = self.insert_to_db(value) 72 value[\'id\'] = last_id 73 values.append(value) 74 print(value) 75 return values 76 77 #根据trid 解析子页 78 def parse(self,trid, pid, url): 79 if url.strip() == \'\': 80 return None 81 # url_prefix+url 82 html = self.crawl_page(url) 83 tree = etree.HTML(html, parser=etree.HTMLParser(encoding=\'gbk\')) 84 85 if trid==3: 86 nodes = tree.xpath(self.trdic.get(trid)) 87 if len(nodes)==0: 88 nodes = tree.xpath(self.trdic.get(4)) 89 print(\'有镇的市:\'+url) 90 else: 91 nodes = tree.xpath(self.trdic.get(trid)) 92 93 94 path = os.path.basename(url) 95 base_url = url.replace(path, \'\') 96 id = 1 97 values = [] 98 # 多个城市 99 for node in nodes: 100 value = {} 101 nexturl = node.xpath(\'./td[1]/a/@href\') 102 if len(nexturl) == 0: 103 nexturl = \'\' 104 code = node.xpath(\'./td[1]/a/text()\') 105 if len(code) == 0: 106 code = node.xpath(\'./td[1]/text()\') 107 name = node.xpath(\'./td[2]/a/text()\') 108 if len(name) == 0: 109 name = node.xpath(\'./td[2]/text()\') 110 value[\'code\'] = "".join(code) 111 urltemp = "".join(nexturl) 112 if len(urltemp) != 0: 113 value[\'url\'] = base_url + "".join(nexturl) 114 else: 115 value[\'url\'] = \'\' 116 value[\'name\'] = "".join(name) 117 print(repr(value[\'name\'])) 118 print(value[\'url\']) 119 value[\'id\'] = id 120 value[\'pid\'] = pid 121 value[\'level\'] = trid 122 id = id + 1 123 last_id = self.insert_to_db(value) 124 value[\'id\'] = last_id 125 values.append(value) 126 print(value) 127 return values 128 129 #解析社区页 130 def parseVillager(self,trid, pid, url): 131 html = self.crawl_page(url) 132 tree = etree.HTML(html, parser=etree.HTMLParser(encoding=\'gbk\')) 133 nodes = tree.xpath(self.trdic.get(trid)) 134 id = 1 135 values = [] 136 # 多个城市 137 for node in nodes: 138 value = {} 139 nexturl = node.xpath(\'./td[1]/a/@href\') 140 code = node.xpath(\'./td[1]/text()\') 141 vcode = node.xpath(\'./td[2]/text()\') 142 name = node.xpath(\'./td[3]/text()\') 143 value[\'code\'] = "".join(code) 144 value[\'url\'] = "".join(nexturl) 145 value[\'name\'] = "".join(name) 146 print(repr(value[\'name\'])) 147 value[\'id\'] = id 148 value[\'pid\'] = pid 149 value[\'level\'] = trid 150 values.append(value) 151 id = id + 1 152 last_id = self.insert_to_db(value) 153 value[\'id\'] = last_id 154 values.append(value) 155 print(value) 156 157 return values 158 159 #插入数据库 160 def insert_to_db(self,taobao): 161 # return 0 162 param = [] 163 lastid = 0 164 try: 165 sql = \'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)\' 166 param = (0, taobao.get("pid"), taobao.get("name"), \'\', taobao.get("level"), taobao.get("code")) 167 self.cur.execute(sql, param) 168 lastid = self.cur.lastrowid 169 self.conn.commit() 170 except Exception as e: 171 print(e) 172 self.conn.rollback() 173 return lastid 174 175 #从头执行解析 176 def parseChineseCity(self): 177 values = self.parseProvince() 178 for value in values: 179 citys = self.parse(2, value[\'id\'], value[\'url\']) 180 if not citys is None: 181 for city in citys: 182 countys = self.parse(3, city[\'id\'], city[\'url\']) 183 #这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了 184 if not countys is None: 185 for county in countys: 186 towns = self.parse(4, county[\'id\'], county[\'url\']) 187 if towns is not None: 188 for town in towns: 189 villagers = self.parseVillager(5, town[\'id\'], town[\'url\']) 190 191 if __name__ == \'__main__\': 192 chinese_city = chinese_city() 193 chinese_city.parseChineseCity()
如果提示缺少相应的库,可以使用pip进行安装:
pip install pymysql
pip install lxml
运行脚本:
python ./xzqh.py
祝您成功!
版权声明:本文为liongis原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。