爬取国家统计局2020年行政区划分数据

使用python获取统计局2020年行政区划数据。

参考：https://blog.csdn.net/qlx119/article/details/105289974

在MySQL中创建tab_citys数据表：

DROP TABLE IF EXISTS `tab_citys`;
CREATE TABLE `tab_citys` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `parent_id` int(11) DEFAULT NULL,
  `city_name_zh` varchar(20) NOT NULL,
  `city_name_en` varchar(20) DEFAULT NULL,
  `city_level` int(11) NOT NULL,
  `city_code` char(12) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;

创建xzqh.py的pyton脚本：

  1 #!/usr/bin/python
  2 # -*- coding: UTF-8 -*-
  3 #   功能：  获取省市县数据
  4 #   版本：v1.1
  5 import importlib
  6 import sys
  7 import pymysql
  8 importlib.reload(sys)
  9 import requests
 10 import lxml.etree as etree
 11 import os
 12 class chinese_city():
 13     # 初始化函数
 14     def __init__(self):
 15         self.baseUrl = \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html\'
 16         self.base = \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/\'
 17         self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="xzqh", charset=\'utf8\')
 18         self.cur = self.conn.cursor()
 19         self.trdic = {
 20             1: \'//tr[@class="provincetr"]\',
 21             2: \'//tr[@class="citytr"]\',
 22             3: \'//tr[@class="countytr"]\',
 23             4: \'//tr[@class="towntr"]\',
 24             5: \'//tr[@class="villagetr"]\'
 25         }
 26     def __del__(self):
 27         if self.cur:
 28             self.cur.close()
 29         if self.conn:
 30             self.conn.close()
 31  
 32     def crawl_page(self,url):
 33         \'\'\' 爬行政区划代码公布页 \'\'\'
 34         # print(f"crawling...{url}")
 35         headers = {\'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0\',
 36                    \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\'}
 37         i = 0
 38         while i < 3:
 39             try:
 40                 html = requests.get(url, headers=headers, timeout=20)
 41                 html.encoding = \'gbk\'  # 这里添加一行
 42                 # print(html.status_code)
 43                 text = html.text
 44                 return text
 45             except requests.exceptions.RequestException:
 46                 i += 1
 47                 print(\'超时\'+url)
 48  
 49     #解析省页，返回list
 50     def parseProvince(self):
 51         html = self.crawl_page(self.baseUrl)
 52         tree = etree.HTML(html, parser=etree.HTMLParser(encoding=\'gbk\'))
 53         nodes = tree.xpath(\'//tr[@class="provincetr"]\')
 54         id = 1
 55         values = []
 56         for node in nodes:
 57             items = node.xpath(\'./td\')
 58             for item in items:
 59                 value = {}
 60                 nexturl = item.xpath(\'./a/@href\')
 61                 province = item.xpath(\'./a/text()\')
 62                 print(province)
 63                 value[\'url\'] = self.base + "".join(nexturl)
 64                 value[\'name\'] = "".join(province)
 65                 value[\'code\'] = 0
 66                 value[\'pid\'] = 0
 67                 value[\'id\'] = id
 68                 value[\'level\'] = 1
 69                 print(repr(value[\'name\']))
 70                 id = id + 1
 71                 last_id = self.insert_to_db(value)
 72                 value[\'id\'] = last_id
 73                 values.append(value)
 74                 print(value)
 75         return values
 76  
 77     #根据trid 解析子页
 78     def parse(self,trid, pid, url):
 79         if url.strip() == \'\':
 80             return None
 81         # url_prefix+url
 82         html = self.crawl_page(url)
 83         tree = etree.HTML(html, parser=etree.HTMLParser(encoding=\'gbk\'))
 84         
 85         if trid==3:
 86             nodes = tree.xpath(self.trdic.get(trid))
 87             if len(nodes)==0:
 88                 nodes = tree.xpath(self.trdic.get(4))
 89                 print(\'有镇的市：\'+url)
 90         else:
 91             nodes = tree.xpath(self.trdic.get(trid))
 92  
 93  
 94         path = os.path.basename(url)
 95         base_url = url.replace(path, \'\')
 96         id = 1
 97         values = []
 98         # 多个城市
 99         for node in nodes:
100             value = {}
101             nexturl = node.xpath(\'./td[1]/a/@href\')
102             if len(nexturl) == 0:
103                 nexturl = \'\'
104             code = node.xpath(\'./td[1]/a/text()\')
105             if len(code) == 0:
106                 code = node.xpath(\'./td[1]/text()\')
107             name = node.xpath(\'./td[2]/a/text()\')
108             if len(name) == 0:
109                 name = node.xpath(\'./td[2]/text()\')
110             value[\'code\'] = "".join(code)
111             urltemp = "".join(nexturl)
112             if len(urltemp) != 0:
113                 value[\'url\'] = base_url + "".join(nexturl)
114             else:
115                 value[\'url\'] = \'\'
116             value[\'name\'] = "".join(name)
117             print(repr(value[\'name\']))
118             print(value[\'url\'])
119             value[\'id\'] = id
120             value[\'pid\'] = pid
121             value[\'level\'] = trid
122             id = id + 1
123             last_id = self.insert_to_db(value)
124             value[\'id\'] = last_id
125             values.append(value)
126             print(value)
127         return values
128  
129     #解析社区页
130     def parseVillager(self,trid, pid, url):
131         html = self.crawl_page(url)
132         tree = etree.HTML(html, parser=etree.HTMLParser(encoding=\'gbk\'))
133         nodes = tree.xpath(self.trdic.get(trid))
134         id = 1
135         values = []
136         # 多个城市
137         for node in nodes:
138             value = {}
139             nexturl = node.xpath(\'./td[1]/a/@href\')
140             code = node.xpath(\'./td[1]/text()\')
141             vcode = node.xpath(\'./td[2]/text()\')
142             name = node.xpath(\'./td[3]/text()\')
143             value[\'code\'] = "".join(code)
144             value[\'url\'] = "".join(nexturl)
145             value[\'name\'] = "".join(name)
146             print(repr(value[\'name\']))
147             value[\'id\'] = id
148             value[\'pid\'] = pid
149             value[\'level\'] = trid
150             values.append(value)
151             id = id + 1
152             last_id = self.insert_to_db(value)
153             value[\'id\'] = last_id
154             values.append(value)
155             print(value)
156  
157         return values
158  
159     #插入数据库
160     def insert_to_db(self,taobao):
161         # return 0
162         param = []
163         lastid = 0
164         try:
165             sql = \'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)\'
166             param = (0, taobao.get("pid"), taobao.get("name"), \'\', taobao.get("level"), taobao.get("code"))
167             self.cur.execute(sql, param)
168             lastid = self.cur.lastrowid
169             self.conn.commit()
170         except Exception as e:
171             print(e)
172             self.conn.rollback()
173         return lastid
174  
175     #从头执行解析
176     def parseChineseCity(self):
177         values = self.parseProvince()
178         for value in values:
179             citys = self.parse(2, value[\'id\'], value[\'url\'])
180             if not citys is None:
181                 for city in citys:
182                     countys = self.parse(3, city[\'id\'], city[\'url\'])
183                     #这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了
184                     if not countys is None:
185                         for county in countys:
186                             towns = self.parse(4, county[\'id\'], county[\'url\'])
187                             if towns is not None:
188                                 for town in towns:
189                                     villagers = self.parseVillager(5, town[\'id\'], town[\'url\'])
190  
191 if __name__ == \'__main__\':
192     chinese_city = chinese_city()
193     chinese_city.parseChineseCity()

如果提示缺少相应的库，可以使用pip进行安装：

pip install pymysql

pip install lxml

运行脚本：

python ./xzqh.py

祝您成功！

本文链接：https://www.cnblogs.com/liongis/p/14249932.html

爬取国家统计局2020年行政区划分数据

在MySQL中创建tab_citys数据表：

创建xzqh.py的pyton脚本：

运行脚本：

爬取国家统计局2020年行政区划分数据的更多相关文章

随机推荐

热门专题

目录导航