一、网址:
全国行政区划信息查询平台
二、分析并搭建框架
检查网页源码:
检查网页源码可以发现: 所有省级信息全部在javaScript下的json中,会在页面加载时加载json数据,填充到页面的option中。
1、第一步:使用正则表达式抓取json数据并解析,组成一个province集合:
# 获取省的集合
def get_province(self):
pattern = re.compile(r"var json =(.*?);", re.MULTILINE | re.DOTALL)
script = self.soup.find("script", text=pattern)
lists = str(pattern.search(script.text).group(1))
json_list = json.loads(lists)
# province_list = set()
province_dict = dict()
for json_data in json_list:
province = json_data['shengji']
quhua_code = json_data['quHuaDaiMa']
province_dict.update({quhua_code: province})
# province_list.add(province)
# print(province_dict)
return province_dict
2、第二步:检查该网站实现级联查询的方式,找出查询市区的方式
根据这段源码可看出,在选择 省级的后,网页会调用selectJson接口进行一个post请求,上图可以看到请求的body和header等信息。
代码:
# 获取市
def get_city(self, shengji):
body = ("shengji=" + shengji).encode('UTF-8')
# body = "shengji='江苏省(苏)'"..encode('UTF-8')
headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
content = response.content
json_list = json.loads(content)
# city_list = set()
city_dict = dict()
for json_data in json_list:
citys = json_data['diji']
# city_list.add(citys)
quhua_code = json_data['quHuaDaiMa']
city_dict.update({quhua_code: citys})
return city_dict
# return city_list
# 获取区
def get_area(self, shengji, diji):
body = ("shengji=" + shengji + "&diji=" + diji).encode('UTF-8')
headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
content = response.content
json_list = json.loads(content)
# area_list = set()
area_dict = dict()
for json_data in json_list:
area = json_data['xianji']
# area_list.add(area)
area_code = json_data['quHuaDaiMa']
area_dict.update({area_code: area})
return area_dict
3、第三步:main函数(遍历所有省市区+数据入库)
数据库表结构如下:
三、全部代码:
import requests
from bs4 import BeautifulSoup
import pymysql
import re
import json
class allAreaDataNew(object):
base_url = 'http://xzqh.mca.gov.cn/map'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
wb_data = requests.get(base_url, headers=headers)
wb_data.encoding = 'GBK'
soup = BeautifulSoup(wb_data.text, 'lxml')
# print(soup)
def __init__(self):
#### 自己数据库信息
self.db = pymysql.connect("***", "***", "***", "***", charset="utf8mb4") # mysql数据库
self.main()
self.db.close()
# 入口
def main(self):
sql_list = set()
province_dict = self.get_province()
for province_code in province_dict:
province = province_dict[province_code]
city_dict = self.get_city(province)
sql_province = "insert into area_config values (null,'" + province + "','PROVINCE'," + province_code + ",0)"
sql_list.add(sql_province)
print(
province_code + "----------------------------------省------------------------------------------" + province + "\n")
for city_code in city_dict:
city = city_dict[city_code]
area_dict = self.get_area(province, city)
print(city_code + "*******************市****************" + city + "\n")
# 处理 省直辖县级行政单位
if city == '省直辖县级行政单位' or city == '自治区直辖县级行政单位':
sql_city = "insert into area_config values (null,'" + city + "','CITY'," + province_code + "," + province_code + ")"
sql_list.add(sql_city)
for area_code in area_dict:
area = area_dict[area_code]
print(area_code + "-区-" + area + "\n")
sql_area = "insert into area_config values (null,'" + area + "','DISTRICT'," + area_code + "," + province_code + ")"
sql_list.add(sql_area)
else:
sql_city = "insert into area_config values (null,'" + city + "','CITY'," + city_code + "," + province_code + ")"
sql_list.add(sql_city)
for area_code in area_dict:
area = area_dict[area_code]
print(area_code + "-区-" + area + "\n")
sql_area = "insert into area_config values (null,'" + area + "','DISTRICT'," + area_code + "," + city_code + ")"
sql_list.add(sql_area)
print(str(sql_list))
# 事务入库
empty_sql = "delete from area_config"
self.connect_mysql(empty_sql, sql_list)
# 获取省
def get_province(self):
pattern = re.compile(r"var json =(.*?);", re.MULTILINE | re.DOTALL)
script = self.soup.find("script", text=pattern)
lists = str(pattern.search(script.text).group(1))
json_list = json.loads(lists)
# province_list = set()
province_dict = dict()
for json_data in json_list:
province = json_data['shengji']
quhua_code = json_data['quHuaDaiMa']
province_dict.update({quhua_code: province})
# province_list.add(province)
# print(province_dict)
return province_dict
# 获取市
def get_city(self, shengji):
body = ("shengji=" + shengji).encode('UTF-8')
# body = "shengji='江苏省(苏)'"..encode('UTF-8')
headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
content = response.content
json_list = json.loads(content)
# city_list = set()
city_dict = dict()
for json_data in json_list:
citys = json_data['diji']
# city_list.add(citys)
quhua_code = json_data['quHuaDaiMa']
city_dict.update({quhua_code: citys})
return city_dict
# return city_list
# 获取区
def get_area(self, shengji, diji):
body = ("shengji=" + shengji + "&diji=" + diji).encode('UTF-8')
headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
content = response.content
json_list = json.loads(content)
# area_list = set()
area_dict = dict()
for json_data in json_list:
area = json_data['xianji']
# area_list.add(area)
area_code = json_data['quHuaDaiMa']
area_dict.update({area_code: area})
return area_dict
# return area_list
def connect_mysql(self, empty_sql, sql_list):
cursor = self.db.cursor()
try:
cursor.execute(empty_sql)
for sql in sql_list:
cursor.execute(sql)
print('=================================更新所有数据完成!=================================')
except Exception as e:
print('=================================更新失败!=================================')
print(e)
self.db.rollback()
finally:
cursor.close()
# 提交操作
self.db.commit()
if __name__ == '__main__':
allAreaDataNew()
代码执行成功后就可以查到中国所有省市区啦!:
特殊情况:“省直辖县级行政单位”和“自治区直辖县级行政单位”
注意:部分省有特殊的“直辖县级行政单位”或“自治区直辖县级行政单位”