一、数据来源
国家统计局2023年数据:
二、区划编码现成文件
1、获取方式:
- csdn:资源绑定
- v:JFAN0329
三、python部分代码分析
import time
import requests
from bs4 import BeautifulSoup
import re
import xlsxwriter
def mainClass():
urlindex = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/'
url = urlindex+'index.html'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}
resp = requests.get(url, headers=dic)
resp.encoding = 'zh-CN'
page = BeautifulSoup(resp.text, "html.parser")
httpCode = resp.status_code
workbook = xlsxwriter.Workbook('F:\Study_file\python\getqgqhbm\ssxqybm.xlsx')
worksheet = workbook.add_worksheet()
# 如果请求失败,则休息2秒再次请求
if(httpCode != 200):
while httpCode != 200:
time.sleep(2)
resp = requests.get(url, headers=dic)
resp.encoding = 'zh-CN'
page = BeautifulSoup(resp.text, "html.parser")
httpCode = resp.status_code
tr = page.find_all("tr",attrs={"class": "provincetr"}) # 将属性写成一个字典,等同于”page.find("div", class_="img-list-tt hh")”
i = 0;
dataid = 1;
oneid = 1;
twoid = 1;
treeid = 1;
for td in tr:
td_content = td.find_all("td")
for a in td_content:
a_tag = a.find_all("a")
urlsxq = a_tag[0].get("href") # 直接获取 href 属性
qymctext = a_tag[0].text # 直接获取 text 属性
if urlsxq: # 确保 href 存在
url1 = urlindex + urlsxq;
# 省级输出
worksheet.write(i, 0, dataid)
worksheet.write(i, 2, qymctext)
worksheet.write(i, 3, 0)
worksheet.write(i, 4, 1)
oneid = dataid;
dataid = dataid+1
i = i+1
print(f"一级url:{urlsxq}")
print(f"一级名称:{qymctext}")
resp1 = requests.get(url1, headers=dic)
resp1.encoding = 'zh-CN'
httpCode1 = resp1.status_code
if (httpCode1 != 200):
while httpCode1 != 200:
time.sleep(2)
resp1 = requests.get(url, headers=dic)
resp1.encoding = 'zh-CN'
httpCode1 = resp1.status_code
page1 = BeautifulSoup(resp1.text, "html.parser")
table1 = page1.find_all("table", attrs={"class": "citytable"})
city0ne = len(table1);
for taTr in table1:
tr1 = taTr.find_all("tr", attrs={"class":"citytr"})
for tdTr in tr1:
td1 = tdTr.find_all("a")
url2 = td1[0].get("href")
qhbm = td1[0].text
qhmc = td1[1].text
if url2 !='' and qhmc != '':
worksheet.write(i, 0, dataid)
worksheet.write(i, 1, qhbm)
worksheet.write(i, 2, qhmc)
worksheet.write(i, 3, oneid)
worksheet.write(i, 4, 2)
twoid = dataid
dataid = dataid + 1
i=i+1
print(f"二级url:{url2}")
print(f"二级编码:{qhbm}")
print(f"二级名称:{qhmc}")
onebm = qhbm[0:2]+"000000000"
worksheet.write(oneid-1, 1, onebm)
workbook.close()
if __name__ == '__main__':
mainClass()