国家统计局行政区划获取及入库ES实践

我们先看下最终效果：
在这里插入图片描述

1. ES索引新建

PUT administrative_division
{
  "mappings": {
    "properties": {
      "province": {
        "type": "keyword"
      },
      "province_code": {
        "type": "keyword"
      },
      "city": {
        "type": "keyword"
      },
      "city_code": {
        "type": "keyword"
      },
      "district": {
        "type": "keyword"
      },
      "district_code": {
        "type": "keyword"
      },
      "town": {
        "type": "keyword"
      },
      "town_code": {
        "type": "keyword"
      },
      "committee": {
        "type": "keyword"
      },
      "committee_code": {
        "type": "keyword"
      },
      "type_code": {
        "type": "keyword"
      }
    }
  },
  "settings": {
    "number_of_replicas": 0,
    "number_of_shards": 1
  }
}

2. 代码编写

此处代码找的网上大神写的个人认为较为简洁的，直接拿来用改下存储

from lxml import etree
import requests
import time
import random

"""
国家统计局行政区划获取
"""
from elasticsearch import helpers, Elasticsearch


def init_es_client(es_host):
    es = Elasticsearch(hosts=[es_host], verify_certs=False)
    return es


es_client = init_es_client('http://127.0.0.1:9200')

actions = list()
count = 0


def get_html(url):
    response = requests.get(url)
    response.encoding = "utf8"
    res = response.text
    html = etree.HTML(res)
    return html


base_url = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/"
url = base_url + "index.html"
province_html = get_html(url)
province_list = province_html.xpath('//tr[@class="provincetr"]/td')
province_code = province_list[0].xpath('//td/a/@href')
province_name = province_list[0].xpath('//td/a/text()')
province = dict(zip([p.split(".")[0] for p in province_code], province_name))

actions = list()
for p_key in province.keys():
    url_city = base_url + p_key + ".html"
    time.sleep(random.randint(0, 3))
    city_html = get_html(url_city)
    if city_html is None:
        print("city_html is None", url_city)
        continue
    city_code = city_html.xpath('//tr[@class="citytr"]/td[1]/a/text()')
    city_name = city_html.xpath('//tr[@class="citytr"]/td[2]/a/text()')
    city_url = city_html.xpath('//tr[@class="citytr"]/td[1]/a/@href')
    for c_num in range(len(city_url)):
        county_url = base_url + city_url[c_num]
        time.sleep(random.randint(0, 3))
        county_html = get_html(county_url)
        if county_html is None:
            print("county_html is None", county_url)
            continue
        county_code = county_html.xpath('//tr[@class="countytr"]/td[1]/a/text()')
        county_name = county_html.xpath('//tr[@class="countytr"]/td[2]/a/text()')
        county_url = county_html.xpath('//tr[@class="countytr"]/td[1]/a/@href')
        for t_num in range(len(county_url)):
            town_url = base_url + "/" + city_url[c_num].split('/')[0] + "/" + county_url[t_num]
            time.sleep(random.randint(0, 3))
            town_html = get_html(town_url)
            if town_html is None:
                print("town_html is None", town_url)
                continue
            town_code = town_html.xpath('//tr[@class="towntr"]/td[1]/a/text()')
            town_name = town_html.xpath('//tr[@class="towntr"]/td[2]/a/text()')
            town_url = town_html.xpath('//tr[@class="towntr"]/td[1]/a/@href')
            for v_num in range(len(town_url)):
                code_ = town_url[v_num].split("/")[1].rstrip(".html")
                village_url = base_url + code_[0:2] + "/" + code_[2:4] + "/" + town_url[v_num]
                time.sleep(random.randint(0, 3))
                village_html = get_html(village_url)
                if village_html is None:
                    print("village_html is None", village_url)
                    continue
                # 居委村委代码
                village_code = village_html.xpath('//tr[@class="villagetr"]/td[1]/text()')
                # 居委村委城乡分类代码
                village_type_code = village_html.xpath('//tr[@class="villagetr"]/td[2]/text()')
                # 居委村委名称
                village_name = village_html.xpath('//tr[@class="villagetr"]/td[3]/text()')
                for num in range(len(village_code)):
                    v_name = village_name[num]
                    v_code = village_code[num]
                    type_code = village_type_code[num]
                    info = dict()
                    info['province'] = str(p_key).ljust(12, '0')
                    info['province_code'] = province[p_key]
                    info['city_code'] = city_code[c_num]
                    info['city'] = city_name[c_num]
                    info['district_code'] = county_code[t_num]
                    info['district'] = county_name[t_num]
                    info['town_code'] = town_code[v_num]
                    info['town'] = town_name[v_num]
                    info['type_code'] = type_code
                    info['committee_code'] = v_code
                    info['committee'] = v_name
                    action = {
                        "_op_type": "index",
                        "_index": "administrative_division",
                        "_id": v_code,
                        "_source": info
                    }
                    actions.append(action)
                    if len(actions) == 10:
                        helpers.bulk(es_client, actions)
                        count += len(actions)
                        print(count)
                        actions.clear()
if len(actions) > 0:
    helpers.bulk(es_client, actions)
    count += len(actions)
    print(count)
    actions.clear()