标签归档:行政区划

最新全国区划代码和城乡划分代码(2020-02-05) 数据

由于现在这个公司主要服务于政府,项目上经常都会用到行政区域的列表和查找,在网上找了很多,都是残缺或者过时的,刚好最近的一个项目也需要,所以就写了一个爬虫。从国家统计局上爬下来。

import copy
import json
import multiprocessing
import os
import random
import re
import traceback
from time import sleep

import requests
import pymysql
from bs4 import BeautifulSoup

agent_list = [
    "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.105 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
]

def patch_href(base_url: str, href: str):
    return re.sub('/[a-z0-9]+\.html$', "/%s" % href, base_url)

def connect():
    return pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='area', charset='utf8')

def html_root(url):
    return url.replace("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019","/data/tjsj")

def save_html(url,content):
    path = html_root(url)
    directory = os.path.dirname(path)
    if not os.path.isdir(directory):
        os.mkdir(directory, 0o755)
    try:
        with open(path,'w+',encoding='gbk') as f:
            f.write(content)
    except BaseException as e:
        print(" write html to file failed (%s) (%s)" % (repr(e),url))

def get_free_proxy(proxies:list,num):
    url = ''
    result = requests.get(url,headers={'User-Agent': get_agent()})
    try:
        data = json.loads(result.text)
        for i in data["data"]:
            if len(proxies) > 0:
                proxies.pop(0)
            proxies.append("http://%s:%s" % (i["IP"],i["Port"]))
        return True
    except BaseException as e:
        print(repr(e))
        return False

def flush_http_proxy():
    while not get_free_proxy(Area.proxies,8):
        sleep(2)

def get_http_proxy():
    if len(Area.proxies) > 0:
        return Area.proxies[0]
    else:
        flush_http_proxy()
        return get_http_proxy()

def get_agent():
    return random.choice(agent_list)

def fails():
    if Area.fails > 3:
        Area.fails = 0
        Area.proxies.pop(0)
    else:
        sleep(Area.fails * 1)
    Area.fails += 1

def success():
    if Area.fails > 1:
       Area.fails -= 1

def create_table():
    cur = connect().cursor()
    cur.execute("DROP TABLE IF EXISTS `%s`" % Area.table_name)
    cur.execute("""
    CREATE TABLE `%s` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `level` int(10) DEFAULT NULL,
      `code` varchar(12) DEFAULT NULL,
      `short_code` varchar (12) DEFAULT NULL,
      `parent` varchar(12) DEFAULT NULL,
      `path` json DEFAULT NULL,
      `name` varchar(32) DEFAULT NULL,
      `merger_name` varchar(255) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    """ % Area.table_name)
    cur.connection.commit()

class Area(object):
    table_name = "cn_area"
    fails = 0
    proxies = []
    verify_string = "统计用区划代码"
    connect = connect()

    def __init__(self, short_code: str, path: [str], level: int, merger_name: str, name: str, href: str):
        self.short_code: str = short_code
        self.href: str = href
        self.code: str = short_code + "000000000000"[len(short_code):]
        if len(path) > 0:
            self.parent: str = path[-1]
        else:
            self.parent: str = "0"
        self.path: [] = path
        path.append(self.code)
        self.level: int = level
        self.name: str = name
        if merger_name != "":
            self.merger_name: str = ("%s,%s" % (merger_name, name))
        else:
            self.merger_name: str = name

    def save(self):
        cur = Area.connect.cursor()
        sql = (
            'INSERT INTO `%s` (`short_code`,`code`, `level`,`parent`, `name`, `merger_name`,`path`) VALUES("%s","%s", %s, "%s", "%s","%s",\'%s\')' % (
            Area.table_name,self.short_code, self.code, self.level, self.parent, self.name, self.merger_name,
            json.dumps(self.path)))
        cur.execute(sql)
        cur.connection.commit()

    def pull(self):
        self.view()
        self.save()
        if self.href != "":
            content = html_get(self.href)

            while (not isinstance(content,str)) or content.find(Area.verify_string) < 0:
                fails()
                content = html_get(self.href)

            success()
            save_html(self.href,content)
            soup = BeautifulSoup(content, 'lxml')
            data = soup.select("tr.citytr, tr.countytr, tr.towntr, tr.villagetr")
            for area in data:
                td = area.find_all("td")
                full_code: str = td[0].get_text()
                level = self.level + 1
                if level > 3:
                    code: str = full_code[:3 *2 + (level - 3)*3]
                else:
                    code: str = full_code[:level * 2]
                if td[0].a is not None and td[0].a["href"] is not None:
                    href = patch_href(self.href, td[0].a["href"])
                else:
                    href = ""
                if len(td) == 2:  # 城/镇
                    name: str = td[1].get_text()
                elif len(td) == 3:  # 城/乡
                    name: str = td[2].get_text()
                else:
                    raise BaseException("unexpect html content")
                this = Area(short_code=code,path=copy.deepcopy(self.path),level=level,merger_name=self.merger_name,name=name,href=href)
                this.pull()

    def view(self):
        print(self.__dict__)

def html_get(url):
    try:
        print("%s => %s" % ("pull", url))
        body = requests.get(url,
            timeout= 3,
            headers={'User-Agent': get_agent()},
            proxies={'http': get_http_proxy()}
        )
        body.encoding = 'GBK'
        print(body.text)
    except BaseException as e:
        print(repr(e))
        fails()
    else:
        return body.text
#多进程执行入口
def fetch(province :map):
    Area.connect = connect()
    area = Area(
        short_code=province["short_code"],
        path=province["path"],
        level=province["level"],
        merger_name=province["merger_name"],
        name=province["name"],
        href=province["href"]
    )
    try:
        area.pull()
    except BaseException as e:
        print(repr(e))
        traceback.print_exc(file=open("error.log",'a+'))


def distribute(url: str):
    content = html_get(url)
    while (not isinstance(content, str)) or content.find(Area.verify_string) < 0:
        fails()
        content = html_get(url)
    success()
    save_html(url,content)
    soup = BeautifulSoup(content, 'lxml')
    data = soup.select("tr.provincetr > td")
    pool = multiprocessing.Pool(64)
    for row in data:
        if row.get_text():
            if row.a is not None:
                if row.a is not None and row.a["href"] is not None:
                    href = patch_href(url, row.a["href"])
                else:
                    href = ""
                area = {
                    "short_code":re.split('[./]', href)[-2],
                    "path":copy.deepcopy([]),
                    "level":1,
                    "merger_name":"",
                    "name":row.get_text(),
                    "href":href,
                }
                pool.apply_async(fetch,(area,))
    pool.close()
    pool.join()

if __name__ == '__main__':
    create_table()
    flush_http_proxy()
    baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html"
    distribute(baseUrl)

下面已经拉取完整的数据库文件(不含港澳台),在http代理稳定的情况下大致需要20分钟即可拉取共704750条数据(不得吐槽下,国家的发布网站,竟然还有乱码问题,还是我自己手动改的。)