最新全国区划代码和城乡划分代码(2020-02-05) 数据

由于现在这个公司主要服务于政府,项目上经常都会用到行政区域的列表和查找,在网上找了很多,都是残缺或者过时的,刚好最近的一个项目也需要,所以就写了一个爬虫。从国家统计局上爬下来。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import copy
import json
import multiprocessing
import os
import random
import re
import traceback
from time import sleep
 
import requests
import pymysql
from bs4 import BeautifulSoup
 
agent_list = [
    "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.105 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
]
 
def patch_href(base_url: str, href: str):
    return re.sub('/[a-z0-9]+\.html$', "/%s" % href, base_url)
 
def connect():
    return pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='area', charset='utf8')
 
def html_root(url):
    return url.replace("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019","/data/tjsj")
 
def save_html(url,content):
    path = html_root(url)
    directory = os.path.dirname(path)
    if not os.path.isdir(directory):
        os.mkdir(directory, 0o755)
    try:
        with open(path,'w+',encoding='gbk') as f:
            f.write(content)
    except BaseException as e:
        print(" write html to file failed (%s) (%s)" % (repr(e),url))
 
def get_free_proxy(proxies:list,num):
    url = ''
    result = requests.get(url,headers={'User-Agent': get_agent()})
    try:
        data = json.loads(result.text)
        for i in data["data"]:
            if len(proxies) > 0:
                proxies.pop(0)
            proxies.append("http://%s:%s" % (i["IP"],i["Port"]))
        return True
    except BaseException as e:
        print(repr(e))
        return False
 
def flush_http_proxy():
    while not get_free_proxy(Area.proxies,8):
        sleep(2)
 
def get_http_proxy():
    if len(Area.proxies) > 0:
        return Area.proxies[0]
    else:
        flush_http_proxy()
        return get_http_proxy()
 
def get_agent():
    return random.choice(agent_list)
 
def fails():
    if Area.fails > 3:
        Area.fails = 0
        Area.proxies.pop(0)
    else:
        sleep(Area.fails * 1)
    Area.fails += 1
 
def success():
    if Area.fails > 1:
       Area.fails -= 1
 
def create_table():
    cur = connect().cursor()
    cur.execute("DROP TABLE IF EXISTS `%s`" % Area.table_name)
    cur.execute("""
    CREATE TABLE `%s` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `level` int(10) DEFAULT NULL,
      `code` varchar(12) DEFAULT NULL,
      `short_code` varchar (12) DEFAULT NULL,
      `parent` varchar(12) DEFAULT NULL,
      `path` json DEFAULT NULL,
      `name` varchar(32) DEFAULT NULL,
      `merger_name` varchar(255) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    """ % Area.table_name)
    cur.connection.commit()
 
class Area(object):
    table_name = "cn_area"
    fails = 0
    proxies = []
    verify_string = "统计用区划代码"
    connect = connect()
 
    def __init__(self, short_code: str, path: [str], level: int, merger_name: str, name: str, href: str):
        self.short_code: str = short_code
        self.href: str = href
        self.code: str = short_code + "000000000000"[len(short_code):]
        if len(path) > 0:
            self.parent: str = path[-1]
        else:
            self.parent: str = "0"
        self.path: [] = path
        path.append(self.code)
        self.level: int = level
        self.name: str = name
        if merger_name != "":
            self.merger_name: str = ("%s,%s" % (merger_name, name))
        else:
            self.merger_name: str = name
 
    def save(self):
        cur = Area.connect.cursor()
        sql = (
            'INSERT INTO `%s` (`short_code`,`code`, `level`,`parent`, `name`, `merger_name`,`path`) VALUES("%s","%s", %s, "%s", "%s","%s",\'%s\')' % (
            Area.table_name,self.short_code, self.code, self.level, self.parent, self.name, self.merger_name,
            json.dumps(self.path)))
        cur.execute(sql)
        cur.connection.commit()
 
    def pull(self):
        self.view()
        self.save()
        if self.href != "":
            content = html_get(self.href)
 
            while (not isinstance(content,str)) or content.find(Area.verify_string) < 0:
                fails()
                content = html_get(self.href)
 
            success()
            save_html(self.href,content)
            soup = BeautifulSoup(content, 'lxml')
            data = soup.select("tr.citytr, tr.countytr, tr.towntr, tr.villagetr")
            for area in data:
                td = area.find_all("td")
                full_code: str = td[0].get_text()
                level = self.level + 1
                if level > 3:
                    code: str = full_code[:3 *2 + (level - 3)*3]
                else:
                    code: str = full_code[:level * 2]
                if td[0].a is not None and td[0].a["href"] is not None:
                    href = patch_href(self.href, td[0].a["href"])
                else:
                    href = ""
                if len(td) == 2# 城/镇
                    name: str = td[1].get_text()
                elif len(td) == 3# 城/乡
                    name: str = td[2].get_text()
                else:
                    raise BaseException("unexpect html content")
                this = Area(short_code=code,path=copy.deepcopy(self.path),level=level,merger_name=self.merger_name,name=name,href=href)
                this.pull()
 
    def view(self):
        print(self.__dict__)
 
def html_get(url):
    try:
        print("%s => %s" % ("pull", url))
        body = requests.get(url,
            timeout= 3,
            headers={'User-Agent': get_agent()},
            proxies={'http': get_http_proxy()}
        )
        body.encoding = 'GBK'
        print(body.text)
    except BaseException as e:
        print(repr(e))
        fails()
    else:
        return body.text
#多进程执行入口
def fetch(province :map):
    Area.connect = connect()
    area = Area(
        short_code=province["short_code"],
        path=province["path"],
        level=province["level"],
        merger_name=province["merger_name"],
        name=province["name"],
        href=province["href"]
    )
    try:
        area.pull()
    except BaseException as e:
        print(repr(e))
        traceback.print_exc(file=open("error.log",'a+'))
 
 
def distribute(url: str):
    content = html_get(url)
    while (not isinstance(content, str)) or content.find(Area.verify_string) < 0:
        fails()
        content = html_get(url)
    success()
    save_html(url,content)
    soup = BeautifulSoup(content, 'lxml')
    data = soup.select("tr.provincetr > td")
    pool = multiprocessing.Pool(64)
    for row in data:
        if row.get_text():
            if row.a is not None:
                if row.a is not None and row.a["href"] is not None:
                    href = patch_href(url, row.a["href"])
                else:
                    href = ""
                area = {
                    "short_code":re.split('[./]', href)[-2],
                    "path":copy.deepcopy([]),
                    "level":1,
                    "merger_name":"",
                    "name":row.get_text(),
                    "href":href,
                }
                pool.apply_async(fetch,(area,))
    pool.close()
    pool.join()
 
if __name__ == '__main__':
    create_table()
    flush_http_proxy()
    distribute(baseUrl)

下面已经拉取完整的数据库文件(不含港澳台),在http代理稳定的情况下大致需要20分钟即可拉取共704750条数据(不得吐槽下,国家的发布网站,竟然还有乱码问题,还是我自己手动改的。)

前方是何方。

说起来我的职业生涯可真是忐忑。
出来想写程序,稀里糊涂的做了运维。
又被忽悠进了所谓的大公司修了1年电脑。
出来再入职了,有机会学习C++,学习编程的知识了。又莫名其妙的去维护PHP的代码。
而且这PHP代码还很烂。
再过半年就25周岁,嗯。(=_=)
20岁说的话,就当放屁了吧。
已不在拿年轻当懒惰的接口了。
特么的,不年轻了我就不懒惰似的。

Bash Parameter Expansion

今天重新温故了一遍shell
Bash Reference Manual
在阅读Parameter Expansion部分发现了新大陆.

${parameter:-word}
如果parameter为空或者未定义,值替换为word;否则为parameter的值.

1
2
3
#var1=""  var2="world"
#echo ${var1:-hello} ${var2:-guest}
hello world

${parameter:=word}
如果parameter为空或者未定义,word赋值给parameter;否则为parameter的值.

1
2
3
4
5
#var1=""  var2="world"
#echo ${var1:=hello} ${var2:=guest}
hello world
#echo ${var1} ${var2}
hello world

${parameter:?word}
如果parameter为空或者未定义,word作为标准错误输出,并且如果shell非交互式,退出shell;否则为parameter的值.

1
2
#echo ${var:?这个变量存在吗?}
-bash: var: 这个变量存在吗?

${parameter:+word}
如果parameter为空或者未定义,值为空;否则为word的值.(和${parameter:-word}相反)

1
2
3
#var1=""  var2="world"
#echo ${var1:+hello} ${var2:+guest}
guest

${parameter:offset}
${parameter:offset:length}
这是子字符串的扩展;
如果parameter是一个变量,
值为参数(从左到右)第offset的开始保留length个字符;
如果没有指定length,值为参数(从左到右)第offset的开始的字符;
如果offset小于0,则从尾部开始(offset为负数时,一定要加空格);
如果length小于0,则表示为从尾部开始的偏移量.
如果parameter是一个数组(带下标@或者*),
数组成员作为一个元素偏移,length必须大于0.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
*strings为变量
#strings=goodluck
#echo ${strings:4:4}
luck
#echo ${strings: -6:2}
od
#echo ${strings: -6:-1}
odluc
*strings为数组,length必须大于0
#strings=(this is a arrays)
#echo ${strings[@]:2:1}
a
#echo ${strings[@]: -1:1}
arrays
#echo ${strings[@]: -1: -1}
-bash:  -1: substring expression < 0

${!prefix*}
${!prefix@}
列出所有开头为prefix的变量名,以IFS的第一个字符作为分隔符.
如果使用”@“,并且在双引号内,则每个变量名为一个单独的单词.

1
2
3
4
5
6
7
8
9
10
#declare websit=blog.weskiller.com wad= wait=3
#echo ${!w@}
wad wait websit
#echo ${!wa*}
wad wait
#for i in "${!wa@}";do echo $i;done
wad
wait
#for i in "${!wa*}";do echo $i;done
wad wait

${!name[@]}
${!name[*]}
如果name是一个数组变量,列出数组下标或者数组键值(如果是键值数组),
如果使用”@“,并且在双引号内,则每个键值或者下标为一个单独的单词.;
如果name是一个变量,则值为0;
如果name不是一个数组或者变量,值为空.

1
2
3
4
5
6
7
8
9
10
11
#declare -A arrays=([a]=1 [b]=2 [c]=3 [d]=4)
#echo ${!arrays[@]}
a b c d
#unset arrays
#declare -a arrays=(1 2 3 4 5)
#echo ${!arrays[*]}
0 1 2 3 4
#unset arrays
#decalire arrays=this
#echo ${!arrays[@]}
0

${#parameter}
值为parameter字符的长度;
如果parameter是一个数组(带下标@或者*),则值表示数组包含元素的个数.

1
2
3
4
5
6
#strings=abcd
#echo ${#strings}
4
#strings=(a b c d e)
#echo ${#strings[*]}
5

${parameter#word}
${parameter##word}
*“表示为通配符,表示任意字符任意长度,
?“表示为通配符,表示一个任意字符,
[..]“表示匹配[]内包含的单个字符.
详细资料请移步查看手册Filename-Expansion
word作为文件名扩展模式,从parameter的值头部开始匹配,删除匹配最短的部分(#),或者删除最长匹配的部分(##).
如果parameter是一个数组(带下标@或者*),
则对每个元素进行匹配,结果为删除后的值.

1
2
3
4
5
#strings="hello world"
#echo ${strings#h*l}
lo world
echo ${strings##h*l}
d


${parameter%word}
${parameter%%word}
${parameter#word},${parameter##word}相同,但是从尾部开始匹配.

1
2
3
4
5
#strings="hello world"
# echo ${strings%l*d}
hello wor
# echo ${strings%%l*d}
he

${parameter/pattern/string}
pattern匹配parameter的值,替换为string,/替换第一次匹配,
如果pattern开头为/,则替换所有匹配;
pattern作为文件名扩展模式.

1
2
3
4
5
#strings=abcdea
#echo ${strings/a/0}
0bcdea
#echo ${strings//a/0}
0bcde0

${parameter^pattern}
${parameter^^pattern}
改变parameter的值,转换小写字母为大写,第一个字符为大写(^),或者全部转换为大写(^^).

1
2
3
4
5
#shell=bash
#echo ${shell^}
Bash
#echo ${shell^^}
BASH

${parameter,pattern}
${parameter,,pattern}
改变parameter的值,转换大写字母为小写,第一个字符为小写(,),或者全部转换为小写(,,).

1
2
3
4
5
#shell=BASH
#echo ${shell,}
bASH
#echo ${shell,,}
bash

陆续更新