由于现在这个公司主要服务于政府,项目上经常都会用到行政区域的列表和查找,在网上找了很多,都是残缺或者过时的,刚好最近的一个项目也需要,所以就写了一个爬虫。从国家统计局上爬下来。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | import copy import json import multiprocessing import os import random import re import traceback from time import sleep import requests import pymysql from bs4 import BeautifulSoup agent_list = [ "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36" , "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" , "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" , "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" , "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" , "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" , "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" , "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.105 Safari/537.36" , "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" , "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" , "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" , ] def patch_href(base_url: str , href: str ): return re.sub( '/[a-z0-9]+\.html$' , "/%s" % href, base_url) def connect(): return pymysql.connect(host = '127.0.0.1' , port = 3306 , user = 'root' , passwd = ' ', db=' area ', charset=' utf8') def html_root(url): def save_html(url,content): path = html_root(url) directory = os.path.dirname(path) if not os.path.isdir(directory): os.mkdir(directory, 0o755 ) try : with open (path, 'w+' ,encoding = 'gbk' ) as f: f.write(content) except BaseException as e: print ( " write html to file failed (%s) (%s)" % ( repr (e),url)) def get_free_proxy(proxies: list ,num): url = '' result = requests.get(url,headers = { 'User-Agent' : get_agent()}) try : data = json.loads(result.text) for i in data[ "data" ]: if len (proxies) > 0 : proxies.pop( 0 ) return True except BaseException as e: print ( repr (e)) return False def flush_http_proxy(): while not get_free_proxy(Area.proxies, 8 ): sleep( 2 ) def get_http_proxy(): if len (Area.proxies) > 0 : return Area.proxies[ 0 ] else : flush_http_proxy() return get_http_proxy() def get_agent(): return random.choice(agent_list) def fails(): if Area.fails > 3 : Area.fails = 0 Area.proxies.pop( 0 ) else : sleep(Area.fails * 1 ) Area.fails + = 1 def success(): if Area.fails > 1 : Area.fails - = 1 def create_table(): cur = connect().cursor() cur.execute( "DROP TABLE IF EXISTS `%s`" % Area.table_name) cur.execute( """ CREATE TABLE `%s` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `level` int(10) DEFAULT NULL, `code` varchar(12) DEFAULT NULL, `short_code` varchar (12) DEFAULT NULL, `parent` varchar(12) DEFAULT NULL, `path` json DEFAULT NULL, `name` varchar(32) DEFAULT NULL, `merger_name` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """ % Area.table_name) cur.connection.commit() class Area( object ): table_name = "cn_area" fails = 0 proxies = [] verify_string = "统计用区划代码" connect = connect() def __init__( self , short_code: str , path: [ str ], level: int , merger_name: str , name: str , href: str ): self .short_code: str = short_code self .href: str = href self .code: str = short_code + "000000000000" [ len (short_code):] if len (path) > 0 : self .parent: str = path[ - 1 ] else : self .parent: str = "0" self .path: [] = path path.append( self .code) self .level: int = level self .name: str = name if merger_name ! = "": self .merger_name: str = ( "%s,%s" % (merger_name, name)) else : self .merger_name: str = name def save( self ): cur = Area.connect.cursor() sql = ( 'INSERT INTO `%s` (`short_code`,`code`, `level`,`parent`, `name`, `merger_name`,`path`) VALUES("%s","%s", %s, "%s", "%s","%s",\'%s\')' % ( Area.table_name, self .short_code, self .code, self .level, self .parent, self .name, self .merger_name, json.dumps( self .path))) cur.execute(sql) cur.connection.commit() def pull( self ): self .view() self .save() if self .href ! = "": content = html_get( self .href) while ( not isinstance (content, str )) or content.find(Area.verify_string) < 0 : fails() content = html_get( self .href) success() save_html( self .href,content) soup = BeautifulSoup(content, 'lxml' ) data = soup.select( "tr.citytr, tr.countytr, tr.towntr, tr.villagetr" ) for area in data: td = area.find_all( "td" ) full_code: str = td[ 0 ].get_text() level = self .level + 1 if level > 3 : code: str = full_code[: 3 * 2 + (level - 3 ) * 3 ] else : code: str = full_code[:level * 2 ] if td[ 0 ].a is not None and td[ 0 ].a[ "href" ] is not None : href = patch_href( self .href, td[ 0 ].a[ "href" ]) else : href = "" if len (td) = = 2 : # 城/镇 name: str = td[ 1 ].get_text() elif len (td) = = 3 : # 城/乡 name: str = td[ 2 ].get_text() else : raise BaseException( "unexpect html content" ) this = Area(short_code = code,path = copy.deepcopy( self .path),level = level,merger_name = self .merger_name,name = name,href = href) this.pull() def view( self ): print ( self .__dict__) def html_get(url): try : print ( "%s => %s" % ( "pull" , url)) body = requests.get(url, timeout = 3 , headers = { 'User-Agent' : get_agent()}, proxies = { 'http' : get_http_proxy()} ) body.encoding = 'GBK' print (body.text) except BaseException as e: print ( repr (e)) fails() else : return body.text #多进程执行入口 def fetch(province : map ): Area.connect = connect() area = Area( short_code = province[ "short_code" ], path = province[ "path" ], level = province[ "level" ], merger_name = province[ "merger_name" ], name = province[ "name" ], href = province[ "href" ] ) try : area.pull() except BaseException as e: print ( repr (e)) traceback.print_exc( file = open ( "error.log" , 'a+' )) def distribute(url: str ): content = html_get(url) while ( not isinstance (content, str )) or content.find(Area.verify_string) < 0 : fails() content = html_get(url) success() save_html(url,content) soup = BeautifulSoup(content, 'lxml' ) data = soup.select( "tr.provincetr > td" ) pool = multiprocessing.Pool( 64 ) for row in data: if row.get_text(): if row.a is not None : if row.a is not None and row.a[ "href" ] is not None : href = patch_href(url, row.a[ "href" ]) else : href = "" area = { "short_code" :re.split( '[./]' , href)[ - 2 ], "path" :copy.deepcopy([]), "level" : 1 , "merger_name" :"", "name" :row.get_text(), "href" :href, } pool.apply_async(fetch,(area,)) pool.close() pool.join() if __name__ = = '__main__' : create_table() flush_http_proxy() distribute(baseUrl) |
下面已经拉取完整的数据库文件(不含港澳台),在http代理稳定的情况下大致需要20分钟即可拉取共704750条数据(不得吐槽下,国家的发布网站,竟然还有乱码问题,还是我自己手动改的。)