SSL: 1
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
{"id":683,"date":"2020-02-28T15:57:20","date_gmt":"2020-02-28T07:57:20","guid":{"rendered":"https:\/\/blog.weskiller.com\/?p=683"},"modified":"2020-03-01T03:48:59","modified_gmt":"2020-02-29T19:48:59","slug":"%e6%9c%80%e6%96%b0%e5%85%a8%e5%9b%bd%e5%8c%ba%e5%88%92%e4%bb%a3%e7%a0%81%e5%92%8c%e5%9f%8e%e4%b9%a1%e5%88%92%e5%88%86%e4%bb%a3%e7%a0%812020-02-05-%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/blog.gamein.vip\/pull-cn-area.html","title":{"rendered":"\u6700\u65b0\u5168\u56fd\u533a\u5212\u4ee3\u7801\u548c\u57ce\u4e61\u5212\u5206\u4ee3\u7801(2020-02-05) \u6570\u636e"},"content":{"rendered":"\n
\u7531\u4e8e\u73b0\u5728\u8fd9\u4e2a\u516c\u53f8\u4e3b\u8981\u670d\u52a1\u4e8e\u653f\u5e9c\uff0c\u9879\u76ee\u4e0a\u7ecf\u5e38\u90fd\u4f1a\u7528\u5230\u884c\u653f\u533a\u57df\u7684\u5217\u8868\u548c\u67e5\u627e\uff0c\u5728\u7f51\u4e0a\u627e\u4e86\u5f88\u591a\uff0c\u90fd\u662f\u6b8b\u7f3a\u6216\u8005\u8fc7\u65f6\u7684\uff0c\u521a\u597d\u6700\u8fd1\u7684\u4e00\u4e2a\u9879\u76ee\u4e5f\u9700\u8981\uff0c\u6240\u4ee5\u5c31\u5199\u4e86\u4e00\u4e2a\u722c\u866b\u3002\u4ece\u56fd\u5bb6\u7edf\u8ba1\u5c40\u4e0a\u722c\u4e0b\u6765\u3002<\/p>\n\n\n
\nimport copy\nimport json\nimport multiprocessing\nimport os\nimport random\nimport re\nimport traceback\nfrom time import sleep\n\nimport requests\nimport pymysql\nfrom bs4 import BeautifulSoup\n\nagent_list = [\n "Mozilla\/5.0 (Linux; Android 6.0; Nexus 5 Build\/MRA58N) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/75.0.3770.142 Mobile Safari\/537.36",\n "Mozilla\/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build\/ICL53F) AppleWebKit\/534.30 (KHTML, like Gecko) Version\/4.0 Mobile Safari\/534.30",\n "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\/601.7.7 (KHTML, like Gecko) Version\/9.1.2 Safari\/601.7.7",\n "Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/60.0.3112.113 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/74.0.3729.169 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/51.0.2704.103 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/52.0.2743.116 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/74.0.3729.169 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 5.2) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/58.0.3029.110 Safari\/537.36 SE 2.X MetaSr 1.0",\n "Mozilla\/5.0 (Windows NT 6.1) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/41.0.2228.0 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/49.0.2623.105 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/69.0.3497.92 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/51.0.2704.103 Safari\/537.36",\n "Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/52.0.2743.116 Safari\/537.36",\n "Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/56.0.2924.87 Safari\/537.36",\n "Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/63.0.3239.108 Safari\/537.36",\n]\n\ndef patch_href(base_url: str, href: str):\n return re.sub('\/[a-z0-9]+\\.html$', "\/%s" % href, base_url)\n\ndef connect():\n return pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='area', charset='utf8')\n\ndef html_root(url):\n return url.replace("http:\/\/www.stats.gov.cn\/tjsj\/tjbz\/tjyqhdmhcxhfdm\/2019","\/data\/tjsj")\n\ndef save_html(url,content):\n path = html_root(url)\n directory = os.path.dirname(path)\n if not os.path.isdir(directory):\n os.mkdir(directory, 0o755)\n try:\n with open(path,'w+',encoding='gbk') as f:\n f.write(content)\n except BaseException as e:\n print(" write html to file failed (%s) (%s)" % (repr(e),url))\n\ndef get_free_proxy(proxies:list,num):\n url = ''\n result = requests.get(url,headers={'User-Agent': get_agent()})\n try:\n data = json.loads(result.text)\n for i in data["data"]:\n if len(proxies) > 0:\n proxies.pop(0)\n proxies.append("http:\/\/%s:%s" % (i["IP"],i["Port"]))\n return True\n except BaseException as e:\n print(repr(e))\n return False\n\ndef flush_http_proxy():\n while not get_free_proxy(Area.proxies,8):\n sleep(2)\n\ndef get_http_proxy():\n if len(Area.proxies) > 0:\n return Area.proxies[0]\n else:\n flush_http_proxy()\n return get_http_proxy()\n\ndef get_agent():\n return random.choice(agent_list)\n\ndef fails():\n if Area.fails > 3:\n Area.fails = 0\n Area.proxies.pop(0)\n else:\n sleep(Area.fails * 1)\n Area.fails += 1\n\ndef success():\n if Area.fails > 1:\n Area.fails -= 1\n\ndef create_table():\n cur = connect().cursor()\n cur.execute("DROP TABLE IF EXISTS `%s`" % Area.table_name)\n cur.execute("""\n CREATE TABLE `%s` (\n `id` int(10) unsigned NOT NULL AUTO_INCREMENT,\n `level` int(10) DEFAULT NULL,\n `code` varchar(12) DEFAULT NULL,\n `short_code` varchar (12) DEFAULT NULL,\n `parent` varchar(12) DEFAULT NULL,\n `path` json DEFAULT NULL,\n `name` varchar(32) DEFAULT NULL,\n `merger_name` varchar(255) DEFAULT NULL,\n PRIMARY KEY (`id`)\n ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;\n """ % Area.table_name)\n cur.connection.commit()\n\nclass Area(object):\n table_name = "cn_area"\n fails = 0\n proxies = []\n verify_string = "\u7edf\u8ba1\u7528\u533a\u5212\u4ee3\u7801"\n connect = connect()\n\n def __init__(self, short_code: str, path: [str], level: int, merger_name: str, name: str, href: str):\n self.short_code: str = short_code\n self.href: str = href\n self.code: str = short_code + "000000000000"[len(short_code):]\n if len(path) > 0:\n self.parent: str = path[-1]\n else:\n self.parent: str = "0"\n self.path: [] = path\n path.append(self.code)\n self.level: int = level\n self.name: str = name\n if merger_name != "":\n self.merger_name: str = ("%s,%s" % (merger_name, name))\n else:\n self.merger_name: str = name\n\n def save(self):\n cur = Area.connect.cursor()\n sql = (\n 'INSERT INTO `%s` (`short_code`,`code`, `level`,`parent`, `name`, `merger_name`,`path`) VALUES("%s","%s", %s, "%s", "%s","%s",\\'%s\\')' % (\n Area.table_name,self.short_code, self.code, self.level, self.parent, self.name, self.merger_name,\n json.dumps(self.path)))\n cur.execute(sql)\n cur.connection.commit()\n\n def pull(self):\n self.view()\n self.save()\n if self.href != "":\n content = html_get(self.href)\n\n while (not isinstance(content,str)) or content.find(Area.verify_string) < 0:\n fails()\n content = html_get(self.href)\n\n success()\n save_html(self.href,content)\n soup = BeautifulSoup(content, 'lxml')\n data = soup.select("tr.citytr, tr.countytr, tr.towntr, tr.villagetr")\n for area in data:\n td = area.find_all("td")\n full_code: str = td[0].get_text()\n level = self.level + 1\n if level > 3:\n code: str = full_code[:3 *2 + (level - 3)*3]\n else:\n code: str = full_code[:level * 2]\n if td[0].a is not None and td[0].a["href"] is not None:\n href = patch_href(self.href, td[0].a["href"])\n else:\n href = ""\n if len(td) == 2: # \u57ce\/\u9547\n name: str = td[1].get_text()\n elif len(td) == 3: # \u57ce\/\u4e61\n name: str = td[2].get_text()\n else:\n raise BaseException("unexpect html content")\n this = Area(short_code=code,path=copy.deepcopy(self.path),level=level,merger_name=self.merger_name,name=name,href=href)\n this.pull()\n\n def view(self):\n print(self.__dict__)\n\ndef html_get(url):\n try:\n print("%s => %s" % ("pull", url))\n body = requests.get(url,\n timeout= 3,\n headers={'User-Agent': get_agent()},\n proxies={'http': get_http_proxy()}\n )\n body.encoding = 'GBK'\n print(body.text)\n except BaseException as e:\n print(repr(e))\n fails()\n else:\n return body.text\n#\u591a\u8fdb\u7a0b\u6267\u884c\u5165\u53e3\ndef fetch(province :map):\n Area.connect = connect()\n area = Area(\n short_code=province["short_code"],\n path=province["path"],\n level=province["level"],\n merger_name=province["merger_name"],\n name=province["name"],\n href=province["href"]\n )\n try:\n area.pull()\n except BaseException as e:\n print(repr(e))\n traceback.print_exc(file=open("error.log",'a+'))\n\n\ndef distribute(url: str):\n content = html_get(url)\n while (not isinstance(content, str)) or content.find(Area.verify_string) < 0:\n fails()\n content = html_get(url)\n success()\n save_html(url,content)\n soup = BeautifulSoup(content, 'lxml')\n data = soup.select("tr.provincetr > td")\n pool = multiprocessing.Pool(64)\n for row in data:\n if row.get_text():\n if row.a is not None:\n if row.a is not None and row.a["href"] is not None:\n href = patch_href(url, row.a["href"])\n else:\n href = ""\n area = {\n "short_code":re.split('[.\/]', href)[-2],\n "path":copy.deepcopy([]),\n "level":1,\n "merger_name":"",\n "name":row.get_text(),\n "href":href,\n }\n pool.apply_async(fetch,(area,))\n pool.close()\n pool.join()\n\nif __name__ == '__main__':\n create_table()\n flush_http_proxy()\n baseUrl = "http:\/\/www.stats.gov.cn\/tjsj\/tjbz\/tjyqhdmhcxhfdm\/2019\/index.html"\n distribute(baseUrl)\n<\/pre><\/div>\n\n\n\u4e0b\u9762\u5df2\u7ecf\u62c9\u53d6\u5b8c\u6574\u7684\u6570\u636e\u5e93\u6587\u4ef6\uff08\u4e0d\u542b\u6e2f\u6fb3\u53f0\uff09\uff0c\u5728http\u4ee3\u7406\u7a33\u5b9a\u7684\u60c5\u51b5\u4e0b\u5927\u81f4\u9700\u898120\u5206\u949f\u5373\u53ef\u62c9\u53d6\u5171704750\u6761\u6570\u636e\uff08\u4e0d\u5f97\u5410\u69fd\u4e0b\uff0c\u56fd\u5bb6\u7684\u53d1\u5e03\u7f51\u7ad9\uff0c\u7adf\u7136\u8fd8\u6709\u4e71\u7801\u95ee\u9898\uff0c\u8fd8\u662f\u6211\u81ea\u5df1\u624b\u52a8\u6539\u7684\u3002\uff09<\/p>\n\n\n\n
cn_area_2020.sql<\/a>\u4e0b\u8f7d<\/a><\/div>\n","protected":false},"excerpt":{"rendered":"\u7531\u4e8e\u73b0\u5728\u8fd9\u4e2a\u516c\u53f8\u4e3b\u8981\u670d\u52a1\u4e8e\u653f\u5e9c\uff0c\u9879\u76ee\u4e0a\u7ecf\u5e38\u90fd\u4f1a\u7528\u5230\u884c\u653f\u533a\u57df\u7684\u5217\u8868\u548c\u67e5\u627e\uff0c\u5728\u7f51\u4e0a\u627e\u4e86\u5f88\u591a\uff0c\u90fd\u662f\u6b8b\u7f3a\u6216\u8005\u8fc7\u65f6\u7684\uff0c\u521a […]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[39],"tags":[40,42],"class_list":["post-683","post","type-post","status-publish","format-standard","hentry","category-python","tag-40","tag-42"],"_links":{"self":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts\/683","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/comments?post=683"}],"version-history":[{"count":14,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts\/683\/revisions"}],"predecessor-version":[{"id":704,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts\/683\/revisions\/704"}],"wp:attachment":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/media?parent=683"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/categories?post=683"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/tags?post=683"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}