SSL: 1
Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893

Warning: Cannot modify header information - headers already sent by (output started at /www/blog/wp-includes/load.php:1646) in /www/blog/wp-includes/rest-api/class-wp-rest-server.php on line 1893
{"id":683,"date":"2020-02-28T15:57:20","date_gmt":"2020-02-28T07:57:20","guid":{"rendered":"https:\/\/blog.weskiller.com\/?p=683"},"modified":"2020-03-01T03:48:59","modified_gmt":"2020-02-29T19:48:59","slug":"%e6%9c%80%e6%96%b0%e5%85%a8%e5%9b%bd%e5%8c%ba%e5%88%92%e4%bb%a3%e7%a0%81%e5%92%8c%e5%9f%8e%e4%b9%a1%e5%88%92%e5%88%86%e4%bb%a3%e7%a0%812020-02-05-%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/blog.gamein.vip\/pull-cn-area.html","title":{"rendered":"\u6700\u65b0\u5168\u56fd\u533a\u5212\u4ee3\u7801\u548c\u57ce\u4e61\u5212\u5206\u4ee3\u7801(2020-02-05) \u6570\u636e"},"content":{"rendered":"\n

\u7531\u4e8e\u73b0\u5728\u8fd9\u4e2a\u516c\u53f8\u4e3b\u8981\u670d\u52a1\u4e8e\u653f\u5e9c\uff0c\u9879\u76ee\u4e0a\u7ecf\u5e38\u90fd\u4f1a\u7528\u5230\u884c\u653f\u533a\u57df\u7684\u5217\u8868\u548c\u67e5\u627e\uff0c\u5728\u7f51\u4e0a\u627e\u4e86\u5f88\u591a\uff0c\u90fd\u662f\u6b8b\u7f3a\u6216\u8005\u8fc7\u65f6\u7684\uff0c\u521a\u597d\u6700\u8fd1\u7684\u4e00\u4e2a\u9879\u76ee\u4e5f\u9700\u8981\uff0c\u6240\u4ee5\u5c31\u5199\u4e86\u4e00\u4e2a\u722c\u866b\u3002\u4ece\u56fd\u5bb6\u7edf\u8ba1\u5c40\u4e0a\u722c\u4e0b\u6765\u3002<\/p>\n\n\n

\nimport copy\nimport json\nimport multiprocessing\nimport os\nimport random\nimport re\nimport traceback\nfrom time import sleep\n\nimport requests\nimport pymysql\nfrom bs4 import BeautifulSoup\n\nagent_list = [\n    "Mozilla\/5.0 (Linux; Android 6.0; Nexus 5 Build\/MRA58N) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/75.0.3770.142 Mobile Safari\/537.36",\n    "Mozilla\/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build\/ICL53F) AppleWebKit\/534.30 (KHTML, like Gecko) Version\/4.0 Mobile Safari\/534.30",\n    "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\/601.7.7 (KHTML, like Gecko) Version\/9.1.2 Safari\/601.7.7",\n    "Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/60.0.3112.113 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/74.0.3729.169 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/51.0.2704.103 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/52.0.2743.116 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 10.0; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/74.0.3729.169 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 5.2) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/58.0.3029.110 Safari\/537.36 SE 2.X MetaSr 1.0",\n    "Mozilla\/5.0 (Windows NT 6.1) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/41.0.2228.0 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/49.0.2623.105 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/69.0.3497.92 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/51.0.2704.103 Safari\/537.36",\n    "Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/52.0.2743.116 Safari\/537.36",\n    "Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/56.0.2924.87 Safari\/537.36",\n    "Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/63.0.3239.108 Safari\/537.36",\n]\n\ndef patch_href(base_url: str, href: str):\n    return re.sub('\/[a-z0-9]+\\.html$', "\/%s" % href, base_url)\n\ndef connect():\n    return pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='area', charset='utf8')\n\ndef html_root(url):\n    return url.replace("http:\/\/www.stats.gov.cn\/tjsj\/tjbz\/tjyqhdmhcxhfdm\/2019","\/data\/tjsj")\n\ndef save_html(url,content):\n    path = html_root(url)\n    directory = os.path.dirname(path)\n    if not os.path.isdir(directory):\n        os.mkdir(directory, 0o755)\n    try:\n        with open(path,'w+',encoding='gbk') as f:\n            f.write(content)\n    except BaseException as e:\n        print(" write html to file failed (%s) (%s)" % (repr(e),url))\n\ndef get_free_proxy(proxies:list,num):\n    url = ''\n    result = requests.get(url,headers={'User-Agent': get_agent()})\n    try:\n        data = json.loads(result.text)\n        for i in data["data"]:\n            if len(proxies) > 0:\n                proxies.pop(0)\n            proxies.append("http:\/\/%s:%s" % (i["IP"],i["Port"]))\n        return True\n    except BaseException as e:\n        print(repr(e))\n        return False\n\ndef flush_http_proxy():\n    while not get_free_proxy(Area.proxies,8):\n        sleep(2)\n\ndef get_http_proxy():\n    if len(Area.proxies) > 0:\n        return Area.proxies[0]\n    else:\n        flush_http_proxy()\n        return get_http_proxy()\n\ndef get_agent():\n    return random.choice(agent_list)\n\ndef fails():\n    if Area.fails > 3:\n        Area.fails = 0\n        Area.proxies.pop(0)\n    else:\n        sleep(Area.fails * 1)\n    Area.fails += 1\n\ndef success():\n    if Area.fails > 1:\n       Area.fails -= 1\n\ndef create_table():\n    cur = connect().cursor()\n    cur.execute("DROP TABLE IF EXISTS `%s`" % Area.table_name)\n    cur.execute("""\n    CREATE TABLE `%s` (\n      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,\n      `level` int(10) DEFAULT NULL,\n      `code` varchar(12) DEFAULT NULL,\n      `short_code` varchar (12) DEFAULT NULL,\n      `parent` varchar(12) DEFAULT NULL,\n      `path` json DEFAULT NULL,\n      `name` varchar(32) DEFAULT NULL,\n      `merger_name` varchar(255) DEFAULT NULL,\n      PRIMARY KEY (`id`)\n    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;\n    """ % Area.table_name)\n    cur.connection.commit()\n\nclass Area(object):\n    table_name = "cn_area"\n    fails = 0\n    proxies = []\n    verify_string = "\u7edf\u8ba1\u7528\u533a\u5212\u4ee3\u7801"\n    connect = connect()\n\n    def __init__(self, short_code: str, path: [str], level: int, merger_name: str, name: str, href: str):\n        self.short_code: str = short_code\n        self.href: str = href\n        self.code: str = short_code + "000000000000"[len(short_code):]\n        if len(path) > 0:\n            self.parent: str = path[-1]\n        else:\n            self.parent: str = "0"\n        self.path: [] = path\n        path.append(self.code)\n        self.level: int = level\n        self.name: str = name\n        if merger_name != "":\n            self.merger_name: str = ("%s,%s" % (merger_name, name))\n        else:\n            self.merger_name: str = name\n\n    def save(self):\n        cur = Area.connect.cursor()\n        sql = (\n            'INSERT INTO `%s` (`short_code`,`code`, `level`,`parent`, `name`, `merger_name`,`path`) VALUES("%s","%s", %s, "%s", "%s","%s",\\'%s\\')' % (\n            Area.table_name,self.short_code, self.code, self.level, self.parent, self.name, self.merger_name,\n            json.dumps(self.path)))\n        cur.execute(sql)\n        cur.connection.commit()\n\n    def pull(self):\n        self.view()\n        self.save()\n        if self.href != "":\n            content = html_get(self.href)\n\n            while (not isinstance(content,str)) or content.find(Area.verify_string) < 0:\n                fails()\n                content = html_get(self.href)\n\n            success()\n            save_html(self.href,content)\n            soup = BeautifulSoup(content, 'lxml')\n            data = soup.select("tr.citytr, tr.countytr, tr.towntr, tr.villagetr")\n            for area in data:\n                td = area.find_all("td")\n                full_code: str = td[0].get_text()\n                level = self.level + 1\n                if level > 3:\n                    code: str = full_code[:3 *2 + (level - 3)*3]\n                else:\n                    code: str = full_code[:level * 2]\n                if td[0].a is not None and td[0].a["href"] is not None:\n                    href = patch_href(self.href, td[0].a["href"])\n                else:\n                    href = ""\n                if len(td) == 2:  # \u57ce\/\u9547\n                    name: str = td[1].get_text()\n                elif len(td) == 3:  # \u57ce\/\u4e61\n                    name: str = td[2].get_text()\n                else:\n                    raise BaseException("unexpect html content")\n                this = Area(short_code=code,path=copy.deepcopy(self.path),level=level,merger_name=self.merger_name,name=name,href=href)\n                this.pull()\n\n    def view(self):\n        print(self.__dict__)\n\ndef html_get(url):\n    try:\n        print("%s => %s" % ("pull", url))\n        body = requests.get(url,\n            timeout= 3,\n            headers={'User-Agent': get_agent()},\n            proxies={'http': get_http_proxy()}\n        )\n        body.encoding = 'GBK'\n        print(body.text)\n    except BaseException as e:\n        print(repr(e))\n        fails()\n    else:\n        return body.text\n#\u591a\u8fdb\u7a0b\u6267\u884c\u5165\u53e3\ndef fetch(province :map):\n    Area.connect = connect()\n    area = Area(\n        short_code=province["short_code"],\n        path=province["path"],\n        level=province["level"],\n        merger_name=province["merger_name"],\n        name=province["name"],\n        href=province["href"]\n    )\n    try:\n        area.pull()\n    except BaseException as e:\n        print(repr(e))\n        traceback.print_exc(file=open("error.log",'a+'))\n\n\ndef distribute(url: str):\n    content = html_get(url)\n    while (not isinstance(content, str)) or content.find(Area.verify_string) < 0:\n        fails()\n        content = html_get(url)\n    success()\n    save_html(url,content)\n    soup = BeautifulSoup(content, 'lxml')\n    data = soup.select("tr.provincetr > td")\n    pool = multiprocessing.Pool(64)\n    for row in data:\n        if row.get_text():\n            if row.a is not None:\n                if row.a is not None and row.a["href"] is not None:\n                    href = patch_href(url, row.a["href"])\n                else:\n                    href = ""\n                area = {\n                    "short_code":re.split('[.\/]', href)[-2],\n                    "path":copy.deepcopy([]),\n                    "level":1,\n                    "merger_name":"",\n                    "name":row.get_text(),\n                    "href":href,\n                }\n                pool.apply_async(fetch,(area,))\n    pool.close()\n    pool.join()\n\nif __name__ == '__main__':\n    create_table()\n    flush_http_proxy()\n    baseUrl = "http:\/\/www.stats.gov.cn\/tjsj\/tjbz\/tjyqhdmhcxhfdm\/2019\/index.html"\n    distribute(baseUrl)\n<\/pre><\/div>\n\n\n

\u4e0b\u9762\u5df2\u7ecf\u62c9\u53d6\u5b8c\u6574\u7684\u6570\u636e\u5e93\u6587\u4ef6\uff08\u4e0d\u542b\u6e2f\u6fb3\u53f0\uff09\uff0c\u5728http\u4ee3\u7406\u7a33\u5b9a\u7684\u60c5\u51b5\u4e0b\u5927\u81f4\u9700\u898120\u5206\u949f\u5373\u53ef\u62c9\u53d6\u5171704750\u6761\u6570\u636e\uff08\u4e0d\u5f97\u5410\u69fd\u4e0b\uff0c\u56fd\u5bb6\u7684\u53d1\u5e03\u7f51\u7ad9\uff0c\u7adf\u7136\u8fd8\u6709\u4e71\u7801\u95ee\u9898\uff0c\u8fd8\u662f\u6211\u81ea\u5df1\u624b\u52a8\u6539\u7684\u3002\uff09<\/p>\n\n\n\n

cn_area_2020.sql<\/a>\u4e0b\u8f7d<\/a><\/div>\n","protected":false},"excerpt":{"rendered":"

\u7531\u4e8e\u73b0\u5728\u8fd9\u4e2a\u516c\u53f8\u4e3b\u8981\u670d\u52a1\u4e8e\u653f\u5e9c\uff0c\u9879\u76ee\u4e0a\u7ecf\u5e38\u90fd\u4f1a\u7528\u5230\u884c\u653f\u533a\u57df\u7684\u5217\u8868\u548c\u67e5\u627e\uff0c\u5728\u7f51\u4e0a\u627e\u4e86\u5f88\u591a\uff0c\u90fd\u662f\u6b8b\u7f3a\u6216\u8005\u8fc7\u65f6\u7684\uff0c\u521a […]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[39],"tags":[40,42],"class_list":["post-683","post","type-post","status-publish","format-standard","hentry","category-python","tag-40","tag-42"],"_links":{"self":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts\/683","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/comments?post=683"}],"version-history":[{"count":14,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts\/683\/revisions"}],"predecessor-version":[{"id":704,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/posts\/683\/revisions\/704"}],"wp:attachment":[{"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/media?parent=683"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/categories?post=683"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.gamein.vip\/wp-json\/wp\/v2\/tags?post=683"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}