Last active
January 2, 2016 08:39
-
-
Save conanca/8277721 to your computer and use it in GitHub Desktop.
抓取国家统计局网站上最新的县及县以上行政区划代码,并保存成 json 格式的js文件(供前端用)和SQL语句(供后端用)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
'''抓取国家统计局网站上最新的县及县以上行政区划代码,并保存成 json 格式的js文件(供前端用)和SQL语句(供后端用) | |
by Conanca | |
''' | |
import urllib2,json | |
url_prefix = 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/' | |
var_text = 'xzqh' | |
code_text = 'C' | |
name_text = 'N' | |
sub_text = 'S' | |
jsfile_path = 'xzqh.js' | |
sqlfile_path = 'xzqh.sql' | |
def set_proxy(proxy): | |
''' 设置代理服务器 ''' | |
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler({'http' : proxy}))) | |
def get_latest_page(): | |
''' 获取最新的行政区划代码公布页 ''' | |
content = urllib2.urlopen(url_prefix).read() | |
index_start = content.find("./20") + 2 | |
index_end = content.find(".html") + 5 | |
xzqhdm_url = content[index_start:index_end] | |
xzqhdm_url = url_prefix + xzqhdm_url | |
print 'latest page:' + xzqhdm_url | |
return xzqhdm_url | |
def crawl_page(xzqhdm_url): | |
''' 爬行政区划代码公布页 ''' | |
print 'crawling...' | |
content = urllib2.urlopen(xzqhdm_url).read() | |
index_start = content.find('<TBODY>') + 9 | |
index_end = content.find("</TBODY></TABLE>") | |
content = content[index_start:index_end] | |
return content | |
def creat_item(item_str): | |
''' 根据字符串创建条目对象 ''' | |
code = item_str[item_str.index('lang=EN-US>') + 11:item_str.index('<o:p></o:p></SPAN></P></TD>')] | |
name = item_str[item_str.index('''mso-bidi-font-family: Tahoma">''') + 30:] | |
name = name.replace("</SPAN></P></TD></TR>","") | |
item = {code_text:code,name_text:name} | |
print item | |
return item | |
def convert(content): | |
''' 将爬到的内容转换为行政区划 list ''' | |
print 'converting...' | |
item_arr = content.split('<SPAN lang=EN-US><o:p></o:p></SPAN></SPAN></P></TD></TR>') | |
p_list = [] | |
current_p = {} | |
current_p_sub = [] | |
current_c = {} | |
current_c_sub = [] | |
current_d = {} | |
for item_str in item_arr: | |
#print item_str | |
if item_str.find('TEXT-ALIGN: left; MARGIN: 0cm 0cm 0pt; mso-pagination: widow-orphan')>=0: | |
#print 'got a province:'+item_str | |
# 赋值 当前省;初始化 当前省的子项 | |
current_p = creat_item(item_str) | |
current_p_sub = [] | |
if len(current_p)!=0: | |
# 为当前省 设置其子项;省列表中添加当前省 | |
current_p[sub_text] = current_p_sub | |
p_list.append(current_p) | |
elif item_str.find('TEXT-INDENT: 12pt;')>=0: | |
#print '********got a city:'+item_str | |
# 赋值 当前市;初始化 当前市的子项 | |
current_c = creat_item(item_str) | |
current_c_sub = [] | |
if len(current_c)!=0: | |
# 为当前市 设置其子项;当前省的子项中添加当前市 | |
current_c[sub_text] = current_c_sub | |
current_p_sub.append(current_c) | |
elif (item_str.find('TEXT-INDENT: 24pt;')>=0 or item_str.find('TEXT-ALIGN: left; MARGIN: 0cm 0cm 0pt 23.95pt;')>=0): | |
#print '****************got a district:'+item_str | |
# 赋值 当前区县;当前市的子项中添加当前区县 | |
current_d = creat_item(item_str) | |
current_c_sub.append(current_d) | |
else : | |
print 'invaild item string:'+item_str | |
p_list.append({code_text:'710000',name_text:'台湾省'}) | |
p_list.append({code_text:'810000',name_text:'香港特别行政区'}) | |
p_list.append({code_text:'820000',name_text:'澳门特别行政区'}) | |
return p_list | |
def to_sql(p_list): | |
''' 将行政区划列表转换为SQL语句 ''' | |
sql = 'CREATE TABLE T_XZQH(CODE CHAR(6) PRIMARY KEY, NAME VARCHAR(30), PARENT_CODE CHAR(6), TYPE CHAR(1));\nINSERT INTO T_XZQH VALUES\n' | |
for p in p_list: | |
sql += "('"+p[code_text]+"','"+p[name_text]+"',NULL,'p'),\n" | |
if p.has_key(sub_text): | |
for c in p[sub_text]: | |
sql += "('"+c[code_text]+"','"+c[name_text]+"','"+p[code_text]+"','c'),\n" | |
if c.has_key(sub_text): | |
for d in c[sub_text]: | |
sql += "('"+d[code_text]+"','"+d[name_text]+"','"+c[code_text]+"','d'),\n" | |
return sql[:-2]+";" | |
def write_to(content,file_path): | |
''' 将字符串写入指定的文件中 ''' | |
print 'writing...' | |
f = open(file_path, 'w') | |
f.write(content) | |
f.close() | |
print 'done!' | |
if __name__ == '__main__': | |
#set_proxy('http://192.168.2.59:8080') | |
url = get_latest_page() | |
content = crawl_page(url) | |
p_list = convert(content) | |
content = 'var ' + var_text + ' = ' + json.dumps(p_list,ensure_ascii=False,separators=(',',':')) | |
write_to(content,jsfile_path) | |
content = to_sql(p_list) | |
write_to(content,sqlfile_path) | |
print 'finish!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment