Last active
December 7, 2017 10:20
-
-
Save zii/231bbc2c5f8ac0562972d16ea7defbbc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
#TODO: 还是要确认一下职业的, 先带职业名搜索 | |
#TODO: 头像 | |
#TODO: 如果没有描述, 用job等字段组成描述 | |
from scrapy import Selector | |
import requests | |
from lorm import Struct | |
PROXY_ADDR = '127.0.0.1:5555' | |
def google_search(keyword, hl=None): | |
""" | |
:param keyword: search keyword | |
:param hl: language, en/zh | |
:return: (html, error) | |
""" | |
url = u"https://www.google.com/search" | |
proxies = None | |
if PROXY_ADDR: | |
proxies = {'http':PROXY_ADDR, 'https':PROXY_ADDR} | |
headers = {} | |
#headers['user-agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' | |
params = { | |
'q': keyword, | |
'ie': 'UTF-8' | |
} | |
if hl: | |
params['hl'] = hl | |
r = requests.get(url, params=params, timeout=10, proxies=proxies, | |
headers=headers) | |
if r.status_code != 200: | |
return None, r.status | |
return r.content, None | |
def google_kg(keyword, hl=None): | |
""" | |
grab google knowledge gragh | |
:param keyword: keyword | |
:param hl: language, en/zh | |
:return: (data, error) | |
""" | |
html, error = google_search(keyword, hl) | |
if not html: | |
return None, error | |
sel = Selector(text=html, type='html') | |
rhs_block = sel.css('#rhs_block') | |
if not rhs_block: | |
return None, 'not find #rhs_block' | |
name = rhs_block.css('div._B5d::text').extract() | |
name = ''.join(name) | |
print 'name:', name | |
job = rhs_block.css('div._zdb::text').extract_first() | |
print 'job:', job | |
desc = rhs_block.css('div._tXc span::text').extract() | |
desc = ''.join(desc) | |
print 'desc:', desc | |
keys = rhs_block.css('span._gS') | |
vals = rhs_block.css('span._tA') | |
for i, key in enumerate(keys): | |
val = vals[i] | |
key = key.css('::text').extract_first() | |
val = val.css('::text').extract() | |
val = ''.join(val) | |
print 'key:', key | |
print 'val:', val | |
# 第二套方案(有可能有多个资料,需要确认;顺便通过Actress确定性别) | |
tables = rhs_block.css('table') | |
if tables: | |
for table in tables: | |
title = table.css('td div._fce::text').extract() | |
title = ''.join(title) | |
print 'title:', title | |
for el in table.css('td div._Vbe'): | |
info = el.css('::text').extract() | |
info = ''.join(info) | |
print 'info:', info | |
data = Struct() | |
return data, None | |
if __name__ == '__main__': | |
# html, error = google_search(u'Zoë Cooper', 'zh') | |
# with open('5.html', 'wb') as f: | |
# f.write(html) | |
data, error = google_kg(u"沈炜竣", 'zh') | |
print data or error |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment