magicshui · December 10, 2015 02:38 · magicshui · Feb 5, 2013
diff --git a/pickcode.py b/pickcode.py
 #coding: utf-8
 import requests
 import re
 from requests import *
 import pyquery
 from pyquery import PyQuery as pq
 from lxml import etree
 import urllib
 import sys
 reload(sys)
 sys.setdefaultencoding('utf8') 
 urls = [
 "http://www.infoq.com/cn/articles/interview-sam-haskins",
 "http://www.infoq.com/cn/news/2013/03/MySQL-Reference-Architectures",
 "http://www.infoq.com/cn/news/2013/03/58com-opensourced-argo",
 "http://www.infoq.com/cn/news/2013/03/tencent-devops",
 "http://www.infoq.com/cn/news/2013/03/ruby-to-go",
 "http://www.infoq.com/cn/news/2013/03/linkedin-databus",
 "http://www.infoq.com/cn/articles/full-function-team-about-data",
 "http://www.infoq.com/cn/news/2012/08/facebook-tools-culture",
 "http://www.infoq.com/cn/articles/Automated-Builds",
 "http://www.infoq.com/cn/news/2013/03/k4w-sdk1.7-release",
 "http://www.infoq.com/cn/news/2013/03/Microsoft-Kinect-Samples",
 "http://www.infoq.com/cn/news/2013/03/k4w-roadshow-beijing",
 "http://www.infoq.com/cn/articles/java-memory-model-7",
 "http://www.infoq.com/cn/articles/webkit-for-developers",
 "http://www.infoq.com/cn/articles/java-threadPool",
 "http://www.infoq.com/cn/articles/teamtoy2"

 ]

 prod_urls=["http://www.infoq.com/cn/articles/java7-nio2",
 "http://www.infoq.com/cn/news/2013/03/spring-for-apache-hadoop-1.0",
 "http://www.infoq.com/cn/news/2013/03/VS2012_CTP4",
 "http://www.infoq.com/cn/news/2013/03/jquery-github-plugin-repo",
 "http://www.infoq.com/cn/news/2013/03/Microsoft-Azure-Amazon-Android",
 "http://www.infoq.com/cn/news/2013/03/node.js-0.10-released",
 "http://www.infoq.com/cn/news/2013/03/lienzo",
 "http://www.infoq.com/cn/news/2013/03/eclipse-rap-2-released",
 "http://www.infoq.com/cn/news/2013/03/jquery-mobile-1.3.0",
 "http://www.infoq.com/cn/news/2013/03/red-gate-simply-web-dev",
 "http://www.infoq.com/cn/news/2013/03/google-go-1-1",
 "http://www.infoq.com/cn/news/2013/03/anime-scrum-primer",
 "http://www.infoq.com/cn/news/2013/03/code-review-visual-studio-2012",
 "http://www.infoq.com/cn/news/2013/03/hudson-eclipse",
 "http://www.infoq.com/cn/news/2013/03/chartjs-v.0.1-released",
 "http://www.infoq.com/cn/news/2013/03/Grunt-0.4.0-Released",
 "http://www.infoq.com/cn/news/2013/03/Lingual",
 "http://www.infoq.com/cn/news/2013/03/gcc48_released",
 "http://www.infoq.com/cn/news/2013/03/nuodb"]
 
 html_="""<html>
 	<body><div>{head}{title}{author}{content}<p><strong>原文链接：<a href="{org}">{org}</a></strong></p>{likes}</div></body>
 	</html>"""
 def get_rec(title,path,ids):
 	data=requests.post('http://www.infoq.com/api/recommendationlinks.action',{"topicIds":ids,
 				"title":title,"contentPath":path,"language":'zh'
 				})
 	import json
 	_d=json.loads(data.content)
 	return "<br/>".join('<a href="%s">%s</a>'%(x['url'],x['title']) for x in _d)

 def get_article_content(url):
 	data=requests.get(url)
 	d = pq(data.content)
 	pattern_pre='{"topicIds" : "(.*)", "title"'
 	match_pre = re.findall(pattern_pre, data.content)
 	title=d('title').text()
 	print title
 	author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
 	content=d('.box-content-5').remove('a[rel="permalink"]').remove('script').remove('.h1-r').remove('.comments-header').remove('span').remove('.forum-list-tree').remove('.content-sidebar-wide').remove('.comments-sort').remove('.addthis_toolbox').remove('.comments').remove('.tags2').remove('.box-bottom').outerHtml().replace('src="/resource','src="http://www.infoq.com/resource')
 	likes=''
 	try:
 		likes=get_rec(title,url.replace('http://infoq.com/cn',''),match_pre[0])
 	except:
 		likes=""
 	return html_.format(head='',title=title,author='',content=content,likes=likes,org=url)

 def get_news_content(url):
 	data=requests.get(url)
 	d = pq(data.content)
 	pattern_pre='{"topicIds" : "(.*)", "title"'
 	match_pre = re.findall(pattern_pre, data.content)
 	title=d('title').text()
 	print title
 	author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
 	content=d('#newsContent').remove('a[rel="permalink"]').remove('.h1-r').remove('#relatedContent').outerHtml().replace('src="/resource','src="http://www.infoq.com/resource').replace('&#13;','')
 	likes=''
 	try:
 		likes=get_rec(title,url.replace('http://infoq.com/cn',''),match_pre[0])
 		likes=("<strong>相关内容</strong><br/>"+likes)
 	except:
 		likes=""
 	return html_.format(head='',title=title,author=author,content=content,likes=likes,org=url)
 def get_prod_content(url):
 	data=requests.get(url)
 	d = pq(data.content)
 	description=d('meta[name="description"]').attr('content')

 	title=d('title').text()
 	print title
 	author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
 	
 	return html_.format(head='',title=title,author=author,content=description,likes="",org=url)

 i=0
 for x in urls:
 	with open("/Users/Arthur/Desktop/one/arch/%d.html"%i,'w+') as f:
 		if x.find('/news/')>0:
 			f.write(get_news_content(x))
 		if x.find('/article')>0:
 			f.write(get_article_content(x))
 		i+=1
 with open("/Users/Arthur/Desktop/one/arch/%d.html"%i,'w+') as f:
 	z=''
 	for x in prod_urls:
 		z+=get_prod_content(x)
 	f.write(z)
 	i+=1

 #print get_rec(title="从小型网站到超大规模网站的MySQL参考架构",path="/news/2013/03/MySQL-Reference-Architectures")
 #get_interview_content('http://www.infoq.com/interviews/george-dinwiddie-three-amigos')
	#coding: utf-8
	import requests
	import re
	from requests import *
	import pyquery
	from pyquery import PyQuery as pq
	from lxml import etree
	import urllib
	import sys
	reload(sys)
	sys.setdefaultencoding('utf8')
	urls = [
	"http://www.infoq.com/cn/articles/interview-sam-haskins",
	"http://www.infoq.com/cn/news/2013/03/MySQL-Reference-Architectures",
	"http://www.infoq.com/cn/news/2013/03/58com-opensourced-argo",
	"http://www.infoq.com/cn/news/2013/03/tencent-devops",
	"http://www.infoq.com/cn/news/2013/03/ruby-to-go",
	"http://www.infoq.com/cn/news/2013/03/linkedin-databus",
	"http://www.infoq.com/cn/articles/full-function-team-about-data",
	"http://www.infoq.com/cn/news/2012/08/facebook-tools-culture",
	"http://www.infoq.com/cn/articles/Automated-Builds",
	"http://www.infoq.com/cn/news/2013/03/k4w-sdk1.7-release",
	"http://www.infoq.com/cn/news/2013/03/Microsoft-Kinect-Samples",
	"http://www.infoq.com/cn/news/2013/03/k4w-roadshow-beijing",
	"http://www.infoq.com/cn/articles/java-memory-model-7",
	"http://www.infoq.com/cn/articles/webkit-for-developers",
	"http://www.infoq.com/cn/articles/java-threadPool",
	"http://www.infoq.com/cn/articles/teamtoy2"

	]

	prod_urls=["http://www.infoq.com/cn/articles/java7-nio2",
	"http://www.infoq.com/cn/news/2013/03/spring-for-apache-hadoop-1.0",
	"http://www.infoq.com/cn/news/2013/03/VS2012_CTP4",
	"http://www.infoq.com/cn/news/2013/03/jquery-github-plugin-repo",
	"http://www.infoq.com/cn/news/2013/03/Microsoft-Azure-Amazon-Android",
	"http://www.infoq.com/cn/news/2013/03/node.js-0.10-released",
	"http://www.infoq.com/cn/news/2013/03/lienzo",
	"http://www.infoq.com/cn/news/2013/03/eclipse-rap-2-released",
	"http://www.infoq.com/cn/news/2013/03/jquery-mobile-1.3.0",
	"http://www.infoq.com/cn/news/2013/03/red-gate-simply-web-dev",
	"http://www.infoq.com/cn/news/2013/03/google-go-1-1",
	"http://www.infoq.com/cn/news/2013/03/anime-scrum-primer",
	"http://www.infoq.com/cn/news/2013/03/code-review-visual-studio-2012",
	"http://www.infoq.com/cn/news/2013/03/hudson-eclipse",
	"http://www.infoq.com/cn/news/2013/03/chartjs-v.0.1-released",
	"http://www.infoq.com/cn/news/2013/03/Grunt-0.4.0-Released",
	"http://www.infoq.com/cn/news/2013/03/Lingual",
	"http://www.infoq.com/cn/news/2013/03/gcc48_released",
	"http://www.infoq.com/cn/news/2013/03/nuodb"]

	html_="""<html>
	<body><div>{head}{title}{author}{content}<p><strong>原文链接：<a href="{org}">{org}</a></strong></p>{likes}</div></body>
	</html>"""
	def get_rec(title,path,ids):
	data=requests.post('http://www.infoq.com/api/recommendationlinks.action',{"topicIds":ids,
	"title":title,"contentPath":path,"language":'zh'
	})
	import json
	_d=json.loads(data.content)
	return "<br/>".join('<a href="%s">%s</a>'%(x['url'],x['title']) for x in _d)

	def get_article_content(url):
	data=requests.get(url)
	d = pq(data.content)
	pattern_pre='{"topicIds" : "(.*)", "title"'
	match_pre = re.findall(pattern_pre, data.content)
	title=d('title').text()
	print title
	author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
	content=d('.box-content-5').remove('a[rel="permalink"]').remove('script').remove('.h1-r').remove('.comments-header').remove('span').remove('.forum-list-tree').remove('.content-sidebar-wide').remove('.comments-sort').remove('.addthis_toolbox').remove('.comments').remove('.tags2').remove('.box-bottom').outerHtml().replace('src="/resource','src="http://www.infoq.com/resource')
	likes=''
	try:
	likes=get_rec(title,url.replace('http://infoq.com/cn',''),match_pre[0])
	except:
	likes=""
	return html_.format(head='',title=title,author='',content=content,likes=likes,org=url)

	def get_news_content(url):
	data=requests.get(url)
	d = pq(data.content)
	pattern_pre='{"topicIds" : "(.*)", "title"'
	match_pre = re.findall(pattern_pre, data.content)
	title=d('title').text()
	print title
	author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
	content=d('#newsContent').remove('a[rel="permalink"]').remove('.h1-r').remove('#relatedContent').outerHtml().replace('src="/resource','src="http://www.infoq.com/resource').replace(' ','')
	likes=''
	try:
	likes=get_rec(title,url.replace('http://infoq.com/cn',''),match_pre[0])
	likes=("<strong>相关内容</strong><br/>"+likes)
	except:
	likes=""
	return html_.format(head='',title=title,author=author,content=content,likes=likes,org=url)
	def get_prod_content(url):
	data=requests.get(url)
	d = pq(data.content)
	description=d('meta[name="description"]').attr('content')

	title=d('title').text()
	print title
	author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')

	return html_.format(head='',title=title,author=author,content=description,likes="",org=url)

	i=0
	for x in urls:
	with open("/Users/Arthur/Desktop/one/arch/%d.html"%i,'w+') as f:
	if x.find('/news/')>0:
	f.write(get_news_content(x))
	if x.find('/article')>0:
	f.write(get_article_content(x))
	i+=1
	with open("/Users/Arthur/Desktop/one/arch/%d.html"%i,'w+') as f:
	z=''
	for x in prod_urls:
	z+=get_prod_content(x)
	f.write(z)
	i+=1

	#print get_rec(title="从小型网站到超大规模网站的MySQL参考架构",path="/news/2013/03/MySQL-Reference-Architectures")
	#get_interview_content('http://www.infoq.com/interviews/george-dinwiddie-three-amigos')