Skip to content

Instantly share code, notes, and snippets.

@magicshui
Last active December 10, 2015 02:38
Show Gist options
  • Save magicshui/4368772 to your computer and use it in GitHub Desktop.
Save magicshui/4368772 to your computer and use it in GitHub Desktop.
提取网页中的code渲染成图片
#coding: utf-8
import requests
import re
from requests import *
import pyquery
from pyquery import PyQuery as pq
from lxml import etree
import urllib
import sys
reload(sys)
sys.setdefaultencoding('utf8')
urls = [
"http://www.infoq.com/cn/articles/interview-sam-haskins",
"http://www.infoq.com/cn/news/2013/03/MySQL-Reference-Architectures",
"http://www.infoq.com/cn/news/2013/03/58com-opensourced-argo",
"http://www.infoq.com/cn/news/2013/03/tencent-devops",
"http://www.infoq.com/cn/news/2013/03/ruby-to-go",
"http://www.infoq.com/cn/news/2013/03/linkedin-databus",
"http://www.infoq.com/cn/articles/full-function-team-about-data",
"http://www.infoq.com/cn/news/2012/08/facebook-tools-culture",
"http://www.infoq.com/cn/articles/Automated-Builds",
"http://www.infoq.com/cn/news/2013/03/k4w-sdk1.7-release",
"http://www.infoq.com/cn/news/2013/03/Microsoft-Kinect-Samples",
"http://www.infoq.com/cn/news/2013/03/k4w-roadshow-beijing",
"http://www.infoq.com/cn/articles/java-memory-model-7",
"http://www.infoq.com/cn/articles/webkit-for-developers",
"http://www.infoq.com/cn/articles/java-threadPool",
"http://www.infoq.com/cn/articles/teamtoy2"
]
prod_urls=["http://www.infoq.com/cn/articles/java7-nio2",
"http://www.infoq.com/cn/news/2013/03/spring-for-apache-hadoop-1.0",
"http://www.infoq.com/cn/news/2013/03/VS2012_CTP4",
"http://www.infoq.com/cn/news/2013/03/jquery-github-plugin-repo",
"http://www.infoq.com/cn/news/2013/03/Microsoft-Azure-Amazon-Android",
"http://www.infoq.com/cn/news/2013/03/node.js-0.10-released",
"http://www.infoq.com/cn/news/2013/03/lienzo",
"http://www.infoq.com/cn/news/2013/03/eclipse-rap-2-released",
"http://www.infoq.com/cn/news/2013/03/jquery-mobile-1.3.0",
"http://www.infoq.com/cn/news/2013/03/red-gate-simply-web-dev",
"http://www.infoq.com/cn/news/2013/03/google-go-1-1",
"http://www.infoq.com/cn/news/2013/03/anime-scrum-primer",
"http://www.infoq.com/cn/news/2013/03/code-review-visual-studio-2012",
"http://www.infoq.com/cn/news/2013/03/hudson-eclipse",
"http://www.infoq.com/cn/news/2013/03/chartjs-v.0.1-released",
"http://www.infoq.com/cn/news/2013/03/Grunt-0.4.0-Released",
"http://www.infoq.com/cn/news/2013/03/Lingual",
"http://www.infoq.com/cn/news/2013/03/gcc48_released",
"http://www.infoq.com/cn/news/2013/03/nuodb"]
html_="""<html>
<body><div>{head}{title}{author}{content}<p><strong>原文链接:<a href="{org}">{org}</a></strong></p>{likes}</div></body>
</html>"""
def get_rec(title,path,ids):
data=requests.post('http://www.infoq.com/api/recommendationlinks.action',{"topicIds":ids,
"title":title,"contentPath":path,"language":'zh'
})
import json
_d=json.loads(data.content)
return "<br/>".join('<a href="%s">%s</a>'%(x['url'],x['title']) for x in _d)
def get_article_content(url):
data=requests.get(url)
d = pq(data.content)
pattern_pre='{"topicIds" : "(.*)", "title"'
match_pre = re.findall(pattern_pre, data.content)
title=d('title').text()
print title
author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
content=d('.box-content-5').remove('a[rel="permalink"]').remove('script').remove('.h1-r').remove('.comments-header').remove('span').remove('.forum-list-tree').remove('.content-sidebar-wide').remove('.comments-sort').remove('.addthis_toolbox').remove('.comments').remove('.tags2').remove('.box-bottom').outerHtml().replace('src="/resource','src="http://www.infoq.com/resource')
likes=''
try:
likes=get_rec(title,url.replace('http://infoq.com/cn',''),match_pre[0])
except:
likes=""
return html_.format(head='',title=title,author='',content=content,likes=likes,org=url)
def get_news_content(url):
data=requests.get(url)
d = pq(data.content)
pattern_pre='{"topicIds" : "(.*)", "title"'
match_pre = re.findall(pattern_pre, data.content)
title=d('title').text()
print title
author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
content=d('#newsContent').remove('a[rel="permalink"]').remove('.h1-r').remove('#relatedContent').outerHtml().replace('src="/resource','src="http://www.infoq.com/resource').replace('&#13;','')
likes=''
try:
likes=get_rec(title,url.replace('http://infoq.com/cn',''),match_pre[0])
likes=("<strong>相关内容</strong><br/>"+likes)
except:
likes=""
return html_.format(head='',title=title,author=author,content=content,likes=likes,org=url)
def get_prod_content(url):
data=requests.get(url)
d = pq(data.content)
description=d('meta[name="description"]').attr('content')
title=d('title').text()
print title
author=d('.box-content-5>.info').outerHtml().replace('href="/cn/author','href="http://infoq.com/cn/author')
return html_.format(head='',title=title,author=author,content=description,likes="",org=url)
i=0
for x in urls:
with open("/Users/Arthur/Desktop/one/arch/%d.html"%i,'w+') as f:
if x.find('/news/')>0:
f.write(get_news_content(x))
if x.find('/article')>0:
f.write(get_article_content(x))
i+=1
with open("/Users/Arthur/Desktop/one/arch/%d.html"%i,'w+') as f:
z=''
for x in prod_urls:
z+=get_prod_content(x)
f.write(z)
i+=1
#print get_rec(title="从小型网站到超大规模网站的MySQL参考架构",path="/news/2013/03/MySQL-Reference-Architectures")
#get_interview_content('http://www.infoq.com/interviews/george-dinwiddie-three-amigos')
@magicshui
Copy link
Author

原有的re的正则方式会内存泄露……

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment