Last active
August 29, 2015 14:13
-
-
Save advancedxy/923cb0c71399d96565c7 to your computer and use it in GitHub Desktop.
advancedxy.com's post related code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2 | |
#coding=utf-8 | |
import os | |
import urllib2 | |
import zipfile | |
import zlib | |
from urlparse import urlparse | |
from time import strftime,localtime | |
def initsites(filename): | |
try: | |
file = open(filename,'rU') | |
returns = [] | |
returnone = [] | |
for i in file.readlines(): | |
if i != '\n': | |
j = 0 | |
i = i[:-1] | |
if i[0:4] == '====': | |
i = i[4:-4] | |
if i[0:13] == 'getting into ': | |
i = i[13:] | |
returnone = returnone + [i] | |
else : | |
if j == 0: | |
returnone = [returnone] | |
j = j+1 | |
returns = returns+returnone | |
returnone = [] | |
return returns | |
finally: | |
file.close() | |
def dividesites(sites,f1='/media/virtual/srtp/2items',f2='/media/virtual/srtp/3items',f3='/media/virtual/srtp/4items',f4='/media/virtual/srtp/5items',f5="/media/virtual/srtp/moreitems"): | |
try: | |
file1 = open(f1,'w') | |
file2 = open(f2,'w') | |
file3 = open(f3,'w') | |
file4 = open(f4,'w') | |
file5 = open(f5,'w') | |
for i in sites: | |
s = '' | |
for j in i: | |
s = s+j+'\n' | |
s = s+'\n' | |
if len(i) == 2 : | |
file1.write(s) | |
elif len(i) == 3 : | |
file2.write(s) | |
elif len(i) == 4 : | |
file3.write(s) | |
elif len(i) == 5 : | |
file4.write(s) | |
elif len(i) >= 6: | |
file5.write(s) | |
else : | |
print i | |
print 'There is something wrong with your sites, please check it!' | |
finally: | |
file1.close() | |
file2.close() | |
file3.close() | |
file4.close() | |
file5.close() | |
def mkdir(dirname): | |
if not os.path.exists(dirname) : | |
os.mkdir(dirname) | |
else: | |
if os.path.isfile(dirname): | |
os.remove(dirname) | |
os.mkdir(dirname) | |
def downallxml(sites): | |
for i in sites: | |
o = urlparse(i[1]) | |
dirname = o.netloc | |
print dirname | |
url = i[3] | |
print url | |
currentdir = os.getcwd() | |
mkdir(dirname) | |
os.chdir(dirname) | |
print os.getcwd() | |
filename = strftime("%y%m%d",localtime())+'_all' | |
if not os.path.exists(filename): | |
f = open(filename,'w') | |
try: | |
sock = urllib2.urlopen(url) | |
xml = sock.read() | |
except: | |
sock.close() | |
print "we have some problem with "+url+"! please check it !" | |
sock.close() | |
f.write(xml) | |
f.close() | |
os.chdir(currentdir) | |
def zipfiles(sites): | |
zipFile = zipfile.ZipFile(strftime("%y%m%d",localtime())+'.zip','w',zipfile.ZIP_DEFLATED) | |
for i in sites: | |
o = urlparse(i[1]) | |
dirname = o.netloc | |
filename = strftime("%y%m%d",localtime())+'_all' | |
zipFile.write(os.path.join(dirname,filename)) | |
zipFile.close() | |
if __name__ == '__main__': | |
file = 'api' | |
site = initsites(file) | |
#print site4 | |
downallxml(site) | |
zipfiles(site) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
#coding=utf-8 | |
import os | |
import urllib2 | |
from pyquery import PyQuery as pyq | |
from urlparse import urlparse,urljoin | |
from downxml import initsites as init | |
from downxml import dividesites as divide | |
file = '/media/virtual/srtp/sites.xml' | |
pattern = ['baidu', 'hao123','123','百度','api'] | |
def analyse(addr): | |
try: | |
sock = pyq(url=addr) | |
except: | |
pass | |
site = pyq(addr) | |
o = urlparse(addr) | |
baseurl = o.scheme+"://"+o.netloc | |
urls = [] | |
for i in pattern: | |
flag = 0 | |
for j in site.find("a").parent(): | |
if i in pyq(j).text().lower().encode("utf-8"): | |
urls += [pyq(j).find("a").attr("href")] | |
flag = 1 | |
if flag == 1: | |
break | |
urls = list(set(urls)) #delete the same element | |
urls = [ urljoin(baseurl,i) if baseurl in urljoin(baseurl,i) else i for i in urls ] | |
print urls | |
return urls | |
if __name__ == '__main__': | |
sites = init('/media/virtual/srtp/sites.xml') | |
divide(sites) | |
sites = init('/media/virtual/srtp/3items') | |
f = '' | |
api=open("/media/virtual/srtp/api",'w') | |
for i in sites: | |
addr = i[2] | |
urls = analyse(addr) | |
i.extend(urls) | |
for j in i: | |
api.write(j+'\n') | |
api.write('\n') | |
api.close() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2 | |
import codecs | |
from pyquery import PyQuery as pyq | |
from urlparse import urljoin | |
sock = pyq(url='http://tuan.baidu.com/allsite.php') | |
site = sock('.site-span') | |
filepath = "/media/virtual/srtp/sites.xml" | |
xml = open(filepath,'r+') | |
for i in site: | |
f = '' | |
s1 = '===='+pyq(i).find('a').eq(0).text()+'====' | |
print s1 | |
xml.write(s1.encode("utf-8")+'\n') | |
f = f+s1+'\n' | |
urls = pyq(i).find('a').eq(0).attr("href") | |
if urls[-1] == '/': | |
urls = urls[:-1] | |
f = f+urls+'\n' | |
xml.write(urls+'\n') | |
print "getting into %s" %(urls,) | |
try: | |
doc = pyq(url=urls,parser='html') | |
for j in doc.find('a'): | |
if "api" in pyq(j).text().lower(): | |
href = pyq(j).attr("href") | |
if "http" in href: | |
href = href | |
else : | |
print href[0] | |
href = urljoin(urls,href) | |
print href | |
xml.write(href+'\n') | |
#f = f+href+'\n' | |
except: | |
pass | |
print '\n' | |
xml.write('\n') | |
#xml = open(filepath,'r+') | |
#xml.write(f) | |
xml.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment