Last active
August 29, 2015 14:05
-
-
Save toddlerya/35ba2b49042fbfe5c406 to your computer and use it in GitHub Desktop.
A py-spider : download the pictures of Taobao'models~
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
#Author: toddlerya | |
#History: 2014/8/10 | |
import urllib,re | |
num = raw_input("你要下载哪一页的淘宝小妹?\n输入一个页码: ") | |
web = "http://mm.taobao.com/json/request_top_list.htm?type=0&page=" | |
url = web + str(num) | |
def getModelHomePage(url): | |
val = urllib.urlopen(url).read() | |
#print val | |
modre = r'href=".*?com/\d+\.htm' | |
modelre = re.compile(modre) | |
modurls = modelre.findall(val) | |
head = 'href="' | |
for modurl in modurls: | |
html = modurl[len(head):] | |
return html | |
def getImgUrl(): | |
html = getModelHomePage(url) | |
home = urllib.urlopen(html).read() | |
imgre = r'src="\w+?.*?\.jpg' | |
imglist = re.findall(imgre,home) | |
return imglist | |
def getImage(): | |
imglist = getImgUrl() | |
temp = 'src="' | |
n =0 | |
for img in imglist: | |
image = img[len(temp):] | |
#print image | |
urllib.urlretrieve(image,"pic\\mm.jpg" +str(n)+".jpg") | |
print "正在下载第%s张" % n | |
n += 1 | |
getImage() | |
print "下载完毕!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment