Created
May 9, 2015 04:08
-
-
Save demonkit/69f387745a908287419d to your computer and use it in GitHub Desktop.
Get images from http://www.douban.com/photos/album/145486923/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Get images from http://www.douban.com/photos/album/145486923/. | |
""" | |
__author__ = 'demonkit' | |
import re | |
import threading | |
import time | |
from Queue import Empty, Queue | |
import requests | |
from lxml import etree | |
PUBLISH_FINISHED = False | |
CONSUME_FINISHED = False | |
class Pulibsher(object): | |
def __init__(self, queue): | |
self.queue = queue | |
def put(self, item): | |
self.queue.put(item) | |
class Consumer(object): | |
def __init__(self, queue): | |
self.queue = queue | |
def get(self, block=True): | |
return self.queue.get(block=block) | |
class ImgMaker(Pulibsher, threading.Thread): | |
KICK_START_URL = "http://www.douban.com/photos/album/145486923/" | |
param = 'start' | |
XPATH_PATTERN = '//*[@id="content"]/div[3]/div[1]/div[2]/div/a' | |
IMG_RE_PATTERN = re.compile('http://www.douban.com/photos/photo/(\d+)/') | |
PIC_NUM_PER_PAGE = 18 | |
def __init__(self, queue, page_no): | |
threading.Thread.__init__(self) | |
Pulibsher.__init__(self, queue) | |
self.setName(self.__class__.__name__) | |
self.page_no = page_no | |
def run(self): | |
for page in range(0, | |
self.page_no * self.PIC_NUM_PER_PAGE + 1, | |
self.PIC_NUM_PER_PAGE): | |
url = self.KICK_START_URL + "?%s=%s" % (self.param, str(page)) | |
content = requests.get(url).text | |
tree = etree.HTML(content) | |
elements = tree.xpath(self.XPATH_PATTERN) | |
for ele in elements: | |
img_url = ele.get('href') | |
if img_url is not None: | |
matcher = self.IMG_RE_PATTERN.match(img_url) | |
if matcher: | |
img_no = matcher.groups()[0] | |
self.queue.put(img_no) | |
PUBLISH_FINISHED = True | |
class ImgSaver(Consumer, threading.Thread): | |
REAL_IMG_URL_PATTERN = 'http://img3.douban.com/view/photo/photo/public/p%s.jpg' | |
FOLDER = 'emoji' | |
def __init__(self, queue): | |
threading.Thread.__init__(self) | |
Consumer.__init__(self, queue) | |
self.setName(self.__class__.__name__) | |
def run(self): | |
while 1: | |
try: | |
img_no = self.queue.get(block=False) | |
except Empty, em: | |
if PUBLISH_FINISHED: | |
break | |
else: | |
time.sleep(1) | |
else: | |
real_img_url = self.REAL_IMG_URL_PATTERN % img_no | |
resp = requests.get(real_img_url, stream=True) | |
with open("%s/%s.jpg" % (self.FOLDER, img_no), 'wb') as fout: | |
fout.write(resp.content) | |
CONSUME_FINISHED = True | |
if __name__ == '__main__': | |
queue = Queue() | |
publisher = ImgMaker(queue, 21) | |
consumer = ImgSaver(queue) | |
publisher.start() | |
consumer.start() | |
while not CONSUME_FINISHED: | |
time.sleep(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment