Skip to content

Instantly share code, notes, and snippets.

@thoslin
Created October 21, 2015 03:19
Show Gist options
  • Save thoslin/4cb3918e7762de33cdc2 to your computer and use it in GitHub Desktop.
Save thoslin/4cb3918e7762de33cdc2 to your computer and use it in GitHub Desktop.
pyspider douban group
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-10-21 10:27:30
# Project: douban_zufang
import re
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=30)
def on_start(self):
self.crawl('http://www.douban.com/group/shanghaizufang/discussion?start=0', callback=self.index_page)
@config(age=365 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match("http://www.douban.com/group/topic/\d+/", each.attr.href):
self.crawl(each.attr.href, callback=self.detail_page)
self.crawl(response.doc(".next a").attr.href, callback=self.index_page)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
"datetime": response.doc('.color-green').text(),
"content": response.doc('#link-report .topic-content').text(),
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment