Last active
July 21, 2018 11:52
-
-
Save seanwu1105/8fce0262b3a3c92ae42d56d9e28ca66b to your computer and use it in GitHub Desktop.
Wikipedia zhTW Article Names Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class ArticlesListSpider(scrapy.Spider): | |
name = "articles_list" | |
page_counter = 0 | |
def start_requests(self): | |
urls = [ | |
r'https://zh.wikipedia.org/wiki/Special:%E6%89%80%E6%9C%89%E9%A1%B5%E9%9D%A2' | |
] | |
for url in urls: | |
yield scrapy.Request(url=url, callback=self.parse) | |
def parse(self, response): | |
names = set() | |
next_page = None | |
for unorder_list in response.css('ul.mw-allpages-chunk'): | |
for list_item in unorder_list.css('li'): | |
names.add(list_item.css('::text').extract_first()) | |
self.page_counter += 1 | |
yield {self.page_counter: names} | |
nav = response.css('div.mw-allpages-nav a') | |
for link in nav: | |
if '下一页(' in link.css('::text').extract_first(): | |
next_page = link.css('::attr(href)').extract_first() | |
self.log('next page text is: ' + link.css('::text').extract_first()) | |
break | |
if next_page is not None: | |
yield response.follow(next_page, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Scrapy settings for wiki_articles_list project | |
# | |
# For simplicity, this file contains only settings considered important or | |
# commonly used. You can find more settings consulting the documentation: | |
# | |
# https://doc.scrapy.org/en/latest/topics/settings.html | |
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html | |
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html | |
BOT_NAME = 'wiki_articles_list' | |
SPIDER_MODULES = ['wiki_articles_list.spiders'] | |
NEWSPIDER_MODULE = 'wiki_articles_list.spiders' | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'wiki_articles_list (+http://www.yourdomain.com)' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = False | |
FEED_EXPORT_ENCODING = 'utf-8' | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 32 | |
# Configure a delay for requests for the same website (default: 0) | |
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
#DOWNLOAD_DELAY = 3 | |
# The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
# Disable cookies (enabled by default) | |
#COOKIES_ENABLED = False | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
#DEFAULT_REQUEST_HEADERS = { | |
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
# 'Accept-Language': 'en', | |
#} | |
# Enable or disable spider middlewares | |
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html | |
#SPIDER_MIDDLEWARES = { | |
# 'wiki_articles_list.middlewares.WikiArticlesListSpiderMiddleware': 543, | |
#} | |
# Enable or disable downloader middlewares | |
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html | |
#DOWNLOADER_MIDDLEWARES = { | |
# 'wiki_articles_list.middlewares.WikiArticlesListDownloaderMiddleware': 543, | |
#} | |
# Enable or disable extensions | |
# See https://doc.scrapy.org/en/latest/topics/extensions.html | |
#EXTENSIONS = { | |
# 'scrapy.extensions.telnet.TelnetConsole': None, | |
#} | |
# Configure item pipelines | |
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
#ITEM_PIPELINES = { | |
# 'wiki_articles_list.pipelines.WikiArticlesListPipeline': 300, | |
#} | |
# Enable and configure the AutoThrottle extension (disabled by default) | |
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html | |
#AUTOTHROTTLE_ENABLED = True | |
# The initial download delay | |
#AUTOTHROTTLE_START_DELAY = 5 | |
# The maximum download delay to be set in case of high latencies | |
#AUTOTHROTTLE_MAX_DELAY = 60 | |
# The average number of requests Scrapy should be sending in parallel to | |
# each remote server | |
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | |
# Enable showing throttling stats for every response received: | |
#AUTOTHROTTLE_DEBUG = False | |
# Enable and configure HTTP caching (disabled by default) | |
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | |
#HTTPCACHE_ENABLED = True | |
#HTTPCACHE_EXPIRATION_SECS = 0 | |
#HTTPCACHE_DIR = 'httpcache' | |
#HTTPCACHE_IGNORE_HTTP_CODES = [] | |
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment