Created
October 6, 2013 19:00
-
-
Save iainelder/6857708 to your computer and use it in GitHub Desktop.
requests_cache is a library that caches responses returned by the requests library. This gist shows you how to dump the cache content to the shell. You can take this as a base for saving resposes to file for archiving. Using a cache allows you to implement the arching logic seperately from the fetching logic.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import requests_cache | |
from urlparse import urlparse | |
from os.path import basename | |
from pprint import pprint | |
# By default requests uses urlib3 | |
# <class 'requests.packages.urllib3.response.HTTPResponse'> | |
resp = requests.get('http://httpbin.org/user-agent') | |
print type(resp.raw) | |
# requests_cache monkey-patches requests with a global cache | |
# Default backend is sqlite. Also supports memory, mongodb, and redis. | |
requests_cache.install_cache('demo_cache') | |
# Now requests is backed by a '_Store' object. | |
# <class 'requests_cache.backends.base._Store'> | |
resp = requests.get('http://httpbin.org/user-agent') | |
print type(resp.raw) | |
# /delay/1 takes 1 second to respond | |
# Without a cache this would take 10 seconds | |
# With a cache it takes 1 (warm) or 2 (cold) | |
# Existing code can use the cache without modification. | |
[requests.get('http://httpbin.org/delay/1') for i in xrange(10)] | |
# Populate the cache with more cool stuff | |
requests.get('http://httpbin.org/user-agent') | |
requests.get('http://httpbin.org/cookies') | |
# Process responses asynchronously using the cache API. | |
cache = requests_cache.core.get_cache() | |
# Dump cache for known URL (GET request) | |
# Depends on undocumented function _url_to_key | |
response, timestamp = cache.get_response_and_time( | |
cache._url_to_key('http://httpbin.org/user-agent')) | |
# get_response_and_time returns a normal requests Response object | |
pprint({'timestamp': timestamp, | |
'url': response.url, | |
'filename': basename(urlparse(response.url).path), | |
'content': response.content, | |
'type': type(response)}) | |
# Dump complete cache (not just GET requests) | |
# cache.responses.values() returns a _Store sequence | |
# restore_response returns a requests Response | |
pprint([{'timestamp': timestamp, | |
'url': entry.url, | |
'filename': basename(urlparse(entry.url).path), | |
'content': cache.restore_response(entry).content, | |
'type': type(entry) | |
} for entry, timestamp in cache.responses.values()]) | |
# Another way to dump the cache, using keys. | |
# Requires two list comprehensions to call get_response_and_time | |
pprint([{'timestamp': timestamp, | |
'url': response.url, | |
'filename': basename(urlparse(response.url).path), | |
'content': response.content, | |
'type': type(response) | |
} | |
for response, timestamp in [cache.get_response_and_time(key) | |
for key in cache.responses.keys()] | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment