Skip to content

Instantly share code, notes, and snippets.

@MattPitlyk
Last active July 26, 2016 03:12
Show Gist options
  • Save MattPitlyk/9c2fc73ba08d6b74558a3ed2ca458cf9 to your computer and use it in GitHub Desktop.
Save MattPitlyk/9c2fc73ba08d6b74558a3ed2ca458cf9 to your computer and use it in GitHub Desktop.
Template for making multithreaded requests calls.
from threading import Thread
from Queue import Queue
import requests
def worker():
while True:
url = input_q.get() # grab a task (a url to scrape) from the input_q
res = requests.get(url)
output_q.put((url, res.content)) # save just the content or the entire response object to process later
input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed
input_q = Queue()
output_q = Queue()
# Set number of workers. You might want to do some time experience with this number. Just limit the number of
# urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell)
# to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher.
num_of_worker_threads = 50
for i in xrange(num_of_worker_threads):
t = Thread(target=worker)
t.daemon = True # Note that these deamon threads will continue to exist until the python interpreter closes.
# If you are running this in a notebook, you have to restart the kernal (and there's a different
# way to do this in a notebook.
t.start()
# Load you list of urls here
for url in list_of_urls:
input_q.put(url)
input_q.join() # block the program until all tasks are done
results = [output_q.get(False) for _ in xrange(len(list_of_urls))]
# Results is now a list of tuples of (url, res.content) and you can save them however you want.
"""This version is better suited for Jupyter notebooks and long running scripts
because it does not use daemon threads and kills the threads when it is done.
"""
from threading import Thread
from Queue import Queue
import requests
def worker():
while True:
# Variables to help with retries
retry_count = 0
status_code = 0
url = input_q.get() # grab a task (a url to scrape) from the input_q
if url is None: # Did we get an end condition?
break
while count <= 5 and status_code != 200: # Retry 5 times or until we get a good response
count += 1
res = requests.get(url)
status_code = res.status_code
output_q.put((url, res.content)) # save just the content or the entire response object to process later
input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed
input_q = Queue()
output_q = Queue()
# Set number of workers. You might want to do some time experience with this number. Just limit the number of
# urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell)
# to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher.
num_of_worker_threads = 50
# Load you list of urls here
for url in list_of_urls:
input_q.put(url)
for i in xrange(num_of_worker_threads):
t = Thread(target=worker)
t.start() # No daemon=True this time
input_q.join() # block the program until all tasks are done
# When the program gets here, all the tasks have been processed, so let's shutdown the threads
for _ in xrange(num_of_threads):
input_q.put(None) # Add an end condition for each thread we created above
# Combine the results
results = [output_q.get(False) for _ in xrange(len(list_of_urls))]
# Results is now a list of tuples of (url, res.content) and you can save them however you want.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment