MattPitlyk · July 26, 2016 03:12
diff --git a/multithreading_requests.py b/multithreading_requests.py
 from threading import Thread
 from Queue import Queue
 import requests

 def worker():
    while True:
        url = input_q.get()  # grab a task (a url to scrape) from the input_q
        res = requests.get(url)
        output_q.put((url, res.content)) # save just the content or the entire response object to process later
        input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed
        
 input_q = Queue()
 output_q = Queue()

 # Set number of workers. You might want to do some time experience with this number. Just limit the number of 
 # urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell)
 # to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher.
 num_of_worker_threads = 50

 for i in xrange(num_of_worker_threads):
    t = Thread(target=worker)
    t.daemon = True  # Note that these deamon threads will continue to exist until the python interpreter closes.
                     # If you are running this in a notebook, you have to restart the kernal (and there's a different
                     # way to do this in a notebook.
    t.start()         
 
 # Load you list of urls here
 for url in list_of_urls:
    input_q.put(url)

 input_q.join()  # block the program until all tasks are done

 results = [output_q.get(False) for _ in xrange(len(list_of_urls))]

 # Results is now a list of tuples of (url, res.content) and you can save them however you want.
diff --git a/mutlithreaded_requests_no_daemon.py b/mutlithreaded_requests_no_daemon.py
 """This version is better suited for Jupyter notebooks and long running scripts
 because it does not use daemon threads and kills the threads when it is done.
 """

 from threading import Thread
 from Queue import Queue
 import requests

 def worker():
    while True:
        # Variables to help with retries
        retry_count = 0
        status_code = 0
        
        url = input_q.get()  # grab a task (a url to scrape) from the input_q
        if url is None:  # Did we get an end condition?
            break
        while count <= 5 and status_code != 200:  # Retry 5 times or until we get a good response
            count += 1
            res = requests.get(url)
            status_code = res.status_code
        output_q.put((url, res.content)) # save just the content or the entire response object to process later
        input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed
        
 input_q = Queue()
 output_q = Queue()

 # Set number of workers. You might want to do some time experience with this number. Just limit the number of 
 # urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell)
 # to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher.
 num_of_worker_threads = 50

 # Load you list of urls here
 for url in list_of_urls:
    input_q.put(url)
    
 for i in xrange(num_of_worker_threads):
    t = Thread(target=worker)
    t.start()         # No daemon=True this time
 
 input_q.join()  # block the program until all tasks are done

 # When the program gets here, all the tasks have been processed, so let's shutdown the threads
 for _ in xrange(num_of_threads):
    input_q.put(None) # Add an end condition for each thread we created above
    
 # Combine the results
 results = [output_q.get(False) for _ in xrange(len(list_of_urls))]

 # Results is now a list of tuples of (url, res.content) and you can save them however you want.
	from threading import Thread
	from Queue import Queue
	import requests

	def worker():
	while True:
	url = input_q.get() # grab a task (a url to scrape) from the input_q
	res = requests.get(url)
	output_q.put((url, res.content)) # save just the content or the entire response object to process later
	input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed

	input_q = Queue()
	output_q = Queue()

	# Set number of workers. You might want to do some time experience with this number. Just limit the number of
	# urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell)
	# to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher.
	num_of_worker_threads = 50

	for i in xrange(num_of_worker_threads):
	t = Thread(target=worker)
	t.daemon = True # Note that these deamon threads will continue to exist until the python interpreter closes.
	# If you are running this in a notebook, you have to restart the kernal (and there's a different
	# way to do this in a notebook.
	t.start()

	# Load you list of urls here
	for url in list_of_urls:
	input_q.put(url)

	input_q.join() # block the program until all tasks are done

	results = [output_q.get(False) for _ in xrange(len(list_of_urls))]

	# Results is now a list of tuples of (url, res.content) and you can save them however you want.
	"""This version is better suited for Jupyter notebooks and long running scripts
	because it does not use daemon threads and kills the threads when it is done.
	"""

	from threading import Thread
	from Queue import Queue
	import requests

	def worker():
	while True:
	# Variables to help with retries
	retry_count = 0
	status_code = 0

	url = input_q.get() # grab a task (a url to scrape) from the input_q
	if url is None: # Did we get an end condition?
	break
	while count <= 5 and status_code != 200: # Retry 5 times or until we get a good response
	count += 1
	res = requests.get(url)
	status_code = res.status_code
	output_q.put((url, res.content)) # save just the content or the entire response object to process later
	input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed

	input_q = Queue()
	output_q = Queue()

	# Set number of workers. You might want to do some time experience with this number. Just limit the number of
	# urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell)
	# to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher.
	num_of_worker_threads = 50

	# Load you list of urls here
	for url in list_of_urls:
	input_q.put(url)

	for i in xrange(num_of_worker_threads):
	t = Thread(target=worker)
	t.start() # No daemon=True this time

	input_q.join() # block the program until all tasks are done

	# When the program gets here, all the tasks have been processed, so let's shutdown the threads
	for _ in xrange(num_of_threads):
	input_q.put(None) # Add an end condition for each thread we created above

	# Combine the results
	results = [output_q.get(False) for _ in xrange(len(list_of_urls))]

	# Results is now a list of tuples of (url, res.content) and you can save them however you want.