Skip to content

Instantly share code, notes, and snippets.

@rande
Created August 5, 2010 13:19
Show Gist options
  • Save rande/509720 to your computer and use it in GitHub Desktop.
Save rande/509720 to your computer and use it in GitHub Desktop.
Create cache from a files containing url (multi-threaded python script)
import threading, time, urllib
import fileinput, io
class UrlDownloadThread ( threading.Thread ):
def __init__(self, url):
self.url = url
threading.Thread.__init__(self)
def run(self):
file = urllib.urlopen(self.url)
if file.getcode() != 200:
self.error = 'http code is not 200 : %s' % self.url
class CacheCrawler:
def __init__(self, number, file):
self.number = number
self.pool = {}
self.pos = 0
self.stream = io.open(file)
self.count = 0;
def start(self):
print "starting thread"
## init thread
for x in range(0, self.number):
url = self.get_next_url()
if url:
self.pool[x] = UrlDownloadThread(url)
self.pool[x].start()
## loop accros thread and check status
try:
while True:
time.sleep(0.1)
for x in self.pool:
if not isinstance(self.pool[x], UrlDownloadThread):
#print 'not a valid thread'
continue
if self.pool[x].isAlive():
#print 'thread is running nothing to do'
continue
url = self.get_next_url()
if not url:
raise StopIteration()
if self.count % 100 == 0:
print "urls downloaded : %s" % self.count
self.pool[x] = UrlDownloadThread(url)
self.pool[x].start()
except StopIteration:
print 'no more url to fetch'
pass
def get_next_url(self):
line = self.stream.readline()
if line:
self.count += 1
return line
CacheCrawler(5, "urls/recipes.txt").start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment