cfilipov · November 6, 2017 04:11
diff --git a/crawl_dijkstra_archive.py b/crawl_dijkstra_archive.py
 from bs4 import BeautifulSoup
 import urllib2

 def crawl(url):
 	html = urllib2.urlopen(url).read()
 	soup = BeautifulSoup(html)
 	rows = soup.findAll("table")[3].findAll("tr")
 	for row in rows[1:]:
 		cols = row.findAll("td")
 		ahref = cols[2].findAll("a")[0]
 		ewdnum = cols[0].b.string
 		ewdnumpart = "".join([c for c in ewdnum if c.isdigit()])
 		ewdcharpart = "".join([c for c in ewdnum if c.isalpha()])
 		ewdpadded = ewdnumpart.zfill(4)
 		if ewdnumpart == ewdnum:
 			ewdnum = ewdpadded
 		else:
 			ewdnum = ewdpadded + ewdcharpart
 		ewdname = "EWD" + ewdnum + " " + ahref.contents[0]
 		filename = "".join([c for c in ewdname if c.isalpha() or c.isdigit() or c==' ' or c=="(" or c==")" or c=="-"])
 		filename = filename + ".pdf"
 		pdfUrl = baseUrl + ahref["href"]
 		pdfFile = urllib2.urlopen(pdfUrl)
 		print "Saving: " + filename
 		output = open(filename, "wb")
 		output.write(pdfFile.read())
 		output.close()
 	return

 baseUrl = "http://www.cs.utexas.edu/~EWD/"
 startPage = "index00xx.html"
 html = urllib2.urlopen(baseUrl + startPage).read()
 soup = BeautifulSoup(html)
 links = soup.findAll("table")[1].findAll("tr")[1].findAll("p")[2].findAll(href=True)

 for link in links:
 	crawl(baseUrl + link["href"])
	from bs4 import BeautifulSoup
	import urllib2

	def crawl(url):
	html = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html)
	rows = soup.findAll("table")[3].findAll("tr")
	for row in rows[1:]:
	cols = row.findAll("td")
	ahref = cols[2].findAll("a")[0]
	ewdnum = cols[0].b.string
	ewdnumpart = "".join([c for c in ewdnum if c.isdigit()])
	ewdcharpart = "".join([c for c in ewdnum if c.isalpha()])
	ewdpadded = ewdnumpart.zfill(4)
	if ewdnumpart == ewdnum:
	ewdnum = ewdpadded
	else:
	ewdnum = ewdpadded + ewdcharpart
	ewdname = "EWD" + ewdnum + " " + ahref.contents[0]
	filename = "".join([c for c in ewdname if c.isalpha() or c.isdigit() or c==' ' or c=="(" or c==")" or c=="-"])
	filename = filename + ".pdf"
	pdfUrl = baseUrl + ahref["href"]
	pdfFile = urllib2.urlopen(pdfUrl)
	print "Saving: " + filename
	output = open(filename, "wb")
	output.write(pdfFile.read())
	output.close()
	return

	baseUrl = "http://www.cs.utexas.edu/~EWD/"
	startPage = "index00xx.html"
	html = urllib2.urlopen(baseUrl + startPage).read()
	soup = BeautifulSoup(html)
	links = soup.findAll("table")[1].findAll("tr")[1].findAll("p")[2].findAll(href=True)

	for link in links:
	crawl(baseUrl + link["href"])