johnconroy · November 13, 2010 13:44
diff --git a/crawl_n_scrape_search_results.py b/crawl_n_scrape_search_results.py
 #goddam I love crawling and scraping.
 #I used this script to scrape a particular Twitter user directory to query & acquire a list of Twitter users from Ireland. Great fun.
 # Was a small crawl (1200 pages??)so I don't think they'd get het up about it. 
 # Their search results in this case came via POST. If they hadn't, I cuda used Python's urllib2 library instead, which allows 
 # you to pass GET parameters to a search query.

 # ... Looking at this, this was a ridiculously simple crawl... but I can't seem to find anything slightly tougher :(

 #if readlines()[n] contains	<div class="result_thumbnail">:
 #scrape readlines()[n+1]
 #dismiss first y chars
 #dismiss everything after the "
 #==twitter screen_name

 import os, urllib, time, string

 filename="F:\\somedir\\_IREUSERS.txt"

 for x in range(1000):  #1000 pages of results
    if x==0:
        thispagestr="http://SPOILER.com/twitter/spoiler/"
    else:
        thispagestr="http://SPOILER.com/twitter/spoiler/"+str(x+1)

    sock=urllib.urlopen(thispagestr)
    htmlsource=sock.readlines()
    file1=open(filename, 'a')
    
    for n in range(len(htmlsource)):
            if htmlsource[n].rfind('<div class="result_thumbnail">')!=-1:
                    #print htmlsource[n+1]
                    line=htmlsource[n+1]
                    line1=line[31:]

                    lastchar=line1.find('"')
                    name=line1[:lastchar]
                    file1.write(name)

    file1.close()
    time.sleep(30)
	#goddam I love crawling and scraping.
	#I used this script to scrape a particular Twitter user directory to query & acquire a list of Twitter users from Ireland. Great fun.
	# Was a small crawl (1200 pages??)so I don't think they'd get het up about it.
	# Their search results in this case came via POST. If they hadn't, I cuda used Python's urllib2 library instead, which allows
	# you to pass GET parameters to a search query.

	# ... Looking at this, this was a ridiculously simple crawl... but I can't seem to find anything slightly tougher :(

	#if readlines()[n] contains <div class="result_thumbnail">:
	#scrape readlines()[n+1]
	#dismiss first y chars
	#dismiss everything after the "
	#==twitter screen_name

	import os, urllib, time, string

	filename="F:\\somedir\\_IREUSERS.txt"

	for x in range(1000): #1000 pages of results
	if x==0:
	thispagestr="http://SPOILER.com/twitter/spoiler/"
	else:
	thispagestr="http://SPOILER.com/twitter/spoiler/"+str(x+1)

	sock=urllib.urlopen(thispagestr)
	htmlsource=sock.readlines()
	file1=open(filename, 'a')

	for n in range(len(htmlsource)):
	if htmlsource[n].rfind('<div class="result_thumbnail">')!=-1:
	#print htmlsource[n+1]
	line=htmlsource[n+1]
	line1=line[31:]

	lastchar=line1.find('"')
	name=line1[:lastchar]
	file1.write(name)

	file1.close()
	time.sleep(30)