-
-
Save tecknoh19/10767105 to your computer and use it in GitHub Desktop.
# URL Harvester written by Andy Bricker | |
# http://andybricker.com | |
# andy at andybricker.com | |
# Requirements | |
# Python 2.7 (Has not been tasted on later versions) | |
# Beautiful Soup library for Python (http://www.crummy.com/software/BeautifulSoup/) | |
# Usage: | |
# python urlHarvest.py books stores -n 50 -l myLogFile.txt | |
# Google Dorks are supported | |
# python urlHarvest.py inurl:.com.eu/foobar.php intext:I like computers -n 50 -l /home/me/logs/myLogFile.txt | |
# Script will crawl google collections the specified number of results for a given search. The script will then | |
# build a URL array while preventing duplicate entries. Finally, a line my line logfile is generated containins | |
# the results. | |
# Like the script? Donate | |
# LiteCoin: LcFU5upJyS7FsEeB5sb25vFTS69dH6fugr | |
# DogeCoin: D7SPH1LYJn9Co4GCZePH3JvzR5RkZEPi5M | |
from optparse import OptionParser | |
options = OptionParser(usage='%prog search [options]', description='Python URL Harvester by Andy Bricker. http://AndyBricker.Com') | |
options.add_option('-n', '--number', type='int', default=5, help='Number of search results to parse (default: 5)') | |
options.add_option('-l', '--log_file', type='string', default='urlHarvest.txt', help='Name of the output logfile. Paths accepted. (default: urlHarvest.txt)') | |
def addLog(target, opts): | |
log_file = open(opts.log_file, "a") | |
log_file.write(target + '\n') | |
log_file.close() | |
def main(): | |
print "" | |
print "=======================================================" | |
print "Checking arguments." | |
opts, args = options.parse_args() | |
z = 0 | |
if len(args) < 1: | |
options.print_help() | |
exit() | |
domainList = [] | |
print "Beginning Google Search of " + str(opts.number) + " records. Please be patient." | |
# Check Google against our search to build URL list | |
from google import search | |
for url in search(args[0], stop=opts.number): | |
from urlparse import urlparse | |
parsed_uri = urlparse( url ) | |
domain = '{uri.netloc}'.format(uri=parsed_uri) | |
domainList.append(domain); | |
print "Search Complete, filtering results." | |
domainList = list(set(domainList)) | |
print "Building log file." | |
for target in domainList: | |
addLog(target, opts) | |
print "Harvest complete. Log data written to " + opts.log_file | |
print "" | |
print "=======================================================" | |
if __name__ == '__main__': | |
main() |
@demogorgonz
pip install google
Checking arguments.
Beginning Google Search of 500 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 51, in main
for url in search(args[0], stop=opts.number):
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 269, in search
html = get_page(url)
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 89, in get_page
response = urlopen(request)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 442, in error
result = self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 629, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 503: Service Unavailable
Checking arguments.
Beginning Google Search of 50 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 50, in main
from google import search
ImportError: No module named google