Last active
November 4, 2017 03:23
-
-
Save falkben/1b2e13b008e4dc9471d237d117e43f62 to your computer and use it in GitHub Desktop.
scrapes a website for all the URLs below it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' scrapes a website for urls ''' | |
import requests | |
from bs4 import BeautifulSoup | |
class URLTest(): | |
def __init__(self, link, status_code, current_depth, head): | |
self.link = link | |
self.status_code = status_code | |
self.current_depth = current_depth | |
self.head = head | |
def write_to_csv(filename, link, status, cur_depth, head): | |
''' actually tab separated ''' | |
with open(filename, 'a') as f: | |
f.write('{}\t{}\t{}\t{}\n'.format(link, status, cur_depth, head)) | |
def url_search(link, depth_limit=3, current_depth=0, head='', search_internal=[], except_strings=[]): | |
''' | |
recursively searches for links in an html tree | |
''' | |
print(link, current_depth) | |
# excluding links with particular strings in them | |
if except_strings: | |
ee = [e for e in except_strings if e in link] | |
if ee: | |
return | |
# check if in tested links, if so, grab that status result... | |
u_prev = list(filter(lambda u: u.link == link, URLS_TESTED)) | |
u_followed = [ | |
u if 'not followed' != u.status_code else None for u in u_prev] | |
u_followed = [x for x in u_followed if x is not None] | |
if u_followed: | |
depths = [u.current_depth for u in u_followed] | |
s_codes = [u.status_code for u in u_followed] | |
min_depth = min(depths) | |
idx = depths.index(min_depth) | |
if current_depth >= min_depth: | |
URLS_TESTED.append( | |
URLTest(link, s_codes[idx], current_depth, head)) | |
# URLS_TESTED.append( | |
# URLTest(link, 'prev_found', current_depth, head)) | |
return | |
if search_internal: | |
ee = [e for e in search_internal if e in link] | |
if not ee: | |
URLS_TESTED.append( | |
URLTest(link, 'not followed', current_depth, head)) | |
return | |
try: | |
r = requests.get(link, timeout=2) | |
if r.status_code != 200: | |
r = requests.get(link + '/', timeout=2) | |
except Exception as e: | |
URLS_TESTED.append( | |
URLTest(link, 'Error: ' + str(e), current_depth, head)) | |
print(e) | |
return | |
URLS_TESTED.append( | |
URLTest(link, r.status_code, current_depth, head)) | |
if r.status_code != 200: | |
return | |
if current_depth + 1 > depth_limit: | |
return | |
# now going to iterate through it's links: | |
soup = BeautifulSoup(r.text, "lxml") | |
atags = soup.find_all('a') | |
links = [a.get('href') for a in atags] | |
links = set(links) | |
# need this to skip the onhover links or any link that doesn't have href in it | |
links = [x for x in links if x is not None] | |
for href in links: | |
# relative link: | |
if 'http' not in href: | |
href = href.lstrip('/') | |
link_array = link.split('/') | |
href = '{}//{}/{}'.format(link_array[0], link_array[2], href) | |
url_search(href, depth_limit=depth_limit, current_depth=current_depth + 1, head=link, | |
search_internal=search_internal, except_strings=except_strings) | |
URLS_TESTED = [] | |
def main(): | |
depth_limit = 2 | |
url_type = 'data' | |
base_url = 'https://neurodata.io/{}/'.format(url_type) | |
# tab delimited text file (some of the links have commas) | |
filename = 'links_{}.txt'.format(url_type).replace('/', '_') | |
# skip links that have | |
except_strings = ['mendeley', 'mailto', '.pdf', '.tar.gz', '.zip'] | |
# search_internal = ['neurodata.io', 'openconnecto.me', 'github.com'] | |
search_internal = [] # empty set will search all links | |
# base_links | |
r = requests.get(base_url) | |
soup = BeautifulSoup(r.content, "lxml") | |
for link in soup.find_all('a'): | |
print(link.get('href')) | |
print('') | |
print('') | |
# iterating through all the links, printing the good and bad ones | |
url_search(base_url, depth_limit=depth_limit, | |
search_internal=search_internal, except_strings=except_strings) | |
URLS_TESTED.sort(key=lambda x: ( | |
x.current_depth, x.head, str(x.status_code), x.link)) | |
with open(filename, 'w') as f: | |
f.write('link\tstatus\tdepth\thead\n') | |
for u in URLS_TESTED: | |
write_to_csv(filename, u.link, u.status_code, u.current_depth, u.head) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment