Last active
October 1, 2022 13:59
-
-
Save abkumbar/4315d1656781e6d06f6b627a3934e09d to your computer and use it in GitHub Desktop.
Wayback news scraper gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Creating a loop to scrape from all pages | |
news_title = [] | |
news_source = [] | |
news_link = [] | |
#pages = [str(i) for i in range(1,371)] | |
reqs = 0 | |
start_time = time() | |
for url in url_list: | |
full_url = 'https://web.archive.org/web/'+url | |
#open page | |
try: | |
pg = rq.get(full_url).text | |
except urllib.error.HTTPError as e: | |
print('Error: {}'.format(e)) | |
except urllib.error.URLError as e: | |
print('Error: {}'.format(e.reason)) | |
sleep(randint(10,20)) | |
reqs +=1 | |
# Calculate elapsed time between requests | |
elapsed_time = time() - start_time | |
print('Request: {}; Frequency: {} requests/s'.format(reqs,reqs/elapsed_time)) | |
#Break once the max pages is reached | |
if reqs > len(url_list): | |
warn('No. of requests was greater than expected') | |
break | |
# parse html using beautifulsoup and store in soup | |
soup = bs(pg,'html.parser') | |
#find all news containers | |
articles = soup.find_all('article') | |
# parse through news containers to get info | |
for article in articles: | |
try: | |
if article != None: | |
#title and link | |
if article.find_all('h2') != None: | |
#get news title | |
title = article.find_all('h2')[1].a.text | |
#get individual news article link | |
link = article.find_all('h2')[1].a['href'] | |
else: | |
title = 'N/A' | |
link = 'N/A' | |
# source | |
source = 'NBC News' | |
# Append data elements to lists | |
news_title.append(title) | |
news_source.append(source) | |
news_link.append(link) | |
except: | |
e = sys.exc_info()[0] | |
print(e) | |
## Creating a loop to scrape summary from links | |
news_summary = [] | |
summ_link = [] | |
reqs = 0 | |
start_time = time() | |
for url in news_link: | |
#open page | |
try: | |
pg = rq.get(url).text | |
except urllib.error.HTTPError as e: | |
print('Error: {}'.format(e)) | |
except urllib.error.URLError as e: | |
print('Error: {}'.format(e.reason)) | |
sleep(randint(10,20)) | |
reqs +=1 | |
# Calculate elapsed time between requests | |
elapsed_time = time() - start_time | |
print('Request: {}; Frequency: {} requests/s'.format(reqs,reqs/elapsed_time)) | |
#Break once the max pages is reached | |
if reqs > len(news_link): | |
warn('No. of requests was greater than expected') | |
break | |
# parse html using beautifulsoup and store in soup | |
soup = bs(pg,'html.parser') | |
#find all news containers | |
article = soup.find('div',attrs={'class':'article container___2EGEI'}) | |
try: | |
if article.div != None: | |
summ = article.div.text | |
else: | |
summ = 'N/A' | |
news_summary.append(summ) | |
summ_link.append(url) | |
except: | |
e = sys.exc_info()[0] | |
print(e) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment