Skip to content

Instantly share code, notes, and snippets.

@warborn
Created March 7, 2018 22:04
Show Gist options
  • Save warborn/be67af3438e328c30c233583bd3ff237 to your computer and use it in GitHub Desktop.
Save warborn/be67af3438e328c30c233583bd3ff237 to your computer and use it in GitHub Desktop.
Webcrawler that visits the first link in an article
import time
import urllib
import requests
from bs4 import BeautifulSoup
start_url = "https://en.wikipedia.org/wiki/Special:Random"
target_url = "https://en.wikipedia.org/wiki/Philosophy"
def continue_crawl(search_history, target_url, max_steps = 25):
current_url = search_history[-1]
if current_url == target_url:
print("We've found the target article!")
return False
if len(search_history) > max_steps:
print("The search has gone on suspiciously long, aborting search!")
return False
if current_url in search_history[:-1]:
print("We've arrived at an article we've already seen, aborting search!")
print("The article was %s" % current_url)
return False
return True
def find_first_link(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
article_link = None
content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")
for element in content_div.find_all('p', recursive=False):
link = element.find('a', recursive=False)
if link:
article_link = link.get('href')
break
if not article_link:
return
first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
return first_link
article_chain = [start_url]
while continue_crawl(article_chain, target_url):
print(article_chain[-1])
# download html of last article in article_chain
# find the first link in that html
first_link = find_first_link(article_chain[-1])
# add the first link to article_chain
article_chain.append(first_link)
# delay for about two seconds
time.sleep(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment