warborn · March 7, 2018 22:04
diff --git a/wikipedia_webcrawler.py b/wikipedia_webcrawler.py
 import time
 import urllib
 import requests
 from bs4 import BeautifulSoup

 start_url = "https://en.wikipedia.org/wiki/Special:Random"
 target_url = "https://en.wikipedia.org/wiki/Philosophy"

 def continue_crawl(search_history, target_url, max_steps = 25):
  current_url = search_history[-1]
  if current_url == target_url:
    print("We've found the target article!")
    return False
  if len(search_history) > max_steps:
    print("The search has gone on suspiciously long, aborting search!")
    return False
  if current_url in search_history[:-1]:
    print("We've arrived at an article we've already seen, aborting search!")
    print("The article was %s" % current_url)
    return False
  return True

 def find_first_link(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  article_link = None

  content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")

  for element in content_div.find_all('p', recursive=False):
    link = element.find('a', recursive=False)
    if link:
      article_link = link.get('href')
      break

  if not article_link:
    return
  
  first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
  return first_link

 article_chain = [start_url]

 while continue_crawl(article_chain, target_url):
  print(article_chain[-1])
  # download html of last article in article_chain
  # find the first link in that html
  first_link = find_first_link(article_chain[-1])
  # add the first link to article_chain
  article_chain.append(first_link)
  # delay for about two seconds
  time.sleep(2)
	import time
	import urllib
	import requests
	from bs4 import BeautifulSoup

	start_url = "https://en.wikipedia.org/wiki/Special:Random"
	target_url = "https://en.wikipedia.org/wiki/Philosophy"

	def continue_crawl(search_history, target_url, max_steps = 25):
	current_url = search_history[-1]
	if current_url == target_url:
	print("We've found the target article!")
	return False
	if len(search_history) > max_steps:
	print("The search has gone on suspiciously long, aborting search!")
	return False
	if current_url in search_history[:-1]:
	print("We've arrived at an article we've already seen, aborting search!")
	print("The article was %s" % current_url)
	return False
	return True

	def find_first_link(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')

	article_link = None

	content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")

	for element in content_div.find_all('p', recursive=False):
	link = element.find('a', recursive=False)
	if link:
	article_link = link.get('href')
	break

	if not article_link:
	return

	first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
	return first_link

	article_chain = [start_url]

	while continue_crawl(article_chain, target_url):
	print(article_chain[-1])
	# download html of last article in article_chain
	# find the first link in that html
	first_link = find_first_link(article_chain[-1])
	# add the first link to article_chain
	article_chain.append(first_link)
	# delay for about two seconds
	time.sleep(2)