ebarns · November 13, 2019 02:42
diff --git a/booty.py b/booty.py
 from bs4 import BeautifulSoup

 import requests


 def get_section_links(soup):
    sections = []
    for link in soup.find_all('a')[:20]:
        print(link.text, link.get('href'))
        if "/section/" in link.get(
                'href'):  # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You
            sections.append(link.get('href'))
    return sections


 def add_article_hrefs(articles, soup):
    for lnk in soup.find_all('a')[:20]:
        href = lnk.get('href', "")
        if len(href) > 0 and "/2019/" in href and ".html" in href:
            articles.add(href)


 def main():
    r = requests.get("https://www.nytimes.com")
    soup = BeautifulSoup(r.text)
    articles = set()
    for section_url in get_section_links(soup):
        soup = BeautifulSoup(requests.get(section_url).text)
        add_article_hrefs(articles, soup)

    article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles]
    print(article_urls)

 main()
	from bs4 import BeautifulSoup

	import requests


	def get_section_links(soup):
	sections = []
	for link in soup.find_all('a')[:20]:
	print(link.text, link.get('href'))
	if "/section/" in link.get(
	'href'): # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You
	sections.append(link.get('href'))
	return sections


	def add_article_hrefs(articles, soup):
	for lnk in soup.find_all('a')[:20]:
	href = lnk.get('href', "")
	if len(href) > 0 and "/2019/" in href and ".html" in href:
	articles.add(href)


	def main():
	r = requests.get("https://www.nytimes.com")
	soup = BeautifulSoup(r.text)
	articles = set()
	for section_url in get_section_links(soup):
	soup = BeautifulSoup(requests.get(section_url).text)
	add_article_hrefs(articles, soup)

	article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles]
	print(article_urls)

	main()