ebarns · November 13, 2019 02:59
diff --git a/okaythistime.py b/okaythistime.py
 from bs4 import BeautifulSoup

 import requests


 def get_section_links(soup):
    sections = []
    for link in soup.find_all('a')[:20]:
        print(link.text, link.get('href'))
        if "/section/" in link.get(
                'href'):  # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You
            sections.append(link.get('href'))
    return sections


 def add_article_hrefs(articles, soup):
    for lnk in soup.find_all('a')[:20]:
        href = lnk.get('href', "")
        if len(href) > 0 and "/2019/" in href and ".html" in href:
            articles.add(href)


 def main():
    r = requests.get("https://www.nytimes.com")
    soup = BeautifulSoup(r.text)
    articles = set()
    for section_url in get_section_links(soup):
        soup = BeautifulSoup(requests.get(section_url).text)
        add_article_hrefs(articles, soup)

    article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles]

    bodies = []
    for article in article_urls:
        try:
            r = requests.get(article)
            soup = BeautifulSoup(r.text)
            article_content = "".join([s.text for s in soup.find_all("section") if "name" in s.attrs])
            print("contnet; ", article_content)
            bodies.append(article_content)
        except Exception as e:
            print(e)
            continue
    print(article_urls)

    print(bodies)
    print(bodies)


 main()
	from bs4 import BeautifulSoup

	import requests


	def get_section_links(soup):
	sections = []
	for link in soup.find_all('a')[:20]:
	print(link.text, link.get('href'))
	if "/section/" in link.get(
	'href'): # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You
	sections.append(link.get('href'))
	return sections


	def add_article_hrefs(articles, soup):
	for lnk in soup.find_all('a')[:20]:
	href = lnk.get('href', "")
	if len(href) > 0 and "/2019/" in href and ".html" in href:
	articles.add(href)


	def main():
	r = requests.get("https://www.nytimes.com")
	soup = BeautifulSoup(r.text)
	articles = set()
	for section_url in get_section_links(soup):
	soup = BeautifulSoup(requests.get(section_url).text)
	add_article_hrefs(articles, soup)

	article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles]

	bodies = []
	for article in article_urls:
	try:
	r = requests.get(article)
	soup = BeautifulSoup(r.text)
	article_content = "".join([s.text for s in soup.find_all("section") if "name" in s.attrs])
	print("contnet; ", article_content)
	bodies.append(article_content)
	except Exception as e:
	print(e)
	continue
	print(article_urls)

	print(bodies)
	print(bodies)


	main()