Created
November 13, 2019 02:59
-
-
Save ebarns/f32eedcff1a996a6821b15e07c205f81 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
def get_section_links(soup): | |
sections = [] | |
for link in soup.find_all('a')[:20]: | |
print(link.text, link.get('href')) | |
if "/section/" in link.get( | |
'href'): # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You | |
sections.append(link.get('href')) | |
return sections | |
def add_article_hrefs(articles, soup): | |
for lnk in soup.find_all('a')[:20]: | |
href = lnk.get('href', "") | |
if len(href) > 0 and "/2019/" in href and ".html" in href: | |
articles.add(href) | |
def main(): | |
r = requests.get("https://www.nytimes.com") | |
soup = BeautifulSoup(r.text) | |
articles = set() | |
for section_url in get_section_links(soup): | |
soup = BeautifulSoup(requests.get(section_url).text) | |
add_article_hrefs(articles, soup) | |
article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles] | |
bodies = [] | |
for article in article_urls: | |
try: | |
r = requests.get(article) | |
soup = BeautifulSoup(r.text) | |
article_content = "".join([s.text for s in soup.find_all("section") if "name" in s.attrs]) | |
print("contnet; ", article_content) | |
bodies.append(article_content) | |
except Exception as e: | |
print(e) | |
continue | |
print(article_urls) | |
print(bodies) | |
print(bodies) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment