Skip to content

Instantly share code, notes, and snippets.

@ebarns
Created November 13, 2019 02:42
Show Gist options
  • Save ebarns/5699cef3eeb9a6660e0f2c1d8028ae29 to your computer and use it in GitHub Desktop.
Save ebarns/5699cef3eeb9a6660e0f2c1d8028ae29 to your computer and use it in GitHub Desktop.
bootyful soupy
from bs4 import BeautifulSoup
import requests
def get_section_links(soup):
sections = []
for link in soup.find_all('a')[:20]:
print(link.text, link.get('href'))
if "/section/" in link.get(
'href'): # i noticed the home page had a lot of "/section/" paths which seems to be where a lot of articles are. You
sections.append(link.get('href'))
return sections
def add_article_hrefs(articles, soup):
for lnk in soup.find_all('a')[:20]:
href = lnk.get('href', "")
if len(href) > 0 and "/2019/" in href and ".html" in href:
articles.add(href)
def main():
r = requests.get("https://www.nytimes.com")
soup = BeautifulSoup(r.text)
articles = set()
for section_url in get_section_links(soup):
soup = BeautifulSoup(requests.get(section_url).text)
add_article_hrefs(articles, soup)
article_urls = [f"https://www.nytimes.com{article_url}" for article_url in articles]
print(article_urls)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment