Skip to content

Instantly share code, notes, and snippets.

@triangletodd
Created August 14, 2024 02:15
Show Gist options
  • Save triangletodd/2f9606764fb6e4a04d8bfafddb59cfb8 to your computer and use it in GitHub Desktop.
Save triangletodd/2f9606764fb6e4a04d8bfafddb59cfb8 to your computer and use it in GitHub Desktop.
Scrape GoodReads Quotes
#!/usr/bin/env python3
import json
import requests
from bs4 import BeautifulSoup
quotes = []
for i in range(1, 101):
url = f'https://www.goodreads.com/quotes?page={i}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
target_class = 'quoteText'
divs = soup.find_all('div', class_=target_class)
for div in divs:
parts = div.get_text(strip=True).split("ΓÇò")
if len(parts) < 2:
continue
quote = parts[0].strip().replace("ΓÇ£", '').replace("ΓÇ¥", '')
author_title = parts[1].split(",", 1)
if len(author_title) < 2:
author = parts[1].strip()
title = ''
else:
author = author_title[0].strip()
title = author_title[1].strip()
quote_map = {
"quote": quote,
"author": author,
"title": title
}
quotes.append(quote_map)
json_string = json.dumps(quotes, indent=4)
print(json_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment