Last active
March 15, 2023 13:51
-
-
Save MarioVilas/de6f088b6fe11f765971411ab226d84f to your computer and use it in GitHub Desktop.
Goodreads quote downloader script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Fortune cookie generator based on Goodreads quotes | |
# -------------------------------------------------- | |
# | |
# Use the following commands to download quotes from a specific author (for example, Mark Twain): | |
# | |
# python3 goodreads-quote-downloader.py -e3 https://www.goodreads.com/author/quotes/1244.Mark_Twain > marktwain | |
# strfile -c % marktwain marktwain.dat | |
# sudo cp marktwain marktwain.dat /usr/share/games/fortunes/ | |
import argparse | |
import os | |
import os.path | |
import sys | |
from http.cookiejar import LWPCookieJar | |
from urllib.request import Request, urlopen | |
from urllib.parse import unquote | |
# Exported symbols. | |
__all__ = ['GoodreadsQuoteDownloader'] | |
# We need to do this very early so the following code can show the help message on error. | |
ARGUMENT_PARSER = argparse.ArgumentParser() | |
ARGUMENT_PARSER.add_argument("url", metavar="URL", default="https://www.goodreads.com/quotes", help="Link to quotes page (default: https://www.goodreads.com/quotes)") | |
ARGUMENT_PARSER.add_argument("--start", "-s", metavar="N", type=int, default=0, help="start page (default: 0)") | |
ARGUMENT_PARSER.add_argument("--end", "-e", metavar="N", type=int, default=0, help="end page (default: no limit)") | |
ARGUMENT_PARSER.add_argument("--pause", "-p", metavar="N", type=int, default=0, help="pause between HTTP requests in seconds (default: 0)") | |
ARGUMENT_PARSER.add_argument("--user-agent", metavar="STR", help="custom User-agent header for HTTP requests") | |
# Try to import the non built-in dependencies. Show an error message on failure. | |
try: | |
from bs4 import BeautifulSoup | |
is_bs4 = True | |
except ImportError: | |
ARGUMENT_PARSER.error("missing dependency: BeautifulSoup\n\nRun the following command to install:\n\tpip3 install bs4") | |
class GoodreadsQuoteDownloader: | |
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)' | |
def __init__(self, home_folder=None): | |
if not home_folder: | |
home_folder = os.getenv('HOME') | |
if not home_folder: | |
home_folder = os.getenv('USERHOME') | |
if not home_folder: | |
home_folder = '.' # Use the current folder on error. | |
self.cookie_jar = LWPCookieJar(os.path.join(home_folder, '.goodreads-cookie')) | |
try: | |
self.cookie_jar.load() | |
except Exception: | |
pass | |
def get_quotes_from_page(self, url, page=0): | |
# Prepare the URL. | |
if "?" in url: | |
url = url[:url.find("?")] | |
if page > 0: | |
url = url + "?page=" + str(page) | |
# Prepare the HTTP request. | |
request = Request(url) | |
if self.USER_AGENT: | |
request.add_header('User-Agent', self.USER_AGENT) | |
# Get cookies from the cookie jar. | |
self.cookie_jar.add_cookie_header(request) | |
# Make the HTTP request and get the HTML response. | |
response = urlopen(request) | |
# Store cookies in the cookie jar. | |
self.cookie_jar.extract_cookies(response, request) | |
html = response.read() | |
response.close() | |
try: | |
self.cookie_jar.save() | |
except Exception: | |
pass | |
# Parse the HTML response and extract the quotes. | |
return [ | |
unquote(div.get_text()).replace(" ", " ").replace("\n\n", "\n").strip() | |
for div in BeautifulSoup(html, 'html.parser').find_all("div", class_="quoteText") | |
] | |
def iter_quotes(self, url, start=0, end=0, pause=0): | |
page = start | |
while end == 0 or page <= end: | |
quotes = self.get_quotes_from_page(url, page) | |
if not quotes: | |
break | |
for q in quotes: | |
yield q | |
page += 1 | |
def main(): | |
args = ARGUMENT_PARSER.parse_args() | |
gqd = GoodreadsQuoteDownloader() | |
if args.user_agent: | |
gqd.USER_AGENT = args.user_agent | |
separator = None | |
for quote in gqd.iter_quotes(args.url, start=args.start, end=args.end, pause=args.pause): | |
if separator: | |
print(separator) | |
print(quote) | |
separator = "%" | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment