mdzhang · August 30, 2019 04:17
diff --git a/goodreads.py b/goodreads.py
 """Filter down a Goodreads Listopia list to books I'm most likely to like.

 Fetches the raw HTML with Selenium (requires chromedriver), extracts book data
 to a pandas data frame using BeautifulSoup, does some massaging, then prints
 it out nicely.

 TODO:
    - more data points (year, contents of top upvoted reviews, shelves,
      page count, read status, etc.)
    - support multi-page Listopia lists
 """
 import argparse
 import logging
 import os
 import re
 import sys

 import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from tabulate import tabulate

 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)


 def make_book(book) -> dict:
    """Extract key fields from HTML for a book in a Goodreads webpage table
    to a dict.

    Works for Listopia lists at e.g. https://www.goodreads.com/list/show/*
    """
    # title & author
    title = book.find("a", attrs={"class": "bookTitle"}).find("span").getText()
    author = book.find("a", attrs={"class": "authorName"}).find("span").getText()

    # id (from which url can easily be built)
    href = book.find("a", attrs={"class": "bookTitle"})["href"]
    id_ = os.path.basename(href).split(".", 1)[0]
    id_ = id_.split("-", 1)[0]

    # avg & total rating/s
    rating_detail = book.find("span", attrs={"class", "minirating"}).getText()
    RATING_REGEX = r"^.+\s?(\d\.\d\d) avg rating — ([\d,]+) ratings$"
    m = re.match(RATING_REGEX, rating_detail)

    if m is None:
        logger.warn(f"Failed to pull ratings from detail: {rating_detail}")
        avg_rating = np.nan
        total_ratings = np.nan
    else:
        avg_rating = float(m.group(1))
        total_ratings = int(m.group(2).replace(",", ""))

    # listopia score
    score_detail = book.find("a", onclick=re.compile("score_explanation")).getText()
    SCORE_REGEX = r"^score: ([\d,]+)$"

    m = re.match(SCORE_REGEX, score_detail)

    if m is None:
        logger.warn(f"Failed to pull scoring from detail: {score_detail}")
        score = np.nan
    else:
        score = int(m.group(1).replace(",", ""))

    # final dict
    return {
        "title": title,
        "author": author,
        "id": id_,
        "avg_rating": avg_rating,
        "total_ratings": total_ratings,
        "score": score,
        "url": f"https://www.goodreads.com/book/show/{id_}",
    }


 def load_to_df(url):
    driver = webdriver.Chrome("/usr/bin/chromedriver")

    driver.get(url)

    html_source = driver.page_source

    driver.close()

    soup = BeautifulSoup(html_source, "html.parser")
    books = soup.find_all("tr", attrs={"itemtype": "http://schema.org/Book"})

    book_dicts = list(map(make_book, books))
    df = pd.DataFrame.from_dict(book_dicts)
    return df.sort_values(by=["total_ratings", "avg_rating"], ascending=False)


 def find_my_favorites(df):
    df2 = df.sort_values(by=["total_ratings", "avg_rating", "score"], ascending=False)
    df3 = df2[["title", "id", "total_ratings", "avg_rating", "score"]]

    # ignore anything below a 3.7, since historically I haven't liked those much anyways
    df3 = df3[df3["avg_rating"] > 3.7]

    # ignore anything read less than some number of times
    # since it could be an indicator that the book was so bad, people just won't finish it
    df3 = df3[df3["total_ratings"] > 500]

    # TODO: filter out books I've already read

    # create an aggregate score from total/avg rating
    #
    # weigh total ratings down b/c sometimes things get a lot of ratings due to
    # hype/cult status etc which doesn't correspond to my enjoyment.
    # typically I don't enjoy books with that many more ratings over ~2k all that much more
    #
    # weigh average rating way up
    #
    # the results for a couple sample lists have some of my favorites from the list within the
    # top 15 results, which seems legit
    #
    # TODO: incorporate year since I often don't like very old books
    # TODO: incorporate page count, since I'm more likely to pick up a short
    #       book to get closer to my reading challenge for the year
    # TODO: incorporate tags and shelves, b/c some genres I really don't care for
    df3["agg_score"] = 1000 * (np.log2(df3["total_ratings"]) + (np.e ** df3["avg_rating"])) + df3['score']
    df3 = df3.sort_values(by=["agg_score"], ascending=False)
    return df3.reset_index(drop=True)


 def get_parser():
    parser = argparse.ArgumentParser(
        description="Search a Goodreads Listopia list for books I may want to read"
    )
    parser.add_argument(
        "--refresh", action="store_true", help="Whether to reuse cached files on disk"
    )

    parser.add_argument(
        "-n", type=int, help="How many of the top recommendations to print", default=30
    )
    parser.add_argument(
        "--url",
        type=str,
        help="URL of Listopia list on Goodreads to filter down",
        required=True,
    )
    return parser


 def main(limit, url, refresh):
    title = os.path.basename(url).split(".", 1)[1]
    cache_file = f"{title}.csv"

    if not os.path.exists(cache_file) or refresh:
        logger.info(f"Fetching contents for first time '{cache_file}'")
        df = load_to_df(url)
        df.to_csv(cache_file, index=False)
    else:
        logger.info(f"Reusing local '{cache_file}'")
        df = pd.read_csv(cache_file)

    df2 = find_my_favorites(df)
    print(tabulate(df2.head(limit), headers="keys", tablefmt="psql"))


 if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    main(limit=args.n, refresh=args.refresh, url=args.url)
	"""Filter down a Goodreads Listopia list to books I'm most likely to like.

	Fetches the raw HTML with Selenium (requires chromedriver), extracts book data
	to a pandas data frame using BeautifulSoup, does some massaging, then prints
	it out nicely.

	TODO:
	- more data points (year, contents of top upvoted reviews, shelves,
	page count, read status, etc.)
	- support multi-page Listopia lists
	"""
	import argparse
	import logging
	import os
	import re
	import sys

	import numpy as np
	import pandas as pd
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from tabulate import tabulate

	logging.basicConfig()
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)


	def make_book(book) -> dict:
	"""Extract key fields from HTML for a book in a Goodreads webpage table
	to a dict.

	Works for Listopia lists at e.g. https://www.goodreads.com/list/show/*
	"""
	# title & author
	title = book.find("a", attrs={"class": "bookTitle"}).find("span").getText()
	author = book.find("a", attrs={"class": "authorName"}).find("span").getText()

	# id (from which url can easily be built)
	href = book.find("a", attrs={"class": "bookTitle"})["href"]
	id_ = os.path.basename(href).split(".", 1)[0]
	id_ = id_.split("-", 1)[0]

	# avg & total rating/s
	rating_detail = book.find("span", attrs={"class", "minirating"}).getText()
	RATING_REGEX = r"^.+\s?(\d\.\d\d) avg rating — ([\d,]+) ratings$"
	m = re.match(RATING_REGEX, rating_detail)

	if m is None:
	logger.warn(f"Failed to pull ratings from detail: {rating_detail}")
	avg_rating = np.nan
	total_ratings = np.nan
	else:
	avg_rating = float(m.group(1))
	total_ratings = int(m.group(2).replace(",", ""))

	# listopia score
	score_detail = book.find("a", onclick=re.compile("score_explanation")).getText()
	SCORE_REGEX = r"^score: ([\d,]+)$"

	m = re.match(SCORE_REGEX, score_detail)

	if m is None:
	logger.warn(f"Failed to pull scoring from detail: {score_detail}")
	score = np.nan
	else:
	score = int(m.group(1).replace(",", ""))

	# final dict
	return {
	"title": title,
	"author": author,
	"id": id_,
	"avg_rating": avg_rating,
	"total_ratings": total_ratings,
	"score": score,
	"url": f"https://www.goodreads.com/book/show/{id_}",
	}


	def load_to_df(url):
	driver = webdriver.Chrome("/usr/bin/chromedriver")

	driver.get(url)

	html_source = driver.page_source

	driver.close()

	soup = BeautifulSoup(html_source, "html.parser")
	books = soup.find_all("tr", attrs={"itemtype": "http://schema.org/Book"})

	book_dicts = list(map(make_book, books))
	df = pd.DataFrame.from_dict(book_dicts)
	return df.sort_values(by=["total_ratings", "avg_rating"], ascending=False)


	def find_my_favorites(df):
	df2 = df.sort_values(by=["total_ratings", "avg_rating", "score"], ascending=False)
	df3 = df2[["title", "id", "total_ratings", "avg_rating", "score"]]

	# ignore anything below a 3.7, since historically I haven't liked those much anyways
	df3 = df3[df3["avg_rating"] > 3.7]

	# ignore anything read less than some number of times
	# since it could be an indicator that the book was so bad, people just won't finish it
	df3 = df3[df3["total_ratings"] > 500]

	# TODO: filter out books I've already read

	# create an aggregate score from total/avg rating
	#
	# weigh total ratings down b/c sometimes things get a lot of ratings due to
	# hype/cult status etc which doesn't correspond to my enjoyment.
	# typically I don't enjoy books with that many more ratings over ~2k all that much more
	#
	# weigh average rating way up
	#
	# the results for a couple sample lists have some of my favorites from the list within the
	# top 15 results, which seems legit
	#
	# TODO: incorporate year since I often don't like very old books
	# TODO: incorporate page count, since I'm more likely to pick up a short
	# book to get closer to my reading challenge for the year
	# TODO: incorporate tags and shelves, b/c some genres I really don't care for
	df3["agg_score"] = 1000 * (np.log2(df3["total_ratings"]) + (np.e ** df3["avg_rating"])) + df3['score']
	df3 = df3.sort_values(by=["agg_score"], ascending=False)
	return df3.reset_index(drop=True)


	def get_parser():
	parser = argparse.ArgumentParser(
	description="Search a Goodreads Listopia list for books I may want to read"
	)
	parser.add_argument(
	"--refresh", action="store_true", help="Whether to reuse cached files on disk"
	)

	parser.add_argument(
	"-n", type=int, help="How many of the top recommendations to print", default=30
	)
	parser.add_argument(
	"--url",
	type=str,
	help="URL of Listopia list on Goodreads to filter down",
	required=True,
	)
	return parser


	def main(limit, url, refresh):
	title = os.path.basename(url).split(".", 1)[1]
	cache_file = f"{title}.csv"

	if not os.path.exists(cache_file) or refresh:
	logger.info(f"Fetching contents for first time '{cache_file}'")
	df = load_to_df(url)
	df.to_csv(cache_file, index=False)
	else:
	logger.info(f"Reusing local '{cache_file}'")
	df = pd.read_csv(cache_file)

	df2 = find_my_favorites(df)
	print(tabulate(df2.head(limit), headers="keys", tablefmt="psql"))


	if __name__ == "__main__":
	parser = get_parser()
	args = parser.parse_args()

	main(limit=args.n, refresh=args.refresh, url=args.url)