Skip to content

Instantly share code, notes, and snippets.

@mdzhang
Last active August 30, 2019 04:17
Show Gist options
  • Save mdzhang/53cf81c8e4800bff5afe2698f25f134f to your computer and use it in GitHub Desktop.
Save mdzhang/53cf81c8e4800bff5afe2698f25f134f to your computer and use it in GitHub Desktop.
Script to filter down goodreads listopia items and order them based on what I'm mostly likely to want to read
"""Filter down a Goodreads Listopia list to books I'm most likely to like.
Fetches the raw HTML with Selenium (requires chromedriver), extracts book data
to a pandas data frame using BeautifulSoup, does some massaging, then prints
it out nicely.
TODO:
- more data points (year, contents of top upvoted reviews, shelves,
page count, read status, etc.)
- support multi-page Listopia lists
"""
import argparse
import logging
import os
import re
import sys
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from tabulate import tabulate
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def make_book(book) -> dict:
"""Extract key fields from HTML for a book in a Goodreads webpage table
to a dict.
Works for Listopia lists at e.g. https://www.goodreads.com/list/show/*
"""
# title & author
title = book.find("a", attrs={"class": "bookTitle"}).find("span").getText()
author = book.find("a", attrs={"class": "authorName"}).find("span").getText()
# id (from which url can easily be built)
href = book.find("a", attrs={"class": "bookTitle"})["href"]
id_ = os.path.basename(href).split(".", 1)[0]
id_ = id_.split("-", 1)[0]
# avg & total rating/s
rating_detail = book.find("span", attrs={"class", "minirating"}).getText()
RATING_REGEX = r"^.+\s?(\d\.\d\d) avg rating — ([\d,]+) ratings$"
m = re.match(RATING_REGEX, rating_detail)
if m is None:
logger.warn(f"Failed to pull ratings from detail: {rating_detail}")
avg_rating = np.nan
total_ratings = np.nan
else:
avg_rating = float(m.group(1))
total_ratings = int(m.group(2).replace(",", ""))
# listopia score
score_detail = book.find("a", onclick=re.compile("score_explanation")).getText()
SCORE_REGEX = r"^score: ([\d,]+)$"
m = re.match(SCORE_REGEX, score_detail)
if m is None:
logger.warn(f"Failed to pull scoring from detail: {score_detail}")
score = np.nan
else:
score = int(m.group(1).replace(",", ""))
# final dict
return {
"title": title,
"author": author,
"id": id_,
"avg_rating": avg_rating,
"total_ratings": total_ratings,
"score": score,
"url": f"https://www.goodreads.com/book/show/{id_}",
}
def load_to_df(url):
driver = webdriver.Chrome("/usr/bin/chromedriver")
driver.get(url)
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, "html.parser")
books = soup.find_all("tr", attrs={"itemtype": "http://schema.org/Book"})
book_dicts = list(map(make_book, books))
df = pd.DataFrame.from_dict(book_dicts)
return df.sort_values(by=["total_ratings", "avg_rating"], ascending=False)
def find_my_favorites(df):
df2 = df.sort_values(by=["total_ratings", "avg_rating", "score"], ascending=False)
df3 = df2[["title", "id", "total_ratings", "avg_rating", "score"]]
# ignore anything below a 3.7, since historically I haven't liked those much anyways
df3 = df3[df3["avg_rating"] > 3.7]
# ignore anything read less than some number of times
# since it could be an indicator that the book was so bad, people just won't finish it
df3 = df3[df3["total_ratings"] > 500]
# TODO: filter out books I've already read
# create an aggregate score from total/avg rating
#
# weigh total ratings down b/c sometimes things get a lot of ratings due to
# hype/cult status etc which doesn't correspond to my enjoyment.
# typically I don't enjoy books with that many more ratings over ~2k all that much more
#
# weigh average rating way up
#
# the results for a couple sample lists have some of my favorites from the list within the
# top 15 results, which seems legit
#
# TODO: incorporate year since I often don't like very old books
# TODO: incorporate page count, since I'm more likely to pick up a short
# book to get closer to my reading challenge for the year
# TODO: incorporate tags and shelves, b/c some genres I really don't care for
df3["agg_score"] = 1000 * (np.log2(df3["total_ratings"]) + (np.e ** df3["avg_rating"])) + df3['score']
df3 = df3.sort_values(by=["agg_score"], ascending=False)
return df3.reset_index(drop=True)
def get_parser():
parser = argparse.ArgumentParser(
description="Search a Goodreads Listopia list for books I may want to read"
)
parser.add_argument(
"--refresh", action="store_true", help="Whether to reuse cached files on disk"
)
parser.add_argument(
"-n", type=int, help="How many of the top recommendations to print", default=30
)
parser.add_argument(
"--url",
type=str,
help="URL of Listopia list on Goodreads to filter down",
required=True,
)
return parser
def main(limit, url, refresh):
title = os.path.basename(url).split(".", 1)[1]
cache_file = f"{title}.csv"
if not os.path.exists(cache_file) or refresh:
logger.info(f"Fetching contents for first time '{cache_file}'")
df = load_to_df(url)
df.to_csv(cache_file, index=False)
else:
logger.info(f"Reusing local '{cache_file}'")
df = pd.read_csv(cache_file)
df2 = find_my_favorites(df)
print(tabulate(df2.head(limit), headers="keys", tablefmt="psql"))
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
main(limit=args.n, refresh=args.refresh, url=args.url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment