Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created September 14, 2024 07:38
Show Gist options
  • Save me-suzy/c53302d1e57f34d84604a2c3b45aaced to your computer and use it in GitHub Desktop.
Save me-suzy/c53302d1e57f34d84604a2c3b45aaced to your computer and use it in GitHub Desktop.
download google books 3
import os
import urllib
import traceback
import regex as re
from time import sleep
from seleniumwire import webdriver # pip install selenium-wire
from progressbar import progressbar as bar
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service # <-- Import Service
print("""
Google Books Downloader by @aprikyan, 2020.
. . . . . . . . . . .
""")
def get_book_url():
url = input("""
Step 1: Paste the URL of the book preview to be downloaded.
(e.g. https://books.google.com/books?id=buc0AAAAMAAJ&printsec=frontcover&sa=X&ved=2ahUKEwj-y8T4r5vrAhWKLewKHaIQBnYQ6AEwAXoECAQQAg#v=onepage&f=false)
Your input: """)
if re.findall(r"id=[A-Za-z0-9]+", url):
id_part = re.findall(r"id=[A-Za-z0-9]+", url)[-1]
else:
print("Invalid input. Please try again.")
return get_book_url()
return (f"https://books.google.com/books?{id_part}&pg=1&hl=en#v=onepage&q&f=false",
f"https://books.google.com/books?{id_part}&pg=1&hl=en&f=false&output=embed&source=gbs_embed")
def get_book_data(url):
driver.get(url)
driver.refresh()
sleep(3)
title = driver.find_element_by_class_name("gb-volume-title").text
author = driver.find_element_by_class_name("addmd").text
return f"{title} (b{author[1:]})"
def capture_requests(url):
driver.get(url)
driver.refresh()
sleep(5)
checkpoint = ""
while checkpoint != driver.find_element_by_class_name("pageImageDisplay"):
checkpoint = driver.find_element_by_class_name("pageImageDisplay")
checkpoint.click()
for i in range(25):
html = driver.find_element_by_tag_name("body")
html.click()
html.send_keys(Keys.SPACE)
sleep(2)
return str(driver.requests)
def extract_urls(requests):
urls = re.findall(r"url='(https:\/\/[^']+content[^']+pg=[A-Z]+([0-9]+)[^']+)(&w=[0-9]+)'", requests)
return {int(url[1]): url[0] + "&w=69420" for url in urls}
def save_backup():
save = input("""
Would you like to save a backup file (type Yes or No)?
Your input: """).upper()
if save == "YES":
with open(f"Backup of {book_data}.txt", "w") as f:
f.write(str(all_pages))
print(f"Successfully backed up the book in \"Backup of {book_data}.txt\"!")
def select_pages(user_input, all_pages):
ranges = user_input.replace(" ", "").split(",")
page_numbers = []
if "all" in ranges:
return all_pages
while "odd" in ranges:
page_numbers.extend([i for i in all_pages.items() if i[0] % 2])
ranges.remove("odd")
while "even" in ranges:
page_numbers.extend([i for i in all_pages.items() if i[0] % 2 == 0])
ranges.remove("even")
for segment in ranges:
if "-" in segment:
a, b = segment.split("-")
page_numbers.extend([i for i in all_pages.items() if int(a) <= i[0] <= int(b)])
elif int(segment) in all_pages.keys():
page_numbers.append((int(segment), all_pages[int(segment)]))
return dict(set(page_numbers))
def get_cookie(url):
cookies = []
driver.get(url)
driver.refresh()
for request in driver.requests:
if request.headers:
if "Cookie" in request.headers.keys():
cookies.append(request.headers["Cookie"])
return cookies[0]
def download_imgs(pages, cookie, directory):
proxy = urllib.request.ProxyHandler({})
opener = urllib.request.build_opener(proxy)
opener.addheaders = [("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4)"),
("cookie", cookie)]
urllib.request.install_opener(opener)
for number, url in bar(pages.items()):
urllib.request.urlretrieve(url, os.path.join(directory, f"page{number}.png"))
def step1():
global book_data, all_pages
from_url = input("""
Would you like to download a book from URL? Type No if you have a backup, otherwise type Yes.
Your input: """).upper()
if from_url == "YES":
data_url, pages_url = get_book_url()
book_data = get_book_data(data_url)
reqs = capture_requests(pages_url)
all_pages = extract_urls(reqs)
save_backup()
elif from_url == "NO":
backup = input("Enter the location of the backup file.")
try:
book_data = os.path.basename(backup)[10:-4]
all_pages = eval(open(backup).read())
except:
step1()
def step2():
global selected_pages, cookie
selection = input("""
Step 2: Specify the pages to be downloaded. Use the format:
(e.g. 1, 10-50, odd, 603)
Your input: """)
try:
selected_pages = select_pages(selection, all_pages)
except:
step2()
cookie = get_cookie(list(all_pages.items())[0][1])
def step3():
main_directory = input("Step 3: Specify the download location (optional):")
try:
new_directory = os.path.join(main_directory, book_data)
if not os.path.exists(new_directory):
os.mkdir(new_directory)
except:
step3()
download_imgs(selected_pages, cookie, new_directory)
if __name__ == "__main__":
global driver
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=3")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
chrome_options.add_experimental_option("prefs", {"safebrowsing.enabled": True})
# Provide the path to your chromedriver using Service
chromedriver_path = r"e:\google-books-downloader-master\chromedriver.exe"
service = Service(executable_path=chromedriver_path) # Correct usage of Service
driver = webdriver.Chrome(service=service, options=chrome_options) # No executable_path here
try:
step1()
step2()
step3()
except Exception as e:
with open("google-books-downloader crash.log", "w") as log:
log.write(traceback.format_exc())
print("Something went wrong, check the log file.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment