Created
September 14, 2024 07:38
-
-
Save me-suzy/c53302d1e57f34d84604a2c3b45aaced to your computer and use it in GitHub Desktop.
download google books 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import urllib | |
import traceback | |
import regex as re | |
from time import sleep | |
from seleniumwire import webdriver # pip install selenium-wire | |
from progressbar import progressbar as bar | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.chrome.service import Service # <-- Import Service | |
print(""" | |
Google Books Downloader by @aprikyan, 2020. | |
. . . . . . . . . . . | |
""") | |
def get_book_url(): | |
url = input(""" | |
Step 1: Paste the URL of the book preview to be downloaded. | |
(e.g. https://books.google.com/books?id=buc0AAAAMAAJ&printsec=frontcover&sa=X&ved=2ahUKEwj-y8T4r5vrAhWKLewKHaIQBnYQ6AEwAXoECAQQAg#v=onepage&f=false) | |
Your input: """) | |
if re.findall(r"id=[A-Za-z0-9]+", url): | |
id_part = re.findall(r"id=[A-Za-z0-9]+", url)[-1] | |
else: | |
print("Invalid input. Please try again.") | |
return get_book_url() | |
return (f"https://books.google.com/books?{id_part}&pg=1&hl=en#v=onepage&q&f=false", | |
f"https://books.google.com/books?{id_part}&pg=1&hl=en&f=false&output=embed&source=gbs_embed") | |
def get_book_data(url): | |
driver.get(url) | |
driver.refresh() | |
sleep(3) | |
title = driver.find_element_by_class_name("gb-volume-title").text | |
author = driver.find_element_by_class_name("addmd").text | |
return f"{title} (b{author[1:]})" | |
def capture_requests(url): | |
driver.get(url) | |
driver.refresh() | |
sleep(5) | |
checkpoint = "" | |
while checkpoint != driver.find_element_by_class_name("pageImageDisplay"): | |
checkpoint = driver.find_element_by_class_name("pageImageDisplay") | |
checkpoint.click() | |
for i in range(25): | |
html = driver.find_element_by_tag_name("body") | |
html.click() | |
html.send_keys(Keys.SPACE) | |
sleep(2) | |
return str(driver.requests) | |
def extract_urls(requests): | |
urls = re.findall(r"url='(https:\/\/[^']+content[^']+pg=[A-Z]+([0-9]+)[^']+)(&w=[0-9]+)'", requests) | |
return {int(url[1]): url[0] + "&w=69420" for url in urls} | |
def save_backup(): | |
save = input(""" | |
Would you like to save a backup file (type Yes or No)? | |
Your input: """).upper() | |
if save == "YES": | |
with open(f"Backup of {book_data}.txt", "w") as f: | |
f.write(str(all_pages)) | |
print(f"Successfully backed up the book in \"Backup of {book_data}.txt\"!") | |
def select_pages(user_input, all_pages): | |
ranges = user_input.replace(" ", "").split(",") | |
page_numbers = [] | |
if "all" in ranges: | |
return all_pages | |
while "odd" in ranges: | |
page_numbers.extend([i for i in all_pages.items() if i[0] % 2]) | |
ranges.remove("odd") | |
while "even" in ranges: | |
page_numbers.extend([i for i in all_pages.items() if i[0] % 2 == 0]) | |
ranges.remove("even") | |
for segment in ranges: | |
if "-" in segment: | |
a, b = segment.split("-") | |
page_numbers.extend([i for i in all_pages.items() if int(a) <= i[0] <= int(b)]) | |
elif int(segment) in all_pages.keys(): | |
page_numbers.append((int(segment), all_pages[int(segment)])) | |
return dict(set(page_numbers)) | |
def get_cookie(url): | |
cookies = [] | |
driver.get(url) | |
driver.refresh() | |
for request in driver.requests: | |
if request.headers: | |
if "Cookie" in request.headers.keys(): | |
cookies.append(request.headers["Cookie"]) | |
return cookies[0] | |
def download_imgs(pages, cookie, directory): | |
proxy = urllib.request.ProxyHandler({}) | |
opener = urllib.request.build_opener(proxy) | |
opener.addheaders = [("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4)"), | |
("cookie", cookie)] | |
urllib.request.install_opener(opener) | |
for number, url in bar(pages.items()): | |
urllib.request.urlretrieve(url, os.path.join(directory, f"page{number}.png")) | |
def step1(): | |
global book_data, all_pages | |
from_url = input(""" | |
Would you like to download a book from URL? Type No if you have a backup, otherwise type Yes. | |
Your input: """).upper() | |
if from_url == "YES": | |
data_url, pages_url = get_book_url() | |
book_data = get_book_data(data_url) | |
reqs = capture_requests(pages_url) | |
all_pages = extract_urls(reqs) | |
save_backup() | |
elif from_url == "NO": | |
backup = input("Enter the location of the backup file.") | |
try: | |
book_data = os.path.basename(backup)[10:-4] | |
all_pages = eval(open(backup).read()) | |
except: | |
step1() | |
def step2(): | |
global selected_pages, cookie | |
selection = input(""" | |
Step 2: Specify the pages to be downloaded. Use the format: | |
(e.g. 1, 10-50, odd, 603) | |
Your input: """) | |
try: | |
selected_pages = select_pages(selection, all_pages) | |
except: | |
step2() | |
cookie = get_cookie(list(all_pages.items())[0][1]) | |
def step3(): | |
main_directory = input("Step 3: Specify the download location (optional):") | |
try: | |
new_directory = os.path.join(main_directory, book_data) | |
if not os.path.exists(new_directory): | |
os.mkdir(new_directory) | |
except: | |
step3() | |
download_imgs(selected_pages, cookie, new_directory) | |
if __name__ == "__main__": | |
global driver | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
chrome_options.add_argument("--log-level=3") | |
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"]) | |
chrome_options.add_experimental_option("prefs", {"safebrowsing.enabled": True}) | |
# Provide the path to your chromedriver using Service | |
chromedriver_path = r"e:\google-books-downloader-master\chromedriver.exe" | |
service = Service(executable_path=chromedriver_path) # Correct usage of Service | |
driver = webdriver.Chrome(service=service, options=chrome_options) # No executable_path here | |
try: | |
step1() | |
step2() | |
step3() | |
except Exception as e: | |
with open("google-books-downloader crash.log", "w") as log: | |
log.write(traceback.format_exc()) | |
print("Something went wrong, check the log file.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment