Created
January 15, 2024 14:05
-
-
Save yshalsager/7d94f672ab7063267a69ea4135655ae8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install httpx parsel | |
from pathlib import Path | |
from httpx import get | |
from parsel import Selector | |
books_urls = Path('books_list.txt').read_text().splitlines() | |
download_urls = Path('download_urls.txt') | |
if download_urls.exists(): | |
download_urls.unlink() | |
for book_url in books_urls: | |
page = Selector(text=get(book_url).text) | |
# <iframe class="wonderplugin-pdf-iframe" src="https://www.albabtainlibrary.org/wp-content/plugins/wonderplugin-pdf-embed/pdfjslight/web/viewer.html?v=2&disabledownload=1&disableprint=1&disabletext=1&disabledoc=1&disableopenfile=1&disabletoolbar=1&disablerightclick=1&file=https://www.albabtainlibrary.org/wp-content/uploads/2023/08/Ibn-tamiya-2-final.pdf" width="100%" height="600px" style="border:0;"></iframe> | |
# Get the iframe src | |
iframe_src = page.css('iframe::attr(src)').get("") | |
# Get the pdf file url | |
pdf_url = iframe_src.split('file=')[-1] | |
print(pdf_url) | |
download_urls.open(mode='a').write(pdf_url + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment