Created
April 30, 2020 03:43
-
-
Save ToniRV/726b55de83f6c3666460bc039e3cf78c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import xml.etree.ElementTree as ET | |
from lxml import html | |
import requests | |
DIRECTORY='/home/tonirv/Downloads/SpringerBooks/' | |
tree = ET.parse('Springer_Ebooks.xml') | |
root = tree.getroot() | |
ns = {"office": "urn:oasis:names:tc:opendocument:xmlns:office:1.0", | |
"draw":"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0", | |
"text":"urn:oasis:names:tc:opendocument:xmlns:text:1.0"} | |
# Find the element that has a 'key' attribute with a value of 'applications' | |
entries = root.findall(".//draw:text-box", ns) | |
name = '' | |
for entry in entries: | |
for span in entry.find(".//text:p", ns): | |
#print(span.tag, span.attrib, span.text) | |
url = '' | |
for key, value in span.attrib.items(): | |
if value == 'T2' and span.text is not None: | |
try: | |
name += str(span.text) + '_' | |
except: | |
print("error reading name") | |
if value == 'T3': | |
url = span.text | |
if url is not '': | |
# Download!! | |
print ("DOWNLOADING: ", name) | |
print ("from: ", url) | |
# GET | |
r = requests.get(url) | |
# Response, status etc | |
print(r.status_code) | |
tree = html.fromstring(r.text) | |
link_element = tree.xpath('//a[@title="Download this book in PDF format"]') | |
for (element, attribute, link, pos) in link_element[0].iterlinks(): | |
download_url = 'http://link.springer.com/%s'%link | |
print("Download from: %s. Saving to: %s" % (download_url, name)) | |
pdf = requests.get(download_url) | |
open(DIRECTORY+'/'+name +'.pdf', 'wb').write(pdf.content) | |
name = '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment