-
-
Save bielfrontera/88e55eb65e3f97ff3e60015a4aca2e1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
import shutil | |
import requests | |
import re | |
TOPIC_LIST = [ | |
'programming', | |
'web-platform', | |
'security', | |
'iot', | |
'data', | |
'business', | |
'webops-perf' | |
] | |
def download_free_ebooks(ebook_topic, ebook_formats=['epub', 'mobi', 'pdf']): | |
if not os.path.exists(ebook_topic): | |
os.mkdir(ebook_topic) | |
ebook_list = get_free_ebook_list(ebook_topic) | |
for ebook in ebook_list: | |
for ebook_format in ebook_formats: | |
download_ebook(ebook_topic, ebook, ebook_format) | |
def download_ebook(ebook_topic, ebook_slug, ebook_format): | |
ebook_url = get_ebook_url(ebook_topic, ebook_slug, ebook_format) | |
ebook_filename = get_ebook_filename(ebook_topic, ebook_slug, ebook_format) | |
r = requests.get(ebook_url, stream=True) | |
if r.ok: | |
with open(ebook_filename, 'wb') as out_file: | |
shutil.copyfileobj(r.raw, out_file) | |
del r | |
def get_free_ebook_list(ebook_topic): | |
ebook_list = [] | |
index_url = 'http://www.oreilly.com/{topic}/free/'.format(topic=ebook_topic) | |
r = requests.get(index_url) | |
if r.ok: | |
ebook_list = get_ebook_list_from_content(ebook_topic, r.content) | |
return ebook_list | |
def get_ebook_list_from_content(ebook_topic, html_content): | |
pattern = r'http://www.oreilly.com/{topic}/free/[\'"]?([^\'" >]+).csp'.format( | |
topic=ebook_topic | |
) | |
book_slugs = re.findall(pattern, html_content) | |
return book_slugs | |
def get_ebook_url(ebook_topic, ebook_slug, ebook_format): | |
ebook_url = "http://www.oreilly.com/{topic}/free/files/{slug}.{ext}".format( | |
topic=ebook_topic, | |
slug=ebook_slug, | |
ext=ebook_format | |
) | |
return ebook_url | |
def get_ebook_filename(ebook_topic, ebook_slug, ebook_format): | |
ebook_fn = "{topic}/{slug}.{ext}".format( | |
topic=ebook_topic, | |
slug=ebook_slug, | |
ext=ebook_format | |
) | |
return ebook_fn | |
if __name__ == "__main__": | |
for topic in TOPIC_LIST: | |
download_free_ebooks(topic) |
Si necesitais usar proxy, teneis que cambiar request por "sesion", y arriba del todo poner:
sesion = requests.Session()
proxy = "_proxy_server:puerto_proxy"
sesion.proxies = {"http": proxy, "https": proxy}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
no funciona con python 3, he tenido que cambiar la linea 50:
book_slugs = re.findall(pattern, html_content.decode("utf-8"))