Last active
December 21, 2020 21:59
-
-
Save Airtnp/f6d4ce2ee116dab9ffdfbd3f11c3e762 to your computer and use it in GitHub Desktop.
PDF_download.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlretrieve | |
import requests | |
from bs4 import BeautifulSoup | |
import sys | |
import os | |
import socket | |
socket.setdefaulttimeout(150) | |
import re | |
class DownloadError: | |
pass | |
def show_block_fn(fn): | |
def show_block(a, b, c): | |
per = 100.0 * a * b / c | |
if per > 100: | |
per = 100 | |
if per < 0: | |
raise DownloadError() | |
bl = "[" + "*" * int(per / 2.5) + "-" * (40 - int(per / 2.5)) + "]" | |
print("\t%s:%s %.2f%%" % (fn, bl, per)) | |
return show_block | |
def download_file(url, idx, local_filename, folder): | |
local_filename = local_filename.replace("%20", " ") | |
# NOTE the stream=True parameter | |
r = requests.get(url, stream=True) | |
try: | |
urlretrieve(url, folder + '/' + local_filename, show_block_fn(local_filename)) | |
except: | |
print("Error: " + url) | |
return None | |
def download_pdf(root_link, folder, download_prefix = None): | |
suffix = ['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.tar.gz', '.zip', '.rar'] | |
try: | |
os.mkdir(folder) | |
except: | |
print("Folder already exists: {}".format(folder)) | |
r = requests.get(root_link) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.text, 'lxml') | |
idx = 1 | |
for link in soup.find_all('a'): | |
url = link.get('href') | |
if url != None: | |
if not download_prefix: | |
if url.startswith('http'): | |
new_link = url | |
else: | |
if root_link.endswith('.html'): | |
new_link = '/'.join(root_link.split('/')[:-1]) + '/' + link.get('href') | |
else: | |
new_link = root_link + '/' + link.get('href') | |
else: | |
new_link = download_prefix + '/' + link.get('href') | |
for suf in suffix: | |
reg = r"([-_.\w]+)\{}([?&].*)*".format(suf) | |
m = re.search(reg, new_link) | |
if m and m.group(0): | |
print("\nDownloading: " + new_link + " -> " + m.group(1) + suf) | |
try: | |
download_file(new_link,str(idx), (m.group(1) + suf), folder) | |
except Exception as e: | |
print("Failed to download url {}".format(new_link)) | |
idx += 1 | |
print("All download finished") | |
else: | |
print("A errors occurs.") | |
if __name__ == "__main__": | |
if (len(sys.argv) == 3): | |
download_pdf(sys.argv[1], sys.argv[2]) | |
else: | |
download_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment