Created
February 28, 2021 20:35
-
-
Save rickerp/a5674c9af97ac2d62b8b8489edfbea28 to your computer and use it in GitHub Desktop.
Python script to recursively get files from html file server
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re | |
import requests | |
import os | |
import argparse | |
ap = argparse.ArgumentParser() | |
ap.add_argument('URL', help='URL to download files') | |
ap.add_argument('regex', help='Regex to detect the subdirectories in html') | |
ap.add_argument("-d", "--directory", help="Directory", default=os.getcwd() + os.sep) | |
ap.add_argument('-a', '--auth', nargs=2, help="Authentication credentials if needed for request") | |
args = vars(ap.parse_args()) | |
def download_file(url, path=None, request_params={}): | |
local_filename = path if path else url.split('/')[-1] | |
# NOTE the stream=True parameter below | |
with requests.get(url, stream=True, **request_params) as r: | |
r.raise_for_status() | |
with open(local_filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=8192): | |
# If you have chunk encoded response uncomment if | |
# and set chunk_size parameter to None. | |
# if chunk: | |
f.write(chunk) | |
return local_filename | |
def getfiles(url, reg, folder=os.getcwd() + '/', auth=None): | |
print(f"Requesting {url}") | |
entries = re.findall(reg, requests.get(url, auth=tuple(auth)).text)[1:] | |
for e in entries: | |
if e[-1] == '/': | |
os.mkdir(folder + e) | |
getfiles(url + e, reg, folder + e, auth) | |
else: | |
print(f"Requesting {url + e}") | |
download_file(url + e, folder + e, {'auth': tuple(auth)}) | |
getfiles(args['URL'], args['regex'], args['directory'], args['auth']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment