Last active
February 22, 2019 15:04
-
-
Save DakuTree/94d53646613c8087decb8f3a19297984 to your computer and use it in GitHub Desktop.
Python3 - Multi-connection downloading via Range header (requests, concurrent.futures)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Example of using concurrent.futures with the Range HTTP header for faster downloading. | |
# This requires the server you are download from supports the Range header. | |
# Couldn't find any clear solution for how to do this, so I wrote one up myself. | |
# | |
# Here we are using equal sized chunks representive of `(chunkSize, _) = divmod(contentLength, CONNECTIONS)` | |
# In my own testing: | |
# - Size =< 10MB~ is slower than a single connection. | |
# - Size > 10MB~ is faster than a single connection. | |
# | |
# I'm assuming we could use variable connections & chunkSize to improve speeds in some cases, but that is beyond my current knowledge. Feel free to improve where needed. | |
from concurrent import futures | |
import requests | |
import time | |
from math import floor | |
URL = "http://ipv4.download.thinkbroadband.com/100MB.zip" # https://www.thinkbroadband.com/download for test files | |
CONNECTIONS = 8 | |
DATACHUNKS = {} | |
def main(): | |
head = requests.head(URL) | |
contentLength = int(head.headers['Content-Length']) | |
(chunkSize, _) = divmod(contentLength, CONNECTIONS) | |
with futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as e: | |
downloads = [] | |
for i in range(1, CONNECTIONS): | |
time.sleep(0.20) # Delay between requests | |
# There is probably a nicer way to doing all this logic with Python, but I couldn't find any.. | |
(chunkStart, chunkEnd) = (((chunkSize * i) - chunkSize), (chunkSize * i)) | |
if not i == 1: | |
chunkStart += i-1 | |
chunkEnd += i-1 | |
downloads.append(e.submit(downloadChunk, URL, chunkStart, chunkEnd)) | |
downloads.append(e.submit(downloadChunk, URL, (((CONNECTIONS - 1) * chunkSize) + (CONNECTIONS - 1)), contentLength)) | |
futures.wait(downloads) | |
print("All chunks downloaded, dumping to file.") | |
filename = URL.split('/')[-1] # NOTE: This will not work with URLs using a Content-Disposition filename header | |
with open(filename, 'wb') as f: | |
f.write(b"".join(v for (k,v) in sorted(DATACHUNKS.items()))) | |
def downloadChunk(url, bytes_start, bytes_end): | |
print(f"Downloading chunk {bytes_start} - {bytes_end}") | |
try: | |
r = requests.get(url, headers={"range" : f"bytes={bytes_start}-{bytes_end}"}) | |
r.raise_for_status() | |
DATACHUNKS[bytes_start] = r.content | |
except requests.exceptions.RequestException as e: | |
print(e) | |
# sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment