Last active
July 16, 2020 17:21
-
-
Save martjanz/0ddb6d184079c096f1a777de77b31be4 to your computer and use it in GitHub Desktop.
Heinrich - Sanguinetti photo archive downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Heinrich - Sanguinetti Archive photo downloader | |
Downloads the photo archive from the Endangered Archives Programme of the British Public Library. | |
This script assumes that photos are numbered consecutively inside and between folders. There are | |
some (few) exceptions, a manual review after the run to check if all files were downloaded will be needed. | |
Sample image URL: http://images.eap.bl.uk/EAP755/EAP755_1_1_295/2987.jp2/full/1287,/0/default.jpg | |
""" | |
import os | |
import time | |
import urllib.request | |
BASE_URL = 'http://images.eap.bl.uk/EAP755/EAP755_1_1_' | |
MAX_RESOLUTION = 10000 | |
MIN_FOLDER = 1 | |
MAX_FOLDER = 300 | |
INITIAL_IMAGE = 1 | |
image = INITIAL_IMAGE | |
# Iterate over folders from MIN_FOLDER to MAX_FOLDER | |
for folder in range(MIN_FOLDER, MAX_FOLDER): | |
print('Now on folder {}...'.format(folder)) | |
# Create folder | |
# On folder iterate until image gives 404 | |
# import urllib.request | |
while True: | |
path = 'photos/EAP755_1_1_{}'.format(folder) | |
image_path = '{path}/{image}.jpg'.format(path=path, image=image) | |
if os.path.isfile(image_path): | |
image = image + 1 | |
continue | |
url = '{base}{folder}/{photo}.jp2/full/{resolution},/0/default.jpg'.format( | |
base=BASE_URL, folder=folder, photo=image, resolution=MAX_RESOLUTION) | |
print('Checking image {} from EAP755_1_1_{}'.format(image, folder)) | |
try: | |
response = urllib.request.urlopen(url) | |
except urllib.error.HTTPError as e: | |
# Check for a skipped image number. If there is, continue on this folder | |
url = '{base}{folder}/{photo}.jp2/full/{resolution},/0/default.jpg'.format( | |
base=BASE_URL, folder=folder, photo=image + 1, resolution=MAX_RESOLUTION) | |
try: | |
response = urllib.request.urlopen(url) | |
except urllib.error.HTTPError as e: | |
time.sleep(1) | |
break | |
else: | |
image = image + 1 | |
print('Skipped image number, continuing on current folder.') | |
pass | |
print('Downloading image {} from EAP755_1_1_{}'.format(image, folder)) | |
os.makedirs(path, exist_ok=True) | |
with open(image_path, 'wb') as out_file: | |
data = response.read() # a `bytes` object | |
out_file.write(data) | |
image = image + 1 | |
time.sleep(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment