Created
January 25, 2021 10:25
-
-
Save irv/156f1e8fb80b8a3af57626761ed3a628 to your computer and use it in GitHub Desktop.
Create heatmap from IIIF Image requests (incomplete)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from io import BytesIO | |
import re | |
import asyncio | |
import numpy as np | |
from aiohttp import ClientSession, ClientPayloadError | |
import requests | |
import itertools, operator | |
from PIL import Image | |
import cv2 | |
import backoff | |
def create_image_matrix(width, height, regions): | |
affected = np.zeros((height, width)) | |
for r in regions: | |
wrapped_width = min(r['w'], width - r['x']) | |
wrapped_height = min(r['h'], height - r['y']) | |
region_matrix = np.full((wrapped_height, wrapped_width), 0.1 * r['count']) | |
#print(region_matrix) | |
ypad = max(0, height - wrapped_height) | |
xpad = max(0, width - wrapped_width) | |
# pad with 0 up to the size of the source image | |
base = np.lib.pad(region_matrix,[(r['y'],max(0, ypad - r['y'])), (r['x'],max(0, xpad - r['x']))], 'constant') | |
affected = np.add(affected, base) | |
return affected | |
async def main(loop): | |
img_api_regex = re.compile("\/(.*)\/((\d+.\d+,\d+,\d+)|(pct:(\d*\.?\d*)+,(\d*\.?\d*)+,(\d*\.?\d*)+,(\d*\.?\d*)+))\/.*\/\d+\/default\.(jpg|gif|png|tif|jp2|webm)") | |
# parsed_requests = [ | |
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/2048,0,1408,2048/704,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/2048,2048,1408,684/704,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/0,2048,2048,684/1024,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/0,0,2048,2048/1024,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,4096,740,1597/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,1024,1024/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,512,512/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,512,512/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1280,1280,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,2048,2048,2048/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,0,740,4096/47,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,4096,4096,1597/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,2048,2048,2048/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,0,2048,2048/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,0,2048,2048/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,4096,2048,1597/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,0,740,2048/93,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,4096,2048,1597/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,2048,740,2048/93,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,4096,740,1597/93,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,3072,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,2048,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,3072,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,2048,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,2048,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,1024,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,3072,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,1024,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,4096,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,4096,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,2048,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,2048,740,1024/185,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,3072,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,3072,740,1024/185,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,4096,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,1024,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,0,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,1024,740,1024/185,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,0,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,0,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,5120,1024,573/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,4096,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,4096,740,1024/185,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,5120,1024,573/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,5120,1024,573/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,0,1024,1024/256,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,0,740,1024/185,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,5120,740,573/185,/0/default.jpg"), | |
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,5120,1024,573/256,/0/default.jpg"), | |
# #parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/pct:10,10,5,2/704,/0/default.jpg") | |
# ] | |
parsed_requests = [] | |
filename = "access.log" | |
with open(filename, "r") as f: | |
for cnt, line in enumerate(f): | |
# only want GET requests | |
matches = img_api_regex.search(line) | |
if matches: | |
bl = matches.group(0) | |
parsed_requests.append(parse_request(bl)) | |
uri_base = "https://api.bl.uk/image/iiif/ark:/81055" | |
parsed_requests = parsed_requests[0:200] | |
image_identifiers = {item[0] for item in parsed_requests} | |
image_urls = [get_info_url(uri_base, i) for i in image_identifiers] | |
image_info = await get_info_jsons(image_urls) | |
grouped_requests = itertools.groupby(parsed_requests, key=operator.itemgetter(0)) | |
grouped_requests = [list(group) for _, group in grouped_requests] | |
matrixes = [] | |
for i in image_info: | |
reqs = [] | |
for r in grouped_requests: | |
# if idenifiers match | |
if r[0][0] == i[0]: | |
for s in r: | |
if s[2]: | |
# if it was a pct: region, expand to pixels | |
s[1]['x'] = int((s[1]['x'] / 100 ) * i[1]) | |
s[1]['y'] = int((s[1]['y'] / 100 ) * i[1]) | |
s[1]['w'] = int((s[1]['w'] / 100 ) * i[1]) | |
s[1]['h'] = int((s[1]['h'] / 100 ) * i[1]) | |
reqs.append(s[1]) | |
# try and aggregate same requests | |
bucketed_requests = [dict(g[0], **{'count':len(list(g[1]))} ) for g in itertools.groupby(reqs)] | |
matrixes.append((i[0], create_image_matrix(i[1], i[2],bucketed_requests))) | |
for m in matrixes: | |
arr = m[1] | |
# normalise to RGB values | |
new_arr = np.around(((arr - arr.min()) * (1/(arr.max() - arr.min()) * 255))).astype('uint8') | |
rgb_img = cv2.applyColorMap(new_arr, cv2.COLORMAP_JET) | |
bgr_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2BGR) | |
cv2.imwrite("%s.png" % m[0], bgr_img) | |
async def get_info_jsons(image_urls): | |
sem = asyncio.Semaphore(1000) | |
tasks = [] | |
async with ClientSession(conn_timeout=10000, read_timeout=10000) as session: | |
for url in image_urls: | |
#pass Semaphore and session to every GET request | |
task = asyncio.ensure_future(bound_fetch(sem, url, session)) | |
tasks.append(task) | |
responses = asyncio.gather(*tasks) | |
return [(info_json['@id'].split('/')[-1], info_json['width'], info_json['height']) for info_json in await responses] | |
def parse_request(path): | |
parts = path.split('/')[-5:] | |
region_string = parts[1] | |
pct = "pct:" in region_string | |
if(pct): | |
region_string = region_string[4:] | |
region = map(int,region_string.split(',')) | |
identifier = parts[0] | |
keys = ["x","y","w","h"] | |
return (identifier, dict(zip(keys,region)), pct) | |
def get_info_url(uri_base, identifier): | |
url = "/".join([uri_base, identifier, "info.json"]) | |
return url | |
#return (identifier, info_json['width'], info_json['height']) | |
async def bound_fetch(sem, url, session): | |
# Getter function with semaphore. | |
async with sem: | |
return await fetch(url, session) | |
COUNTER = 1 | |
@backoff.on_predicate(backoff.full_jitter, max_value=13) | |
@backoff.on_exception(backoff.expo, | |
ClientPayloadError, | |
max_time=60) | |
async def fetch(url, session): | |
async with session.get(url) as response: | |
delay = response.headers.get("DELAY") | |
date = response.headers.get("DATE") | |
global COUNTER | |
COUNTER +=1 | |
print("{}. {}:{} with delay {}".format(str(COUNTER), date, response.url, delay)) | |
try: | |
return await response.json() | |
except ClientPayloadError: | |
print("ERROR: {}".format(url)) | |
if __name__ == '__main__': | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main(loop)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment