Created
May 19, 2024 13:16
-
-
Save benwbrum/5f5151583883ba24fef06f23ed647421 to your computer and use it in GitHub Desktop.
Sample script to convert a PBCore XML file into a IIIF manifest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
class PBCoreToIIIF | |
def self.generate_manifest(xml_file) | |
# Read the PBCore XML file | |
xml = File.read(xml_file) | |
# Parse the XML using Nokogiri | |
doc = Nokogiri::XML(xml) | |
# First we need to read the important metadata from the PBCore document | |
title = doc.search('pbcoreTitle').map{|node| node.text}.join(" -- ") | |
description = doc.search("pbcoreDescription").text | |
# date = doc.search("pbcoreDate[dateType='created']").text | |
rights = doc.search("pbcoreRightsSummary").text.strip | |
publisher = doc.search("pbcorePublisher").text | |
creator = doc.search("pbcoreCreator").text | |
subject = doc.search("pbcoreSubject").text | |
extent = doc.search("pbcoreExtent").text | |
type = doc.search("pbcoreInstantiation").attr("source").text | |
# some fields need to go into a metadata hash | |
metadata = {} | |
# read the identifiers, using the source attribute for keys and the text for values | |
doc.search("pbcoreIdentifier").each do |identifier| | |
metadata[identifier.attr("source")] = identifier.text | |
end | |
# read the pbcoreContributor elements, using the contributorRole child for the keys and the contributor for the values | |
doc.search("pbcoreContributor").each do |contributor| | |
metadata[contributor.search("contributorRole").text] = contributor.search("contributor").text.strip | |
end | |
# read the pbcoreAnnotation elements, using the annotationType attribute for keys and text for values | |
doc.search("pbcoreAnnotation").each do |identifier| | |
metadata[identifier.attr("annotationType")] = identifier.text | |
end | |
# find a pbcoreInstantiation element with an instantiationDigital child | |
instantiation = doc.search("pbcoreInstantiation").find{|inst| inst.search("instantiationDigital").any?} | |
# read the media type from the instantiation | |
media_type = instantiation.search("instantiationMediaType").text | |
# read the format from the instantiation | |
mime_format = instantiation.search("instantiationDigital").text | |
location = 'https://aapb.example.com/' + instantiation.search("instantiationLocation").text | |
# read the duration from the instantiation | |
duration_raw = instantiation.search("instantiationDuration").text | |
# convert the raw duration from HH:MM:SS.SSS to seconds | |
duration = duration_raw.split(":").map(&:to_f).inject(0) {|sum, n| sum * 60 + n} | |
# now stub out a hash that will be a IIIF v3 manifest | |
i18n_title = { "en" => [title] } | |
# convert the metadata into a hash of label-value pairs | |
i18n_metadata = metadata.map{|k,v| { "label" => { "en" => [k] }, "value" => { "en" => [v] } }} | |
iiif_manifest = { | |
"@context" => "http://iiif.io/api/presentation/3/context.json", | |
"id" => "https://example.com/iiif/manifest", | |
"type" => "Manifest", | |
"label" => i18n_title, | |
"metadata" => i18n_metadata, | |
"homepage" => [{ | |
"id" => "https://example.com", | |
"type" => "Text", | |
"label" => { "en" => ["This should be a link to the catalog page with an appropriate label here"] }, | |
"format" => "text/html" }], | |
"summary" => { "en" => [description] }, | |
"items" => [ | |
{ | |
"id" => "https://example.com/iiif/canvas/1", | |
"type" => "Canvas", | |
"label" => { "en" => ["I'm not sure if these files have individual labels, or if there is ever more than one"] }, | |
"duration" => duration, | |
# height and witdth would be required for video content | |
"items" => [ | |
{ | |
"id" => "https://example.com/iiif/annotationpage/1", | |
"type" => "AnnotationPage", | |
"items" => [ | |
{ | |
"id" => "https://example.com/iiif/annotation/1", | |
"type" => "Annotation", | |
"motivation" => "painting", | |
"body" => { | |
"id" => location, | |
"type" => media_type, | |
"format" => mime_format, | |
"duration" => duration | |
}, | |
"target" => "https://example.com/iiif/canvas/1" | |
} | |
] | |
} | |
] | |
} | |
] | |
} | |
# Return the generated IIIF manifest | |
iiif_manifest.to_json | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This looks great @benwbrum ! We also have a PBCore gem that we have used for parsing/using PBCore XML in ruby, that might trim this down even further.
If there's not a lot of logic/variation on the output here, I think this will be pretty straightforward like you said.