Skip to content

Instantly share code, notes, and snippets.

@adjam
Last active July 3, 2019 16:00
Show Gist options
  • Save adjam/052cb2e4791e7caf2ba78c0dc7994d62 to your computer and use it in GitHub Desktop.
Save adjam/052cb2e4791e7caf2ba78c0dc7994d62 to your computer and use it in GitHub Desktop.
Example for merging item records spread out over multiple MARCXML into a single record.
#!/usr/bin/env python
# Merges items spread over multiple MARC XML records
# into a single record. Some ILSes will not export MARC21
# that is "too large", and will output the same bibliographic record
# multiple times.
# usage: first, convert your marc21 to MARCXML (see marcrenaissance.sh gist)
# ensure that all records with the same control number come out in a 'clump',
# e.g. by sorting on control number
# reads from STDIN and outputs to STDOUT, so, e.g.
# ./item_joiner.py < multirecord.xml > joined.xml
# reads from 'multirecord.xml' and outputs to `joined.xml`.
# Check the id_expr and item_expr to make sure they're appropriate to your
# records
from lxml import etree
import sys
nsuri = "http://www.loc.gov/MARC21/slim"
mns = {'marc': nsuri}
nonens = {None: nsuri}
# NCSU keeps local control number in the 918$a; adjust as appropriate
id_expr = "marc:datafield[@tag='918']/marc:subfield[@code='a']"
# As above, you may need to adjust if you use the 949, e.g.
item_expr = "marc:datafield[@tag='999']"
def serialize(rec):
return etree.tostring(rec, encoding="utf-8")
def extract_id(rec):
return rec.xpath(id_expr, namespaces=mns)[0].text
def merge_items(to_rec, from_rec):
items = from_rec.xpath(item_expr, namespaces=nonens)
for i in items:
to_rec.append(i)
if items is None:
raise ValueError(etree.tostring(from_rec))
def main(infile, outfile):
current_id = None
current_rec = None
with etree.xmlfile(outfile, encoding="utf-8", buffered=True) as xf:
with xf.element("{%s}collection" % nsuri, nsmap=nonens) as ew:
for evt, rec in etree.iterparse(sys.stdin, events=('end',), tag='{%s}record' % nsuri):
this_id = extract_id(rec)
if this_id is None:
raise ValueError("missing catkey on record")
if this_id != current_id:
if current_rec is not None:
ew.write(current_rec)
current_rec = rec
urrent_id = this_id
else:
merge_items(current_rec, rec)
xf.write(current_rec)
if __name__ == '__main__':
main(sys.stdin, sys.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment