Created
April 24, 2012 16:54
-
-
Save hubgit/2481431 to your computer and use it in GitHub Desktop.
Convert Harvard Library Bibliographic Dataset (MARC21) to MODS XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require 'File/MARC.php'; | |
$xsl = new DOMDocument; | |
$xsl->load('http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3-4.xsl'); | |
$xsltproc = new XSLTProcessor; | |
$xsltproc->importStylesheet($xsl); | |
$marcxml = new DOMDocument; | |
$marcxml->preserveWhiteSpace = false; | |
$i = 0; | |
foreach (glob('/marc/*.mrc') as $file) { | |
$items = new File_MARC($file); | |
while ($record = $items->next()) { | |
$marcxml->loadXML($record->toXML(), LIBXML_NOCDATA); | |
$mods = $xsltproc->transformToDoc($marcxml); | |
$xpath = new DOMXPath($mods); | |
$xpath->registerNamespace('mods', 'http://www.loc.gov/mods/v3'); | |
$nodes = $xpath->query('mods:mods/mods:recordInfo/mods:recordIdentifier'); | |
if (!$nodes->length) continue; // no identifier | |
$id = $nodes->item(0)->textContent; | |
$file = sprintf('/mods/%s/%s/%s.xml', substr($id, 0, 3), substr($id, 3, 3), $id); | |
if (($i++ % 1000) === 0) print "$file\n"; | |
$dir = dirname($file); | |
if (!file_exists($dir)) mkdir($dir, 0777, true); | |
$mods->formatOutput = true; | |
file_put_contents($file, $mods->saveXML($mods->documentElement->firstChild)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$xsl = new DOMDocument; | |
$xsl->load('mods-to-cloudsearch.xsl'); | |
$xsltproc = new XSLTProcessor; | |
$xsltproc->importStylesheet($xsl); | |
$mods = new DOMDocument; | |
$mods->preserveWhiteSpace = false; | |
$i = 0; | |
$files = glob('/mods/000/000/*.xml'); | |
$output = fopen('/mods/cloud-000-000.xml', 'w'); | |
fwrite($output, "<batch>\n"); | |
foreach ($files as $file) { | |
$mods->load($file, LIBXML_NOCDATA); | |
fwrite($output, $xsltproc->transformToXML($mods)); | |
} | |
fwrite($output, "</batch>\n"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> | |
<xsl:output nethod="xml" encoding="utf-8" omit-xml-declaration="yes" standalone="no" indent="yes"/> | |
<xsl:template match="/mods"> | |
<add id="{recordInfo/recordIdentifier}" version="1" lang="en"> | |
<xsl:for-each select="titleInfo/title"> | |
<field name="title"><xsl:value-of select="."/></field> | |
</xsl:for-each> | |
<xsl:for-each select="subject[@authority='lcsh']/topic"> | |
<field name="subject"><xsl:value-of select="."/></field> | |
</xsl:for-each> | |
</add> | |
</xsl:template> | |
</xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment