Created
December 18, 2023 00:11
-
-
Save edwintorok/27b90e6f5f8f3b3e9f89372f05df1b6c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from docx import Document | |
from docx.oxml import ns | |
from docx.oxml.table import CT_TblWidth | |
from docx.oxml.shared import OxmlElement | |
from docx.oxml.ns import qn | |
# From python-docx _tag_seq | |
tag_seqs = ( | |
( | |
"w:footnotePr", | |
"w:endnotePr", | |
"w:type", | |
"w:pgSz", | |
"w:pgMar", | |
"w:paperSrc", | |
"w:pgBorders", | |
"w:lnNumType", | |
"w:pgNumType", | |
"w:cols", | |
"w:formProt", | |
"w:vAlign", | |
"w:noEndnote", | |
"w:titlePg", | |
"w:textDirection", | |
"w:bidi", | |
"w:rtlGutter", | |
"w:docGrid", | |
"w:printerSettings", | |
"w:sectPrChange", | |
), | |
( | |
"w:writeProtection", | |
"w:view", | |
"w:zoom", | |
"w:removePersonalInformation", | |
"w:removeDateAndTime", | |
"w:doNotDisplayPageBoundaries", | |
"w:displayBackgroundShape", | |
"w:printPostScriptOverText", | |
"w:printFractionalCharacterWidth", | |
"w:printFormsData", | |
"w:embedTrueTypeFonts", | |
"w:embedSystemFonts", | |
"w:saveSubsetFonts", | |
"w:saveFormsData", | |
"w:mirrorMargins", | |
"w:alignBordersAndEdges", | |
"w:bordersDoNotSurroundHeader", | |
"w:bordersDoNotSurroundFooter", | |
"w:gutterAtTop", | |
"w:hideSpellingErrors", | |
"w:hideGrammaticalErrors", | |
"w:activeWritingStyle", | |
"w:proofState", | |
"w:formsDesign", | |
"w:attachedTemplate", | |
"w:linkStyles", | |
"w:stylePaneFormatFilter", | |
"w:stylePaneSortMethod", | |
"w:documentType", | |
"w:mailMerge", | |
"w:revisionView", | |
"w:trackRevisions", | |
"w:doNotTrackMoves", | |
"w:doNotTrackFormatting", | |
"w:documentProtection", | |
"w:autoFormatOverride", | |
"w:styleLockTheme", | |
"w:styleLockQFSet", | |
"w:defaultTabStop", | |
"w:autoHyphenation", | |
"w:consecutiveHyphenLimit", | |
"w:hyphenationZone", | |
"w:doNotHyphenateCaps", | |
"w:showEnvelope", | |
"w:summaryLength", | |
"w:clickAndTypeStyle", | |
"w:defaultTableStyle", | |
"w:evenAndOddHeaders", | |
"w:bookFoldRevPrinting", | |
"w:bookFoldPrinting", | |
"w:bookFoldPrintingSheets", | |
"w:drawingGridHorizontalSpacing", | |
"w:drawingGridVerticalSpacing", | |
"w:displayHorizontalDrawingGridEvery", | |
"w:displayVerticalDrawingGridEvery", | |
"w:doNotUseMarginsForDrawingGridOrigin", | |
"w:drawingGridHorizontalOrigin", | |
"w:drawingGridVerticalOrigin", | |
"w:doNotShadeFormData", | |
"w:noPunctuationKerning", | |
"w:characterSpacingControl", | |
"w:printTwoOnOne", | |
"w:strictFirstAndLastChars", | |
"w:noLineBreaksAfter", | |
"w:noLineBreaksBefore", | |
"w:savePreviewPicture", | |
"w:doNotValidateAgainstSchema", | |
"w:saveInvalidXml", | |
"w:ignoreMixedContent", | |
"w:alwaysShowPlaceholderText", | |
"w:doNotDemarcateInvalidXml", | |
"w:saveXmlDataOnly", | |
"w:useXSLTWhenSaving", | |
"w:saveThroughXslt", | |
"w:showXMLTags", | |
"w:alwaysMergeEmptyNamespace", | |
"w:updateFields", | |
"w:hdrShapeDefaults", | |
"w:footnotePr", | |
"w:endnotePr", | |
"w:compat", | |
"w:docVars", | |
"w:rsids", | |
"m:mathPr", | |
"w:attachedSchema", | |
"w:themeFontLang", | |
"w:clrSchemeMapping", | |
"w:doNotIncludeSubdocsInStats", | |
"w:doNotAutoCompressPictures", | |
"w:forceUpgrade", | |
"w:captions", | |
"w:readModeInkLockDown", | |
"w:smartTagType", | |
"sl:schemaLibrary", | |
"w:shapeDefaults", | |
"w:doNotEmbedSmartTags", | |
"w:decimalSymbol", | |
"w:listSeparator", | |
), | |
( | |
"w:name", | |
"w:aliases", | |
"w:basedOn", | |
"w:next", | |
"w:link", | |
"w:autoRedefine", | |
"w:hidden", | |
"w:uiPriority", | |
"w:semiHidden", | |
"w:unhideWhenUsed", | |
"w:qFormat", | |
"w:locked", | |
"w:personal", | |
"w:personalCompose", | |
"w:personalReply", | |
"w:rsid", | |
"w:pPr", | |
"w:rPr", | |
"w:tblPr", | |
"w:trPr", | |
"w:tcPr", | |
"w:tblStylePr", | |
), | |
("w:docDefaults", "w:latentStyles", "w:style"), | |
( | |
"w:tblStyle", | |
"w:tblpPr", | |
"w:tblOverlap", | |
"w:bidiVisual", | |
"w:tblStyleRowBandSize", | |
"w:tblStyleColBandSize", | |
"w:tblW", | |
"w:jc", | |
"w:tblCellSpacing", | |
"w:tblInd", | |
"w:tblBorders", | |
"w:shd", | |
"w:tblLayout", | |
"w:tblCellMar", | |
"w:tblLook", | |
"w:tblCaption", | |
"w:tblDescription", | |
"w:tblPrChange", | |
), | |
( | |
"w:cnfStyle", | |
"w:tcW", | |
"w:gridSpan", | |
"w:hMerge", | |
"w:vMerge", | |
"w:tcBorders", | |
"w:shd", | |
"w:noWrap", | |
"w:tcMar", | |
"w:textDirection", | |
"w:tcFitText", | |
"w:vAlign", | |
"w:hideMark", | |
"w:headers", | |
"w:cellIns", | |
"w:cellDel", | |
"w:cellMerge", | |
"w:tcPrChange", | |
"w:cnfStyle", | |
"w:divId", | |
"w:gridBefore", | |
"w:gridAfter", | |
"w:wBefore", | |
"w:wAfter", | |
"w:cantSplit", | |
"w:trHeight", | |
"w:tblHeader", | |
"w:tblCellSpacing", | |
"w:jc", | |
"w:hidden", | |
"w:ins", | |
"w:del", | |
"w:trPrChange", | |
), | |
( | |
"w:rStyle", | |
"w:rFonts", | |
"w:b", | |
"w:bCs", | |
"w:i", | |
"w:iCs", | |
"w:caps", | |
"w:smallCaps", | |
"w:strike", | |
"w:dstrike", | |
"w:outline", | |
"w:shadow", | |
"w:emboss", | |
"w:imprint", | |
"w:noProof", | |
"w:snapToGrid", | |
"w:vanish", | |
"w:webHidden", | |
"w:color", | |
"w:spacing", | |
"w:w", | |
"w:kern", | |
"w:position", | |
"w:sz", | |
"w:szCs", | |
"w:highlight", | |
"w:u", | |
"w:effect", | |
"w:bdr", | |
"w:shd", | |
"w:fitText", | |
"w:vertAlign", | |
"w:rtl", | |
"w:cs", | |
"w:em", | |
"w:lang", | |
"w:eastAsianLayout", | |
"w:specVanish", | |
"w:oMath", | |
), | |
( | |
"w:pStyle", | |
"w:keepNext", | |
"w:keepLines", | |
"w:pageBreakBefore", | |
"w:framePr", | |
"w:widowControl", | |
"w:numPr", | |
"w:suppressLineNumbers", | |
"w:pBdr", | |
"w:shd", | |
"w:tabs", | |
"w:suppressAutoHyphens", | |
"w:kinsoku", | |
"w:wordWrap", | |
"w:overflowPunct", | |
"w:topLinePunct", | |
"w:autoSpaceDE", | |
"w:autoSpaceDN", | |
"w:bidi", | |
"w:adjustRightInd", | |
"w:snapToGrid", | |
"w:spacing", | |
"w:ind", | |
"w:contextualSpacing", | |
"w:mirrorIndents", | |
"w:suppressOverlap", | |
"w:jc", | |
"w:textDirection", | |
"w:textAlignment", | |
"w:textboxTightWrap", | |
"w:outlineLvl", | |
"w:divId", | |
"w:cnfStyle", | |
"w:rPr", | |
"w:sectPr", | |
"w:pPrChange", | |
), | |
( | |
"w:top", | |
"w:start", | |
"w:left", | |
"w:bottom", | |
"w:end", | |
"w:right", | |
"w:insideH", | |
"w:insideV", | |
), | |
) | |
settings_tag_seq = ( | |
"w:writeProtection", | |
"w:view", | |
"w:zoom", | |
"w:removePersonalInformation", | |
"w:removeDateAndTime", | |
"w:doNotDisplayPageBoundaries", | |
"w:displayBackgroundShape", | |
"w:printPostScriptOverText", | |
"w:printFractionalCharacterWidth", | |
"w:printFormsData", | |
"w:embedTrueTypeFonts", | |
"w:embedSystemFonts", | |
"w:saveSubsetFonts", | |
"w:saveFormsData", | |
"w:mirrorMargins", | |
"w:alignBordersAndEdges", | |
"w:bordersDoNotSurroundHeader", | |
"w:bordersDoNotSurroundFooter", | |
"w:gutterAtTop", | |
"w:hideSpellingErrors", | |
"w:hideGrammaticalErrors", | |
"w:activeWritingStyle", | |
"w:unbounded", | |
"w:proofState", | |
"w:formsDesign", | |
"w:attachedTemplate", | |
"w:linkStyles", | |
"w:stylePaneFormatFilter", | |
"w:stylePaneSortMethod", | |
"w:documentType", | |
"w:mailMerge", | |
"w:revisionView", | |
"w:trackRevisions", | |
"w:doNotTrackMoves", | |
"w:doNotTrackFormatting", | |
"w:documentProtection", | |
"w:autoFormatOverride", | |
"w:styleLockTheme", | |
"w:styleLockQFSet", | |
"w:defaultTabStop", | |
"w:autoHyphenation", | |
"w:consecutiveHyphenLimit", | |
"w:hyphenationZone", | |
"w:doNotHyphenateCaps", | |
"w:showEnvelope", | |
"w:summaryLength", | |
"w:clickAndTypeStyle", | |
"w:defaultTableStyle", | |
"w:evenAndOddHeaders", | |
"w:bookFoldRevPrinting", | |
"w:bookFoldPrinting", | |
"w:bookFoldPrintingSheets", | |
"w:drawingGridHorizontalSpacing", | |
"w:drawingGridVerticalSpacing", | |
"w:displayHorizontalDrawingGridEvery", | |
"w:displayVerticalDrawingGridEvery", | |
"w:doNotUseMarginsForDrawingGridOrigin", | |
"w:drawingGridHorizontalOrigin", | |
"w:drawingGridVerticalOrigin", | |
"w:doNotShadeFormData", | |
"w:noPunctuationKerning", | |
"w:characterSpacingControl", | |
"w:printTwoOnOne", | |
"w:strictFirstAndLastChars", | |
"w:noLineBreaksAfter", | |
"w:noLineBreaksBefore", | |
"w:savePreviewPicture", | |
"w:doNotValidateAgainstSchema", | |
"w:saveInvalidXml", | |
"w:ignoreMixedContent", | |
"w:alwaysShowPlaceholderText", | |
"w:doNotDemarcateInvalidXml", | |
"w:saveXmlDataOnly", | |
"w:useXSLTWhenSaving", | |
"w:saveThroughXslt", | |
"w:showXMLTags", | |
"w:alwaysMergeEmptyNamespace", | |
"w:updateFields", | |
"w:hdrShapeDefaults", | |
"w:footnotePr", | |
"w:endnotePr", | |
"w:compat", | |
"w:docVars", | |
"w:rsids", | |
"m:mathPr", | |
"w:attachedSchema", | |
"w:themeFontLang", | |
"w:clrSchemeMapping", | |
"w:doNotIncludeSubdocsInStats", | |
"w:doNotAutoCompressPictures", | |
"w:forceUpgrade", | |
"w:captions", | |
"w:readModeInkLockDown", | |
"w:smartTagType", | |
"sl:schemaLibrary", | |
"w:shapeDefaults", | |
"w:doNotEmbedSmartTags", | |
"w:decimalSymbol", | |
"w:listSeparator", | |
) | |
settings_tag_seq = {qn(s): i for (i, s) in enumerate(settings_tag_seq)} | |
tag_seqs = [{qn(s): i for (i, s) in enumerate(tag_seq)} for tag_seq in tag_seqs] | |
# Load reference document | |
import sys | |
document = Document(sys.argv[1]) | |
styles = document.styles | |
# Fix validation error: duplicate style | |
for p in document.paragraphs: | |
if p.text.strip() == "Abstract": | |
p.style = None | |
p.style = styles["Abstract"] | |
# Fix validation error: character content in style '>' | |
atitle = styles["Abstract Title"] | |
rPr = atitle.element.rPr | |
for x in rPr: | |
x.tail = None | |
# Fix validation error: 0.0 instead of 0 | |
for t in document.tables: | |
wtag = ns.qn("w:tblW") | |
tblW = t._tblPr.find(wtag) | |
if tblW is not None: | |
tblW = CT_TblWidth(tblW) | |
tblW.w = 0 | |
for s in styles: | |
# there might be some asciiTheme attributes that prevent style inheritance from working | |
# explicitly wipe font name to make inheritance work | |
if s.font.name is None: | |
s.font.name = None | |
def sort_by_tag_seq(el) -> None: | |
tags = frozenset(x.tag for x in el) | |
for tag_seq in tag_seqs: | |
tag_seq_set = frozenset(tag_seq.keys()) | |
if tags.issubset(tag_seq_set): | |
el[:] = sorted(el, key=lambda x: tag_seq[x.tag]) | |
return | |
settings_set = frozenset(settings_tag_seq.keys()) | |
for el in document.settings._element.iter(): | |
tags = frozenset(x.tag for x in el) | |
if tags.issubset(settings_set): | |
el[:] = sorted(el, key=lambda x: settings_tag_seq[x.tag]) | |
# TODO: upstream | |
for w_nsid in document.part.numbering_part._element.xpath("./w:abstractNum/w:nsid"): | |
old = w_nsid.get(qn("w:val")) | |
if len(old) < 8: | |
w_nsid.set(qn("w:val"), "0" * (8 - len(old)) + old) | |
# quarto w:pPr duplicate, todo fix | |
for wp in document._element.xpath("./w:body/w:tbl/w:tr/w:tc/w:p"): | |
all = wp.xpath("./w:pPr") | |
first = all[0] | |
for other in all[1:]: | |
for child in other[:]: | |
first.append(child) | |
wp.remove(other) | |
# remove duplicate w:jc | |
for dup in first.xpath("./w:jc")[1:]: | |
first.remove(dup) | |
# TODO: add missing tblGrid in quarto | |
for tbl in document._element.xpath("./w:body/w:tbl"): | |
if not tbl.xpath("./w:tblGrid"): | |
tblGrid = OxmlElement("w:tblGrid") | |
tbl.insert_element_before(tblGrid, "w:tr") | |
gridCol = OxmlElement("w:gridCol") | |
tblGrid.append(gridCol) | |
gridCol.set(qn("w:w"), "7920") | |
# TODO: quarto or pandoc bug? most likely quarto... | |
for el in document._element.iter(): | |
sort_by_tag_seq(el) | |
for el in document.styles._element.iter(): | |
sort_by_tag_seq(el) | |
document.save(sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment