Created
February 27, 2011 03:37
-
-
Save richlowe/845888 to your computer and use it in GitHub Desktop.
roff(1) Line Mangler And Organizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# roff(1) Line Mangler And Organizer | |
# | |
# Reflow a roff (source) document without damaging it: | |
# | |
# - Comments are left intact | |
# - Commands are left intact | |
# - Known "Verbatim blocks" are left intact | |
# - tbl(1) Tables | |
# - Unflowed sections | |
import sys | |
import textwrap | |
# dict of verbatim blocks start command --> end command | |
VERBATIM_BLOCKS = {'.TS': '.TE', # Table Start / Table End | |
'.nf': '.fi', # No-fill / Fill | |
} | |
# line-tokens | |
# | |
# COMMENT - roff comment, .\" or '\" | |
# | |
# TEXT - plain text | |
# | |
# COMMAND - a roff command (must begin a line) | |
# | |
# VERBATIM - Part of a verbatim block (see VERBATIM_BLOCKS) | |
# | |
# BLANK - Blank lines in roff input are significant. If we treat them as TEXT | |
# they may be flowed out of existence, but they almost always exist in | |
# input for their affect on output; instead treat them as a command | |
# ("insert a blank line"), which seems logical and gives the right | |
# behaviour. | |
COMMENT, TEXT, COMMAND, VERBATIM, BLANK = range(5) | |
def tokenize(inp): | |
"""'tokenize' an nroff page on INP. | |
returns a list of lists [TOKEN, 'text'], runs of the same token are packed | |
together.""" | |
ret = [] | |
lasttok = None | |
# Stack of verbatim block ending commands in the order we need to see them | |
# to leave the block. Also treated as bool to indicate lines should be | |
# passed verbatim. | |
inverb = [] | |
for line in inp: | |
tok = None | |
if (line.startswith('.\\"') or line.startswith('\'\\"')): | |
tok = COMMENT | |
if not line or line.isspace(): | |
tok = BLANK | |
elif line[0] in (".", "'"): | |
tok = COMMAND | |
command = line.split()[0] | |
# command starts a verbatim block | |
if command in VERBATIM_BLOCKS: | |
inverb.append(VERBATIM_BLOCKS[command]) | |
elif inverb and command == inverb[-1]: | |
inverb.pop() | |
else: | |
tok = inverb and VERBATIM or TEXT | |
if tok == lasttok: | |
ret[-1][1] += line | |
else: | |
ret.append([tok, line]) | |
lasttok = tok | |
return ret | |
Wrap = textwrap.TextWrapper(width=79, expand_tabs=False, | |
replace_whitespace=False, | |
drop_whitespace=True, | |
fix_sentence_endings=False, | |
break_on_hyphens=False) | |
def flow_paragraph(text): | |
newlines = Wrap.wrap(text) | |
# We can't allow a non-COMMAND line to start with a period or a single | |
# quote, if we wrap a line in such a way that we do, pull the last word of | |
# the previous line down to prevent it. | |
for n, line in enumerate(newlines): | |
while line[0] in (".", "'"): | |
ns = newlines[n - 1].split(' ') | |
prev, prep = ns[:-1], ns[-1] | |
newlines[n - 1] = ' '.join(prev) | |
newlines[n] = "%s %s" % (prep, line) | |
line = newlines[n] | |
return newlines | |
def reflow(lines, outp=sys.stdout): | |
"""Reflow an nroff document, in LINES writing a new document to OUTP | |
(default: sys.stdout)""" | |
for tok, text in lines: | |
if tok in (COMMENT, COMMAND, VERBATIM, BLANK): | |
outp.write(text) | |
elif tok == TEXT: | |
outp.write('\n'.join(flow_paragraph(text)) + '\n') | |
else: | |
raise Exception("Unknown token value `%s'" % tok) | |
if __name__ == '__main__': | |
if len(sys.argv) != 3: | |
sys.stderr.write("Usage: rofflmao <infile> <outfile>\n") | |
sys.exit(2) | |
infile, outfile = sys.argv[1:3] | |
if infile == outfile: | |
sys.stderr.write("Input and output must differ\n") | |
sys.exit(1) | |
with open(infile, 'r') as f: | |
# sys.stdout.write(''.join(map(lambda (x,y): "%s: %s" % (x, y), | |
# tokenize(f)))) | |
with open(outfile, 'w') as n: | |
reflow(tokenize(f), outp=n) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment