Created
August 21, 2022 01:37
-
-
Save charles-l/12a3eb7b364acf322ec607cb610221e2 to your computer and use it in GitHub Desktop.
Python wordwrap port of http://blogs.perl.org/users/damian_conway/2019/08/greed-is-good-balance-is-better-beauty-is-best.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python implementation of the wordwrap algorithm from | |
# http://blogs.perl.org/users/damian_conway/2019/08/greed-is-good-balance-is-better-beauty-is-best.html | |
import math | |
import re | |
def cost(lines, width): | |
# NOTE: not skipping the last line in the uniformity cost since | |
# if there's no way of splitting in greedy_wrap, the last line ends up | |
# with everything on it. | |
uniformity = sum(abs((width - len(l)))**3 for l in lines) | |
compactness = len(lines)**3 | |
widow_count = sum(1 for line in lines if | |
re.search(r'[.!?,:;]\s+\S+\s*$', line)) | |
orphan_count = sum(1 for line in lines if | |
re.search('^\S+[.!?,:;](\s|$)', line)) | |
return uniformity * compactness * (1 + 10 * (widow_count + orphan_count)) | |
def greedy_wrap(text, width=80, minwidth=None, minbreak=5): | |
lines = [] | |
if minwidth is None: | |
minwidth = math.floor(0.8 * width) | |
while text: | |
if (g := re.match(fr'.{{1,{width}}}$', text)): | |
lines.append(g.group(0)) | |
text = text[g.end():] | |
elif (g := re.match(fr'.{{{minwidth},{width}}} ', text)): | |
lines.append(g.group(0)) | |
text = text[g.end():] | |
elif ((g := re.match(fr'.{{{minbreak},{width-1}}}', text)) and | |
re.match(fr'\S{{{minbreak}}}', text[g.end():])): | |
lines.append(g.group(0) + '-') | |
text = text[g.end():] | |
else: | |
# raku's comb method somehow never runs across this case in the | |
# examples. I'm not sure if it's doing some crazy backtracking or | |
# if I ported the patterns wrong, but I'm just punting on it. | |
lines.append(text) | |
break | |
return '\n'.join(lines) | |
def iterative_wrap(text, width=80): | |
best_wrapping = text | |
lowest_cost = float('inf') | |
prev_max_width = float('inf') | |
for next_width in range(width, math.floor(0.9*width), -1): | |
if next_width > prev_max_width: | |
continue | |
wrapping = greedy_wrap(text, next_width) | |
wrap_cost = cost(wrapping.split('\n'), next_width) | |
if wrap_cost < lowest_cost: | |
best_wrapping = wrapping | |
lowest_cost = wrap_cost | |
prev_max_width = max(len(line) for line in wrapping.split('\n')) - 1 | |
return best_wrapping | |
print(iterative_wrap('''\ | |
Look you, I shall have to be terminating my interdisciplinary | |
investigation of consanguineous antidisestablishmentarianism | |
in Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch. | |
For I've just been electrophotomicrographically diagnosed with | |
pseudopneumonoultramicroscopicsilicovolcanoconiosis, isn't it?'''.replace('\n', | |
' '), | |
45)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment