Skip to content

Instantly share code, notes, and snippets.

@arccoder
Last active January 23, 2021 21:18
Show Gist options
  • Save arccoder/c74a2a010974b87b6031cfd73a748169 to your computer and use it in GitHub Desktop.
Save arccoder/c74a2a010974b87b6031cfd73a748169 to your computer and use it in GitHub Desktop.
Redact word documents using Python
> python redact.py -h
usage: redact.py [-h] -i INPUT -o OUTPUT -p PATTERNS [-r REPLACE] [-c COLOR]
optional arguments:
-h, --help show this help message and exit
-i INPUT, --input INPUT
Path to the document to be redacted
-o OUTPUT, --output OUTPUT
Path to save the redacted document
-p PATTERNS, --patterns PATTERNS
List of pattern to redact. Comma separated.
-r REPLACE, --replace REPLACE
String to replace all patterns
-c COLOR, --color COLOR
Color to redact. Black is default. Options: white, yellow
def process_matches(match_pairs: list, run_text: str):
"""
Process matchPairs from regex finditer
Args:
match_pairs: List of match (start, end) from a regex finditer
run_text: String
Returns: Tuple of (boolean list of highlights, number list of positions)
"""
# List to hold the indices to split at
matches = []
# List to hold whether to highlight or not
# In other words, whether the split contains a pattern
highlights = []
if len(match_pairs) > 0:
# If the first match does not start with zero, append zero
# highlight as False
if match_pairs[0][0] != 0:
matches.append(0)
highlights.append(False)
# First match start index and highlight as true
matches.append(match_pairs[0][0])
highlights.append(True)
# First match end index
matches.append(match_pairs[0][1])
prev = match_pairs[0][1]
# Loop through the remaining pairs, except the last
for idx in range(1, len(match_pairs)):
# IF the start of the next match does NOT match the end of the previous match
# THEN some text in the run does not match the pattern and should not be highlighted
# AND go to the start of the current match
# IF the start of the next match does match the end of the previous match
# THEN the text starting with the start of the match should be highlighted
if prev != match_pairs[idx][0]:
highlights.append(False)
matches.append(match_pairs[idx][0])
highlights.append(True)
# Append the end of the current match
# and go over the same logic with the next match
matches.append(match_pairs[idx][1])
prev = match_pairs[idx][1]
# IF the end of the last match does not match the length of the run-text
# THEN don't highlight and append the matches with the length of the run-text
if matches[-1] != len(run_text):
highlights.append(False)
matches.append(len(run_text))
return highlights, matches
def redact_colors(color: str = None):
"""
Returns text and text-background colors for redaction using argument {color}.
Args:
color: str
Returns: Tuple with redaction colors for (text, background)
"""
switcher = {
'white': (RGBColor(255, 255, 255), WD_COLOR_INDEX.WHITE),
'yellow': (RGBColor(255, 255, 0), WD_COLOR_INDEX.YELLOW)
}
return switcher.get(color, (RGBColor(0, 0, 0), WD_COLOR_INDEX.BLACK))
def redact_document(input_path: str, output_path: str, pattern: list, color: str = None):
"""
Redacts {pattern} is the {input} document and saves it as {output} document
Args:
input_path: Path to the document to be redacted
output_path: Path to save the redacted document
pattern: List of pattern to redact
color: Color to redact. Will be used for both the text and background.
"""
# Get the text color and text-background color for reaction
txt_color, background_color = redact_colors(color)
# Open the input document
doc = Document(input_path)
# Loop through paragraphs
for para in doc.paragraphs:
# Loop through the runs in the paragraph in the reverse order
run_index = len(para.runs) - 1
while run_index > -1:
run = para.runs[run_index]
# Find the start and end indices of the patterns in the run-text
match_pairs = [(match.start(), match.end()) for match in re.finditer('|'.join(pattern), run.text)]
# Get the locations in the format required for `split_run_by` function
highlights, matches = process_matches(match_pairs, run.text)
# Go to redact only if patterns are found in the text
if len(highlights) > 0 and len(matches) > 0:
if len(highlights) != len(matches) - 1:
ValueError('Calculation error within matches and highlights')
else:
if len(matches) == 2: # When a pattern is the only text in the run
# Highlight the background color
run.font.highlight_color = background_color
# Match the text color to the background color
run.font.color.rgb = txt_color
else:
# Split the runs using the matches
new_runs = split_run_by(para, run, matches[1:-1])
# Highlight the run if it matches a pattern
for highlight, run in zip(highlights, new_runs):
if highlight:
# Highlight the background color
run.font.highlight_color = background_color
# Match the text color to the background color
run.font.color.rgb = txt_color
# Decrement the index to process the previous run
run_index -= 1
# Save the redacted document to the output path
doc.save(output_path)
def redact_document_with_replace(input_path: str, output_path: str, pattern: list,
replace_with: str = '##########', color: str = None):
"""
Redacts {pattern} is the {input} document and saves it as {output} document
Args:
input_path (str): Path to the document to be redacted
output_path (str): Path to save the redacted document
pattern (list): List of pattern to redact
replace_with (str): String to replace all patterns
color (str): Color to redact. Will be used for both the text and background.
"""
# Get the text color and text-background color for reaction
txt_color, background_color = redact_colors(color)
# Open the input document
doc = Document(input_path)
# Loop through paragraphs
for para in doc.paragraphs:
# Loop through the runs in the paragraph in the reverse order
run_index = len(para.runs) - 1
while run_index > -1:
run = para.runs[run_index]
# Find the start and end indices of the patterns in the run-text
match_pairs = [(match.start(), match.end()) for match in re.finditer('|'.join(pattern), run.text)]
# Get the locations in the format required for `split_run_by` function
highlights, matches = process_matches(match_pairs, run.text)
# Go to redact only if patterns are found in the text
if len(highlights) > 0 and len(matches) > 0:
if len(highlights) != len(matches) - 1:
ValueError('Calculation error within matches and highlights')
else:
if len(matches) == 2: # When a pattern is the only text in the run
# Replace the matching pattern
run.text = replace_with
# Highlight the background color
run.font.highlight_color = background_color
# Match the text color to the background color
run.font.color.rgb = txt_color
else:
# Split the runs using the matches
new_runs = split_run_by(para, run, matches[1:-1])
# Highlight the run if it matches a pattern
for highlight, run in zip(highlights, new_runs):
if highlight:
# Replace the matching pattern
run.text = replace_with
# Highlight the background color
run.font.highlight_color = background_color
# Match the text color to the background color
run.font.color.rgb = txt_color
# Decrement the index to process the previous run
run_index -= 1
# Save the redacted document to the output path
doc.save(output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment