Skip to content

Instantly share code, notes, and snippets.

@dmyersturnbull
Created September 12, 2024 23:01
Show Gist options
  • Save dmyersturnbull/f0d167b5018dbf1961ede6a78b359bdb to your computer and use it in GitHub Desktop.
Save dmyersturnbull/f0d167b5018dbf1961ede6a78b359bdb to your computer and use it in GitHub Desktop.
Order-independent diff tool
# SPDX-FileCopyrightText: Copyright 2020-2024, Contributors
# SPDX-PackageHomePage: https://gist.github.com/dmyersturnbull
# SPDX-License-Identifier: Apache-2.0
"""
Tool to diff two order-independent sets of lines.
"""
import math
import sys
from argparse import ArgumentParser, Namespace
from dataclasses import dataclass, KW_ONLY
from pathlib import Path
from typing import Iterator, Self
try:
import colorama
except ImportError:
colorama = None
@dataclass(frozen=True, slots=True)
class DiffResult:
"""The difference between two sets of lines."""
added: list[str]
removed: list[str]
retained: list[str]
def summary(self: Self) -> Iterator[str]:
title_label = "SUMMARY"
added_label = "Added: "
removed_label = "Removed: "
retained_label = "Retained: "
max_value = max(self.n_added, self.n_removed, self.n_retained)
title_width = len(title_label)
label_width = max(len(added_label), len(removed_label), len(retained_label))
value_width = len(str(max_value))
half_fill = "=" * int(math.ceil(title_width / 2 + value_width / 2))
title_fill = "=" * title_width
yield from [
f"{half_fill}{title_label}{half_fill}",
f"{added_label:<{label_width}}{self.n_added:>{value_width}}",
f"{removed_label:<{label_width}}{self.n_removed:>{value_width}}",
f"{retained_label:<{label_width}}{self.n_retained:>{value_width}}",
f"{half_fill*2}{title_fill}",
]
def formatted_output(
self: Self,
*,
sort: bool = False,
show_retained: bool = False,
color: bool = False,
) -> Iterator[str]:
"""Generates lines that describe the changes."""
# we'll move the [+-=] to the front after optionally sorting
# that's faster than a bespoke sorted(combined, key=...)
combined = (
*(line + "+" for line in self.added),
*(line + "-" for line in self.removed),
*(line + "." for line in self.retained if show_retained),
)
colorize = {
"+": f"{colorama.Style.BRIGHT}{colorama.Fore.GREEN}+ " if color else "+ ",
"-": f"{colorama.Style.BRIGHT}{colorama.Fore.RED}+ " if color else "- ",
".": f"{colorama.Style.NORMAL}{colorama.Fore.BLACK}. " if color else ". ",
}
if sort:
combined = sorted(combined)
yield from (colorize[line[-1]] + line[:-1] for line in combined)
@property
def n_total(self: Self) -> int:
return len(self.added) + len(self.removed) + len(self.retained)
@property
def n_modified(self: Self) -> int:
return len(self.added) + len(self.removed)
@property
def n_added(self: Self) -> int:
return len(self.added)
@property
def n_removed(self: Self) -> int:
return len(self.removed)
@property
def n_retained(self: Self) -> int:
return len(self.retained)
@dataclass(frozen=True, slots=True)
class Reader:
""""""
strip_whitespace: bool = False
def __call__(self: Self, file: Path) -> list[str]:
"""Read a file and return its lines as a set."""
lines = file.read_text(encoding="utf-8").splitlines()
if self.strip_whitespace:
return [line.strip() for line in lines]
return [line for line in lines if line] # always remove blank lines
@dataclass(frozen=True, slots=True)
class Differ:
"""Class to compare two files as unordered sets of lines."""
def __call__(self: Self, lines1: list[str], lines2: list[str]) -> DiffResult:
"""Compute the unordered difference between two sets of lines."""
set1 = set(lines1)
set2 = set(lines2)
added = [line for line in lines2 if line not in set1]
removed = [line for line in lines1 if line not in set2]
retained = [line for line in lines1 if line in set2]
return DiffResult(added, removed, retained)
@dataclass(frozen=True, slots=True, order=True)
class Args:
""""""
file1: Path
file2: Path
_: KW_ONLY
pretty: bool
retained: bool
sort: bool
error: bool
@dataclass(frozen=True, slots=True)
class Parser:
""""""
def __call__(self: Self, args: list[str]) -> Args:
parser = ArgumentParser(
description="""
Compares two files as unordered sets of lines.
The diff is asymmetric: Lines in `file2` but not `file1` are considered added.
Although the diff is order-independent,
lines are printed in the order they appear in `file1` (or `file2` for added lines).
Empty lines are ignored.
Use `--pretty` for colored output.
"""
)
parser.add_argument("file1", type=Path, help="Path to the first file.")
parser.add_argument("file2", type=Path, help="Path to the second file.")
parser.add_argument("-p", "--pretty", action="store_true", help="Color output and write a summary.")
parser.add_argument("-r", "--retained", action="store_true", help="Output lines that are retained.")
parser.add_argument("-s", "--sort", action="store_true", help="Sort the lines lexigraphically.")
parser.add_argument("-e", "--error", action="store_true", help="Exit 1 if any lines were added or removed.")
ns = parser.parse_args(args[1:])
return Args(ns.file1, ns.file2, pretty=ns.pretty, retained=ns.retained, sort=ns.sort, error=ns.error)
@dataclass(frozen=True, slots=True, kw_only=True)
class Printer:
""""""
sort: bool
pretty: bool
show_retained: bool
def __call__(self: Self, diff: DiffResult) -> Iterator[str]:
yield from (
*self._summary(diff),
*self._header(diff),
*self._data(diff),
*self._footer(diff),
)
def _summary(self: Self, diff: DiffResult) -> Iterator[str]:
if self.pretty:
yield ""
yield from diff.summary()
def _header(self: Self, diff: DiffResult) -> Iterator[str]:
if self.pretty:
yield ""
if self.pretty and diff.n_total == 0:
yield "[[ no data to diff ]]"
elif self.pretty and diff.n_modified == 0 and not self.show_retained:
yield "[[ no changes in diff ]]"
elif self.pretty:
yield "[[ start of diff ]]"
def _footer(self: Self, diff: DiffResult) -> Iterator[str]:
if self.pretty and (diff.n_total == 0 or diff.n_modified == 0 and self.show_retained):
yield "[[ end of diff ]]"
if self.pretty:
yield ""
def _data(self: Self, diff: DiffResult) -> Iterator[str]:
yield from diff.formatted_output(sort=self.sort, show_retained=self.show_retained, color=self.pretty)
@dataclass(frozen=True, slots=True)
class Program:
"""
The diffset program.
"""
def run(self: Self, cli_args: list[str]) -> int:
args = Parser()(cli_args)
if args.pretty and not colorama:
print("Cannot import colorama. Install with `pip install colorama`.", file=sys.stderr)
return 1
diff = Differ()(Reader()(args.file1), Reader()(args.file2))
printer = Printer(sort=args.sort, pretty=args.pretty, show_retained=args.retained)
for line in printer(diff):
print(line)
return int(args.error and diff.n_modified != 0 or 0)
if __name__ == "__main__":
sys.exit(Program().run(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment