Skip to content

Instantly share code, notes, and snippets.

@memononen
Created December 7, 2021 19:33
Show Gist options
  • Save memononen/2c83d183c2749e5f4a493ce7ddb73f4d to your computer and use it in GitHub Desktop.
Save memononen/2c83d183c2749e5f4a493ce7ddb73f4d to your computer and use it in GitHub Desktop.
3-way merge based on O(NP) Myers diff and diff3, merging per item
#include <stdio.h>
#include <vector>
#include <span>
#include <algorithm>
// Based on
// "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi Manber and Gene Myers
// - https://publications.mpi-cbg.de/Wu_1990_6334.pdf
// - Good article visualizing Myer's older algorithm: https://epxx.co/artigos/diff_en.html
//
// "A Formal Investigation of Diff3"
// - https://www.cis.upenn.edu/~bcpierce/papers/diff3-short.pdf
enum class EditType { Insert, Delete, Substitute, Match };
// Describes how one sequence is changed to another.
struct Edit {
Edit() = default;
Edit(EditType _type, int _base, int _change) : type(_type), base(_base), change(_change) {}
EditType type = EditType::Match; // type of edit
int base = INT32_MAX; // index in string a
int change = INT32_MAX; // index in string b
};
struct Diag {
Diag() = default;
Diag(int _x, int _y, int _len, int _next) : x(_x), y(_y), length(_len), next(_next) {}
int x = 0; // start position
int y = 0;
int length = 0; // diagonal length
int next = -1; // next diagonal towards the start
};
struct Point {
Point() = default;
Point(int _y, int _score, int _diag) : y(_y), score(_score), diag(_diag) {}
int y = 0; // furthest y
int score = 0;
int diag = -1; // nearest diagonal
};
enum class MergeType { Conflict, Base, Left, Right };
// Describes a change, type defines from which sequence should be used for merged.
struct Merge {
Merge() = default;
Merge(MergeType _type, EditType _edit, const int _base, const int _left, const int _right)
: type(_type), edit(_edit), base(_base), left(_left), right(_right) {}
MergeType type = MergeType::Base;
EditType edit = EditType::Match;
int base = 0;
int left = 0;
int right = 0;
};
static void snake(const int k, std::span<const char> left, std::span<const char> right,
std::span<Point> fp, const int fp0, std::vector<Diag>& diags)
{
constexpr int insRemScore = 1;
constexpr int subsScore = 4; // Making substitute cheaper than match favors longer diagonals and reduces small edits (chaffs).
constexpr int matchScore = 2;
const Point& belowPt = fp[fp0 + k-1];
const Point& rightPt = fp[fp0 + k+1];
const Point& prevPt = fp[fp0 + k];
int prevDiag = prevPt.diag;
int y = prevPt.y+1;
int score = prevPt.score + subsScore;
if ((rightPt.score + insRemScore) > score)
{
prevDiag = rightPt.diag;
y = rightPt.y;
score = rightPt.score + insRemScore;
}
if ((belowPt.score + insRemScore) > score)
{
prevDiag = belowPt.diag;
y = belowPt.y+1;
score = belowPt.score + insRemScore;
}
int x = y - k;
int length = 0;
const int N = left.size();
const int M = right.size();
while (x < N && y < M && left[x] == right[y])
{
x++; y++; length++; score += matchScore;
}
if (length > 0)
{
diags.emplace_back(x - length, y - length, length, prevDiag);
fp[fp0 + k] = Point(y, score, diags.size() - 1);
}
else
{
fp[fp0 + k] = Point(y, score, prevDiag);
}
}
// Returns shortest(ish) edit script between left and right.
std::vector<Edit> diff(std::span<const char> left, std::span<const char> right)
{
bool reverse = false;
if (left.size() > right.size())
{
std::swap(left, right);
reverse = true;
}
const int N = left.size();
const int M = right.size();
std::vector<Point> fp;
std::vector<Diag> diags;
const int delta = M - N;
const int fp0 = N + 1; // zero offset for furthest point indexing, indexing can go negative.
fp.resize((N+1) + (M+1) + 1);
// All paths will lead to empty diagonal at zero.
diags.push_back(Diag(0, 0, 0, -1));
std::fill(fp.begin(), fp.end(), Point(-1,0,0));
// Calculate common diagonal sequences
for (int p = 0; fp[fp0 + delta].y != M; p++)
{
for (int k = -p; k <= delta-1; k++)
snake(k, left, right, fp, fp0, diags);
for (int k = delta+p; k >= delta+1; k--)
snake(k, left, right, fp, fp0, diags);
snake(delta, left, right, fp, fp0, diags);
}
// Backtrace shortest edit script
std::vector<Edit> diff;
Diag prevDiag(N, M, 0, -1);
for (int i = fp[fp0 + delta].diag; i != -1; i = diags[i].next)
{
const Diag& diag = diags[i];
// The path between the diagonals is handled with substitutes, inserts, and deletes.
int numDel = prevDiag.x - (diag.x + diag.length);
int numIns = prevDiag.y - (diag.y + diag.length);
int numSubs = std::min(numDel, numIns);
numDel -= numSubs;
numIns -= numSubs;
int numMatch = diag.length;
int x = prevDiag.x, y = prevDiag.y;
if (reverse)
{
std::swap(numDel, numIns);
std::swap(x, y);
}
// Store edit for each item in sequence, walking backwards.
for (int j = 0; j < numIns; j++)
{
y--;
diff.emplace_back(EditType::Insert, x, y);
}
for (int j = 0; j < numDel; j++)
{
x--;
diff.emplace_back(EditType::Delete, x, y);
}
for (int j = 0; j < numSubs; j++)
{
x--; y--;
diff.emplace_back(EditType::Substitute, x, y);
}
for (int j = 0; j < numMatch; j++)
{
x--; y--;
diff.emplace_back(EditType::Match, x, y);
}
prevDiag = diag;
}
// Backtrace left the sequence in reverse, flip it.
std::reverse(diff.begin(), diff.end());
return diff;
}
// Returns true if index is valid for specific diff.
static bool isValidIndex(const int index, std::span<const Edit> diff)
{
return index < (int)diff.size();
}
// Returns index in base seq, or "infinity" if the index is out of bounds (i.e. iterator has finished).
static int getBase(const int index, std::span<const Edit> diff)
{
return index < (int)diff.size() ? diff[index].base : INT32_MAX;
}
// Merges diffs into operations that can be used to alter each of the sequences to get the combined result.
std::vector<Merge> merge3(std::span<const char> base, std::span<const char> left, std::span<const char> right,
std::span<const Edit> leftDiff, std::span<const Edit> rightDiff)
{
std::vector<Merge> res;
int indexLeft = 0;
int indexRight = 0;
int lastLeftChange = 0;
int lastRightChange = 0;
while (isValidIndex(indexLeft, leftDiff) || isValidIndex(indexRight, rightDiff))
{
const int leftBasePos = getBase(indexLeft, leftDiff);
const int rightBasePos = getBase(indexRight, rightDiff);
const int basePos = std::min(leftBasePos, rightBasePos);
Edit leftEdit(EditType::Match, basePos, lastLeftChange);
if (isValidIndex(indexLeft, leftDiff) && leftDiff[indexLeft].base == basePos)
{
leftEdit = leftDiff[indexLeft];
indexLeft++;
}
Edit rightEdit(EditType::Match, basePos, lastRightChange);
if (isValidIndex(indexRight, rightDiff) && rightDiff[indexRight].base == basePos)
{
rightEdit = rightDiff[indexRight];
indexRight++;
}
// Merge thruth table. There likely is a simple logic that would handle it without the convoluted ifs.
//
// L / R | ins del subs match
// ------+-----------------------------
// ins | X SL LR IL
// del | SR D IR DR
// subs | RL IL X SL
// match | IR DL SR MB
if (leftEdit.type == EditType::Insert && rightEdit.type == EditType::Insert)
{
// X - Conflict
res.emplace_back(MergeType::Conflict, EditType::Insert, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Substitute)
{
// X - Conflict
res.emplace_back(MergeType::Conflict, EditType::Substitute, basePos, leftEdit.change, rightEdit.change);
}
else if ((leftEdit.type == EditType::Insert && rightEdit.type == EditType::Delete) ||
(leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Match))
{
// SL - substitute left
res.emplace_back(MergeType::Left, EditType::Substitute, basePos, leftEdit.change, rightEdit.change);
}
else if ((leftEdit.type == EditType::Delete && rightEdit.type == EditType::Insert) ||
(leftEdit.type == EditType::Match && rightEdit.type == EditType::Substitute))
{
// SR - substitute right
res.emplace_back(MergeType::Right, EditType::Substitute, basePos, leftEdit.change, rightEdit.change);
}
else if ((leftEdit.type == EditType::Insert && rightEdit.type == EditType::Match) ||
(leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Delete))
{
// IL - insert left
res.emplace_back(MergeType::Left, EditType::Insert, basePos, leftEdit.change, rightEdit.change);
}
else if ((leftEdit.type == EditType::Match && rightEdit.type == EditType::Insert) ||
(leftEdit.type == EditType::Delete && rightEdit.type == EditType::Substitute))
{
// IR - insert right
res.emplace_back(MergeType::Right, EditType::Insert, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Match && rightEdit.type == EditType::Match)
{
// MB - match base
res.emplace_back(MergeType::Base, EditType::Match, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Match && rightEdit.type == EditType::Delete)
{
// DL - delete left
res.emplace_back(MergeType::Left, EditType::Delete, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Match)
{
// DR - delete right
res.emplace_back(MergeType::Right, EditType::Delete, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Insert && rightEdit.type == EditType::Substitute)
{
// LR - left and right
res.emplace_back(MergeType::Left, EditType::Insert, basePos, leftEdit.change, rightEdit.change);
res.emplace_back(MergeType::Right, EditType::Substitute, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Insert)
{
// RL - right and left
res.emplace_back(MergeType::Right, EditType::Insert, basePos, leftEdit.change, rightEdit.change);
res.emplace_back(MergeType::Left, EditType::Substitute, basePos, leftEdit.change, rightEdit.change);
}
else if (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Delete)
{
// D - delete
res.emplace_back(MergeType::Left, EditType::Delete, basePos, leftEdit.change, rightEdit.change);
res.emplace_back(MergeType::Right, EditType::Delete, basePos, leftEdit.change, rightEdit.change);
}
lastLeftChange = leftEdit.change;
lastRightChange = rightEdit.change;
}
return res;
}
// Merge changes to right in-situ
void resolveIntoRight(std::span<Merge> merge, std::span<char> base, std::span<char> left, std::vector<char>& right)
{
int offset = 0;
for (const Merge& m : merge)
{
if (m.type == MergeType::Conflict)
{
// Conflicting insert or replaces, arbitrarily keep right (could be any)
}
else if (m.type == MergeType::Left && m.edit == EditType::Substitute)
{
// Substitute from left
right[m.right + offset] = left[m.left];
}
else if (m.type == MergeType::Left && m.edit == EditType::Insert)
{
// Insert from left
right.insert(right.begin() + m.right + offset, left[m.left]);
offset++;
}
else if (m.type == MergeType::Right && m.edit == EditType::Delete)
{
// Right delete.
right.erase(right.begin() + m.right + offset);
offset--;
}
else if (m.type == MergeType::Right && (m.edit == EditType::Substitute || m.edit == EditType::Insert))
{
// Right, keep.
}
else if (m.type == MergeType::Base && m.edit == EditType::Match)
{
// Match with base, keep
}
}
}
void printDiff(std::span<char> left, std::span<char> right, std::span<Edit> diff)
{
/*
for (const Edit& ed : diff) {
switch (ed.type)
{
case EditType::Match:
printf("\u001b[0m");
printf("%c %c", left[ed.base], right[ed.change]);
printf("\u001b[0m %d, %d\n", ed.base, ed.change);
break;
case EditType::Substitute:
printf("\u001b[46m%c %c", left[ed.base], right[ed.change]);
printf("\u001b[0m %d, %d\n", ed.base, ed.change);
break;
case EditType::Insert:
printf(" \u001b[42m%c", right[ed.change]);
printf("\u001b[0m %d, %d\n", ed.base, ed.change);
break;
case EditType::Delete:
printf("\u001b[0m%c", left[ed.base]);
printf("\u001b[0m %d, %d\n", ed.base, ed.change);
break;
}
}
printf("\u001b[0m\n");*/
printf(" Base: ");
for (const Edit& ed : diff) {
switch (ed.type)
{
case EditType::Match:
printf("\u001b[0m");
printf("%c", left[ed.base]);
break;
case EditType::Substitute:
printf("\u001b[0m");
printf("%c", left[ed.base]);
break;
case EditType::Insert:
printf("\u001b[0m");
printf(" ");
break;
case EditType::Delete:
printf("\u001b[41m");
printf("%c", left[ed.base]);
break;
default:
break;
}
}
printf("\u001b[0m\n");
// Change
printf(" Change: ");
for (const Edit& ed : diff) {
switch (ed.type)
{
case EditType::Match:
printf("\u001b[0m");
printf("%c", right[ed.change]);
break;
case EditType::Substitute:
printf("\u001b[43m");
printf("%c", right[ed.change]);
break;
case EditType::Insert:
printf("\u001b[42m");
printf("%c", right[ed.change]);
break;
case EditType::Delete:
printf("\u001b[0m");
printf(" ");
break;
default:
break;
}
}
printf("\u001b[0m\n");
}
void printMerge(std::span<char> base, std::span<char> left, std::span<char> right, std::span<Merge> merge)
{
/* char mergeMarker[] = "!BLR";
char editMarker[] = "?+-x=";
printf("Diff\n");
for (const Merge& m : merge)
{
printf("%c%c %d, %d, %d %c %c %c\n",
mergeMarker[(int)m.type], editMarker[(int)m.edit],
m.base, m.left, m.right,
base[m.base], left[m.left], right[m.right]);
}
printf("-\n");*/
printf(" ");
for (const Merge& m : merge)
{
if (m.type == MergeType::Conflict)
{
// Conflict, arbitrarily choose right
printf("\u001b[41m%c", right[m.right]);
}
else if (m.type == MergeType::Left && (m.edit == EditType::Substitute || m.edit == EditType::Insert))
{
// Left
printf("\u001b[44m%c", left[m.left]);
}
else if (m.type == MergeType::Right && (m.edit == EditType::Substitute || m.edit == EditType::Insert))
{
// Right
printf("\u001b[46m%c", right[m.right]);
}
else if (m.type == MergeType::Base && m.edit == EditType::Match)
{
// Match
printf("\u001b[0m%c", base[m.base]);
}
}
printf("\u001b[0m\n");
}
void printv(std::span<char> arr)
{
printf(" ");
for (const char c : arr)
printf("%c", c);
printf("\n");
}
std::vector<char> makev(const char* s)
{
std::vector<char> str;
while (*s)
{
str.push_back(*s);
s++;
}
return str;
}
int main()
{
std::vector<char> base = makev("the quick fox jumps ovre some lazy dog");
std::vector<char> left = makev("the abcck brown fox jumped ovre a dog");
std::vector<char> right = makev("the qzick brown fox jumps over some record dog");
std::vector<Edit> leftDiff = diff(base, left);
std::vector<Edit> rightDiff = diff(base, right);
std::vector<Merge> merged = merge3(base, left, right, leftDiff, rightDiff);
printf("\nDiff Base > \u001b[44mLeft\u001b[0m\n");
printDiff(base, left, leftDiff);
printf("\nDiff Base > \u001b[46mRight\u001b[0m\n");
printDiff(base, right, rightDiff);
printf("\nMerged\n");
printMerge(base, left, right, merged);
resolveIntoRight(merged, base, left, right);
printv(right);
printf("\n");
return 0;
}
@memononen
Copy link
Author

mikko@diff % g++ -std=c++2a -g3 diff3.cpp -o diff3
mikko@diff % ./diff3                              

Diff Base > Left
  Base:   the quick       fox jumps  ovre some lazy dog
  Change: the abcck brown fox jumped ovre       a   dog

Diff Base > Right
  Base:   the quick       fox jumps ovre some lazy   dog
  Change: the qzick brown fox jumps over some record dog

Merged
  the azcck brown fox jumped over record dog
  the azcck brown fox jumped over record dog

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment