Skip to content

Instantly share code, notes, and snippets.

@memononen
Last active November 20, 2021 19:01
Show Gist options
  • Save memononen/ff90a46ff55c20fc4655952c899239ff to your computer and use it in GitHub Desktop.
Save memononen/ff90a46ff55c20fc4655952c899239ff to your computer and use it in GitHub Desktop.
O(NP) diff with backtracking
#include <stdio.h>
#include <vector>
#include <algorithm>
// Based on "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi Manber and Gene Myers
struct Edit {
enum Type { Insert, Delete, Common };
Edit() = default;
Edit(Type _type, int _a, int _b, int _n) : type(_type), a(_a), b(_b), n(_n) {}
Type type = Insert; // type of edit
int a = 0; // index in string a
int b = 0; // index in string b
int n = 0; // number of characters
};
struct Diag {
Diag() = default;
Diag(int _x, int _y, int _len, int _next) : x(_x), y(_y), len(_len), next(_next) {}
int x = 0; // start position
int y = 0;
int len = 0; // length
int next = -1; // next diagonal towards the start
};
struct Pt {
Pt() = default;
Pt(int _y, int _diag) : y(_y), diag(_diag) {}
int y = 0; // furthest y
int diag = -1; // nearest diagonal
};
static void snake(const int k,
const char* a, const char* b, const int m, const int n,
std::vector<Pt>& fp, const int fp0, std::vector<Diag>& diags)
{
const Pt& pa = fp[fp0 + k-1];
const Pt& pb = fp[fp0 + k+1];
const bool below = (pa.y+1) > pb.y;
const int diag = below ? pa.diag : pb.diag;
int y = below ? (pa.y+1) : pb.y;
int x = y - k;
int len = 0;
while (x < m && y < n && a[x] == b[y]) {
x++; y++; len++;
}
if (len > 0) {
diags.push_back(Diag(x-len, y-len, len, diag));
fp[fp0 + k] = Pt(y, diags.size() - 1);
} else {
fp[fp0 + k] = Pt(y, diag);
}
};
std::vector<Edit> diff(const char* a, const char* b)
{
int m = strlen(a);
int n = strlen(b);
bool reverse = false;
if (m > n) {
std::swap(a, b);
std::swap(m, n);
reverse = true;
}
std::vector<Pt> fp;
std::vector<Diag> diags;
const int delta = n - m;
const int size = (m+1) + (n+1) + 1;
const int fp0 = m+1; // zero offset for furthest point indexing, it can go negative.
fp.resize(size);
// All paths will lead to empty diagonal at zero.
diags.push_back(Diag(0, 0, 0, -1));
std::fill(fp.begin(), fp.end(), Pt(-1,0));
// Calculate common diagonal sequences
for (int p = 0; fp[fp0 + delta].y != n; p++) {
for (int k = -p; k <= delta-1; k++)
snake(k, a,b,m,n,fp,fp0,diags);
for (int k = delta+p; k >= delta+1; k--)
snake(k, a,b,m,n,fp,fp0,diags);
snake(delta, a,b,m,n,fp,fp0,diags);
}
// Backtrace shortest edit script
std::vector<Edit> ses;
Diag pdiag(m, n, 0, -1);
for (int d = fp[fp0 + delta].diag; d != -1; d = diags[d].next) {
const Diag& diag = diags[d];
// The path between the diagonals is handled with inserts and deletes.
int ndel = pdiag.x - (diag.x+diag.len);
int nins = pdiag.y - (diag.y+diag.len);
int ncom = diag.len;
int x = diag.x, y = diag.y;
if (reverse) {
std::swap(ndel, nins);
std::swap(x, y);
}
if (nins > 0)
ses.push_back(Edit(Edit::Insert, x+ncom+ndel, y+ncom, nins));
if (ndel > 0)
ses.push_back(Edit(Edit::Delete, x+ncom, y+ncom, ndel));
if (ncom > 0)
ses.push_back(Edit(Edit::Common, x, y, ncom));
pdiag = diag;
}
// Backtrace left the sequence in reverse, flip it.
std::reverse(ses.begin(), ses.end());
return ses;
}
int main()
{
const char* b = "Lorem ipsum dolor sit amet";
const char* a = "A quick brown fox jumps over the lazy dog";
std::vector<Edit> ses = diff(a, b);
printf("A: ");
for (const Edit& ed : ses) {
switch (ed.type)
{
case Edit::Common:
for (int k = 0; k < ed.n; k++)
printf("%c", a[ed.a+k]);
break;
case Edit::Insert:
for (int k = 0; k < ed.n; k++)
printf(" ");
break;
case Edit::Delete:
printf("\u001b[31m");
for (int k = 0; k < ed.n; k++)
printf("%c", a[ed.a+k]);
printf("\u001b[0m");
break;
}
}
printf("\u001b[0m\n");
printf("B: ");
for (const Edit& ed : ses) {
switch (ed.type)
{
case Edit::Common:
for (int k = 0; k < ed.n; k++)
printf("%c", b[ed.b+k]);
break;
case Edit::Insert:
printf("\u001b[32m");
for (int k = 0; k < ed.n; k++)
printf("%c", b[ed.b+k]);
printf("\u001b[0m");
break;
case Edit::Delete:
for (int k = 0; k < ed.n; k++)
printf(" ");
break;
}
}
printf("\u001b[0m\n");
return 0;
}
@memononen
Copy link
Author

A: A quick br own fox ju  m  ps    ove  r   the lazy dog   
B:           Lo         rem ipsum do  lor sit    a      met

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment