Skip to content

Instantly share code, notes, and snippets.

@otheus
Last active February 10, 2023 21:33
Show Gist options
  • Save otheus/92162e3a764d2697c3272b98b2663a94 to your computer and use it in GitHub Desktop.
Save otheus/92162e3a764d2697c3272b98b2663a94 to your computer and use it in GitHub Desktop.
#!/bin/awk -f
## Awk script to compare to SQL (postgres) dumps for which each line of input is a row
## and has been preprocessed by
## paste -d $'\a' file1 file2
## The BEL symbol is used by this program to quickly split the input
##
## Sometimes, numbers differ by some kind of rounding error / floating-point implementation
## Ignore that error by subtracting the two values and seeing if they are < maxdiff,
## maxdiff = 1 / (10 ^ (length-after-decimal-point(shortest-value))
## Consider:
## 4.2 vs 4.19998
## The shortest number is 4.2, its length is
## Notes:
## d is the global *d*iff counter
## p is the *p*osition / field that first had a difference
## i is a loop variable,usually current field
## L is the array of fields from the current line of the *L*eft-file
## R is " " " " " " " " " " " *R*ight-file
## clhs is the number of fields in L
## crhs is the number of fields in R
BEGIN {
FS="\a";
DECIMAL_SEP=".";
FIELD_SEP="\t"; # "\t" for postgresql; for mysql, maybe ", ";
MAX_DIFFS=10;
DEBUG=0;
# Efficiently fill out our table of maximum tolerances of values
Maxdiffs[1] = 0.1;
for (i=2; i<31; ++i)
Maxdiffs[i] = Maxdiffs[i-1] / 10;
p=-1;
}
# if -v start=...., skip until that line
NR < (0 + start) { next }
# When pairs don't match, investigate further...
("_" $1) != ("_" $2) {
if (DEBUG>1) print "Line",NR ": Input lines differed somehow. Investigating...";
p=0; # p is field# where difference was found; 0 means whole line
# split each half into tab-delimited fields
clhs=split($1,L,FIELD_SEP);
crhs=split($2,R,FIELD_SEP);
if (clhs == crhs) {
if (DEBUG>1) print "Line",NR ": Same number of tokens in each line, delimited by '" FIELD_SEP "'";
## compare field by field
p = -1; # if we don't set p in the loop below, no real differences
# Compare each field, until a difference is found
for (i=1; i<=clhs && p<0; ++i) {
# Hint: force this compare to be string-based
if (("_" L[i]) != ("_" R[i])) {
if (DEBUG>1) print "Line",NR ": Field",i,"differs somehow";
## They differ... but are they numbers?
if ( \
L[i] ~ /^-?[0-9]+\.[0-9]+([eE][-+][0-9]+)?$/ && \
R[i] ~ /^-?[0-9]+\.[0-9]+([eE][-+][0-9]+)?$/ \
) {
# both fields are floating-point numbers, compare loosely
# strip exponent part
sub(/[eE].*/,"",L[i]);sub(/[eE].*/,"",R[i]);
# determine precision of shortest value
precision=( \
length(L[i]) < length(R[i]) ? \
length(L[i]) - index(L[i],DECIMAL_SEP) : \
length(R[i]) - index(R[i],DECIMAL_SEP) \
);
# look up the maxdiff from our table
maxdiff=Maxdiffs[precision];
diff=(L[1] - R[1]);
if (diff > maxdiff || diff < -maxdiff) {
if (DEBUG) print "Line",NR ": Numbers differed at",i,"between",L[i],"and",R[i],"differing more than",maxdiff;
p=i;
}
else {
if (DEBUG) print "Line",NR ": Numbers differed at",i,"between",L[i],"and",R[i],"but differed less than",maxdiff;
}
}
else {
if (DEBUG) print "Line",NR ": Strings or ints differed at",i,"between",L[i],"and",R[i];
p=i;
}
}
else {
if (DEBUG) print "Line",NR ": No differences found";
}
}
}
# else, field count is different, so whole line is.
else {
if (DEBUG) print "Line",NR ": Number of fields in line differ";
}
}
p>=0 {
++d; # bump total diffs count
# Output a little header for each non-matching records
print "Line",NR,"diffs found so far:",d,(p ? "here at field: " p : "" );
# Output the lines that didnt match
print $1; print $2; print "";
p=-1;
}
# Progress counter
NR % 100000 == 0 { print "Line",NR }
d > MAX_DIFFS { exit(1);}
## NOTE: THE WHITESPACES SHOULD BE TABS
75747358 1 53 2011-03-29 23:00:00+00 7.428
75747359 1 53 2011-03-29 23:30:00+00 5.757
75747360 1 53 2011-03-30 00:00:00+00 6.739
75747361 1 53 2011-03-30 00:30:00+00 6.109
75747362 1 53 2011-03-30 01:00:00+00 6.736 extra-field
75747363 1 53 2011-03-30 01:30:00+00 7.576
75747364 1 53 2011-03-30 02:00:00+00 6.789
75747365 1 53 2011-03-30 02:30:00+00 6.386e+2
75747366 1 53 2011-03-30 03:00:00+00 6.016E-2
75747367 1 53 2011-03-30 03:30:00+00 6.336
75747368 1 53 2011-03-30 04:00:00+00 6.1
75747374 1 53 2011-03-30 07:00:00+00 5.9412
75747375 1 53 2011-03-30 07:30:00+00 6.137803249
## NOTE: THE WHITESPACES SHOULD BE TABS
75747358 1 53 2011-03-28 23:00:00+00 7.428
75747359 1 53 2011-03-29 23:30:00+01 5.757
75747360 1 54 2011-03-30 00:00:00+00 6.74
75747361 1 53 2011-03-30 00:30:00+00 6.109
75747362 1 53 2011-03-30 01:00:00+00 6.73599999999999977
75747363 1 53 2011-03-30 01:30:00+00 7.576e+10
75747364 1 53 2011-03-30 02:00:00+00 6.789e-10
75747365 1 53 2011-03-30 02:30:00+00 6.38600000000000012e+2
75747366 1 53 2011-03-30 03:00:00+00 6.01600000000000001E-2
75747367 1 53 2011-03-30 03:30:00+00 6.3360000000000003
75747368 1 53 2011-03-30 04:00:00+00 6.0999999999999993
75747374 1 53 2011-03-30 07:00:00+00 5.94099999999999984
75747375 1 53 2011-03-30 07:30:00+00 6.13780324900000007
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment