Skip to content

Instantly share code, notes, and snippets.

@ethanresnick
Last active January 23, 2019 19:27
Show Gist options
  • Save ethanresnick/e86661f15c4ea780dc6fbeb44dee9b74 to your computer and use it in GitHub Desktop.
Save ethanresnick/e86661f15c4ea780dc6fbeb44dee9b74 to your computer and use it in GitHub Desktop.
Fix "JSON" that contains invalid strings
// Matches two double quotes and any characters between them, without stopping
// at backslash-escaped double quotes that appear in the middle. This is a lot
// like JS's string literal syntax, except that it will some match characters
// between the double quotes that JS would require be backslash escaped --
// most notably, the newline, which must be \n in string literals.
// Note: we use [^] instead of . below to match any character because JS
// doesn't change the meaning of the dot even in the precense of the
// multiline flag.
const STRING_LITERAL_LIKE = /"([^"\\]|\\[^])*"/g;
// Matches raw ascii control characters and errant backslashes.
// See test file for description of how some errant backslashes
// can't be matched because they accidentally escape string close
// quotes (directly or indirectly).
const CONTROL_CHAR = /[\u0000-\u001F]/;
const ERRANT_BACKSLASH = /\\(?!([btnfr\\/"]|(u\d{4})))/;
const INVALID_JSON_STRING_CHARACTER = new RegExp(
"(?:" + CONTROL_CHAR.source + ")"
+ "|"
+ "(?:" + ERRANT_BACKSLASH.source + ")",
"g"
);
// The characters that JSON strings use, after a backslash, to represent a
// character that must be backslash escaped, keyed by their ascii code.
const JSON_CHAR_NAMES = {
// control character codes to replace with special letters
8: 'b',
9: 't',
10: 'n',
12: 'f',
13: 'r',
// characters that are allowed to or must have a leading backslash,
// but should be passed through as is after the backslash.
92: '\\',
47: '/'
};
// Takes a single character string where the character is an ASCII control
// character or a backslash and returns the escape sequence used for
// representing that character in a JSON string.
function escapeJSONStringChar(char) {
const code = char.charCodeAt(0);
const charName = JSON_CHAR_NAMES[code];
if(charName) {
return "\\" + charName;
}
const hexEscape = code.toString(16);
return "\\u00" + (hexEscape.length === 1 ? '0' : '') + hexEscape;
}
// Takes a string that's almost valid JSON except that the control characters
// and backslashes inside of string literals haven't been properly escaped,
// and returns a true JSON string formed by escaping those characters.
function fixJSON(jsonLike) {
return jsonLike.replace(STRING_LITERAL_LIKE, function (match) {
return match.replace(INVALID_JSON_STRING_CHARACTER, escapeJSONStringChar);
});
}
const jsc = require("jsverify");
const R = require("ramda");
const { fixJSON } = require("./index");
// The generator for improper JSON strings that we'll
// feed into our program.
const borkedJSONStringLiteral = jsc.suchthat(
jsc.string.smap(
// This is likely what the website's doing to build their "JSON".
x => `"${x.replace('"', '\\"')}"`,
x => x.substring(1, x.length - 1).replace('\\"', '"')
),
isHandleableInput
);
/**
* A function used by our generator above to skip generated strings
* that our code can't handle. As described below, there are a few
* cases that are deeply ambiguous and where we're ok if our code
* blows up, so we want to filter out inputs that match those cases.
*/
function isHandleableInput(potentialString) {
// If our generated borked string ends with BACKSLASH DQUOTE,
// (because the string we tried to wrap in quotes ended with a
// literal backslash), that results in an unterminated string
// literal that we can't fix (because we won't know irl where
// the string really should end), so we don't handle that case.
if(potentialString.endsWith('\\"')) {
return false;
}
// We do the same if the result ends with two quotes, and the
// second to last quote isn't escaped (imagine a literal
// BACKSLASH DQUOTE string that goes through our smap above
// and becomes DQUOTE BACKSLASH BACKSLASH DQUOTE DQUOTE).
if(potentialString.endsWith('""')) {
// count backslashes before the last two quotes.
// even number means second to last quote isn't escaped.
const backslashCount = R.takeWhile(
it => it === '\\',
potentialString.split("").reverse().join("").substring(2),
).length;
return (backslashCount % 2) === 0 ? false : true;
}
return true;
}
}
describe("Fix strings", () => {
it("should generate parseable output (manual cases)", () => {
JSON.parse(fixJSON("\"\\\n\""));
});
it("should generate parseable output (auto cases)", function() {
this.timeout(Infinity);
jsc.assert(
jsc.forall(borkedJSONStringLiteral, (str) => {
try {
JSON.parse(fixJSON(str));
return true;
} catch (e) {
console.log(str, fixJSON(str));
return false;
}
}),
{ tests: 8000 }
);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment