Last active
December 22, 2022 14:03
-
-
Save Miha-x64/4ed50f1d5593e45a452efbf456aa1db4 to your computer and use it in GitHub Desktop.
HTML/XML escaping utils, answering https://stackoverflow.com/a/61215915/3050249
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.io.UncheckedIOException; | |
/** | |
* XML escaping utils. | |
* | |
* Source: https://gist.github.com/Miha-x64/4ed50f1d5593e45a452efbf456aa1db4 | |
*/ | |
public final class Xml { | |
private Xml() {} | |
private static final long TEXT_ESCAPE = 1L << '&' | 1L << '<'; | |
private static final long DOUBLE_QUOTED_ATTR_ESCAPE = TEXT_ESCAPE | 1L << '"'; | |
private static final long SINGLE_QUOTED_ATTR_ESCAPE = TEXT_ESCAPE | 1L << '\''; | |
private static final long ESCAPES = DOUBLE_QUOTED_ATTR_ESCAPE | SINGLE_QUOTED_ATTR_ESCAPE; | |
// 'quot' and 'apos' are 1 char longer than '#34' and '#39' which I've decided to use | |
private static final String REPLACEMENTS = ""&'<"; | |
private static final int REPL_SLICES = /* [0, 5, 10, 15, 19) */ 5<<5 | 10<<10 | 15<<15 | 19<<20; | |
// These 5-bit numbers packed into a single int are indices within REPLACEMENTS which is a 'flat' String[] | |
public static <A extends Appendable> A appendEscapedForTagBody(A appendable, CharSequence content) { | |
appendEscaped(appendable, content, TEXT_ESCAPE); | |
return appendable; | |
} | |
public static <A extends Appendable> A appendEscapedForDoubleQuotedAttrValue(A appendable, CharSequence content) { | |
appendEscaped(appendable, content, DOUBLE_QUOTED_ATTR_ESCAPE); | |
return appendable; | |
} | |
public static <A extends Appendable> A appendEscapedForSingleQuotedAttrValue(A appendable, CharSequence content) { | |
appendEscaped(appendable, content, SINGLE_QUOTED_ATTR_ESCAPE); | |
return appendable; | |
} | |
private static void appendEscaped(Appendable builder, CharSequence content, long escapes) { | |
try { | |
int startIdx = 0, len = content.length(); | |
for (int i = 0; i < len; i++) { | |
char c = content.charAt(i); | |
long one; | |
if (((c & 63) == c) && ((one = 1L << c) & escapes) != 0) { | |
// -^^^^^^^^^^^^^^^ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
// | | take only dangerous characters | |
// | java shifts longs by 6 least significant bits, | |
// | e. g. << 0b110111111 is same as >> 0b111111. Filter out bigger characters | |
int index = Long.bitCount(ESCAPES & (one - 1)); | |
builder | |
.append(content, startIdx, i /* exclusive */) | |
.append(REPLACEMENTS, REPL_SLICES >>> (5 * index) & 31, REPL_SLICES >>> (5 * (index + 1)) & 31); | |
startIdx = i + 1; | |
} | |
} | |
builder.append(content, startIdx, len); | |
} catch (IOException e) { | |
// typically, our Appendable is StringBuilder which does not throw; also, | |
// there's no way to declare 'if A#append() throws E, then appendEscaped() throws E, too' | |
throw new UncheckedIOException(e); | |
} | |
} | |
public static void main(String[] args) { | |
StringBuilder sb = | |
new StringBuilder("<!DOCTYPE html>\n<html lang=\"en\">\n<head><title>Test</title></head>\n<body>\n\n"); | |
appendEscapedForDoubleQuotedAttrValue(sb.append("<p title=\""), "<\"I'm double-quoted!\">").append("\">"); | |
appendEscapedForTagBody(sb, "<\"Hello!\">").append("</p>\n"); | |
appendEscapedForSingleQuotedAttrValue(sb.append("<p title='"), "<\"I'm single-quoted!\">").append("'>"); | |
appendEscapedForTagBody(sb, "<\"Goodbye!\">").append("</p>\n\n</body>\n</html>"); | |
String escaped = sb.toString(); | |
String expected = "<!DOCTYPE html>\n<html lang=\"en\">\n<head><title>Test</title></head>\n<body>\n\n" + | |
"<p title=\"<"I'm double-quoted!">\"><\"Hello!\"></p>\n" + | |
"<p title='<\"I'm single-quoted!\">'><\"Goodbye!\"></p>\n" + | |
"\n</body>\n</html>"; | |
if (!expected.equals(escaped)) | |
throw new AssertionError("expected:<" + expected + "> but was:<" + escaped + ">"); | |
System.out.println(escaped); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment