Created
November 18, 2013 14:50
-
-
Save jokecamp/7529013 to your computer and use it in GitHub Desktop.
Various ways of sanitizing XML input
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// http://seattlesoftware.wordpress.com/2008/09/11/hexadecimal-value-0-is-an-invalid-character/ | |
/// http://stackoverflow.com/questions/157646/best-way-to-encode-text-data-for-xml/732135#732135 | |
public string Clean(string text) | |
{ | |
return new string(text.Where(XmlConvert.IsXmlChar).ToArray()); | |
} | |
public static string CleanInvalidXmlChars(string text) | |
{ | |
// From xml spec valid chars: | |
// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] | |
// any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. | |
string re = @"[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\x10000-x10FFFF]"; | |
return Regex.Replace(text, re, ""); | |
} | |
private string SanitizeXmlString(string xml) | |
{ | |
if (xml == null) | |
{ | |
throw new ArgumentNullException("xml"); | |
} | |
var buffer = new StringBuilder(xml.Length); | |
foreach (char c in xml) | |
{ | |
if (IsLegalXmlChar(c)) | |
{ | |
buffer.Append(c); | |
} | |
} | |
return buffer.ToString(); | |
} | |
private bool IsLegalXmlChar(int character) | |
{ | |
return | |
( | |
character == 0x9 /* == '\t' == 9 */ || | |
character == 0xA /* == '\n' == 10 */ || | |
character == 0xD /* == '\r' == 13 */ || | |
(character >= 0x20 && character <= 0xD7FF) || | |
(character >= 0xE000 && character <= 0xFFFD) || | |
(character >= 0x10000 && character <= 0x10FFFF) | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment