In C#, can I convert a string value to a string literal, the way I would see it in code? I would like to replace tabs, newlines, etc. with their escape sequences.
If this code:
Console.WriteLine(someString);
produces:
Hello
World!
I want this code:
Console.WriteLine(ToLiteral(someString));
to produce:
\tHello\r\n\tWorld!\r\n
A long time ago, I found this:
private static string ToLiteral(string input)
{
using (var writer = new StringWriter())
{
using (var provider = CodeDomProvider.CreateProvider("CSharp"))
{
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, null);
return writer.ToString();
}
}
}
This code:
var input = "\tHello\r\n\tWorld!";
Console.WriteLine(input);
Console.WriteLine(ToLiteral(input));
Produces:
Hello
World!
"\tHello\r\n\tWorld!"
These days, Graham discovered you can use Roslyn's Microsoft.CodeAnalysis.CSharp package on NuGet:
private static string ToLiteral(string valueTextForCompiler)
{
return Microsoft.CodeAnalysis.CSharp.SymbolDisplay.FormatLiteral(valueTextForCompiler, false);
}
Use Regex.Escape(String):
Regex.Escape escapes a minimal set of characters (, *, +, ?, |, {, [,
(,), ^, $,., #, and white space) by replacing them with their escape
codes.
There's a method for this in Roslyn's Microsoft.CodeAnalysis.CSharp package on NuGet:
private static string ToLiteral(string valueTextForCompiler)
{
return Microsoft.CodeAnalysis.CSharp.SymbolDisplay.FormatLiteral(valueTextForCompiler, false);
}
Obviously, this didn't exist at the time of the original question, but it might help people who end up here from Google Search.
This is a fully working implementation, including escaping of Unicode and ASCII non-printable characters. It does not insert "+" signs like Hallgrim's answer.
static string ToLiteral(string input) {
StringBuilder literal = new StringBuilder(input.Length + 2);
literal.Append("\"");
foreach (var c in input) {
switch (c) {
case '\"': literal.Append("\\\""); break;
case '\\': literal.Append(#"\\"); break;
case '\0': literal.Append(#"\0"); break;
case '\a': literal.Append(#"\a"); break;
case '\b': literal.Append(#"\b"); break;
case '\f': literal.Append(#"\f"); break;
case '\n': literal.Append(#"\n"); break;
case '\r': literal.Append(#"\r"); break;
case '\t': literal.Append(#"\t"); break;
case '\v': literal.Append(#"\v"); break;
default:
// ASCII printable character
if (c >= 0x20 && c <= 0x7e) {
literal.Append(c);
// As UTF16 escaped character
} else {
literal.Append(#"\u");
literal.Append(((int)c).ToString("x4"));
}
break;
}
}
literal.Append("\"");
return literal.ToString();
}
Note that this also escapes all Unicode characters. If your environment supports them, you could change that part to escape only control characters:
// UTF16 control characters
} else if (Char.GetUnicodeCategory(c) == UnicodeCategory.Control) {
literal.Append(#"\u");
literal.Append(((int)c).ToString("x4"));
} else {
literal.Append(c);
}
A more structured approach, including all escape sequences for strings and chars, is:
It doesn't replace Unicode characters with their literal equivalent. It doesn't cook eggs, either.
public class ReplaceString
{
static readonly IDictionary<string, string> m_replaceDict
= new Dictionary<string, string>();
const string ms_regexEscapes = #"[\a\b\f\n\r\t\v\\""]";
public static string StringLiteral(string i_string)
{
return Regex.Replace(i_string, ms_regexEscapes, match);
}
public static string CharLiteral(char c)
{
return c == '\'' ? #"'\''" : string.Format("'{0}'", c);
}
private static string match(Match m)
{
string match = m.ToString();
if (m_replaceDict.ContainsKey(match))
{
return m_replaceDict[match];
}
throw new NotSupportedException();
}
static ReplaceString()
{
m_replaceDict.Add("\a", #"\a");
m_replaceDict.Add("\b", #"\b");
m_replaceDict.Add("\f", #"\f");
m_replaceDict.Add("\n", #"\n");
m_replaceDict.Add("\r", #"\r");
m_replaceDict.Add("\t", #"\t");
m_replaceDict.Add("\v", #"\v");
m_replaceDict.Add("\\", #"\\");
m_replaceDict.Add("\0", #"\0");
//The SO parser gets fooled by the verbatim version
//of the string to replace - #"\"""
//so use the 'regular' version
m_replaceDict.Add("\"", "\\\"");
}
static void Main(string[] args){
string s = "here's a \"\n\tstring\" to test";
Console.WriteLine(ReplaceString.StringLiteral(s));
Console.WriteLine(ReplaceString.CharLiteral('c'));
Console.WriteLine(ReplaceString.CharLiteral('\''));
}
}
Try:
var t = HttpUtility.JavaScriptStringEncode(s);
public static class StringHelpers
{
private static Dictionary<string, string> escapeMapping = new Dictionary<string, string>()
{
{"\"", #"\\\"""},
{"\\\\", #"\\"},
{"\a", #"\a"},
{"\b", #"\b"},
{"\f", #"\f"},
{"\n", #"\n"},
{"\r", #"\r"},
{"\t", #"\t"},
{"\v", #"\v"},
{"\0", #"\0"},
};
private static Regex escapeRegex = new Regex(string.Join("|", escapeMapping.Keys.ToArray()));
public static string Escape(this string s)
{
return escapeRegex.Replace(s, EscapeMatchEval);
}
private static string EscapeMatchEval(Match m)
{
if (escapeMapping.ContainsKey(m.Value))
{
return escapeMapping[m.Value];
}
return escapeMapping[Regex.Escape(m.Value)];
}
}
Hallgrim's answer is excellent, but the "+", newline and indent additions were breaking functionality for me. An easy way around it is:
private static string ToLiteral(string input)
{
using (var writer = new StringWriter())
{
using (var provider = CodeDomProvider.CreateProvider("CSharp"))
{
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, new CodeGeneratorOptions {IndentString = "\t"});
var literal = writer.ToString();
literal = literal.Replace(string.Format("\" +{0}\t\"", Environment.NewLine), "");
return literal;
}
}
}
Here is a little improvement for Smilediver's answer. It will not escape all no-ASCII characters, but only these are really needed.
using System;
using System.Globalization;
using System.Text;
public static class CodeHelper
{
public static string ToLiteral(this string input)
{
var literal = new StringBuilder(input.Length + 2);
literal.Append("\"");
foreach (var c in input)
{
switch (c)
{
case '\'': literal.Append(#"\'"); break;
case '\"': literal.Append("\\\""); break;
case '\\': literal.Append(#"\\"); break;
case '\0': literal.Append(#"\0"); break;
case '\a': literal.Append(#"\a"); break;
case '\b': literal.Append(#"\b"); break;
case '\f': literal.Append(#"\f"); break;
case '\n': literal.Append(#"\n"); break;
case '\r': literal.Append(#"\r"); break;
case '\t': literal.Append(#"\t"); break;
case '\v': literal.Append(#"\v"); break;
default:
if (Char.GetUnicodeCategory(c) != UnicodeCategory.Control)
{
literal.Append(c);
}
else
{
literal.Append(#"\u");
literal.Append(((ushort)c).ToString("x4"));
}
break;
}
}
literal.Append("\"");
return literal.ToString();
}
}
Interesting question.
If you can't find a better method, you can always replace.
In case you're opting for it, you could use this C# Escape Sequence List:
\' - single quote, needed for character literals
\" - double quote, needed for string literals
\ - backslash
\0 - Unicode character 0
\a - Alert (character 7)
\b - Backspace (character 8)
\f - Form feed (character 12)
\n - New line (character 10)
\r - Carriage return (character 13)
\t - Horizontal tab (character 9)
\v - Vertical quote (character 11)
\uxxxx - Unicode escape sequence for character with hex value xxxx
\xn[n][n][n] - Unicode escape sequence for character with hex value nnnn (variable length version of \uxxxx)
\Uxxxxxxxx - Unicode escape sequence for character with hex value xxxxxxxx (for generating surrogates)
This list can be found in the C# Frequently Asked Questions
What character escape sequences are available?
If JSON conventions are enough for the unescaped strings you want to get escaped and you already use Json.NET (Newtonsoft.Json) in your project (it has a pretty large overhead), you may use this package like the following:
using System;
using Newtonsoft.Json;
public class Program
{
public static void Main()
{
Console.WriteLine(ToLiteral(#"abc\n123"));
}
private static string ToLiteral(string input)
{
return JsonConvert.DeserializeObject<string>("\"" + input + "\"");
}
}
public static class StringEscape
{
static char[] toEscape = "\0\x1\x2\x3\x4\x5\x6\a\b\t\n\v\f\r\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\".ToCharArray();
static string[] literals = #"\0,\x0001,\x0002,\x0003,\x0004,\x0005,\x0006,\a,\b,\t,\n,\v,\f,\r,\x000e,\x000f,\x0010,\x0011,\x0012,\x0013,\x0014,\x0015,\x0016,\x0017,\x0018,\x0019,\x001a,\x001b,\x001c,\x001d,\x001e,\x001f".Split(new char[] { ',' });
public static string Escape(this string input)
{
int i = input.IndexOfAny(toEscape);
if (i < 0) return input;
var sb = new System.Text.StringBuilder(input.Length + 5);
int j = 0;
do
{
sb.Append(input, j, i - j);
var c = input[i];
if (c < 0x20) sb.Append(literals[c]); else sb.Append(#"\").Append(c);
} while ((i = input.IndexOfAny(toEscape, j = ++i)) > 0);
return sb.Append(input, j, input.Length - j).ToString();
}
}
My attempt at adding ToVerbatim to Hallgrim's accepted answer:
private static string ToLiteral(string input)
{
using (var writer = new StringWriter())
{
using (var provider = CodeDomProvider.CreateProvider("CSharp"))
{
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, new CodeGeneratorOptions { IndentString = "\t" });
var literal = writer.ToString();
literal = literal.Replace(string.Format("\" +{0}\t\"", Environment.NewLine), "");
return literal;
}
}
}
private static string ToVerbatim(string input)
{
string literal = ToLiteral(input);
string verbatim = "#" + literal.Replace(#"\r\n", Environment.NewLine);
return verbatim;
}
Hallgrim's answer was excellent. Here's a small tweak in case you need to parse out additional white space characters and linebreaks with a C# regular expression. I needed this in the case of a serialized JSON value for insertion into Google Sheets and ran into trouble as the code was inserting tabs, +, spaces, etc.
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, null);
var literal = writer.ToString();
var r2 = new Regex(#"\"" \+.\n[\s]+\""", RegexOptions.ECMAScript);
literal = r2.Replace(literal, "");
return literal;
I submit my own implementation, which handles null values and should be more performant on account of using array lookup tables, manual hex conversion, and avoiding switch statements.
using System;
using System.Text;
using System.Linq;
public static class StringLiteralEncoding {
private static readonly char[] HEX_DIGIT_LOWER = "0123456789abcdef".ToCharArray();
private static readonly char[] LITERALENCODE_ESCAPE_CHARS;
static StringLiteralEncoding() {
// Per http://msdn.microsoft.com/en-us/library/h21280bw.aspx
var escapes = new string[] { "\aa", "\bb", "\ff", "\nn", "\rr", "\tt", "\vv", "\"\"", "\\\\", "??", "\00" };
LITERALENCODE_ESCAPE_CHARS = new char[escapes.Max(e => e[0]) + 1];
foreach(var escape in escapes)
LITERALENCODE_ESCAPE_CHARS[escape[0]] = escape[1];
}
/// <summary>
/// Convert the string to the equivalent C# string literal, enclosing the string in double quotes and inserting
/// escape sequences as necessary.
/// </summary>
/// <param name="s">The string to be converted to a C# string literal.</param>
/// <returns><paramref name="s"/> represented as a C# string literal.</returns>
public static string Encode(string s) {
if(null == s) return "null";
var sb = new StringBuilder(s.Length + 2).Append('"');
for(var rp = 0; rp < s.Length; rp++) {
var c = s[rp];
if(c < LITERALENCODE_ESCAPE_CHARS.Length && '\0' != LITERALENCODE_ESCAPE_CHARS[c])
sb.Append('\\').Append(LITERALENCODE_ESCAPE_CHARS[c]);
else if('~' >= c && c >= ' ')
sb.Append(c);
else
sb.Append(#"\x")
.Append(HEX_DIGIT_LOWER[c >> 12 & 0x0F])
.Append(HEX_DIGIT_LOWER[c >> 8 & 0x0F])
.Append(HEX_DIGIT_LOWER[c >> 4 & 0x0F])
.Append(HEX_DIGIT_LOWER[c & 0x0F]);
}
return sb.Append('"').ToString();
}
}
Code:
string someString1 = "\tHello\r\n\tWorld!\r\n";
string someString2 = #"\tHello\r\n\tWorld!\r\n";
Console.WriteLine(someString1);
Console.WriteLine(someString2);
Output:
Hello
World!
\tHello\r\n\tWorld!\r\n
Related
I have a string that contains invalid XML characters. How can I escape (or remove) invalid XML characters before I parse the string?
As the way to remove invalid XML characters I suggest you to use XmlConvert.IsXmlChar method. It was added since .NET Framework 4 and is presented in Silverlight too. Here is the small sample:
void Main() {
string content = "\v\f\0";
Console.WriteLine(IsValidXmlString(content)); // False
content = RemoveInvalidXmlChars(content);
Console.WriteLine(IsValidXmlString(content)); // True
}
static string RemoveInvalidXmlChars(string text) {
var validXmlChars = text.Where(ch => XmlConvert.IsXmlChar(ch)).ToArray();
return new string(validXmlChars);
}
static bool IsValidXmlString(string text) {
try {
XmlConvert.VerifyXmlChars(text);
return true;
} catch {
return false;
}
}
And as the way to escape invalid XML characters I suggest you to use XmlConvert.EncodeName method. Here is the small sample:
void Main() {
const string content = "\v\f\0";
Console.WriteLine(IsValidXmlString(content)); // False
string encoded = XmlConvert.EncodeName(content);
Console.WriteLine(IsValidXmlString(encoded)); // True
string decoded = XmlConvert.DecodeName(encoded);
Console.WriteLine(content == decoded); // True
}
static bool IsValidXmlString(string text) {
try {
XmlConvert.VerifyXmlChars(text);
return true;
} catch {
return false;
}
}
Update:
It should be mentioned that the encoding operation produces a string with a length which is greater or equal than a length of a source string. It might be important when you store a encoded string in a database in a string column with length limitation and validate source string length in your app to fit data column limitation.
Use SecurityElement.Escape
using System;
using System.Security;
class Sample {
static void Main() {
string text = "Escape characters : < > & \" \'";
string xmlText = SecurityElement.Escape(text);
//output:
//Escape characters : < > & " '
Console.WriteLine(xmlText);
}
}
If you are writing xml, just use the classes provided by the framework to create the xml. You won't have to bother with escaping or anything.
Console.Write(new XElement("Data", "< > &"));
Will output
<Data>< > &</Data>
If you need to read an XML file that is malformed, do not use regular expression. Instead, use the Html Agility Pack.
The RemoveInvalidXmlChars method provided by Irishman does not support surrogate characters. To test it, use the following example:
static void Main()
{
const string content = "\v\U00010330";
string newContent = RemoveInvalidXmlChars(content);
Console.WriteLine(newContent);
}
This returns an empty string but it shouldn't! It should return "\U00010330" because the character U+10330 is a valid XML character.
To support surrogate characters, I suggest using the following method:
public static string RemoveInvalidXmlChars(string text)
{
if (string.IsNullOrEmpty(text))
return text;
int length = text.Length;
StringBuilder stringBuilder = new StringBuilder(length);
for (int i = 0; i < length; ++i)
{
if (XmlConvert.IsXmlChar(text[i]))
{
stringBuilder.Append(text[i]);
}
else if (i + 1 < length && XmlConvert.IsXmlSurrogatePair(text[i + 1], text[i]))
{
stringBuilder.Append(text[i]);
stringBuilder.Append(text[i + 1]);
++i;
}
}
return stringBuilder.ToString();
}
Here is an optimized version of the above method RemoveInvalidXmlChars which doesn't create a new array on every call, thus stressing the GC unnecessarily:
public static string RemoveInvalidXmlChars(string text)
{
if (text == null)
return text;
if (text.Length == 0)
return text;
// a bit complicated, but avoids memory usage if not necessary
StringBuilder result = null;
for (int i = 0; i < text.Length; i++)
{
var ch = text[i];
if (XmlConvert.IsXmlChar(ch))
{
result?.Append(ch);
}
else if (result == null)
{
result = new StringBuilder();
result.Append(text.Substring(0, i));
}
}
if (result == null)
return text; // no invalid xml chars detected - return original text
else
return result.ToString();
}
// Replace invalid characters with empty strings.
Regex.Replace(inputString, #"[^\w\.#-]", "");
The regular expression pattern [^\w.#-] matches any character that is not a word character, a period, an # symbol, or a hyphen. A word character is any letter, decimal digit, or punctuation connector such as an underscore. Any character that matches this pattern is replaced by String.Empty, which is the string defined by the replacement pattern. To allow additional characters in user input, add those characters to the character class in the regular expression pattern. For example, the regular expression pattern [^\w.#-\%] also allows a percentage symbol and a backslash in an input string.
Regex.Replace(inputString, #"[!##$%_]", "");
Refer this too :
Removing Invalid Characters from XML Name Tag - RegEx C#
Here is a function to remove the characters from a specified XML string:
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace XMLUtils
{
class Standards
{
/// <summary>
/// Strips non-printable ascii characters
/// Refer to http://www.w3.org/TR/xml11/#charsets for XML 1.1
/// Refer to http://www.w3.org/TR/2006/REC-xml-20060816/#charsets for XML 1.0
/// </summary>
/// <param name="content">contents</param>
/// <param name="XMLVersion">XML Specification to use. Can be 1.0 or 1.1</param>
private void StripIllegalXMLChars(string tmpContents, string XMLVersion)
{
string pattern = String.Empty;
switch (XMLVersion)
{
case "1.0":
pattern = #"#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|7F|8[0-46-9A-F]9[0-9A-F])";
break;
case "1.1":
pattern = #"#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|[19][0-9A-F]|7F|8[0-46-9A-F]|0?[1-8BCEF])";
break;
default:
throw new Exception("Error: Invalid XML Version!");
}
Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
if (regex.IsMatch(tmpContents))
{
tmpContents = regex.Replace(tmpContents, String.Empty);
}
tmpContents = string.Empty;
}
}
}
If you are only escaping invalid XML characters for a string that is used inside of an XML tag you could do something simple like this.
This works when you aren't using an XML library.
public string EscapeXMLCharacters (string target)
{
return
target
.Replace("&", "&")
.Replace("<", "<")
.Replace(">", ">")
.Replace("\"", """)
.Replace("'", "'");
}
you could then call it like so:
public string GetXMLBody(string content)
{
return #"<input>" + EscapeXMLCharacters(content) + "</input>";
}
string XMLWriteStringWithoutIllegalCharacters(string UnfilteredString)
{
if (UnfilteredString == null)
return string.Empty;
return XmlConvert.EncodeName(UnfilteredString);
}
string XMLReadStringWithoutIllegalCharacters(string FilteredString)
{
if (UnfilteredString == null)
return string.Empty;
return XmlConvert.DecodeName(UnfilteredString);
}
This simple method replace the invalid characters with the same value but accepted in the XML context.
To write string use XMLWriteStringWithoutIllegalCharacters(string UnfilteredString).
To read string use XMLReadStringWithoutIllegalCharacters(string FilteredString).
In C#, can I convert a string value to a string literal, the way I would see it in code? I would like to replace tabs, newlines, etc. with their escape sequences.
If this code:
Console.WriteLine(someString);
produces:
Hello
World!
I want this code:
Console.WriteLine(ToLiteral(someString));
to produce:
\tHello\r\n\tWorld!\r\n
A long time ago, I found this:
private static string ToLiteral(string input)
{
using (var writer = new StringWriter())
{
using (var provider = CodeDomProvider.CreateProvider("CSharp"))
{
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, null);
return writer.ToString();
}
}
}
This code:
var input = "\tHello\r\n\tWorld!";
Console.WriteLine(input);
Console.WriteLine(ToLiteral(input));
Produces:
Hello
World!
"\tHello\r\n\tWorld!"
These days, Graham discovered you can use Roslyn's Microsoft.CodeAnalysis.CSharp package on NuGet:
private static string ToLiteral(string valueTextForCompiler)
{
return Microsoft.CodeAnalysis.CSharp.SymbolDisplay.FormatLiteral(valueTextForCompiler, false);
}
Use Regex.Escape(String):
Regex.Escape escapes a minimal set of characters (, *, +, ?, |, {, [,
(,), ^, $,., #, and white space) by replacing them with their escape
codes.
There's a method for this in Roslyn's Microsoft.CodeAnalysis.CSharp package on NuGet:
private static string ToLiteral(string valueTextForCompiler)
{
return Microsoft.CodeAnalysis.CSharp.SymbolDisplay.FormatLiteral(valueTextForCompiler, false);
}
Obviously, this didn't exist at the time of the original question, but it might help people who end up here from Google Search.
This is a fully working implementation, including escaping of Unicode and ASCII non-printable characters. It does not insert "+" signs like Hallgrim's answer.
static string ToLiteral(string input) {
StringBuilder literal = new StringBuilder(input.Length + 2);
literal.Append("\"");
foreach (var c in input) {
switch (c) {
case '\"': literal.Append("\\\""); break;
case '\\': literal.Append(#"\\"); break;
case '\0': literal.Append(#"\0"); break;
case '\a': literal.Append(#"\a"); break;
case '\b': literal.Append(#"\b"); break;
case '\f': literal.Append(#"\f"); break;
case '\n': literal.Append(#"\n"); break;
case '\r': literal.Append(#"\r"); break;
case '\t': literal.Append(#"\t"); break;
case '\v': literal.Append(#"\v"); break;
default:
// ASCII printable character
if (c >= 0x20 && c <= 0x7e) {
literal.Append(c);
// As UTF16 escaped character
} else {
literal.Append(#"\u");
literal.Append(((int)c).ToString("x4"));
}
break;
}
}
literal.Append("\"");
return literal.ToString();
}
Note that this also escapes all Unicode characters. If your environment supports them, you could change that part to escape only control characters:
// UTF16 control characters
} else if (Char.GetUnicodeCategory(c) == UnicodeCategory.Control) {
literal.Append(#"\u");
literal.Append(((int)c).ToString("x4"));
} else {
literal.Append(c);
}
A more structured approach, including all escape sequences for strings and chars, is:
It doesn't replace Unicode characters with their literal equivalent. It doesn't cook eggs, either.
public class ReplaceString
{
static readonly IDictionary<string, string> m_replaceDict
= new Dictionary<string, string>();
const string ms_regexEscapes = #"[\a\b\f\n\r\t\v\\""]";
public static string StringLiteral(string i_string)
{
return Regex.Replace(i_string, ms_regexEscapes, match);
}
public static string CharLiteral(char c)
{
return c == '\'' ? #"'\''" : string.Format("'{0}'", c);
}
private static string match(Match m)
{
string match = m.ToString();
if (m_replaceDict.ContainsKey(match))
{
return m_replaceDict[match];
}
throw new NotSupportedException();
}
static ReplaceString()
{
m_replaceDict.Add("\a", #"\a");
m_replaceDict.Add("\b", #"\b");
m_replaceDict.Add("\f", #"\f");
m_replaceDict.Add("\n", #"\n");
m_replaceDict.Add("\r", #"\r");
m_replaceDict.Add("\t", #"\t");
m_replaceDict.Add("\v", #"\v");
m_replaceDict.Add("\\", #"\\");
m_replaceDict.Add("\0", #"\0");
//The SO parser gets fooled by the verbatim version
//of the string to replace - #"\"""
//so use the 'regular' version
m_replaceDict.Add("\"", "\\\"");
}
static void Main(string[] args){
string s = "here's a \"\n\tstring\" to test";
Console.WriteLine(ReplaceString.StringLiteral(s));
Console.WriteLine(ReplaceString.CharLiteral('c'));
Console.WriteLine(ReplaceString.CharLiteral('\''));
}
}
Try:
var t = HttpUtility.JavaScriptStringEncode(s);
public static class StringHelpers
{
private static Dictionary<string, string> escapeMapping = new Dictionary<string, string>()
{
{"\"", #"\\\"""},
{"\\\\", #"\\"},
{"\a", #"\a"},
{"\b", #"\b"},
{"\f", #"\f"},
{"\n", #"\n"},
{"\r", #"\r"},
{"\t", #"\t"},
{"\v", #"\v"},
{"\0", #"\0"},
};
private static Regex escapeRegex = new Regex(string.Join("|", escapeMapping.Keys.ToArray()));
public static string Escape(this string s)
{
return escapeRegex.Replace(s, EscapeMatchEval);
}
private static string EscapeMatchEval(Match m)
{
if (escapeMapping.ContainsKey(m.Value))
{
return escapeMapping[m.Value];
}
return escapeMapping[Regex.Escape(m.Value)];
}
}
Hallgrim's answer is excellent, but the "+", newline and indent additions were breaking functionality for me. An easy way around it is:
private static string ToLiteral(string input)
{
using (var writer = new StringWriter())
{
using (var provider = CodeDomProvider.CreateProvider("CSharp"))
{
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, new CodeGeneratorOptions {IndentString = "\t"});
var literal = writer.ToString();
literal = literal.Replace(string.Format("\" +{0}\t\"", Environment.NewLine), "");
return literal;
}
}
}
Here is a little improvement for Smilediver's answer. It will not escape all no-ASCII characters, but only these are really needed.
using System;
using System.Globalization;
using System.Text;
public static class CodeHelper
{
public static string ToLiteral(this string input)
{
var literal = new StringBuilder(input.Length + 2);
literal.Append("\"");
foreach (var c in input)
{
switch (c)
{
case '\'': literal.Append(#"\'"); break;
case '\"': literal.Append("\\\""); break;
case '\\': literal.Append(#"\\"); break;
case '\0': literal.Append(#"\0"); break;
case '\a': literal.Append(#"\a"); break;
case '\b': literal.Append(#"\b"); break;
case '\f': literal.Append(#"\f"); break;
case '\n': literal.Append(#"\n"); break;
case '\r': literal.Append(#"\r"); break;
case '\t': literal.Append(#"\t"); break;
case '\v': literal.Append(#"\v"); break;
default:
if (Char.GetUnicodeCategory(c) != UnicodeCategory.Control)
{
literal.Append(c);
}
else
{
literal.Append(#"\u");
literal.Append(((ushort)c).ToString("x4"));
}
break;
}
}
literal.Append("\"");
return literal.ToString();
}
}
Interesting question.
If you can't find a better method, you can always replace.
In case you're opting for it, you could use this C# Escape Sequence List:
\' - single quote, needed for character literals
\" - double quote, needed for string literals
\ - backslash
\0 - Unicode character 0
\a - Alert (character 7)
\b - Backspace (character 8)
\f - Form feed (character 12)
\n - New line (character 10)
\r - Carriage return (character 13)
\t - Horizontal tab (character 9)
\v - Vertical quote (character 11)
\uxxxx - Unicode escape sequence for character with hex value xxxx
\xn[n][n][n] - Unicode escape sequence for character with hex value nnnn (variable length version of \uxxxx)
\Uxxxxxxxx - Unicode escape sequence for character with hex value xxxxxxxx (for generating surrogates)
This list can be found in the C# Frequently Asked Questions
What character escape sequences are available?
If JSON conventions are enough for the unescaped strings you want to get escaped and you already use Json.NET (Newtonsoft.Json) in your project (it has a pretty large overhead), you may use this package like the following:
using System;
using Newtonsoft.Json;
public class Program
{
public static void Main()
{
Console.WriteLine(ToLiteral(#"abc\n123"));
}
private static string ToLiteral(string input)
{
return JsonConvert.DeserializeObject<string>("\"" + input + "\"");
}
}
public static class StringEscape
{
static char[] toEscape = "\0\x1\x2\x3\x4\x5\x6\a\b\t\n\v\f\r\xe\xf\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\".ToCharArray();
static string[] literals = #"\0,\x0001,\x0002,\x0003,\x0004,\x0005,\x0006,\a,\b,\t,\n,\v,\f,\r,\x000e,\x000f,\x0010,\x0011,\x0012,\x0013,\x0014,\x0015,\x0016,\x0017,\x0018,\x0019,\x001a,\x001b,\x001c,\x001d,\x001e,\x001f".Split(new char[] { ',' });
public static string Escape(this string input)
{
int i = input.IndexOfAny(toEscape);
if (i < 0) return input;
var sb = new System.Text.StringBuilder(input.Length + 5);
int j = 0;
do
{
sb.Append(input, j, i - j);
var c = input[i];
if (c < 0x20) sb.Append(literals[c]); else sb.Append(#"\").Append(c);
} while ((i = input.IndexOfAny(toEscape, j = ++i)) > 0);
return sb.Append(input, j, input.Length - j).ToString();
}
}
My attempt at adding ToVerbatim to Hallgrim's accepted answer:
private static string ToLiteral(string input)
{
using (var writer = new StringWriter())
{
using (var provider = CodeDomProvider.CreateProvider("CSharp"))
{
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, new CodeGeneratorOptions { IndentString = "\t" });
var literal = writer.ToString();
literal = literal.Replace(string.Format("\" +{0}\t\"", Environment.NewLine), "");
return literal;
}
}
}
private static string ToVerbatim(string input)
{
string literal = ToLiteral(input);
string verbatim = "#" + literal.Replace(#"\r\n", Environment.NewLine);
return verbatim;
}
Hallgrim's answer was excellent. Here's a small tweak in case you need to parse out additional white space characters and linebreaks with a C# regular expression. I needed this in the case of a serialized JSON value for insertion into Google Sheets and ran into trouble as the code was inserting tabs, +, spaces, etc.
provider.GenerateCodeFromExpression(new CodePrimitiveExpression(input), writer, null);
var literal = writer.ToString();
var r2 = new Regex(#"\"" \+.\n[\s]+\""", RegexOptions.ECMAScript);
literal = r2.Replace(literal, "");
return literal;
I submit my own implementation, which handles null values and should be more performant on account of using array lookup tables, manual hex conversion, and avoiding switch statements.
using System;
using System.Text;
using System.Linq;
public static class StringLiteralEncoding {
private static readonly char[] HEX_DIGIT_LOWER = "0123456789abcdef".ToCharArray();
private static readonly char[] LITERALENCODE_ESCAPE_CHARS;
static StringLiteralEncoding() {
// Per http://msdn.microsoft.com/en-us/library/h21280bw.aspx
var escapes = new string[] { "\aa", "\bb", "\ff", "\nn", "\rr", "\tt", "\vv", "\"\"", "\\\\", "??", "\00" };
LITERALENCODE_ESCAPE_CHARS = new char[escapes.Max(e => e[0]) + 1];
foreach(var escape in escapes)
LITERALENCODE_ESCAPE_CHARS[escape[0]] = escape[1];
}
/// <summary>
/// Convert the string to the equivalent C# string literal, enclosing the string in double quotes and inserting
/// escape sequences as necessary.
/// </summary>
/// <param name="s">The string to be converted to a C# string literal.</param>
/// <returns><paramref name="s"/> represented as a C# string literal.</returns>
public static string Encode(string s) {
if(null == s) return "null";
var sb = new StringBuilder(s.Length + 2).Append('"');
for(var rp = 0; rp < s.Length; rp++) {
var c = s[rp];
if(c < LITERALENCODE_ESCAPE_CHARS.Length && '\0' != LITERALENCODE_ESCAPE_CHARS[c])
sb.Append('\\').Append(LITERALENCODE_ESCAPE_CHARS[c]);
else if('~' >= c && c >= ' ')
sb.Append(c);
else
sb.Append(#"\x")
.Append(HEX_DIGIT_LOWER[c >> 12 & 0x0F])
.Append(HEX_DIGIT_LOWER[c >> 8 & 0x0F])
.Append(HEX_DIGIT_LOWER[c >> 4 & 0x0F])
.Append(HEX_DIGIT_LOWER[c & 0x0F]);
}
return sb.Append('"').ToString();
}
}
Code:
string someString1 = "\tHello\r\n\tWorld!\r\n";
string someString2 = #"\tHello\r\n\tWorld!\r\n";
Console.WriteLine(someString1);
Console.WriteLine(someString2);
Output:
Hello
World!
\tHello\r\n\tWorld!\r\n
I'm using Azure storage tables and I have data going in to the RowKey that has slashes in it. According to this MSDN page, the following characters are disallowed in both the PartitionKey and RowKey:
The forward slash (/) character
The backslash () character
The number sign (#) character
The question mark (?) character
Control characters from U+0000 to U+001F, including:
The horizontal tab (\t) character
The linefeed (\n) character
The carriage return (\r) character
Control characters from U+007F to U+009F
I've seen some people use URL encoding to get around this. Unfortunately there's a few glitches that can arise from this, such as being able to insert but unable to delete certain entities. I've also seen some people use base64 encoding, however this also can contain disallowed characters.
How can I encode my RowKey efficiently without running in to disallowed characters, or rolling my own encoding?
Updated 18-Aug-2020 for (new?) issue with '+' character in Azure Search. See comments from #mladenb below for background. Of note, the documentation page referenced does not exclude the '+' character.
When a URL is Base64 encoded, the only character that is invalid in an Azure Table Storage key column is the forward slash ('/'). To address this, simply replace the forward slash character with another character that is both (1) valid in an Azure Table Storage key column and (2) not a Base64 character. The most common example I have found (which is cited in other answers) is to replace the forward slash ('/') with the underscore ('_').
private static String EncodeUrlInKey(String url)
{
var keyBytes = System.Text.Encoding.UTF8.GetBytes(url);
var base64 = System.Convert.ToBase64String(keyBytes);
return base64.Replace('/','_').Replace('+','-');
}
When decoding, simply undo the replaced character (first!) and then Base64 decode the resulting string. That's all there is to it.
private static String DecodeUrlInKey(String encodedKey)
{
var base64 = encodedKey.Replace('-','+').Replace('_', '/');
byte[] bytes = System.Convert.FromBase64String(base64);
return System.Text.Encoding.UTF8.GetString(bytes);
}
Some people have suggested that other Base64 characters also need encoding. According to the Azure Table Storage docs this is not the case.
I ran into the same need.
I wasn't satisfied with Base64 encoding because it turns a human-readable string into an unrecognizable string, and will inflate the size of strings regardless of whether they follow the rules (a loss when the great majority of characters are not illegal characters that need to be escaped).
Here's a coder/decoder using '!' as an escape character in much the same way one would traditionally use the backslash character.
public static class TableKeyEncoding
{
// https://msdn.microsoft.com/library/azure/dd179338.aspx
//
// The following characters are not allowed in values for the PartitionKey and RowKey properties:
// The forward slash(/) character
// The backslash(\) character
// The number sign(#) character
// The question mark (?) character
// Control characters from U+0000 to U+001F, including:
// The horizontal tab(\t) character
// The linefeed(\n) character
// The carriage return (\r) character
// Control characters from U+007F to U+009F
public static string Encode(string unsafeForUseAsAKey)
{
StringBuilder safe = new StringBuilder();
foreach (char c in unsafeForUseAsAKey)
{
switch (c)
{
case '/':
safe.Append("!f");
break;
case '\\':
safe.Append("!b");
break;
case '#':
safe.Append("!p");
break;
case '?':
safe.Append("!q");
break;
case '\t':
safe.Append("!t");
break;
case '\n':
safe.Append("!n");
break;
case '\r':
safe.Append("!r");
break;
case '!':
safe.Append("!!");
break;
default:
if (c <= 0x1f || (c >= 0x7f && c <= 0x9f))
{
int charCode = c;
safe.Append("!x" + charCode.ToString("x2"));
}
else
{
safe.Append(c);
}
break;
}
}
return safe.ToString();
}
public static string Decode(string key)
{
StringBuilder decoded = new StringBuilder();
int i = 0;
while (i < key.Length)
{
char c = key[i++];
if (c != '!' || i == key.Length)
{
// There's no escape character ('!'), or the escape should be ignored because it's the end of the array
decoded.Append(c);
}
else
{
char escapeCode = key[i++];
switch (escapeCode)
{
case 'f':
decoded.Append('/');
break;
case 'b':
decoded.Append('\\');
break;
case 'p':
decoded.Append('#');
break;
case 'q':
decoded.Append('?');
break;
case 't':
decoded.Append('\t');
break;
case 'n':
decoded.Append("\n");
break;
case 'r':
decoded.Append("\r");
break;
case '!':
decoded.Append('!');
break;
case 'x':
if (i + 2 <= key.Length)
{
string charCodeString = key.Substring(i, 2);
int charCode;
if (int.TryParse(charCodeString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo, out charCode))
{
decoded.Append((char)charCode);
}
i += 2;
}
break;
default:
decoded.Append('!');
break;
}
}
}
return decoded.ToString();
}
}
Since one should use extreme caution when writing your own encoder, I have written some unit tests for it as well.
using Xunit;
namespace xUnit_Tests
{
public class TableKeyEncodingTests
{
const char Unicode0X1A = (char) 0x1a;
public void RoundTripTest(string unencoded, string encoded)
{
Assert.Equal(encoded, TableKeyEncoding.Encode(unencoded));
Assert.Equal(unencoded, TableKeyEncoding.Decode(encoded));
}
[Fact]
public void RoundTrips()
{
RoundTripTest("!\n", "!!!n");
RoundTripTest("left" + Unicode0X1A + "right", "left!x1aright");
}
// The following characters are not allowed in values for the PartitionKey and RowKey properties:
// The forward slash(/) character
// The backslash(\) character
// The number sign(#) character
// The question mark (?) character
// Control characters from U+0000 to U+001F, including:
// The horizontal tab(\t) character
// The linefeed(\n) character
// The carriage return (\r) character
// Control characters from U+007F to U+009F
[Fact]
void EncodesAllForbiddenCharacters()
{
List<char> forbiddenCharacters = "\\/#?\t\n\r".ToCharArray().ToList();
forbiddenCharacters.AddRange(Enumerable.Range(0x00, 1+(0x1f-0x00)).Select(i => (char)i));
forbiddenCharacters.AddRange(Enumerable.Range(0x7f, 1+(0x9f-0x7f)).Select(i => (char)i));
string allForbiddenCharacters = String.Join("", forbiddenCharacters);
string allForbiddenCharactersEncoded = TableKeyEncoding.Encode(allForbiddenCharacters);
// Make sure decoding is same as encoding
Assert.Equal(allForbiddenCharacters, TableKeyEncoding.Decode(allForbiddenCharactersEncoded));
// Ensure encoding does not contain any forbidden characters
Assert.Equal(0, allForbiddenCharacters.Count( c => allForbiddenCharactersEncoded.Contains(c) ));
}
}
}
How about URL encode/decode functions. It takes care of '/', '?' and '#' characters.
string url = "http://www.google.com/search?q=Example";
string key = HttpUtility.UrlEncode(url);
string urlBack = HttpUtility.UrlDecode(key);
see these links
https://www.rfc-editor.org/rfc/rfc4648#page-7
Code for decoding/encoding a modified base64 URL (see also second answer: https://stackoverflow.com/a/1789179/1094268)
I had the problem myself. These are my own functions I use for this now. I use the trick in the second answer I mentioned, as well as changing up the + and / which are incompatible with azure keys that may still appear.
private static String EncodeSafeBase64(String toEncode)
{
if (toEncode == null)
throw new ArgumentNullException("toEncode");
String base64String = Convert.ToBase64String(Encoding.UTF8.GetBytes(toEncode));
StringBuilder safe = new StringBuilder();
foreach (Char c in base64String)
{
switch (c)
{
case '+':
safe.Append('-');
break;
case '/':
safe.Append('_');
break;
default:
safe.Append(c);
break;
}
}
return safe.ToString();
}
private static String DecodeSafeBase64(String toDecode)
{
if (toDecode == null)
throw new ArgumentNullException("toDecode");
StringBuilder deSafe = new StringBuilder();
foreach (Char c in toDecode)
{
switch (c)
{
case '-':
deSafe.Append('+');
break;
case '_':
deSafe.Append('/');
break;
default:
deSafe.Append(c);
break;
}
}
return Encoding.UTF8.GetString(Convert.FromBase64String(deSafe.ToString()));
}
If it is just the slashes, you can simply replace them on writing to the table with another character, say, '|' and re-replace them on reading.
What I have seen is that although alot of non-alphanumeric characters are technically allowed it doesn't really work very well as partition and row key.
I looked at the answears already given here and other places and wrote this:
https://github.com/JohanNorberg/AlphaNumeric
Two alpha-numeric encoders.
If you need to escape a string that is mostly alphanumeric you can use this:
AlphaNumeric.English.Encode(str);
If you need to escape a string that is mostly not alphanumeric you can use this:
AlphaNumeric.Data.EncodeString(str);
Encoding data:
var base64 = Convert.ToBase64String(bytes);
var alphaNumericEncodedString = base64
.Replace("0", "01")
.Replace("+", "02")
.Replace("/", "03")
.Replace("=", "04");
But, if you want to use for example an email adress as a rowkey you would only want to escape the '#' and '.'. This code will do that:
char[] validChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ3456789".ToCharArray();
char[] allChars = rawString.ToCharArray();
StringBuilder builder = new StringBuilder(rawString.Length * 2);
for(int i = 0; i < allChars.Length; i++)
{
int c = allChars[i];
if((c >= 51 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122))
{
builder.Append(allChars[i]);
}
else
{
int index = builder.Length;
int count = 0;
do
{
builder.Append(validChars[c % 59]);
c /= 59;
count++;
} while (c > 0);
if (count == 1) builder.Insert(index, '0');
else if (count == 2) builder.Insert(index, '1');
else if (count == 3) builder.Insert(index, '2');
else throw new Exception("Base59 has invalid count, method must be wrong Count is: " + count);
}
}
return builder.ToString();
I have a string that contains invalid XML characters. How can I escape (or remove) invalid XML characters before I parse the string?
As the way to remove invalid XML characters I suggest you to use XmlConvert.IsXmlChar method. It was added since .NET Framework 4 and is presented in Silverlight too. Here is the small sample:
void Main() {
string content = "\v\f\0";
Console.WriteLine(IsValidXmlString(content)); // False
content = RemoveInvalidXmlChars(content);
Console.WriteLine(IsValidXmlString(content)); // True
}
static string RemoveInvalidXmlChars(string text) {
var validXmlChars = text.Where(ch => XmlConvert.IsXmlChar(ch)).ToArray();
return new string(validXmlChars);
}
static bool IsValidXmlString(string text) {
try {
XmlConvert.VerifyXmlChars(text);
return true;
} catch {
return false;
}
}
And as the way to escape invalid XML characters I suggest you to use XmlConvert.EncodeName method. Here is the small sample:
void Main() {
const string content = "\v\f\0";
Console.WriteLine(IsValidXmlString(content)); // False
string encoded = XmlConvert.EncodeName(content);
Console.WriteLine(IsValidXmlString(encoded)); // True
string decoded = XmlConvert.DecodeName(encoded);
Console.WriteLine(content == decoded); // True
}
static bool IsValidXmlString(string text) {
try {
XmlConvert.VerifyXmlChars(text);
return true;
} catch {
return false;
}
}
Update:
It should be mentioned that the encoding operation produces a string with a length which is greater or equal than a length of a source string. It might be important when you store a encoded string in a database in a string column with length limitation and validate source string length in your app to fit data column limitation.
Use SecurityElement.Escape
using System;
using System.Security;
class Sample {
static void Main() {
string text = "Escape characters : < > & \" \'";
string xmlText = SecurityElement.Escape(text);
//output:
//Escape characters : < > & " '
Console.WriteLine(xmlText);
}
}
If you are writing xml, just use the classes provided by the framework to create the xml. You won't have to bother with escaping or anything.
Console.Write(new XElement("Data", "< > &"));
Will output
<Data>< > &</Data>
If you need to read an XML file that is malformed, do not use regular expression. Instead, use the Html Agility Pack.
The RemoveInvalidXmlChars method provided by Irishman does not support surrogate characters. To test it, use the following example:
static void Main()
{
const string content = "\v\U00010330";
string newContent = RemoveInvalidXmlChars(content);
Console.WriteLine(newContent);
}
This returns an empty string but it shouldn't! It should return "\U00010330" because the character U+10330 is a valid XML character.
To support surrogate characters, I suggest using the following method:
public static string RemoveInvalidXmlChars(string text)
{
if (string.IsNullOrEmpty(text))
return text;
int length = text.Length;
StringBuilder stringBuilder = new StringBuilder(length);
for (int i = 0; i < length; ++i)
{
if (XmlConvert.IsXmlChar(text[i]))
{
stringBuilder.Append(text[i]);
}
else if (i + 1 < length && XmlConvert.IsXmlSurrogatePair(text[i + 1], text[i]))
{
stringBuilder.Append(text[i]);
stringBuilder.Append(text[i + 1]);
++i;
}
}
return stringBuilder.ToString();
}
Here is an optimized version of the above method RemoveInvalidXmlChars which doesn't create a new array on every call, thus stressing the GC unnecessarily:
public static string RemoveInvalidXmlChars(string text)
{
if (text == null)
return text;
if (text.Length == 0)
return text;
// a bit complicated, but avoids memory usage if not necessary
StringBuilder result = null;
for (int i = 0; i < text.Length; i++)
{
var ch = text[i];
if (XmlConvert.IsXmlChar(ch))
{
result?.Append(ch);
}
else if (result == null)
{
result = new StringBuilder();
result.Append(text.Substring(0, i));
}
}
if (result == null)
return text; // no invalid xml chars detected - return original text
else
return result.ToString();
}
// Replace invalid characters with empty strings.
Regex.Replace(inputString, #"[^\w\.#-]", "");
The regular expression pattern [^\w.#-] matches any character that is not a word character, a period, an # symbol, or a hyphen. A word character is any letter, decimal digit, or punctuation connector such as an underscore. Any character that matches this pattern is replaced by String.Empty, which is the string defined by the replacement pattern. To allow additional characters in user input, add those characters to the character class in the regular expression pattern. For example, the regular expression pattern [^\w.#-\%] also allows a percentage symbol and a backslash in an input string.
Regex.Replace(inputString, #"[!##$%_]", "");
Refer this too :
Removing Invalid Characters from XML Name Tag - RegEx C#
Here is a function to remove the characters from a specified XML string:
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace XMLUtils
{
class Standards
{
/// <summary>
/// Strips non-printable ascii characters
/// Refer to http://www.w3.org/TR/xml11/#charsets for XML 1.1
/// Refer to http://www.w3.org/TR/2006/REC-xml-20060816/#charsets for XML 1.0
/// </summary>
/// <param name="content">contents</param>
/// <param name="XMLVersion">XML Specification to use. Can be 1.0 or 1.1</param>
private void StripIllegalXMLChars(string tmpContents, string XMLVersion)
{
string pattern = String.Empty;
switch (XMLVersion)
{
case "1.0":
pattern = #"#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|7F|8[0-46-9A-F]9[0-9A-F])";
break;
case "1.1":
pattern = #"#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|[19][0-9A-F]|7F|8[0-46-9A-F]|0?[1-8BCEF])";
break;
default:
throw new Exception("Error: Invalid XML Version!");
}
Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
if (regex.IsMatch(tmpContents))
{
tmpContents = regex.Replace(tmpContents, String.Empty);
}
tmpContents = string.Empty;
}
}
}
If you are only escaping invalid XML characters for a string that is used inside of an XML tag you could do something simple like this.
This works when you aren't using an XML library.
public string EscapeXMLCharacters (string target)
{
return
target
.Replace("&", "&")
.Replace("<", "<")
.Replace(">", ">")
.Replace("\"", """)
.Replace("'", "'");
}
you could then call it like so:
public string GetXMLBody(string content)
{
return #"<input>" + EscapeXMLCharacters(content) + "</input>";
}
string XMLWriteStringWithoutIllegalCharacters(string UnfilteredString)
{
if (UnfilteredString == null)
return string.Empty;
return XmlConvert.EncodeName(UnfilteredString);
}
string XMLReadStringWithoutIllegalCharacters(string FilteredString)
{
if (UnfilteredString == null)
return string.Empty;
return XmlConvert.DecodeName(UnfilteredString);
}
This simple method replace the invalid characters with the same value but accepted in the XML context.
To write string use XMLWriteStringWithoutIllegalCharacters(string UnfilteredString).
To read string use XMLReadStringWithoutIllegalCharacters(string FilteredString).
What is the best way to convert from Pascal Case (upper Camel Case) to a sentence.
For example starting with
"AwaitingFeedback"
and converting that to
"Awaiting feedback"
C# preferable but I could convert it from Java or similar.
public static string ToSentenceCase(this string str)
{
return Regex.Replace(str, "[a-z][A-Z]", m => m.Value[0] + " " + char.ToLower(m.Value[1]));
}
In versions of visual studio after 2015, you can do
public static string ToSentenceCase(this string str)
{
return Regex.Replace(str, "[a-z][A-Z]", m => $"{m.Value[0]} {char.ToLower(m.Value[1])}");
}
Based on: Converting Pascal case to sentences using regular expression
I will prefer to use Humanizer for this. Humanizer is a Portable Class Library that meets all your .NET needs for manipulating and displaying strings, enums, dates, times, timespans, numbers and quantities.
Short Answer
"AwaitingFeedback".Humanize() => Awaiting feedback
Long and Descriptive Answer
Humanizer can do a lot more work other examples are:
"PascalCaseInputStringIsTurnedIntoSentence".Humanize() => "Pascal case input string is turned into sentence"
"Underscored_input_string_is_turned_into_sentence".Humanize() => "Underscored input string is turned into sentence"
"Can_return_title_Case".Humanize(LetterCasing.Title) => "Can Return Title Case"
"CanReturnLowerCase".Humanize(LetterCasing.LowerCase) => "can return lower case"
Complete code is :
using Humanizer;
using static System.Console;
namespace HumanizerConsoleApp
{
class Program
{
static void Main(string[] args)
{
WriteLine("AwaitingFeedback".Humanize());
WriteLine("PascalCaseInputStringIsTurnedIntoSentence".Humanize());
WriteLine("Underscored_input_string_is_turned_into_sentence".Humanize());
WriteLine("Can_return_title_Case".Humanize(LetterCasing.Title));
WriteLine("CanReturnLowerCase".Humanize(LetterCasing.LowerCase));
}
}
}
Output
Awaiting feedback
Pascal case input string is turned into sentence
Underscored input string is turned into sentence Can Return Title Case
can return lower case
If you prefer to write your own C# code you can achieve this by writing some C# code stuff as answered by others already.
Here you go...
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace CamelCaseToString
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine(CamelCaseToString("ThisIsYourMasterCallingYou"));
}
private static string CamelCaseToString(string str)
{
if (str == null || str.Length == 0)
return null;
StringBuilder retVal = new StringBuilder(32);
retVal.Append(char.ToUpper(str[0]));
for (int i = 1; i < str.Length; i++ )
{
if (char.IsLower(str[i]))
{
retVal.Append(str[i]);
}
else
{
retVal.Append(" ");
retVal.Append(char.ToLower(str[i]));
}
}
return retVal.ToString();
}
}
}
This works for me:
Regex.Replace(strIn, "([A-Z]{1,2}|[0-9]+)", " $1").TrimStart()
This is just like #SSTA, but is more efficient than calling TrimStart.
Regex.Replace("ThisIsMyCapsDelimitedString", "(\\B[A-Z])", " $1")
Found this in the MvcContrib source, doesn't seem to be mentioned here yet.
return Regex.Replace(input, "([A-Z])", " $1", RegexOptions.Compiled).Trim();
Just because everyone has been using Regex (except this guy), here's an implementation with StringBuilder that was about 5x faster in my tests. Includes checking for numbers too.
"SomeBunchOfCamelCase2".FromCamelCaseToSentence == "Some Bunch Of Camel Case 2"
public static string FromCamelCaseToSentence(this string input) {
if(string.IsNullOrEmpty(input)) return input;
var sb = new StringBuilder();
// start with the first character -- consistent camelcase and pascal case
sb.Append(char.ToUpper(input[0]));
// march through the rest of it
for(var i = 1; i < input.Length; i++) {
// any time we hit an uppercase OR number, it's a new word
if(char.IsUpper(input[i]) || char.IsDigit(input[i])) sb.Append(' ');
// add regularly
sb.Append(input[i]);
}
return sb.ToString();
}
Here's a basic way of doing it that I came up with using Regex
public static string CamelCaseToSentence(this string value)
{
var sb = new StringBuilder();
var firstWord = true;
foreach (var match in Regex.Matches(value, "([A-Z][a-z]+)|[0-9]+"))
{
if (firstWord)
{
sb.Append(match.ToString());
firstWord = false;
}
else
{
sb.Append(" ");
sb.Append(match.ToString().ToLower());
}
}
return sb.ToString();
}
It will also split off numbers which I didn't specify but would be useful.
string camel = "MyCamelCaseString";
string s = Regex.Replace(camel, "([A-Z])", " $1").ToLower().Trim();
Console.WriteLine(s.Substring(0,1).ToUpper() + s.Substring(1));
Edit: didn't notice your casing requirements, modifed accordingly. You could use a matchevaluator to do the casing, but I think a substring is easier. You could also wrap it in a 2nd regex replace where you change the first character
"^\w"
to upper
\U (i think)
I'd use a regex, inserting a space before each upper case character, then lowering all the string.
string spacedString = System.Text.RegularExpressions.Regex.Replace(yourString, "\B([A-Z])", " \k");
spacedString = spacedString.ToLower();
It is easy to do in JavaScript (or PHP, etc.) where you can define a function in the replace call:
var camel = "AwaitingFeedbackDearMaster";
var sentence = camel.replace(/([A-Z].)/g, function (c) { return ' ' + c.toLowerCase(); });
alert(sentence);
Although I haven't solved the initial cap problem... :-)
Now, for the Java solution:
String ToSentence(String camel)
{
if (camel == null) return ""; // Or null...
String[] words = camel.split("(?=[A-Z])");
if (words == null) return "";
if (words.length == 1) return words[0];
StringBuilder sentence = new StringBuilder(camel.length());
if (words[0].length() > 0) // Just in case of camelCase instead of CamelCase
{
sentence.append(words[0] + " " + words[1].toLowerCase());
}
else
{
sentence.append(words[1]);
}
for (int i = 2; i < words.length; i++)
{
sentence.append(" " + words[i].toLowerCase());
}
return sentence.toString();
}
System.out.println(ToSentence("AwaitingAFeedbackDearMaster"));
System.out.println(ToSentence(null));
System.out.println(ToSentence(""));
System.out.println(ToSentence("A"));
System.out.println(ToSentence("Aaagh!"));
System.out.println(ToSentence("stackoverflow"));
System.out.println(ToSentence("disableGPS"));
System.out.println(ToSentence("Ahh89Boo"));
System.out.println(ToSentence("ABC"));
Note the trick to split the sentence without loosing any character...
Pseudo-code:
NewString = "";
Loop through every char of the string (skip the first one)
If char is upper-case ('A'-'Z')
NewString = NewString + ' ' + lowercase(char)
Else
NewString = NewString + char
Better ways can perhaps be done by using regex or by string replacement routines (replace 'X' with ' x')
An xquery solution that works for both UpperCamel and lowerCamel case:
To output sentence case (only the first character of the first word is capitalized):
declare function content:sentenceCase($string)
{
let $firstCharacter := substring($string, 1, 1)
let $remainingCharacters := substring-after($string, $firstCharacter)
return
concat(upper-case($firstCharacter),lower-case(replace($remainingCharacters, '([A-Z])', ' $1')))
};
To output title case (first character of each word capitalized):
declare function content:titleCase($string)
{
let $firstCharacter := substring($string, 1, 1)
let $remainingCharacters := substring-after($string, $firstCharacter)
return
concat(upper-case($firstCharacter),replace($remainingCharacters, '([A-Z])', ' $1'))
};
Found myself doing something similar, and I appreciate having a point-of-departure with this discussion. This is my solution, placed as an extension method to the string class in the context of a console application.
using System;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
string piratese = "avastTharMatey";
string ivyese = "CheerioPipPip";
Console.WriteLine("{0}\n{1}\n", piratese.CamelCaseToString(), ivyese.CamelCaseToString());
Console.WriteLine("For Pete\'s sake, man, hit ENTER!");
string strExit = Console.ReadLine();
}
}
public static class StringExtension
{
public static string CamelCaseToString(this string str)
{
StringBuilder retVal = new StringBuilder(32);
if (!string.IsNullOrEmpty(str))
{
string strTrimmed = str.Trim();
if (!string.IsNullOrEmpty(strTrimmed))
{
retVal.Append(char.ToUpper(strTrimmed[0]));
if (strTrimmed.Length > 1)
{
for (int i = 1; i < strTrimmed.Length; i++)
{
if (char.IsUpper(strTrimmed[i])) retVal.Append(" ");
retVal.Append(char.ToLower(strTrimmed[i]));
}
}
}
}
return retVal.ToString();
}
}
}
Most of the preceding answers split acronyms and numbers, adding a space in front of each character. I wanted acronyms and numbers to be kept together so I have a simple state machine that emits a space every time the input transitions from one state to the other.
/// <summary>
/// Add a space before any capitalized letter (but not for a run of capitals or numbers)
/// </summary>
internal static string FromCamelCaseToSentence(string input)
{
if (string.IsNullOrEmpty(input)) return String.Empty;
var sb = new StringBuilder();
bool upper = true;
for (var i = 0; i < input.Length; i++)
{
bool isUpperOrDigit = char.IsUpper(input[i]) || char.IsDigit(input[i]);
// any time we transition to upper or digits, it's a new word
if (!upper && isUpperOrDigit)
{
sb.Append(' ');
}
sb.Append(input[i]);
upper = isUpperOrDigit;
}
return sb.ToString();
}
And here's some tests:
[TestCase(null, ExpectedResult = "")]
[TestCase("", ExpectedResult = "")]
[TestCase("ABC", ExpectedResult = "ABC")]
[TestCase("abc", ExpectedResult = "abc")]
[TestCase("camelCase", ExpectedResult = "camel Case")]
[TestCase("PascalCase", ExpectedResult = "Pascal Case")]
[TestCase("Pascal123", ExpectedResult = "Pascal 123")]
[TestCase("CustomerID", ExpectedResult = "Customer ID")]
[TestCase("CustomABC123", ExpectedResult = "Custom ABC123")]
public string CanSplitCamelCase(string input)
{
return FromCamelCaseToSentence(input);
}
Mostly already answered here
Small chage to the accepted answer, to convert the second and subsequent Capitalised letters to lower case, so change
if (char.IsUpper(text[i]))
newText.Append(' ');
newText.Append(text[i]);
to
if (char.IsUpper(text[i]))
{
newText.Append(' ');
newText.Append(char.ToLower(text[i]));
}
else
newText.Append(text[i]);
Here is my implementation. This is the fastest that I got while avoiding creating spaces for abbreviations.
public static string PascalCaseToSentence(string input)
{
if (string.IsNullOrEmpty(input) || input.Length < 2)
return input;
var sb = new char[input.Length + ((input.Length + 1) / 2)];
var len = 0;
var lastIsLower = false;
for (int i = 0; i < input.Length; i++)
{
var current = input[i];
if (current < 97)
{
if (lastIsLower)
{
sb[len] = ' ';
len++;
}
lastIsLower = false;
}
else
{
lastIsLower = true;
}
sb[len] = current;
len++;
}
return new string(sb, 0, len);
}