C# StreamReader detect encoding of an XML File - c#
In my xml file i have data like this :
<Data>
<Field>
<Name>BarcodeCapture_0</Name>
<Type>SimpleIndex</Type>
<DataType>DataMatrix</DataType>
<Value>DEA"¡CV°)Ñ võ Fƒ´ 20100410050</Value>
</Field>
</Data>
Im using a class that extend from StreamReader, i override reading methodes to prevent inaccepted characters like character.
This is the class
public class CustomStreamReader : StreamReader
{
private const int EOF = -1;
public CustomStreamReader(Stream stream) : base(stream)
{
}
public CustomStreamReader(string path) : base(path)
{
}
public CustomStreamReader(string path, Encoding encoding) : base(path, encoding)
{
}
/// <summary>
/// Get whether an integer represents a legal XML 1.0 or 1.1 character. See
/// the specification at w3.org for these characters.
/// </summary>
/// <param name="xmlVersion">
/// The version number as a string. Use "1.0" for XML 1.0 character
/// validation, and use "1.1" for XML 1.1 character validation.
/// </param>
public static bool IsLegalXmlChar(string xmlVersion, int character)
{
switch (xmlVersion)
{
case "1.1": // http://www.w3.org/TR/xml11/#charsets
{
return
!(
character <= 0x8 ||
character == 0xB ||
character == 0xC ||
(character >= 0xE && character <= 0x1F) ||
(character >= 0x7F && character <= 0x84) ||
(character >= 0x86 && character <= 0x9F) ||
character > 0x10FFFF
);
}
case "1.0": // http://www.w3.org/TR/REC-xml/#charsets
{
return
(
character == 0x9 /* == '\t' == 9 */ ||
character == 0xA /* == '\n' == 10 */ ||
character == 0xD /* == '\r' == 13 */ ||
(character >= 0x20 && character <= 0xD7FF) ||
(character >= 0xE000 && character <= 0xFFFD) ||
(character >= 0x10000 && character <= 0x10FFFF)
);
}
default:
{
throw new ArgumentOutOfRangeException
("xmlVersion", string.Format("'{0}' is not a valid XML version."));
}
}
}
/// <summary>
/// Get whether an integer represents a legal XML 1.0 character. See the
/// specification at w3.org for these characters.
/// </summary>
public static bool IsLegalXmlChar(int character)
{
return CustomStreamReader.IsLegalXmlChar("1.0", character);
}
public override int Read()
{
// Read each character, skipping over characters that XML has prohibited
int nextCharacter;
do
{
// Read a character
if ((nextCharacter = base.Read()) == EOF)
{
// If the character denotes the end of the file, stop reading
break;
}
}
// Skip the character if it's prohibited, and try the next
while (!CustomStreamReader.IsLegalXmlChar(nextCharacter));
return nextCharacter;
}
public override int Peek()
{
// Return the next legl XML character without reading it
int nextCharacter;
do
{
// See what the next character is
nextCharacter = base.Peek();
}
while
(
// If it's prohibited XML, skip over the character in the stream
// and try the next.
!CustomStreamReader.IsLegalXmlChar(nextCharacter) &&
(nextCharacter = base.Read()) != EOF
);
return nextCharacter;
} // method
// The following methods are exact copies of the methods in TextReader,
// extracting by disassembling it in Refelctor
public override int Read(char[] buffer, int index, int count)
{
if (buffer == null)
{
throw new ArgumentNullException("buffer");
}
if (index < 0)
{
throw new ArgumentOutOfRangeException("index");
}
if (count < 0)
{
throw new ArgumentOutOfRangeException("count");
}
if ((buffer.Length - index) < count)
{
throw new ArgumentException();
}
int num = 0;
do
{
int num2 = this.Read();
if (num2 == -1)
{
return num;
}
buffer[index + num++] = (char)num2;
}
while (num < count);
return num;
}
public override int ReadBlock(char[] buffer, int index, int count)
{
int num;
int num2 = 0;
do
{
num2 += num = this.Read(buffer, index + num2, count - num2);
}
while ((num > 0) && (num2 < count));
return num2;
}
public override string ReadLine()
{
StringBuilder builder = new StringBuilder();
while (true)
{
int num = this.Read();
switch (num)
{
case -1:
if (builder.Length > 0)
{
return builder.ToString();
}
return null;
case 13:
case 10:
if ((num == 13) && (this.Peek() == 10))
{
this.Read();
}
return builder.ToString();
}
builder.Append((char)num);
}
}
public override string ReadToEnd()
{
int num;
char[] buffer = new char[0x1000];
StringBuilder builder = new StringBuilder(0x1000);
while ((num = this.Read(buffer, 0, buffer.Length)) != 0)
{
builder.Append(buffer, 0, num);
}
return builder.ToString();
}
}
In XML deserialization side :
CustomStreamReader fStream_scanTransaction_XML = new CustomStreamReader(scanTransactionFilePath, Encoding.UTF8);
XmlSerializer s = new XmlSerializer(typeof(ScanTransaction));
ScanTransaction result = ScanTransaction)s.Deserialize(fStream_scanTransaction_XML);
The problem is that StreamReader cannot detect encoding, so it not deleting this character and the Xml deserialization faild.
Try:
using (var sr = new StreamReader("XMLFile1.xml", Encoding.UTF8))
using (var xtr = new XmlTextReader(sr))
{
XmlSerializer s = new XmlSerializer(typeof(ScanTransaction));
ScanTransaction result = (ScanTransaction)s.Deserialize(xtr);
}
You don't even neeed a "special" StreamReader. The XmlTextReader doesn't check for illegal characters (you can control this with the Normalize boolean property, but it is by default false, so no check on illegal characters)
Related
Given a formattable string, is there a clean way to get the formatting string for the given arguments?
Suppose I had a FormattableString like so: var now = DateTime.Now; FormattableString str = $"Today's date is: {now:yyyy-MM-dd} and some numbers: {new[]{1,2,3}}"; I'm trying to take a formattable string and transform it to be used in another component. In my case, essentially splitting the format string at the values so I can control how they're concatenated back, but I need the format string for the argument. There are some limited methods on the FormattableString that allows me to get the argument values (GetArguments()/GetArgument()) and the original format string (Format), but there is none for accessing the argument formatting strings. var format = str.Format; // "Today's date is: {0:yyyy-MM-dd} and some numbers: {1}" var arguments = str.GetArguments(); // { now, new[]{1,2,3} } // no simple way to get the "yyyy-MM-dd" part for arg 0 My workarounds I'm looking at are to preformat the value so the formatted value is set as the argument or parsing out the format string which would not be ideal. void DumpLine(FormattableString format) { var values = format.GetArguments().Prepend(null); var parts = Regex.Split(format.Format, #"\{[^}]+\}"); Util.HorizontalRun(false, values.Zip(parts).SelectMany(x => new[] { x.First, x.Second }).Skip(1)).Dump(); } // usage DumpLine($"Today's date is: {now.ToString("yyyy-MM-dd")} and some numbers: {new[]{1,2,3}}"); Is this supported, perhaps through a helper class or am I out of luck? Thank you mariusz96 for pointing out the new InterpolatedStringHandler functionality. It does for me exactly what I needed it for. It's even flexible enough for me to add additional parameters should I need it in the future. This is what I ended up with: public static class UtilEx { public static object Interpolate(IFormatProvider provider, [InterpolatedStringHandlerArgument(nameof(provider))] InterpolateFormatHandler handler) => Interpolate(handler); public static object Interpolate(InterpolateFormatHandler handler) => Util.HorizontalRun(false, handler.Items); [InterpolatedStringHandler] public ref struct InterpolateFormatHandler { private readonly IFormatProvider? provider; public InterpolateFormatHandler(int _literalLength, int _formattedCount, IFormatProvider? provider = default) => this.provider = provider; public List<object?> Items { get; } = new(); public void AppendLiteral(string s) => Items.Add(s); public void AppendFormatted<T>(T t, int alignment, string format) => Items.Add(string.Format(provider, $"{{0,{alignment}:{format}}}", t)); public void AppendFormatted<T>(T t, int alignment) => Items.Add(string.Format(provider, $"{{0,{alignment}}}", t)); public void AppendFormatted<T>(T t, string format) => Items.Add(string.Format(provider, $"{{0:{format}}}", t)); public void AppendFormatted<T>(T t) => Items.Add(t); } }
This is possible with a custom InterpolatedStringHandler. It has overloads that take alignment and format.
none for accessing the argument formatting strings.. Is this supported, perhaps through a helper class or am I out of luck? Interesting question. I'm tempted to say "no" - though I don't have any references/sources to cite I've always regarded FormattableString as a helper for interpolation that enables some parts of a program to know the difference between contexts where it receives a string, and contexts where it receives a formattable string - in essence, to know that something was once a string with format placeholders is helpful in cases like an SQL ORM running a raw command, and wanting to parameterize it. If it receives a FormattableString, it can parameterize the arguments and know where to insert them by parsing the format. If it straight received a formatted string it wouldn't be able to do that, so a FormattableString allows us to keep the format string and related arguments separated until the last moment. When the compiler is turning an interpolated string into a formattable one, it has a relatively easy task. Take a look at what happens in this simple example, run through sharplab's compile/decompile cycle: It's essentially just collecting variables mentioned inside a string, numbering them and swapping the interp out for a standard numerical placeholder format string. It doesn't need to touch the formatting specifiers when it converts {b:0000} -> {0:0000} So the actual formats embedded in the placeholders aren't separated out at this stage; they're parsed out later. If we take a look at this internal method of a stringbuilder, which is what string.Format (eventually) defers to (which is what formattable string defers to), we can see it operating in statemachine parser style, hunting for non escaped {, parsing the numeric holder number, and then pulling the format and padding specifiers out by reference to commas and colons: //from .net framework reference source internal StringBuilder AppendFormatHelper(IFormatProvider provider, String format, ParamsArray args) { if (format == null) { throw new ArgumentNullException("format"); } Contract.Ensures(Contract.Result<StringBuilder>() != null); Contract.EndContractBlock(); int pos = 0; int len = format.Length; char ch = '\x0'; ICustomFormatter cf = null; if (provider != null) { cf = (ICustomFormatter)provider.GetFormat(typeof(ICustomFormatter)); } while (true) { int p = pos; int i = pos; while (pos < len) { ch = format[pos]; pos++; if (ch == '}') { if (pos < len && format[pos] == '}') // Treat as escape character for }} pos++; else FormatError(); } if (ch == '{') { if (pos < len && format[pos] == '{') // Treat as escape character for {{ pos++; else { pos--; break; } } Append(ch); } if (pos == len) break; pos++; if (pos == len || (ch = format[pos]) < '0' || ch > '9') FormatError(); int index = 0; do { index = index * 10 + ch - '0'; pos++; if (pos == len) FormatError(); ch = format[pos]; } while (ch >= '0' && ch <= '9' && index < 1000000); if (index >= args.Length) throw new FormatException(Environment.GetResourceString("Format_IndexOutOfRange")); while (pos < len && (ch = format[pos]) == ' ') pos++; bool leftJustify = false; int width = 0; if (ch == ',') { pos++; while (pos < len && format[pos] == ' ') pos++; if (pos == len) FormatError(); ch = format[pos]; if (ch == '-') { leftJustify = true; pos++; if (pos == len) FormatError(); ch = format[pos]; } if (ch < '0' || ch > '9') FormatError(); do { width = width * 10 + ch - '0'; pos++; if (pos == len) FormatError(); ch = format[pos]; } while (ch >= '0' && ch <= '9' && width < 1000000); } while (pos < len && (ch = format[pos]) == ' ') pos++; Object arg = args[index]; StringBuilder fmt = null; if (ch == ':') { pos++; p = pos; i = pos; while (true) { if (pos == len) FormatError(); ch = format[pos]; pos++; if (ch == '{') { if (pos < len && format[pos] == '{') // Treat as escape character for {{ pos++; else FormatError(); } else if (ch == '}') { if (pos < len && format[pos] == '}') // Treat as escape character for }} pos++; else { pos--; break; } } if (fmt == null) { fmt = new StringBuilder(); } fmt.Append(ch); } } if (ch != '}') FormatError(); pos++; String sFmt = null; String s = null; if (cf != null) { if (fmt != null) { sFmt = fmt.ToString(); } s = cf.Format(sFmt, arg, provider); } if (s == null) { IFormattable formattableArg = arg as IFormattable; #if FEATURE_LEGACYNETCF if(CompatibilitySwitches.IsAppEarlierThanWindowsPhone8) { // TimeSpan does not implement IFormattable in Mango if(arg is TimeSpan) { formattableArg = null; } } #endif if (formattableArg != null) { if (sFmt == null && fmt != null) { sFmt = fmt.ToString(); } s = formattableArg.ToString(sFmt, provider); } else if (arg != null) { s = arg.ToString(); } } if (s == null) s = String.Empty; int pad = width - s.Length; if (!leftJustify && pad > 0) Append(' ', pad); Append(s); if (leftJustify && pad > 0) Append(' ', pad); } return this; } https://referencesource.microsoft.com/#mscorlib/system/text/stringbuilder.cs,2c3b4c2e7c43f5a4 All in, I'd say if you want those format arguments you'll need to pull them from the Format yourself; it's what the framework does. The code that does it is above, and could be tarted up/slimmed down but if you adopt the same approach with a helper as the framework does it should be similarly reliable and consistent in its behavior ps; here's the source for the same method from .net core; essentially the same, but more nicely commented ;) internal StringBuilder AppendFormatHelper(IFormatProvider? provider, string format, ParamsArray args) { if (format == null) { throw new ArgumentNullException(nameof(format)); } int pos = 0; int len = format.Length; char ch = '\x0'; ICustomFormatter? cf = null; if (provider != null) { cf = (ICustomFormatter?)provider.GetFormat(typeof(ICustomFormatter)); } while (true) { while (pos < len) { ch = format[pos]; pos++; // Is it a closing brace? if (ch == '}') { // Check next character (if there is one) to see if it is escaped. eg }} if (pos < len && format[pos] == '}') { pos++; } else { // Otherwise treat it as an error (Mismatched closing brace) FormatError(); } } // Is it an opening brace? else if (ch == '{') { // Check next character (if there is one) to see if it is escaped. eg {{ if (pos < len && format[pos] == '{') { pos++; } else { // Otherwise treat it as the opening brace of an Argument Hole. pos--; break; } } // If it's neither then treat the character as just text. Append(ch); } // // Start of parsing of Argument Hole. // Argument Hole ::= { Index (, WS* Alignment WS*)? (: Formatting)? } // if (pos == len) { break; } // // Start of parsing required Index parameter. // Index ::= ('0'-'9')+ WS* // pos++; // If reached end of text then error (Unexpected end of text) // or character is not a digit then error (Unexpected Character) if (pos == len || (ch = format[pos]) < '0' || ch > '9') FormatError(); int index = 0; do { index = index * 10 + ch - '0'; pos++; // If reached end of text then error (Unexpected end of text) if (pos == len) { FormatError(); } ch = format[pos]; // so long as character is digit and value of the index is less than 1000000 ( index limit ) } while (ch >= '0' && ch <= '9' && index < IndexLimit); // If value of index is not within the range of the arguments passed in then error (Index out of range) if (index >= args.Length) { throw new FormatException(SR.Format_IndexOutOfRange); } // Consume optional whitespace. while (pos < len && (ch = format[pos]) == ' ') pos++; // End of parsing index parameter. // // Start of parsing of optional Alignment // Alignment ::= comma WS* minus? ('0'-'9')+ WS* // bool leftJustify = false; int width = 0; // Is the character a comma, which indicates the start of alignment parameter. if (ch == ',') { pos++; // Consume Optional whitespace while (pos < len && format[pos] == ' ') pos++; // If reached the end of the text then error (Unexpected end of text) if (pos == len) { FormatError(); } // Is there a minus sign? ch = format[pos]; if (ch == '-') { // Yes, then alignment is left justified. leftJustify = true; pos++; // If reached end of text then error (Unexpected end of text) if (pos == len) { FormatError(); } ch = format[pos]; } // If current character is not a digit then error (Unexpected character) if (ch < '0' || ch > '9') { FormatError(); } // Parse alignment digits. do { width = width * 10 + ch - '0'; pos++; // If reached end of text then error. (Unexpected end of text) if (pos == len) { FormatError(); } ch = format[pos]; // So long a current character is a digit and the value of width is less than 100000 ( width limit ) } while (ch >= '0' && ch <= '9' && width < WidthLimit); // end of parsing Argument Alignment } // Consume optional whitespace while (pos < len && (ch = format[pos]) == ' ') pos++; // // Start of parsing of optional formatting parameter. // object? arg = args[index]; ReadOnlySpan<char> itemFormatSpan = default; // used if itemFormat is null // Is current character a colon? which indicates start of formatting parameter. if (ch == ':') { pos++; int startPos = pos; while (true) { // If reached end of text then error. (Unexpected end of text) if (pos == len) { FormatError(); } ch = format[pos]; if (ch == '}') { // Argument hole closed break; } else if (ch == '{') { // Braces inside the argument hole are not supported FormatError(); } pos++; } if (pos > startPos) { itemFormatSpan = format.AsSpan(startPos, pos - startPos); } } else if (ch != '}') { // Unexpected character FormatError(); } // Construct the output for this arg hole. pos++; string? s = null; string? itemFormat = null; if (cf != null) { if (itemFormatSpan.Length != 0) { itemFormat = new string(itemFormatSpan); } s = cf.Format(itemFormat, arg, provider); } if (s == null) { // If arg is ISpanFormattable and the beginning doesn't need padding, // try formatting it into the remaining current chunk. if (arg is ISpanFormattable spanFormattableArg && (leftJustify || width == 0) && spanFormattableArg.TryFormat(RemainingCurrentChunk, out int charsWritten, itemFormatSpan, provider)) { if ((uint)charsWritten > (uint)RemainingCurrentChunk.Length) { // Untrusted ISpanFormattable implementations might return an erroneous charsWritten value, // and m_ChunkLength might end up being used in Unsafe code, so fail if we get back an // out-of-range charsWritten value. FormatError(); } m_ChunkLength += charsWritten; // Pad the end, if needed. int padding = width - charsWritten; if (leftJustify && padding > 0) { Append(' ', padding); } // Continue to parse other characters. continue; } // Otherwise, fallback to trying IFormattable or calling ToString. if (arg is IFormattable formattableArg) { if (itemFormatSpan.Length != 0) { itemFormat ??= new string(itemFormatSpan); } s = formattableArg.ToString(itemFormat, provider); } else if (arg != null) { s = arg.ToString(); } } // Append it to the final output of the Format String. if (s == null) { s = string.Empty; } int pad = width - s.Length; if (!leftJustify && pad > 0) { Append(' ', pad); } Append(s); if (leftJustify && pad > 0) { Append(' ', pad); } // Continue to parse other characters. } return this; } https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Text/StringBuilder.cs
Accidentally splitting unicode chars when truncating strings
I'm saving some strings from a third party into my database (postgres). Sometimes these strings are too long and need to be truncated to fit into the column in my table. On some random occasions I accidentally truncate the string right where there is a Unicode character, which gives me a "broken" string that I cannot save into the database. I get the following error: Unable to translate Unicode character \uD83D at index XXX to specified code page. I've created a minimal example to show you what I mean. Here I have a string that contains a Unicode character ("Small blue diamond" 🔹 U+1F539). Depending on where I truncate, it gives me a valid string or not. var myString = #"This is a string before an emoji:🔹 This is after the emoji."; var brokenString = myString.Substring(0, 34); // Gives: "This is a string before an emoji:☐" var test3 = myString.Substring(0, 35); // Gives: "This is a string before an emoji:🔹" Is there a way for me to truncate the string without accidentally breaking any Unicode chars?
A Unicode character may be represented with several chars, that is the problem with string.Substring you are having. You may convert your string to a StringInfo object and then use SubstringByTextElements() method to get the substring based on the Unicode character count, not a char count. See a C# demo: Console.WriteLine("🔹".Length); // => 2 Console.WriteLine(new StringInfo("🔹").LengthInTextElements); // => 1 var myString = #"This is a string before an emoji:🔹This is after the emoji."; var teMyString = new StringInfo(myString); Console.WriteLine(teMyString.SubstringByTextElements(0, 33)); // => "This is a string before an emoji:" Console.WriteLine(teMyString.SubstringByTextElements(0, 34)); // => This is a string before an emoji:🔹 Console.WriteLine(teMyString.SubstringByTextElements(0, 35)); // => This is a string before an emoji:🔹T
I ended up using a modification of xanatos answer here. The difference is that this version will strip the last grapheme, if adding it would give a string longer than length. public static string UnicodeSafeSubstring(this string str, int startIndex, int length) { if (str == null) { throw new ArgumentNullException(nameof(str)); } if (startIndex < 0 || startIndex > str.Length) { throw new ArgumentOutOfRangeException(nameof(startIndex)); } if (length < 0) { throw new ArgumentOutOfRangeException(nameof(length)); } if (startIndex + length > str.Length) { throw new ArgumentOutOfRangeException(nameof(length)); } if (length == 0) { return string.Empty; } var stringBuilder = new StringBuilder(length); var enumerator = StringInfo.GetTextElementEnumerator(str, startIndex); while (enumerator.MoveNext()) { var grapheme = enumerator.GetTextElement(); startIndex += grapheme.Length; if (startIndex > str.Length) { break; } // Skip initial Low Surrogates/Combining Marks if (stringBuilder.Length == 0) { if (char.IsLowSurrogate(grapheme[0])) { continue; } var cat = char.GetUnicodeCategory(grapheme, 0); if (cat == UnicodeCategory.NonSpacingMark || cat == UnicodeCategory.SpacingCombiningMark || cat == UnicodeCategory.EnclosingMark) { continue; } } // Do not append the grapheme if the resulting string would be longer than the required length if (stringBuilder.Length + grapheme.Length <= length) { stringBuilder.Append(grapheme); } if (stringBuilder.Length >= length) { break; } } return stringBuilder.ToString(); } }
Here is an example for truncate (startIndex = 0): string truncatedStr = (str.Length > maxLength) ? str.Substring(0, maxLength - (char.IsLowSurrogate(str[maxLength]) ? 1 : 0)) : str;
Better truncate by the number of bytes not string length public static string TruncateByBytes(this string text, int maxBytes) { if (string.IsNullOrEmpty(text) || Encoding.UTF8.GetByteCount(text) <= maxBytes) { return text; } var enumerator = StringInfo.GetTextElementEnumerator(text); var newStr = string.Empty; do { enumerator.MoveNext(); if (Encoding.UTF8.GetByteCount(newStr + enumerator.Current) <= maxBytes) { newStr += enumerator.Current; } else { break; } } while (true); return newStr; }
Convert to UNICODE in C#
public string DecodeFromUtf8(string utf8String) { // copy the string as UTF-8 bytes. byte[] utf8Bytes = new byte[utf8String.Length]; for (int i = 0; i < utf8String.Length; ++i) { //Debug.Assert( 0 <= utf8String[i] && utf8String[i] <= 255, //"the char must be in byte's range"); utf8Bytes[i] = (byte)utf8String[i]; } return Encoding.UTF8.GetString(utf8Bytes, 0, utf8Bytes.Length); } this code doesn't work for me do you have any good ideas? i need the unicode array for russian fonts like this public static readonly ReadOnlyCollection<char> Unicodes = Array.AsReadOnly(new char[] { '\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009', '\u000A', '\u000B', '\u000C', '\u000D', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '\u0020', '\u0021', '\u0022', '\u0023', '\u0024', '\u0025', '\u0026', '\u0027', '\u0028', '\u0029', '\u002A', '\u002B', '\u002C', '\u002D', '\u002E', '\u002F', '\u0030', '\u0031', '\u0032', '\u0033', '\u0034', '\u0035', '\u0036', '\u0037', '\u0038', '\u0039', '\u003A', '\u003B', '\u003C', '\u003D', '\u003E', '\u003F', '\u0040', '\u0041', '\u0042', '\u0043', '\u0044', '\u0045', '\u0046', '\u0047', '\u0048', '\u0049', '\u004A', '\u004B', '\u004C', '\u004D', '\u004E', '\u004F', '\u0050', '\u0051', '\u0052', '\u0053', '\u0054', '\u0055', '\u0056', '\u0057', '\u0058', '\u0059', '\u005A', '\u005B', '\u005C', '\u005D', '\u005E', '\u005F', '\u0060', '\u0061', '\u0062', '\u0063', '\u0064', '\u0065', '\u0066', '\u0067', '\u0068', '\u0069', '\u006A', '\u006B', '\u006C', '\u006D', '\u006E', '\u006F', '\u0070', '\u0071', '\u0072', '\u0073', '\u0074', '\u0075', '\u0076', '\u0077', '\u0078', '\u0079', '\u007A', '\u007B', '\u007C', '\u007D', '\u007E', '\u007F', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\u00A0', '\u058E', '\u0587', '\u0589', '\u0029', '\u0028', '\u00BB', '\u00AB', '\u2015', '\u00B7', '\u055D', '\u002C', '\u2010', '\u058A', '\u2026', '\u055C', '\u055B', '\u055E', '\u0531', '\u0561', '\u0532', '\u0562', '\u0533', '\u0563', '\u0534', '\u0564', '\u0535', '\u0565', '\u0536', '\u0566', '\u0537', '\u0567', '\u0538', '\u0568', '\u0539', '\u0569', '\u053A', '\u056A', '\u053B', '\u056B', '\u053C', '\u056C', '\u053D', '\u056D', '\u053E', '\u056E', '\u053F', '\u056F', '\u0540', '\u0570', '\u0541', '\u0571', '\u0542', '\u0572', '\u0543', '\u0573', '\u0544', '\u0574', '\u0545', '\u0575', '\u0546', '\u0576', '\u0547', '\u0577', '\u0548', '\u0578', '\u0549', '\u0579', '\u054A', '\u057A', '\u054B', '\u057B', '\u054C', '\u057C', '\u054D', '\u057D', '\u054E', '\u057E', '\u054F', '\u057F', '\u0550', '\u0580', '\u0551', '\u0581', '\u0552', '\u0582', '\u0553', '\u0583', '\u0554', '\u0584', '\u0555', '\u0585', '\u0556', '\u0586', '\u055A', '\uFFFD' });
Your string seems to be ArmSCII-8. Adapted from an old encoder/decoder I had written for VISCII: namespace Utilities { using System; using System.Collections.ObjectModel; using System.Linq; using System.Runtime.CompilerServices; using System.Text; /// <summary> /// ArmSCII8 (https://en.wikipedia.org/wiki/ArmSCII#ArmSCII-8) /// encoding for C#. /// Fast table-based implementation not based on MappedEncoding. /// Fully thread safe/reentrant (because ArmSCII8Encoder is fully thread /// safe/reentrant and ArmSCII8Decoder is always used with flush = true). /// </summary> public class ArmSCII8EncodingSimple : Encoding { // Taken from https://en.wikipedia.org/wiki/ArmSCII#ArmSCII-8 . // Includes parts of the ISO-8859-1 in the ranges 00–1F and 7F–9F. // Doesn't define anything for FF. public static readonly ReadOnlyCollection<char> Unicodes = Array.AsReadOnly(new char[] { '\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009', '\u000A', '\u000B', '\u000C', '\u000D', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '\u0020', '\u0021', '\u0022', '\u0023', '\u0024', '\u0025', '\u0026', '\u0027', '\u0028', '\u0029', '\u002A', '\u002B', '\u002C', '\u002D', '\u002E', '\u002F', '\u0030', '\u0031', '\u0032', '\u0033', '\u0034', '\u0035', '\u0036', '\u0037', '\u0038', '\u0039', '\u003A', '\u003B', '\u003C', '\u003D', '\u003E', '\u003F', '\u0040', '\u0041', '\u0042', '\u0043', '\u0044', '\u0045', '\u0046', '\u0047', '\u0048', '\u0049', '\u004A', '\u004B', '\u004C', '\u004D', '\u004E', '\u004F', '\u0050', '\u0051', '\u0052', '\u0053', '\u0054', '\u0055', '\u0056', '\u0057', '\u0058', '\u0059', '\u005A', '\u005B', '\u005C', '\u005D', '\u005E', '\u005F', '\u0060', '\u0061', '\u0062', '\u0063', '\u0064', '\u0065', '\u0066', '\u0067', '\u0068', '\u0069', '\u006A', '\u006B', '\u006C', '\u006D', '\u006E', '\u006F', '\u0070', '\u0071', '\u0072', '\u0073', '\u0074', '\u0075', '\u0076', '\u0077', '\u0078', '\u0079', '\u007A', '\u007B', '\u007C', '\u007D', '\u007E', '\u007F', '\u0080', '\u0081', '\u0082', '\u0083', '\u0084', '\u0085', '\u0086', '\u0087', '\u0088', '\u0089', '\u008A', '\u008B', '\u008C', '\u008D', '\u008E', '\u008F', '\u0090', '\u0091', '\u0092', '\u0093', '\u0094', '\u0095', '\u0096', '\u0097', '\u0098', '\u0099', '\u009A', '\u009B', '\u009C', '\u009D', '\u009E', '\u009F', '\u00A0', '\u058E', '\u0587', '\u0589', '\u0029', '\u0028', '\u00BB', '\u00AB', '\u2015', '\u00B7', '\u055D', '\u002C', '\u2010', '\u058A', '\u2026', '\u055C', '\u055B', '\u055E', '\u0531', '\u0561', '\u0532', '\u0562', '\u0533', '\u0563', '\u0534', '\u0564', '\u0535', '\u0565', '\u0536', '\u0566', '\u0537', '\u0567', '\u0538', '\u0568', '\u0539', '\u0569', '\u053A', '\u056A', '\u053B', '\u056B', '\u053C', '\u056C', '\u053D', '\u056D', '\u053E', '\u056E', '\u053F', '\u056F', '\u0540', '\u0570', '\u0541', '\u0571', '\u0542', '\u0572', '\u0543', '\u0573', '\u0544', '\u0574', '\u0545', '\u0575', '\u0546', '\u0576', '\u0547', '\u0577', '\u0548', '\u0578', '\u0549', '\u0579', '\u054A', '\u057A', '\u054B', '\u057B', '\u054C', '\u057C', '\u054D', '\u057D', '\u054E', '\u057E', '\u054F', '\u057F', '\u0550', '\u0580', '\u0551', '\u0581', '\u0552', '\u0582', '\u0553', '\u0583', '\u0554', '\u0584', '\u0555', '\u0585', '\u0556', '\u0586', '\u055A', '\0'/**/, }); private ArmSCII8Decoder decoder; private ArmSCII8Encoder encoder; /// <summary> /// This should be thread safe. The worst case is that two instances /// of ArmSCII8Decoder are created at the same time, but this isn't /// a problem, because ArmSCII8Decoder as used in this class is /// stateless. /// </summary> protected ArmSCII8Decoder Decoder { get { ArmSCII8Decoder decoder2 = decoder; // Lazy creation of Encoder if (object.ReferenceEquals(decoder2, null)) { decoder2 = decoder = new ArmSCII8Decoder(); } DecoderFallback decoderFallback = DecoderFallback; // If the Fallback has changed from the last call, update it if (!object.ReferenceEquals(decoderFallback, null) && !object.ReferenceEquals(decoderFallback, decoder2.Fallback)) { decoder2.Fallback = decoderFallback; } return decoder2; } } /// <summary> /// This should be thread safe. The worst case is that two instances /// of ArmSCII8Encoder are created at the same time, but this isn't /// a problem, because ArmSCII8Encoder as used in this class is /// stateless. /// </summary> protected ArmSCII8Encoder Encoder { get { ArmSCII8Encoder encoder2 = encoder; // Lazy creation of Encoder if (object.ReferenceEquals(encoder2, null)) { encoder = encoder2 = new ArmSCII8Encoder(); } EncoderFallback encoderFallback = EncoderFallback; // If the Fallback has changed from the last call, update it if (!object.ReferenceEquals(encoderFallback, null) && !object.ReferenceEquals(encoderFallback, encoder2.Fallback)) { encoder2.Fallback = encoderFallback; } return encoder2; } } public override string BodyName { get { return "x-armscii-8-simple"; } } public override string EncodingName { get { return BodyName; } } public override bool IsSingleByte { get { return true; } } public override object Clone() { var encoding = (ArmSCII8EncodingSimple)base.Clone(); // We reset the encoder and decoder of the cloned instance, // because otherwise they would be shared between the two // instances. encoding.decoder = null; encoding.encoder = null; return encoding; } public override Decoder GetDecoder() { return new ArmSCII8Decoder(); } public override Encoder GetEncoder() { return new ArmSCII8Encoder(); } public override int GetByteCount(char[] chars, int index, int count) { return Encoder.GetByteCount(chars, index, count, true); } public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) { return Encoder.GetBytes(chars, charIndex, charCount, bytes, byteIndex, true); } public override int GetCharCount(byte[] bytes, int index, int count) { return Decoder.GetCharCount(bytes, index, count, true); } public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) { return Decoder.GetChars(bytes, byteIndex, byteCount, chars, charIndex, true); } public override int GetMaxByteCount(int charCount) { return charCount; } public override int GetMaxCharCount(int byteCount) { return byteCount; } } /// <summary> /// Fully thread safe/reentrant. /// </summary> public class ArmSCII8Decoder : Decoder { private static readonly char[] Unicodes = ArmSCII8EncodingSimple.Unicodes.ToArray(); public override int GetCharCount(byte[] bytes, int index, int count) { if (bytes == null) { throw new ArgumentNullException("bytes"); } if (index < 0 || index > bytes.Length) { throw new ArgumentOutOfRangeException("index"); } if (count < 0) { throw new ArgumentOutOfRangeException("count"); } if (index + count > bytes.Length) { throw new ArgumentOutOfRangeException("bytes"); } // The fallbackBuffer is created on-demand. The instance // FallbackBuffer isn't used because it wouldn't be thread safe. DecoderFallbackBuffer fallbackBuffer = null; int ret = 0; int count2 = index + count; for (; index < count2; index++) { byte b = bytes[index]; char ch = Unicodes[b]; if (ch != '\0' || b == 0) { ret++; } else { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? DecoderFallback.ReplacementFallback).CreateFallbackBuffer(); } // Fallback if (fallbackBuffer.Fallback(new[] { b }, index)) { HandleFallbackCount(fallbackBuffer, ref ret); } } } return ret; } public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) { if (bytes == null) { throw new ArgumentNullException("bytes"); } if (byteIndex < 0 || byteIndex > bytes.Length) { throw new ArgumentOutOfRangeException("byteIndex"); } if (byteCount < 0) { throw new ArgumentOutOfRangeException("byteCount"); } if (byteIndex + byteCount > bytes.Length) { throw new ArgumentOutOfRangeException("bytes"); } if (chars == null) { throw new ArgumentNullException("chars"); } if (charIndex < 0 || charIndex > chars.Length) { throw new ArgumentOutOfRangeException("charIndex"); } // The fallbackBuffer is created on-demand. The instance // FallbackBuffer isn't used because it wouldn't be thread safe. DecoderFallbackBuffer fallbackBuffer = null; int byteCount2 = byteCount + byteIndex; int charIndex2 = charIndex; for (; byteIndex < byteCount2; byteIndex++) { byte b = bytes[byteIndex]; // chars between 0 and 127 are equal in Unicode and ArmSCII8 if (b >= 0 && b <= 127) { WriteChar(chars, charIndex2, (char)b); charIndex2++; } else { char ch = Unicodes[b]; if (ch != '\0' || b == 0) { WriteChar(chars, charIndex2, ch); charIndex2++; } else { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? DecoderFallback.ReplacementFallback).CreateFallbackBuffer(); } // Fallback if (fallbackBuffer.Fallback(new[] { b }, byteIndex)) { HandleFallbackWrite(fallbackBuffer, chars, ref charIndex2); } } } } return charIndex2 - charIndex; } protected static void HandleFallbackCount(DecoderFallbackBuffer fallbackBuffer, ref int count) { while (fallbackBuffer.Remaining > 0) { fallbackBuffer.GetNextChar(); count++; } } protected static void HandleFallbackWrite(DecoderFallbackBuffer fallbackBuffer, char[] chars, ref int charIndex) { while (fallbackBuffer.Remaining > 0) { char ch = fallbackBuffer.GetNextChar(); WriteChar(chars, charIndex, ch); charIndex++; } } // Remove the next line if using .NET < 4.5 [MethodImpl(MethodImplOptions.AggressiveInlining)] protected static void WriteChar(char[] chars, int charIndex, char ch) { if (charIndex >= chars.Length) { throw new ArgumentException("bytes"); } chars[charIndex] = ch; } } /// <summary> /// An instance is thread safe/fully reentrant if the methods are always /// called with flush = true. /// </summary> public class ArmSCII8Encoder : Encoder { private static readonly byte[] ArmSCII8s; // Buffer for High/Low surrogates. Note that this property is read // but not written if the methods are always used with flush = true. protected char HighSurrogate { get; set; } static ArmSCII8Encoder() { ArmSCII8s = new byte[1 + ArmSCII8EncodingSimple.Unicodes.Max()]; for (int i = 0; i < ArmSCII8EncodingSimple.Unicodes.Count; i++) { char ch = ArmSCII8EncodingSimple.Unicodes[i]; if (i == 0 || (ArmSCII8s[ch] == 0 && ch != '\0')) { ArmSCII8s[ch] = (byte)i; } } } public override int GetByteCount(char[] chars, int index, int count, bool flush) { if (chars == null) { throw new ArgumentNullException("chars"); } if (index < 0 || index > chars.Length) { throw new ArgumentOutOfRangeException("index"); } if (count < 0) { throw new ArgumentOutOfRangeException("count"); } if (index + count > chars.Length) { throw new ArgumentOutOfRangeException("chars"); } // The fallbackBuffer is created on-demand. The instance // FallbackBuffer isn't used because it wouldn't be thread safe. EncoderFallbackBuffer fallbackBuffer = null; char highSurrogate = HighSurrogate; int ret = 0; int count2 = index + count; for (; index < count2; index++) { char ch = chars[index]; if (highSurrogate != 0) { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? EncoderFallback.ReplacementFallback).CreateFallbackBuffer(); } // If we have a High/Low surrogates couple, we pass them // together if (char.IsLowSurrogate(ch)) { if (fallbackBuffer.Fallback(highSurrogate, ch, index - 1)) { HandleFallbackCount(fallbackBuffer, ref ret); } highSurrogate = '\0'; continue; } else { // First we pass the High surrogate to the Fallback if (fallbackBuffer.Fallback(highSurrogate, index - 1)) { HandleFallbackCount(fallbackBuffer, ref ret); } highSurrogate = '\0'; // Then we fall-through normal handling } } if (ch < ArmSCII8s.Length && (ArmSCII8s[ch] != 0 || ch == '\0')) { ret++; } else { // High/low surrogate handling, done through buffer if (char.IsHighSurrogate(ch)) { highSurrogate = ch; } else { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? EncoderFallback.ReplacementFallback).CreateFallbackBuffer(); } // Fallback if (fallbackBuffer.Fallback(ch, index)) { HandleFallbackCount(fallbackBuffer, ref ret); } } } } if (flush) { if (highSurrogate != 0) { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? EncoderFallback.ReplacementFallback).CreateFallbackBuffer(); } if (fallbackBuffer.Fallback(highSurrogate, index - 1)) { HandleFallbackCount(fallbackBuffer, ref ret); } } } return ret; } public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) { if (chars == null) { throw new ArgumentNullException("chars"); } if (charIndex < 0 || charIndex > chars.Length) { throw new ArgumentOutOfRangeException("charIndex"); } if (charCount < 0) { throw new ArgumentOutOfRangeException("charCount"); } if (charIndex + charCount > chars.Length) { throw new ArgumentOutOfRangeException("chars"); } if (bytes == null) { throw new ArgumentNullException("bytes"); } if (byteIndex < 0 || byteIndex > bytes.Length) { throw new ArgumentOutOfRangeException("byteIndex"); } // The fallbackBuffer is created on-demand. The instance // FallbackBuffer isn't used because it wouldn't be thread safe. EncoderFallbackBuffer fallbackBuffer = null; // Written only on flush = false char highSurrogate = HighSurrogate; int charCount2 = charIndex + charCount; int byteIndex2 = byteIndex; for (; charIndex < charCount2; charIndex++) { char ch = chars[charIndex]; if (highSurrogate != 0) { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? EncoderFallback.ReplacementFallback).CreateFallbackBuffer(); } // If we have a High/Low surrogates couple, we pass them // together if (char.IsLowSurrogate(ch)) { if (fallbackBuffer.Fallback(highSurrogate, ch, charIndex - 1)) { HandleFallbackWrite(fallbackBuffer, bytes, ref byteIndex2); } highSurrogate = '\0'; continue; } else { // First we pass the High surrogate to the Fallback if (fallbackBuffer.Fallback(highSurrogate, charIndex - 1)) { HandleFallbackWrite(fallbackBuffer, bytes, ref byteIndex2); } highSurrogate = '\0'; // Then we fall-through normal handling } } byte b; if (ch < ArmSCII8s.Length && ((b = ArmSCII8s[ch]) != 0 || ch == '\0')) { // Recognized character WriteByte(bytes, byteIndex2, b); byteIndex2++; } else { // High/low surrogate handling, done through buffer if (char.IsHighSurrogate(ch)) { highSurrogate = ch; } else { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? EncoderFallback.ReplacementFallback).CreateFallbackBuffer(); } // Fallback if (fallbackBuffer.Fallback(ch, charIndex)) { HandleFallbackWrite(fallbackBuffer, bytes, ref byteIndex2); } } } } if (flush) { if (highSurrogate != 0) { if (fallbackBuffer == null) { fallbackBuffer = (Fallback ?? EncoderFallback.ReplacementFallback).CreateFallbackBuffer(); } if (fallbackBuffer.Fallback(highSurrogate, charIndex - 1)) { HandleFallbackWrite(fallbackBuffer, bytes, ref byteIndex2); } } } else { HighSurrogate = highSurrogate; } return byteIndex2 - byteIndex; } protected static void HandleFallbackCount(EncoderFallbackBuffer fallbackBuffer, ref int count) { while (fallbackBuffer.Remaining > 0) { char ch = fallbackBuffer.GetNextChar(); if (!(ch < ArmSCII8s.Length && (ArmSCII8s[ch] != 0 || ch == '\0'))) { throw new EncoderFallbackException(); } count++; } } protected static void HandleFallbackWrite(EncoderFallbackBuffer fallbackBuffer, byte[] bytes, ref int byteIndex) { while (fallbackBuffer.Remaining > 0) { char ch = fallbackBuffer.GetNextChar(); byte b; if (!(ch < ArmSCII8s.Length && ((b = ArmSCII8s[ch]) != 0 || ch == '\0'))) { throw new EncoderFallbackException(); } WriteByte(bytes, byteIndex, b); byteIndex++; } } // Remove the next line if using .NET < 4.5 [MethodImpl(MethodImplOptions.AggressiveInlining)] protected static void WriteByte(byte[] bytes, int byteIndex, byte b) { if (byteIndex == bytes.Length) { throw new ArgumentException("bytes"); } bytes[byteIndex] = b; } } } Then use it like: public static string DecodeFromArmSCII8(string str) { // copy the string as UTF-8 bytes. byte[] bytes = Encoding.GetEncoding("iso-8859-1").GetBytes(str); return new ArmSCII8EncodingSimple().GetString(bytes); } and string str = DecodeFromArmSCII8("سñïÇñáëÛ³Ý ²ÉÇݳ Ø."); The Encoding.GetEncoding("iso-8859-1").GetBytes(str) returns the original byte[] array. Note that if you have the original byte[] you can directly use the ArmSCII8EncodingFast.
Unable to find csv file on ASP.NET
Anyone who knows how to solve this error please help me. Below is my code for reading the csv file. When I tried to upload it showed me *Server Error in '/' Application. Could not find file 'C:/...csv * Im a beginner in c#. ReadCSV string filename = FileUpload1.PostedFile.FileName; using (CsvFileReader reader = new CsvFileReader(filename)) { CsvRow row = new CsvRow(); while (reader.ReadRow(row)) { foreach (string s in row) { Console.Write(s); Console.Write(" "); TextBox1.Text += s; } Console.WriteLine(); } } CSVClass public class CsvFileReader : StreamReader { public CsvFileReader(Stream stream) : base(stream) { } public CsvFileReader(string filename): base(filename) { } /// <summary> /// Reads a row of data from a CSV file /// </summary> /// <param name="row"></param> /// <returns></returns> public bool ReadRow(CsvRow row) { row.LineText = ReadLine(); if (String.IsNullOrEmpty(row.LineText)) return false; int pos = 0; int rows = 0; while (pos < row.LineText.Length) { string value; // Special handling for quoted field if (row.LineText[pos] == '"') { // Skip initial quote pos++; // Parse quoted value int start = pos; while (pos < row.LineText.Length) { // Test for quote character if (row.LineText[pos] == '"') { // Found one pos++; // If two quotes together, keep one // Otherwise, indicates end of value if (pos >= row.LineText.Length || row.LineText[pos] != '"') { pos--; break; } } pos++; } value = row.LineText.Substring(start, pos - start); value = value.Replace("\"\"", "\""); } else { // Parse unquoted value int start = pos; while (pos < row.LineText.Length && row.LineText[pos] != ',') pos++; value = row.LineText.Substring(start, pos - start); } // Add field to list if (rows < row.Count) row[rows] = value; else row.Add(value); rows++; // Eat up to and including next comma while (pos < row.LineText.Length && row.LineText[pos] != ',') pos++; if (pos < row.LineText.Length) pos++; } // Delete any unused items while (row.Count > rows) row.RemoveAt(rows); // Return true if any columns read return (row.Count > 0); } }
FileUpload1.PostedFile.FileName is the filename from your client/browser - it does not contain a path... You either use FileUpload1.PostedFile.InputStream to access it using (CsvFileReader reader = new CsvFileReader(FileUpload1.PostedFile.InputStream)) OR you first save it to disk (anywhere you have needed permissions) via FileUpload1.PostedFile.SaveAs and then access that file.
You need to save the file that is being uploaded to disk first. Something like this: string fileSavePath= Sever.MapPath("/files/" + FileUpload1.PostedFile.FileName); FileUpload1.SaveAs(fileSavePath); using (CsvFileReader reader = new CsvFileReader(fileSavePath)) .... Haven't tested this code, but should give you a starting point.
Truncate string on whole words in .NET C#
I am trying to truncate some long text in C#, but I don't want my string to be cut off part way through a word. Does anyone have a function that I can use to truncate my string at the end of a word? E.g: "This was a long string..." Not: "This was a long st..."
Try the following. It is pretty rudimentary. Just finds the first space starting at the desired length. public static string TruncateAtWord(this string value, int length) { if (value == null || value.Length < length || value.IndexOf(" ", length) == -1) return value; return value.Substring(0, value.IndexOf(" ", length)); }
Thanks for your answer Dave. I've tweaked the function a bit and this is what I'm using ... unless there are any more comments ;) public static string TruncateAtWord(this string input, int length) { if (input == null || input.Length < length) return input; int iNextSpace = input.LastIndexOf(" ", length, StringComparison.Ordinal); return string.Format("{0}…", input.Substring(0, (iNextSpace > 0) ? iNextSpace : length).Trim()); }
My contribution: public static string TruncateAtWord(string text, int maxCharacters, string trailingStringIfTextCut = "…") { if (text == null || (text = text.Trim()).Length <= maxCharacters) return text; int trailLength = trailingStringIfTextCut.StartsWith("&") ? 1 : trailingStringIfTextCut.Length; maxCharacters = maxCharacters - trailLength >= 0 ? maxCharacters - trailLength : 0; int pos = text.LastIndexOf(" ", maxCharacters); if (pos >= 0) return text.Substring(0, pos) + trailingStringIfTextCut; return string.Empty; } This is what I use in my projects, with optional trailing. Text will never exceed the maxCharacters + trailing text length.
If you are using windows forms, in the Graphics.DrawString method, there is an option in StringFormat to specify if the string should be truncated, if it does not fit into the area specified. This will handle adding the ellipsis as necessary. http://msdn.microsoft.com/en-us/library/system.drawing.stringtrimming.aspx
I took your approach a little further: public string TruncateAtWord(string value, int length) { if (value == null || value.Trim().Length <= length) return value; int index = value.Trim().LastIndexOf(" "); while ((index + 3) > length) index = value.Substring(0, index).Trim().LastIndexOf(" "); if (index > 0) return value.Substring(0, index) + "..."; return value.Substring(0, length - 3) + "..."; } I'm using this to truncate tweets.
This solution works too (takes first 10 words from myString): String.Join(" ", myString.Split(' ').Take(10))
Taking into account more than just a blank space separator (e.g. words can be separated by periods followed by newlines, followed by tabs, etc.), and several other edge cases, here is an appropriate extension method: public static string GetMaxWords(this string input, int maxWords, string truncateWith = "...", string additionalSeparators = ",-_:") { int words = 1; bool IsSeparator(char c) => Char.IsSeparator(c) || additionalSeparators.Contains(c); IEnumerable<char> IterateChars() { yield return input[0]; for (int i = 1; i < input.Length; i++) { if (IsSeparator(input[i]) && !IsSeparator(input[i - 1])) if (words == maxWords) { foreach (char c in truncateWith) yield return c; break; } else words++; yield return input[i]; } } return !input.IsNullOrEmpty() ? new String(IterateChars().ToArray()) : String.Empty; }
simplified, added trunking character option and made it an extension. public static string TruncateAtWord(this string value, int maxLength) { if (value == null || value.Trim().Length <= maxLength) return value; string ellipse = "..."; char[] truncateChars = new char[] { ' ', ',' }; int index = value.Trim().LastIndexOfAny(truncateChars); while ((index + ellipse.Length) > maxLength) index = value.Substring(0, index).Trim().LastIndexOfAny(truncateChars); if (index > 0) return value.Substring(0, index) + ellipse; return value.Substring(0, maxLength - ellipse.Length) + ellipse; }
Heres what i came up with. This is to get the rest of the sentence also in chunks. public static List<string> SplitTheSentenceAtWord(this string originalString, int length) { try { List<string> truncatedStrings = new List<string>(); if (originalString == null || originalString.Trim().Length <= length) { truncatedStrings.Add(originalString); return truncatedStrings; } int index = originalString.Trim().LastIndexOf(" "); while ((index + 3) > length) index = originalString.Substring(0, index).Trim().LastIndexOf(" "); if (index > 0) { string retValue = originalString.Substring(0, index) + "..."; truncatedStrings.Add(retValue); string shortWord2 = originalString; if (retValue.EndsWith("...")) { shortWord2 = retValue.Replace("...", ""); } shortWord2 = originalString.Substring(shortWord2.Length); if (shortWord2.Length > length) //truncate it further { List<string> retValues = SplitTheSentenceAtWord(shortWord2.TrimStart(), length); truncatedStrings.AddRange(retValues); } else { truncatedStrings.Add(shortWord2.TrimStart()); } return truncatedStrings; } var retVal_Last = originalString.Substring(0, length - 3); truncatedStrings.Add(retVal_Last + "..."); if (originalString.Length > length)//truncate it further { string shortWord3 = originalString; if (originalString.EndsWith("...")) { shortWord3 = originalString.Replace("...", ""); } shortWord3 = originalString.Substring(retVal_Last.Length); List<string> retValues = SplitTheSentenceAtWord(shortWord3.TrimStart(), length); truncatedStrings.AddRange(retValues); } else { truncatedStrings.Add(retVal_Last + "..."); } return truncatedStrings; } catch { return new List<string> { originalString }; } }
I use this public string Truncate(string content, int length) { try { return content.Substring(0,content.IndexOf(" ",length)) + "..."; } catch { return content; } }