I'm trying to convert some strings that are in French Canadian and basically, I'd like to be able to take out the French accent marks in the letters while keeping the letter. (E.g. convert é to e, so crème brûlée would become creme brulee)
What is the best method for achieving this?
I've not used this method, but Michael Kaplan describes a method for doing so in his blog post (with a confusing title) that talks about stripping diacritics: Stripping is an interesting job (aka
On the meaning of meaningless, aka All
Mn characters are non-spacing, but
some are more non-spacing than
others)
static string RemoveDiacritics(string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder(capacity: normalizedString.Length);
for (int i = 0; i < normalizedString.Length; i++)
{
char c = normalizedString[i];
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder
.ToString()
.Normalize(NormalizationForm.FormC);
}
Note that this is a followup to his earlier post: Stripping diacritics....
The approach uses String.Normalize to split the input string into constituent glyphs (basically separating the "base" characters from the diacritics) and then scans the result and retains only the base characters. It's just a little complicated, but really you're looking at a complicated problem.
Of course, if you're limiting yourself to French, you could probably get away with the simple table-based approach in How to remove accents and tilde in a C++ std::string, as recommended by #David Dibben.
this did the trick for me...
string accentedStr;
byte[] tempBytes;
tempBytes = System.Text.Encoding.GetEncoding("ISO-8859-8").GetBytes(accentedStr);
string asciiStr = System.Text.Encoding.UTF8.GetString(tempBytes);
quick&short!
I needed something that converts all major unicode characters and the voted answer leaved a few out so I've created a version of CodeIgniter's convert_accented_characters($str) into C# that is easily customisable:
using System;
using System.Text;
using System.Collections.Generic;
public static class Strings
{
static Dictionary<string, string> foreign_characters = new Dictionary<string, string>
{
{ "äæǽ", "ae" },
{ "öœ", "oe" },
{ "ü", "ue" },
{ "Ä", "Ae" },
{ "Ü", "Ue" },
{ "Ö", "Oe" },
{ "ÀÁÂÃÄÅǺĀĂĄǍΑΆẢẠẦẪẨẬẰẮẴẲẶА", "A" },
{ "àáâãåǻāăąǎªαάảạầấẫẩậằắẵẳặа", "a" },
{ "Б", "B" },
{ "б", "b" },
{ "ÇĆĈĊČ", "C" },
{ "çćĉċč", "c" },
{ "Д", "D" },
{ "д", "d" },
{ "ÐĎĐΔ", "Dj" },
{ "ðďđδ", "dj" },
{ "ÈÉÊËĒĔĖĘĚΕΈẼẺẸỀẾỄỂỆЕЭ", "E" },
{ "èéêëēĕėęěέεẽẻẹềếễểệеэ", "e" },
{ "Ф", "F" },
{ "ф", "f" },
{ "ĜĞĠĢΓГҐ", "G" },
{ "ĝğġģγгґ", "g" },
{ "ĤĦ", "H" },
{ "ĥħ", "h" },
{ "ÌÍÎÏĨĪĬǏĮİΗΉΊΙΪỈỊИЫ", "I" },
{ "ìíîïĩīĭǐįıηήίιϊỉịиыї", "i" },
{ "Ĵ", "J" },
{ "ĵ", "j" },
{ "ĶΚК", "K" },
{ "ķκк", "k" },
{ "ĹĻĽĿŁΛЛ", "L" },
{ "ĺļľŀłλл", "l" },
{ "М", "M" },
{ "м", "m" },
{ "ÑŃŅŇΝН", "N" },
{ "ñńņňʼnνн", "n" },
{ "ÒÓÔÕŌŎǑŐƠØǾΟΌΩΏỎỌỒỐỖỔỘỜỚỠỞỢО", "O" },
{ "òóôõōŏǒőơøǿºοόωώỏọồốỗổộờớỡởợо", "o" },
{ "П", "P" },
{ "п", "p" },
{ "ŔŖŘΡР", "R" },
{ "ŕŗřρр", "r" },
{ "ŚŜŞȘŠΣС", "S" },
{ "śŝşșšſσςс", "s" },
{ "ȚŢŤŦτТ", "T" },
{ "țţťŧт", "t" },
{ "ÙÚÛŨŪŬŮŰŲƯǓǕǗǙǛŨỦỤỪỨỮỬỰУ", "U" },
{ "ùúûũūŭůűųưǔǖǘǚǜυύϋủụừứữửựу", "u" },
{ "ÝŸŶΥΎΫỲỸỶỴЙ", "Y" },
{ "ýÿŷỳỹỷỵй", "y" },
{ "В", "V" },
{ "в", "v" },
{ "Ŵ", "W" },
{ "ŵ", "w" },
{ "ŹŻŽΖЗ", "Z" },
{ "źżžζз", "z" },
{ "ÆǼ", "AE" },
{ "ß", "ss" },
{ "IJ", "IJ" },
{ "ij", "ij" },
{ "Œ", "OE" },
{ "ƒ", "f" },
{ "ξ", "ks" },
{ "π", "p" },
{ "β", "v" },
{ "μ", "m" },
{ "ψ", "ps" },
{ "Ё", "Yo" },
{ "ё", "yo" },
{ "Є", "Ye" },
{ "є", "ye" },
{ "Ї", "Yi" },
{ "Ж", "Zh" },
{ "ж", "zh" },
{ "Х", "Kh" },
{ "х", "kh" },
{ "Ц", "Ts" },
{ "ц", "ts" },
{ "Ч", "Ch" },
{ "ч", "ch" },
{ "Ш", "Sh" },
{ "ш", "sh" },
{ "Щ", "Shch" },
{ "щ", "shch" },
{ "ЪъЬь", "" },
{ "Ю", "Yu" },
{ "ю", "yu" },
{ "Я", "Ya" },
{ "я", "ya" },
};
public static char RemoveDiacritics(this char c){
foreach(KeyValuePair<string, string> entry in foreign_characters)
{
if(entry.Key.IndexOf (c) != -1)
{
return entry.Value[0];
}
}
return c;
}
public static string RemoveDiacritics(this string s)
{
//StringBuilder sb = new StringBuilder ();
string text = "";
foreach (char c in s)
{
int len = text.Length;
foreach(KeyValuePair<string, string> entry in foreign_characters)
{
if(entry.Key.IndexOf (c) != -1)
{
text += entry.Value;
break;
}
}
if (len == text.Length) {
text += c;
}
}
return text;
}
}
Usage
// for strings
"crème brûlée".RemoveDiacritics (); // creme brulee
// for chars
"Ã"[0].RemoveDiacritics (); // A
In case someone is interested, I was looking for something similar and ended writing the following:
public static string NormalizeStringForUrl(string name)
{
String normalizedString = name.Normalize(NormalizationForm.FormD);
StringBuilder stringBuilder = new StringBuilder();
foreach (char c in normalizedString)
{
switch (CharUnicodeInfo.GetUnicodeCategory(c))
{
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.DecimalDigitNumber:
stringBuilder.Append(c);
break;
case UnicodeCategory.SpaceSeparator:
case UnicodeCategory.ConnectorPunctuation:
case UnicodeCategory.DashPunctuation:
stringBuilder.Append('_');
break;
}
}
string result = stringBuilder.ToString();
return String.Join("_", result.Split(new char[] { '_' }
, StringSplitOptions.RemoveEmptyEntries)); // remove duplicate underscores
}
The accepted answer is totally correct, but nowadays, it should be updated to use Rune class instead of CharUnicodeInfo, as C# & .NET updated the way to analyse strings in latest versions (Rune class has been added in .NET Core 3.0).
The following code for .NET 5+ is now recommended, as it go further for non-latin chars :
static string RemoveDiacritics(string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
foreach (var c in normalizedString.EnumerateRunes())
{
var unicodeCategory = Rune.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
I often use an extenstion method based on another version I found here
(see Replacing characters in C# (ascii))
A quick explanation:
Normalizing to form D splits charactes like è to an e and a nonspacing `
From this, the nospacing characters are removed
The result is normalized back to form C (I'm not sure if this is neccesary)
Code:
using System.Linq;
using System.Text;
using System.Globalization;
// namespace here
public static class Utility
{
public static string RemoveDiacritics(this string str)
{
if (null == str) return null;
var chars =
from c in str.Normalize(NormalizationForm.FormD).ToCharArray()
let uc = CharUnicodeInfo.GetUnicodeCategory(c)
where uc != UnicodeCategory.NonSpacingMark
select c;
var cleanStr = new string(chars.ToArray()).Normalize(NormalizationForm.FormC);
return cleanStr;
}
// or, alternatively
public static string RemoveDiacritics2(this string str)
{
if (null == str) return null;
var chars = str
.Normalize(NormalizationForm.FormD)
.ToCharArray()
.Where(c=> CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
.ToArray();
return new string(chars).Normalize(NormalizationForm.FormC);
}
}
The CodePage of Greek (ISO) can do it
The information about this codepage is into System.Text.Encoding.GetEncodings(). Learn about in: https://msdn.microsoft.com/pt-br/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
Greek (ISO) has codepage 28597 and name iso-8859-7.
Go to the code... \o/
string text = "Você está numa situação lamentável";
string textEncode = System.Web.HttpUtility.UrlEncode(text, Encoding.GetEncoding("iso-8859-7"));
//result: "Voce+esta+numa+situacao+lamentavel"
string textDecode = System.Web.HttpUtility.UrlDecode(textEncode);
//result: "Voce esta numa situacao lamentavel"
So, write this function...
public string RemoveAcentuation(string text)
{
return
System.Web.HttpUtility.UrlDecode(
System.Web.HttpUtility.UrlEncode(
text, Encoding.GetEncoding("iso-8859-7")));
}
Note that... Encoding.GetEncoding("iso-8859-7") is equivalent to Encoding.GetEncoding(28597) because first is the name, and second the codepage of Encoding.
TL;DR - C# string extension method
I think the best solution to preserve the meaning of the string is to convert the characters instead of stripping them, which is well illustrated in the example crème brûlée to crme brle vs. creme brulee.
I checked out Alexander's comment above and saw the Lucene.Net code is Apache 2.0 licensed, so I've modified the class into a simple string extension method. You can use it like this:
var originalString = "crème brûlée";
var maxLength = originalString.Length; // limit output length as necessary
var foldedString = originalString.FoldToASCII(maxLength);
// "creme brulee"
The function is too long to post in a StackOverflow answer (~139k characters of 30k allowed lol) so I made a gist and attributed the authors:
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
/// <para/>
/// Characters from the following Unicode blocks are converted; however, only
/// those characters with reasonable ASCII alternatives are converted:
///
/// <ul>
/// <item><description>C1 Controls and Latin-1 Supplement: http://www.unicode.org/charts/PDF/U0080.pdf</description></item>
/// <item><description>Latin Extended-A: http://www.unicode.org/charts/PDF/U0100.pdf</description></item>
/// <item><description>Latin Extended-B: http://www.unicode.org/charts/PDF/U0180.pdf</description></item>
/// <item><description>Latin Extended Additional: http://www.unicode.org/charts/PDF/U1E00.pdf</description></item>
/// <item><description>Latin Extended-C: http://www.unicode.org/charts/PDF/U2C60.pdf</description></item>
/// <item><description>Latin Extended-D: http://www.unicode.org/charts/PDF/UA720.pdf</description></item>
/// <item><description>IPA Extensions: http://www.unicode.org/charts/PDF/U0250.pdf</description></item>
/// <item><description>Phonetic Extensions: http://www.unicode.org/charts/PDF/U1D00.pdf</description></item>
/// <item><description>Phonetic Extensions Supplement: http://www.unicode.org/charts/PDF/U1D80.pdf</description></item>
/// <item><description>General Punctuation: http://www.unicode.org/charts/PDF/U2000.pdf</description></item>
/// <item><description>Superscripts and Subscripts: http://www.unicode.org/charts/PDF/U2070.pdf</description></item>
/// <item><description>Enclosed Alphanumerics: http://www.unicode.org/charts/PDF/U2460.pdf</description></item>
/// <item><description>Dingbats: http://www.unicode.org/charts/PDF/U2700.pdf</description></item>
/// <item><description>Supplemental Punctuation: http://www.unicode.org/charts/PDF/U2E00.pdf</description></item>
/// <item><description>Alphabetic Presentation Forms: http://www.unicode.org/charts/PDF/UFB00.pdf</description></item>
/// <item><description>Halfwidth and Fullwidth Forms: http://www.unicode.org/charts/PDF/UFF00.pdf</description></item>
/// </ul>
/// <para/>
/// See: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode
/// <para/>
/// For example, 'à' will be replaced by 'a'.
/// </summary>
public static partial class StringExtensions
{
/// <summary>
/// Converts characters above ASCII to their ASCII equivalents. For example,
/// accents are removed from accented characters.
/// </summary>
/// <param name="input"> The string of characters to fold </param>
/// <param name="length"> The length of the folded return string </param>
/// <returns> length of output </returns>
public static string FoldToASCII(this string input, int? length = null)
{
// See https://gist.github.com/andyraddatz/e6a396fb91856174d4e3f1bf2e10951c
}
}
Hope that helps someone else, this is the most robust solution I've found!
It's funny such a question can get so many answers, and yet none fit my requirements :) There are so many languages around, a full language agnostic solution is AFAIK not really possible, as others has mentionned that the FormC or FormD are giving issues.
Since the original question was related to French, the simplest working answer is indeed
public static string ConvertWesternEuropeanToASCII(this string str)
{
return Encoding.ASCII.GetString(Encoding.GetEncoding(1251).GetBytes(str));
}
1251 should be replaced by the encoding code of the input language.
This however replace only one character by one character. Since I am also working with German as input, I did a manual convert
public static string LatinizeGermanCharacters(this string str)
{
StringBuilder sb = new StringBuilder(str.Length);
foreach (char c in str)
{
switch (c)
{
case 'ä':
sb.Append("ae");
break;
case 'ö':
sb.Append("oe");
break;
case 'ü':
sb.Append("ue");
break;
case 'Ä':
sb.Append("Ae");
break;
case 'Ö':
sb.Append("Oe");
break;
case 'Ü':
sb.Append("Ue");
break;
case 'ß':
sb.Append("ss");
break;
default:
sb.Append(c);
break;
}
}
return sb.ToString();
}
It might not deliver the best performance, but at least it is very easy to read and extend.
Regex is a NO GO, much slower than any char/string stuff.
I also have a very simple method to remove space:
public static string RemoveSpace(this string str)
{
return str.Replace(" ", string.Empty);
}
Eventually, I am using a combination of all 3 above extensions:
public static string LatinizeAndConvertToASCII(this string str, bool keepSpace = false)
{
str = str.LatinizeGermanCharacters().ConvertWesternEuropeanToASCII();
return keepSpace ? str : str.RemoveSpace();
}
And a small unit test to that (not exhaustive) which pass successfully.
[TestMethod()]
public void LatinizeAndConvertToASCIITest()
{
string europeanStr = "Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ";
string expected = "Bonjourcava?C'estl'ete!IchmoechteaeAeaaaeeeeEEiIiiiooooeOeUeueuuuUyYcCnN";
string actual = europeanStr.LatinizeAndConvertToASCII();
Assert.AreEqual(expected, actual);
}
Same as accepted answer but faster, using Span instead of StringBuilder.
Requires .NET Core 3.1 or newer .NET.
static string RemoveDiacritics(string text)
{
ReadOnlySpan<char> normalizedString = text.Normalize(NormalizationForm.FormD);
int i = 0;
Span<char> span = text.Length < 1000
? stackalloc char[text.Length]
: new char[text.Length];
foreach (char c in normalizedString)
{
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
span[i++] = c;
}
return new string(span).Normalize(NormalizationForm.FormC);
}
Also this is extensible for additional character replacements e.g. for polish Ł.
span[i++] = c switch
{
'Ł' => 'L',
'ł' => 'l',
_ => c
};
A small note: Stack allocation stackalloc is rather faster than Heap allocation new, and it makes less work for Garbage Collector. 1000 is a threshold to avoid allocating large structures on Stack which may cause StackOverflowException. While 1000 is a pretty safe value, in most cases 10000 or even 100000 would also work (100k allocates on Stack up to 200kB while default stack size is 1 MB). However 100k looks for me a bit dangerous.
THIS IS THE VB VERSION (Works with GREEK) :
Imports System.Text
Imports System.Globalization
Public Function RemoveDiacritics(ByVal s As String)
Dim normalizedString As String
Dim stringBuilder As New StringBuilder
normalizedString = s.Normalize(NormalizationForm.FormD)
Dim i As Integer
Dim c As Char
For i = 0 To normalizedString.Length - 1
c = normalizedString(i)
If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
stringBuilder.Append(c)
End If
Next
Return stringBuilder.ToString()
End Function
This is how i replace diacritic characters to non-diacritic ones in all my .NET program
C#:
//Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter 'é' is substituted by an 'e'
public string RemoveDiacritics(string s)
{
string normalizedString = null;
StringBuilder stringBuilder = new StringBuilder();
normalizedString = s.Normalize(NormalizationForm.FormD);
int i = 0;
char c = '\0';
for (i = 0; i <= normalizedString.Length - 1; i++)
{
c = normalizedString[i];
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().ToLower();
}
VB .NET:
'Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter "é" is substituted by an "e"'
Public Function RemoveDiacritics(ByVal s As String) As String
Dim normalizedString As String
Dim stringBuilder As New StringBuilder
normalizedString = s.Normalize(NormalizationForm.FormD)
Dim i As Integer
Dim c As Char
For i = 0 To normalizedString.Length - 1
c = normalizedString(i)
If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
stringBuilder.Append(c)
End If
Next
Return stringBuilder.ToString().ToLower()
End Function
Popping this Library here if you haven't already considered it. Looks like there are a full range of unit tests with it.
https://github.com/thomasgalliker/Diacritics.NET
For simply removing French Canadian accent marks as the original question asked, here's an alternate method that uses a regular expression instead of hardcoded conversions and For/Next loops. Depending on your needs, it could be condensed into a single line of code; however, I added it to an extensions class for easier reusability.
Visual Basic
Imports System.Text
Imports System.Text.RegularExpressions
Public MustInherit Class StringExtension
Public Shared Function RemoveDiacritics(Text As String) As String
Return New Regex("\p{Mn}", RegexOptions.Compiled).Replace(Text.Normalize(NormalizationForm.FormD), String.Empty)
End Function
End Class
Implementation
Private Shared Sub DoStuff()
MsgBox(StringExtension.RemoveDiacritics(inputString))
End Sub
c#
using System.Text;
using System.Text.RegularExpressions;
namespace YourApplication
{
public abstract class StringExtension
{
public static string RemoveDiacritics(string Text)
{
return new Regex(#"\p{Mn}", RegexOptions.Compiled).Replace(Text.Normalize(NormalizationForm.FormD), string.Empty);
}
}
}
Implementation
private static void DoStuff()
{
MessageBox.Show(StringExtension.RemoveDiacritics(inputString));
}
Input: äáčďěéíľľňôóřŕšťúůýž ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ ÖÜË łŁđĐ ţŢşŞçÇ øı
Output: aacdeeillnoorrstuuyz AACDEEILLNOORRSTUUYZ OUE łŁđĐ tTsScC øı
I included characters that wouldn't be converted to help visualize what happens when unexpected input is received.
If you need it to also convert other types of characters such as the Polish ł and Ł, then depending on your needs, consider incorporating this answer (.NET Core friendly) that uses CodePagesEncodingProvider into your solution.
Try HelperSharp package.
There is a method RemoveAccents:
public static string RemoveAccents(this string source)
{
//8 bit characters
byte[] b = Encoding.GetEncoding(1251).GetBytes(source);
// 7 bit characters
string t = Encoding.ASCII.GetString(b);
Regex re = new Regex("[^a-zA-Z0-9]=-_/");
string c = re.Replace(t, " ");
return c;
}
you can use string extension from MMLib.Extensions nuget package:
using MMLib.RapidPrototyping.Generators;
public void ExtensionsExample()
{
string target = "aácčeéií";
Assert.AreEqual("aacceeii", target.RemoveDiacritics());
}
Nuget page: https://www.nuget.org/packages/MMLib.Extensions/
Codeplex project site https://mmlib.codeplex.com/
Imports System.Text
Imports System.Globalization
Public Function DECODE(ByVal x As String) As String
Dim sb As New StringBuilder
For Each c As Char In x.Normalize(NormalizationForm.FormD).Where(Function(a) CharUnicodeInfo.GetUnicodeCategory(a) <> UnicodeCategory.NonSpacingMark)
sb.Append(c)
Next
Return sb.ToString()
End Function
What this person said:
Encoding.ASCII.GetString(Encoding.GetEncoding(1251).GetBytes(text));
It actually splits the likes of å which is one character (which is character code 00E5, not 0061 plus the modifier 030A which would look the same) into a plus some kind of modifier, and then the ASCII conversion removes the modifier, leaving the only a.
I really like the concise and functional code provided by azrafe7.
So, I have changed it a little bit to convert it to an extension method:
public static class StringExtensions
{
public static string RemoveDiacritics(this string text)
{
const string SINGLEBYTE_LATIN_ASCII_ENCODING = "ISO-8859-8";
if (string.IsNullOrEmpty(text))
{
return string.Empty;
}
return Encoding.ASCII.GetString(
Encoding.GetEncoding(SINGLEBYTE_LATIN_ASCII_ENCODING).GetBytes(text));
}
}
For anyone who finds Lucene.Net as an overkill for removing diacritics, I managed to find this small library, that utilize ASCII transliteration for you.
https://github.com/anyascii/anyascii
This code worked for me:
var updatedText = text.Normalize(NormalizationForm.FormD)
.Where(c => CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
.ToArray();
However, please don't do this with names. It's not only an insult to people with umlauts/accents in their name, it can also be dangerously wrong in certain situations (see below). There are alternative writings instead of just removing the accent.
Furthermore, it's simply wrong and dangerous, e.g. if the user has to provide his name exactly how it occurs on the passport.
For example my name is written Zuberbühler and in the machine readable part of my passport you will find Zuberbuehler. By removing the umlaut, the name will not match with either part. This can lead to issues for the users.
You should rather disallow umlauts/accent in an input form for names so the user can write his name correctly without its umlaut or accent.
Practical example, if the web service to apply for ESTA (https://www.application-esta.co.uk/special-characters-and) would use above code instead of transforming umlauts correctly, the ESTA application would either be refused or the traveller will have problems with the American Border Control when entering the States.
Another example would be flight tickets. Assuming you have a flight ticket booking web application, the user provides his name with an accent and your implementation is just removing the accents and then using the airline's web service to book the ticket! Your customer may not be allowed to board since the name does not match to any part of his/her passport.
Not having enough reputations, apparently I can not comment on Alexander's excellent link. - Lucene appears to be the only solution working in reasonably generic cases.
For those wanting a simple copy-paste solution, here it is, leveraging code in Lucene:
string testbed = "ÁÂÄÅÇÉÍÎÓÖØÚÜÞàáâãäåæçèéêëìíîïðñóôöøúüāăčĐęğıŁłńŌōřŞşšźžșțệủ";
Console.WriteLine(Lucene.latinizeLucene(testbed));
AAAACEIIOOOUUTHaaaaaaaeceeeeiiiidnoooouuaacDegiLlnOorSsszzsteu
//////////
public static class Lucene
{
// source: https://raw.githubusercontent.com/apache/lucenenet/master/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/ASCIIFoldingFilter.cs
// idea: https://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net (scroll down, search for lucene by Alexander)
public static string latinizeLucene(string arg)
{
char[] argChar = arg.ToCharArray();
// latinizeLuceneImpl can expand one char up to four chars - e.g. Þ to TH, or æ to ae, or in fact ⑽ to (10)
char[] resultChar = new String(' ', arg.Length * 4).ToCharArray();
int outputPos = Lucene.latinizeLuceneImpl(argChar, 0, ref resultChar, 0, arg.Length);
string ret = new string(resultChar);
ret = ret.Substring(0, outputPos);
return ret;
}
/// <summary>
/// Converts characters above ASCII to their ASCII equivalents. For example,
/// accents are removed from accented characters.
/// <para/>
/// #lucene.internal
/// </summary>
/// <param name="input"> The characters to fold </param>
/// <param name="inputPos"> Index of the first character to fold </param>
/// <param name="output"> The result of the folding. Should be of size >= <c>length * 4</c>. </param>
/// <param name="outputPos"> Index of output where to put the result of the folding </param>
/// <param name="length"> The number of characters to fold </param>
/// <returns> length of output </returns>
private static int latinizeLuceneImpl(char[] input, int inputPos, ref char[] output, int outputPos, int length)
{
int end = inputPos + length;
for (int pos = inputPos; pos < end; ++pos)
{
char c = input[pos];
// Quick test: if it's not in range then just keep current character
if (c < '\u0080')
{
output[outputPos++] = c;
}
else
{
switch (c)
{
case '\u00C0': // À [LATIN CAPITAL LETTER A WITH GRAVE]
case '\u00C1': // Á [LATIN CAPITAL LETTER A WITH ACUTE]
case '\u00C2': // Â [LATIN CAPITAL LETTER A WITH CIRCUMFLEX]
case '\u00C3': // Ã [LATIN CAPITAL LETTER A WITH TILDE]
case '\u00C4': // Ä [LATIN CAPITAL LETTER A WITH DIAERESIS]
case '\u00C5': // Å [LATIN CAPITAL LETTER A WITH RING ABOVE]
case '\u0100': // Ā [LATIN CAPITAL LETTER A WITH MACRON]
case '\u0102': // Ă [LATIN CAPITAL LETTER A WITH BREVE]
case '\u0104': // Ą [LATIN CAPITAL LETTER A WITH OGONEK]
case '\u018F': // Ə http://en.wikipedia.org/wiki/Schwa [LATIN CAPITAL LETTER SCHWA]
case '\u01CD': // Ǎ [LATIN CAPITAL LETTER A WITH CARON]
case '\u01DE': // Ǟ [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON]
case '\u01E0': // Ǡ [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON]
case '\u01FA': // Ǻ [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE]
case '\u0200': // Ȁ [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE]
case '\u0202': // Ȃ [LATIN CAPITAL LETTER A WITH INVERTED BREVE]
case '\u0226': // Ȧ [LATIN CAPITAL LETTER A WITH DOT ABOVE]
case '\u023A': // Ⱥ [LATIN CAPITAL LETTER A WITH STROKE]
case '\u1D00': // ᴀ [LATIN LETTER SMALL CAPITAL A]
case '\u1E00': // Ḁ [LATIN CAPITAL LETTER A WITH RING BELOW]
case '\u1EA0': // Ạ [LATIN CAPITAL LETTER A WITH DOT BELOW]
case '\u1EA2': // Ả [LATIN CAPITAL LETTER A WITH HOOK ABOVE]
case '\u1EA4': // Ấ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE]
case '\u1EA6': // Ầ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE]
case '\u1EA8': // Ẩ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
case '\u1EAA': // Ẫ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE]
case '\u1EAC': // Ậ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
case '\u1EAE': // Ắ [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE]
case '\u1EB0': // Ằ [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE]
case '\u1EB2': // Ẳ [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE]
case '\u1EB4': // Ẵ [LATIN CAPITAL LETTER A WITH BREVE AND TILDE]
case '\u1EB6': // Ặ [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW]
case '\u24B6': // Ⓐ [CIRCLED LATIN CAPITAL LETTER A]
case '\uFF21': // A [FULLWIDTH LATIN CAPITAL LETTER A]
output[outputPos++] = 'A';
break;
case '\u00E0': // à [LATIN SMALL LETTER A WITH GRAVE]
case '\u00E1': // á [LATIN SMALL LETTER A WITH ACUTE]
case '\u00E2': // â [LATIN SMALL LETTER A WITH CIRCUMFLEX]
case '\u00E3': // ã [LATIN SMALL LETTER A WITH TILDE]
case '\u00E4': // ä [LATIN SMALL LETTER A WITH DIAERESIS]
case '\u00E5': // å [LATIN SMALL LETTER A WITH RING ABOVE]
case '\u0101': // ā [LATIN SMALL LETTER A WITH MACRON]
case '\u0103': // ă [LATIN SMALL LETTER A WITH BREVE]
case '\u0105': // ą [LATIN SMALL LETTER A WITH OGONEK]
case '\u01CE': // ǎ [LATIN SMALL LETTER A WITH CARON]
case '\u01DF': // ǟ [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON]
case '\u01E1': // ǡ [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON]
case '\u01FB': // ǻ [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE]
case '\u0201': // ȁ [LATIN SMALL LETTER A WITH DOUBLE GRAVE]
case '\u0203': // ȃ [LATIN SMALL LETTER A WITH INVERTED BREVE]
case '\u0227': // ȧ [LATIN SMALL LETTER A WITH DOT ABOVE]
case '\u0250': // ɐ [LATIN SMALL LETTER TURNED A]
case '\u0259': // ə [LATIN SMALL LETTER SCHWA]
case '\u025A': // ɚ [LATIN SMALL LETTER SCHWA WITH HOOK]
case '\u1D8F': // ᶏ [LATIN SMALL LETTER A WITH RETROFLEX HOOK]
case '\u1D95': // ᶕ [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK]
case '\u1E01': // ạ [LATIN SMALL LETTER A WITH RING BELOW]
case '\u1E9A': // ả [LATIN SMALL LETTER A WITH RIGHT HALF RING]
case '\u1EA1': // ạ [LATIN SMALL LETTER A WITH DOT BELOW]
case '\u1EA3': // ả [LATIN SMALL LETTER A WITH HOOK ABOVE]
case '\u1EA5': // ấ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE]
case '\u1EA7': // ầ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE]
case '\u1EA9': // ẩ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
case '\u1EAB': // ẫ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE]
case '\u1EAD': // ậ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
case '\u1EAF': // ắ [LATIN SMALL LETTER A WITH BREVE AND ACUTE]
case '\u1EB1': // ằ [LATIN SMALL LETTER A WITH BREVE AND GRAVE]
case '\u1EB3': // ẳ [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE]
case '\u1EB5': // ẵ [LATIN SMALL LETTER A WITH BREVE AND TILDE]
case '\u1EB7': // ặ [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW]
case '\u2090': // ₐ [LATIN SUBSCRIPT SMALL LETTER A]
case '\u2094': // ₔ [LATIN SUBSCRIPT SMALL LETTER SCHWA]
case '\u24D0': // ⓐ [CIRCLED LATIN SMALL LETTER A]
case '\u2C65': // ⱥ [LATIN SMALL LETTER A WITH STROKE]
case '\u2C6F': // Ɐ [LATIN CAPITAL LETTER TURNED A]
case '\uFF41': // a [FULLWIDTH LATIN SMALL LETTER A]
output[outputPos++] = 'a';
break;
case '\uA732': // Ꜳ [LATIN CAPITAL LETTER AA]
output[outputPos++] = 'A';
output[outputPos++] = 'A';
break;
case '\u00C6': // Æ [LATIN CAPITAL LETTER AE]
case '\u01E2': // Ǣ [LATIN CAPITAL LETTER AE WITH MACRON]
case '\u01FC': // Ǽ [LATIN CAPITAL LETTER AE WITH ACUTE]
case '\u1D01': // ᴁ [LATIN LETTER SMALL CAPITAL AE]
output[outputPos++] = 'A';
output[outputPos++] = 'E';
break;
case '\uA734': // Ꜵ [LATIN CAPITAL LETTER AO]
output[outputPos++] = 'A';
output[outputPos++] = 'O';
break;
case '\uA736': // Ꜷ [LATIN CAPITAL LETTER AU]
output[outputPos++] = 'A';
output[outputPos++] = 'U';
break;
// etc. etc. etc.
// see link above for complete source code
//
// unfortunately, postings are limited, as in
// "Body is limited to 30000 characters; you entered 136098."
[...]
case '\u2053': // ⁓ [SWUNG DASH]
case '\uFF5E': // ~ [FULLWIDTH TILDE]
output[outputPos++] = '~';
break;
default:
output[outputPos++] = c;
break;
}
}
}
return outputPos;
}
}
Related
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 10 months ago.
Improve this question
I want to replace unicode decimal code character to character normal o special
for example É to É
with this i could solve(HttpUtility.HtmlDecode("PRUEBA05 JIMÉNEZ")
It sounds like you want to replace accented characters such as ö or é with their unaccented ASCII equivalents, or decompose ligatures such as æinto its constituent letters (ae).
To do that, you need to play with Unicode normalization forms to decompose things, strip out the bits you don't want, and put it all back together.
You can read up on Unicode normalization forms at
Unicode® Standard Annex #15: Unicode Normalization Forms
And about C#/.Net's support at
String.Normalize()
Here are a couple of extension methods I wrote some time back to do exactly what it sounds like you want to do.
Note that this was written to handle the sort of text that our application encounters. If you need to go further afield in Unicode than Western European texts, you'll probably need to make some changes.
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using Common.Utilities.Private;
namespace Common.Utilities
{
public static class ExtensionMethods
{
public static string ToKebabCase( this string s, bool allowAccentedCharacters = false )
{
string transformed = null;
if ( s != null )
{
transformed = Skewer(s);
if (!allowAccentedCharacters)
{
transformed = RemoveDiacritics(transformed);
}
transformed = transformed.ToLower();
}
return transformed;
}
public static string RemoveDiacritics( string s )
{
// NOTE THAT if we encounter data that uses stuff like ligatures (e.g, 'Æthelred the archæologist'), we'll probably
// want to switch from NormalizationForm.FormD (canonical decomposition) to
// NormalizationForm.FormKD (compatibility decomposition).
//
// Using FormD, the string 'Æthelred the archæologist' will remain unchanged;
// using FormKD, it will be transformed into 'Aethelred the archaeologist'
// decompose the string into it's constituent octets.
string canonicalDecomposition = s.Normalize( NormalizationForm.FormD ) ;
// strip out any diacritical marks
char[] buf = canonicalDecomposition
.Where( c => char.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark )
.TransliterateSpecials() // kill off the really oddball ones
.ToArray();
// put it back together using canonical compostion
string transformed = new string(buf).Normalize( NormalizationForm.FormC );
return transformed;
}
}
/// <summary>
/// Private Ling extensions so we don't add this to every string and other IEnumerable of char that's out there.
/// </summary>
namespace Private
{
public static class Extensions
{
/// <summary>
/// Transliterat certain special character (see mapping below) to their common basic ASCII equivalents.
/// </summary>
/// <param name="chars"></param>
/// <returns></returns>
public static IEnumerable<char> TransliterateSpecials( this IEnumerable<char> chars )
{
foreach ( char ch in chars ?? Enumerable.Empty<char>() )
{
IEnumerable<char> replacement;
bool hit = specialAccentedChars.TryGetValue( ch , out replacement );
if ( !hit )
{
yield return ch;
}
else
{
foreach ( char r in replacement )
{
yield return r;
}
}
}
}
/// <summary>
/// Derived from analysis of the Unicode Character Database v12.1.0.
///
/// These are the "accented" letters in the blocks 'Latin-1 Supplement' and 'Latin Extended-A'
/// that have no further decomposition defined even though there are common transliterations
/// and substitutions for these letters (which see the table below).
///
/// Why 'Latin-1 Supplement' and 'Latin Extended-A', you might ask? These blocks:
///
/// * Basic Latin (ASCII)
/// * Latin-1 Supplement
/// * Latin Extended-A
///
/// Cover pretty much any text we're likely to see, that is most Western European scripts.
/// Once you get into Latin Extended-B, -C, etc., you're off into Non-European, African, and
/// other oddball scripts (special letter for Egyptologists? Really?).
///
/// Basic Latin (ASCII) covers code point 0x0000 - 0x007F and doesn't have any special letters to worry about.
///
/// Latin-1 Supplement covers the remainder of the 8-bit range (0x0080-00FF: most common accented characters), and
/// Latin Extended-A brings in most letters found in Eastern European languages (Latvian, Lithuanian, Croatian, etc.)
///
/// Might note that we are extremely unlikely to ever encounter some of these letters:
///
/// * The letter 'Kra', for instance, "is a glyph formerly used to write the Kalaallisut language of Greenland and is
/// now only found in Nunatsiavummiutut, a distinct Inuktitut dialect.
///
/// * And the letter 'Eng'? It represents the sound 'nngg' as in "singing". It's used by the Washo language, spoken by the Washoe,
/// a [small] Native American tribe on the California/Nevada border.
///
/// </summary>
private static readonly Dictionary<char,IEnumerable<char>> specialAccentedChars = new Dictionary<char, IEnumerable<char>>
{
{ '\u00C6' , "AE" } , // LATIN CAPITAL LETTER AE
{ '\u00D0' , "TH" } , // LATIN CAPITAL LETTER ETH
{ '\u00D8' , "O" } , // LATIN CAPITAL LETTER O WITH STROKE
{ '\u00DE' , "TH" } , // LATIN CAPITAL LETTER THORN
{ '\u00DF' , "ss" } , // LATIN SMALL LETTER SHARP S
{ '\u00E6' , "ae" } , // LATIN SMALL LETTER AE
{ '\u00F0' , "th" } , // LATIN SMALL LETTER ETH
{ '\u00F8' , "o" } , // LATIN SMALL LETTER O WITH STROKE
{ '\u00FE' , "th" } , // LATIN SMALL LETTER THORN
{ '\u0110' , "D" } , // LATIN CAPITAL LETTER D WITH STROKE
{ '\u0111' , "d" } , // LATIN SMALL LETTER D WITH STROKE
{ '\u0126' , "H" } , // LATIN CAPITAL LETTER H WITH STROKE
{ '\u0127' , "h" } , // LATIN SMALL LETTER H WITH STROKE
{ '\u0131' , "i" } , // LATIN SMALL LETTER DOTLESS I
{ '\u0138' , "q" } , // LATIN SMALL LETTER KRA
{ '\u0141' , "L" } , // LATIN CAPITAL LETTER L WITH STROKE
{ '\u0142' , "L" } , // LATIN SMALL LETTER L WITH STROKE
{ '\u014A' , "N" } , // LATIN CAPITAL LETTER ENG
{ '\u014B' , "n" } , // LATIN SMALL LETTER ENG
{ '\u0152' , "OE" } , // LATIN CAPITAL LIGATURE OE
{ '\u0153' , "oe" } , // LATIN SMALL LIGATURE OE
{ '\u0166' , "T" } , // LATIN CAPITAL LETTER T WITH STROKE
{ '\u0167' , "t" } , // LATIN SMALL LETTER T WITH STROKE
};
}
}
}
This is a C# question. I want to make my program accept nothing but uppercase letters. I have managed to make it reject lowercase letters but I don't know how to make it reject numbers and other characters. Thank you for your help!
#region Question3
/* Write an application named EnterUppercaseLetters that asks the user
* to type an uppercase letter from the keyboard. If the character entered
* is an uppercase letter, display OK; if it isn't an uppercase letter,
* display an error message. The program continues until the user types an exclamation point.
*/
static void EnterUppercaseLetters()
{
//char letter;
bool toContinue = true;
do
{
Console.Write("Enter an uppercase letter: ");
//string input = Console.ReadLine();
char letter = Convert.ToChar(Console.ReadLine());
//double number = Convert.ToDouble(input);
//int number = Convert.ToInt32(letter);
if (letter == '!')
{
toContinue = false;
}
else
{
if (letter == Char.ToUpper(letter))
{
Console.WriteLine($"{letter} OK");
}
else
{
Console.WriteLine("ERROR!");
continue;
}
}
} while (toContinue);
Console.WriteLine();
}
#endregion
I think you can use regex for that.
Regex.IsMatch(input, #"^[A-Z]+$");
This will check if your string has any character that is not a capital letter.
#Moe - what you've got is good ... but maybe you'd prefer not entering the whole line. You can try something like this instead:
static void Main(string[] args)
{
char ch;
//input character
Console.WriteLine("Enter an UPPER CASE character, or '!' to exit: ");
while ((ch = Console.ReadKey().KeyChar() != '!')
{
if (IsUpper(ch))
Console.WriteLine("Input character is {0}: OK", ch);
else
Console.WriteLine("Input character is {0}: ERROR!", ch);
}
}
This will respond immediately when you enter ANY keystroke. It will exit immediately when you type "!". And it will print "OK" (for an upper case character) or "ERROR!" otherwise.
Relevant documentation (for our peripatetic friend Jeppe Stig Nielsen):
Console.ReadKey()
Char.IsUpper()
One possibility is:
char.GetUnicodeCategory(letter) != UnicodeCategory.UppercaseLetter
Note that Convert.ToChar will throw an unhandled exception if the user types an empty string or a string with length exceeding one. You may consider checking the string length, or using Console.ReadKey instead of Console.ReadLine.
We have one of use case where we need to validate some Uid with following constraint:
Uid of lenght 5 in form annnnn where a is any letter other than A, B, D and E. And n is any number from 0 to 9.
So I have create new method which will validate same and working for as expected.
/// <summary>
/// Validate Uid of lenght 5 in form annnnn where ‘a’ is any letter other than A, B, D and E.
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
private static bool IsValidUid( string value )
{
bool returnValue = false;
if (string.IsNullOrEmpty(value) == false && value.Length == 5)
{
char firstChar = value[0];
bool firstCharFailed = false;
switch (firstChar)
{
case 'A':
case 'B':
case 'D':
case 'E':
firstCharFailed = true;
break;
}
if (firstCharFailed == false)
{
string remainingStr = value.Substring(1, 4);
int numb;
if (int.TryParse(remainingStr, out numb) == true)
{
returnValue = true;
}
}
}
return returnValue;
}
But it could be effectively done via regular expression. Since I am bad with it, I need help to create one for me! Thanks in advance!!
where ‘a’ is any letter other than A, B, D and E.
So in this case you need to specify the set of letters that are allowed
[a-zF-ZC] => meaning all letters are in the range a-z and F-Z should be matched, also C should be matched
And 'n' is any number from 0 to 9.
\d matches a digit or you can use [0-9]
of lenght 5 in form annnnn
\d{4} specifies the amount of the preceding character that is ought to be matched
One last thing has to be done. You need to specify, that a is supposed to be at the beginning of the string and that the string ends after the 5-th character:
^ denotes the start of a string
$ denotes the end of a string.
So combined it would look like this:
string pattern = #"^[a-zF-ZC]\d{4}$
or using the range for the digits:
string pattern = #"^[a-zF-ZC][0-9]{4}$
Here is an overview of the regex patterns with examples and explanations
Here is a site where you can test and try out your regex pattern. Fiddle around with it to get to know it. It helps
EDIT:
the [...] denote a set of characters that will be matched. A Range is denoted using the - sign like in a-z, A-F or 5-9. Any other character ca be simply put into the set:
[abcdefgxyz] will match only those specified letters!
The set and the ranges can be combined
[CF-SX-Z] will match C or all capital letters between F and S and between X and Z. I hope it get more clear now
I have a class like:
public class MyClass{
public int ID {get;set;}
public string CName{get;set}
public string FirstName{get;set}
}
when I using newtonsoft camelcase to convert this class to json ,I get something like this
{
id:1,
cname:xxx,
fistName:xxx
}
Why not ID to iD,CName to cName? what's the exact rule of camelcase?
{
iD:1,
cName:xxx,
fistName:xxx
}
The first word of camelcase is all lowercase. Hence, ID becomes id, and CName becomes cname. After that, each additional word has only the first letter capitalized, hence name becomes Name. That is to say that Newtonsoft treats ID and CName as single words, not multiple words.
This is the method used to convert characters to camelcase in Newtonsoft. As you can see, it contains little logic for parsing a string into individual words. The code simply assumes that the first word in uppercase ends (1) after the second letter and (2) when the code finds either a space or an uppercase letter followed by a lowercase letter.
public static string ToCamelCase(string s)
{
if (string.IsNullOrEmpty(s) || !char.IsUpper(s[0]))
{
return s;
}
char[] chars = s.ToCharArray();
for (int i = 0; i < chars.Length; i++)
{
if (i == 1 && !char.IsUpper(chars[i]))
{
break;
}
bool hasNext = (i + 1 < chars.Length);
if (i > 0 && hasNext && !char.IsUpper(chars[i + 1]))
{
// if the next character is a space, which is not considered uppercase
// (otherwise we wouldn't be here...)
// we want to ensure that the following:
// 'FOO bar' is rewritten as 'foo bar', and not as 'foO bar'
// The code was written in such a way that the first word in uppercase
// ends when if finds an uppercase letter followed by a lowercase letter.
// now a ' ' (space, (char)32) is considered not upper
// but in that case we still want our current character to become lowercase
if (char.IsSeparator(chars[i + 1]))
{
chars[i] = ToLower(chars[i]);
}
break;
}
chars[i] = ToLower(chars[i]);
}
return new string(chars);
}
Is there some class in C# that would behave like string and will allow me to store custom metadata (tags) for some characters/words?
For example, for a string Example string I might want to add information that capital letter E is capital letter and I might want to add type of some letters (say vocals).Then I might want to call .Replace or .Trim on the string and I need the result should still contain 'tags' for unchanged letters in the sequence.
Is something like that possible in C#/.NET without having to write all the logic myself?
The answer to "is there anything built-in" is no.
However, you can create your own class to deal with whatever metadata info structure you want and expose the string value as an inner property. You also can use it to override the ToString() method. This way you'll be able to pass around your object and still work with its string equivalent.
By overloading operators you'll also be able to do casting and comparisons with regular string instances.
just for the fun of it!
to run this quickly:
Download the always amazing LinqPad
Open it and select Language C# Program
paste the code below
press Alt + X
void Main()
{
string str = "Example, string";
var output = Explain(str);
OutputExplanation(output);
}
private void OutputExplanation(List<LetterExplanation> input)
{
StringBuilder sb = new StringBuilder();
foreach(var ltr in input)
sb.AppendFormat("The letter {0} is {1}\n", ltr.Letter, ltr.Type);
sb.ToString().Dump();
}
private List<LetterExplanation> Explain(string input)
{
var sb = new List<LetterExplanation>();
foreach(char c in input.ToCharArray())
{
//c.Dump();
LetterType type = LetterType.Character;
// vowel, consonant or special
if("aeiou".IndexOf(c.ToString(), StringComparison.InvariantCultureIgnoreCase) >= 0)
type |= LetterType.Vowel;
else if(" ,.-_<>/\\".IndexOf(c.ToString(), StringComparison.InvariantCultureIgnoreCase) >= 0)
type |= LetterType.Special;
else
type |= LetterType.Consonant;
// uppercase or lowercase
if(char.IsUpper(c) && (type & LetterType.Special) != LetterType.Special)
type |= LetterType.Uppercase;
else if((type & LetterType.Special) != LetterType.Special)
type |= LetterType.Lowercase;
// add
sb.Add(new LetterExplanation() { Letter = c, Type = type });
}
return sb;
}
[Flags]
public enum LetterType {
Vowel = 1, Consonant = 1 << 1, Uppercase = 1 << 2, Lowercase = 1 << 3, Number = 1 << 4, Special = 1 << 5, Character = 1 << 6
}
public class LetterExplanation
{
public char Letter { get; set; }
public LetterType Type { get; set; }
}
you will have an output of:
The letter E is Vowel, Uppercase, Character
The letter x is Consonant, Lowercase, Character
The letter a is Vowel, Lowercase, Character
The letter m is Consonant, Lowercase, Character
The letter p is Consonant, Lowercase, Character
The letter l is Consonant, Lowercase, Character
The letter e is Vowel, Lowercase, Character
The letter , is Special, Character
The letter is Special, Character
The letter s is Consonant, Lowercase, Character
The letter t is Consonant, Lowercase, Character
The letter r is Consonant, Lowercase, Character
The letter i is Vowel, Lowercase, Character
The letter n is Consonant, Lowercase, Character
The letter g is Consonant, Lowercase, Character