I would like to write C# code that parses nested parenthesis to array elements, but only on first level. An example is needed for sure:
I want this string:
"(example (to (parsing nested paren) but) (first lvl only))"
tp be parsed into:
["example", "(to (parsing nested paren) but)", "(first lvl only)"]
I was thinking about using regex but can't figure out how to properly use them without implementing this behaviour from scratch.
In the case of malformed inputs I would like to return an empty array, or an array ["error"]
I developed a parser for your example. I also checked some other examples which you can see in the code.
using System;
using System.Collections;
using System.Collections.Generic;
public class Program
{
public static void Main()
{
string str = "(example (to (parsing nested paren) but) (first lvl only))"; // => [example , (to (parsing nested paren) but) , (first lvl only)]
//string str = "(first)(second)(third)"; // => [first , second , third]
//string str = "(first(second)third)"; // => [first , (second) , third]
//string str = "(first(second)(third)fourth)"; // => [first , (second) , (third) , fourth]
//string str = "(first((second)(third))fourth)"; // => [first , ((second)(third)) , fourth]
//string str = "just Text"; // => [ERROR]
//string str = "start with Text (first , second)"; // => [ERROR]
//string str = "(first , second) end with text"; // => [ERROR]
//string str = ""; // => [ERROR]
//string str = "("; // => [ERROR]
//string str = "(first()(second)(third))fourth)"; // => [ERROR]
//string str = "(((extra close pareanthese))))"; // => [ERROR]
var res = Parser.parse(str);
showRes(res);
}
static void showRes(ArrayList res)
{
var strings = res.ToArray();
var theString = string.Join(" , ", strings);
Console.WriteLine("[" + theString + "]");
}
}
public class Parser
{
static Dictionary<TokenType, TokenType> getRules()
{
var rules = new Dictionary<TokenType, TokenType>();
rules.Add(TokenType.OPEN_PARENTHESE, TokenType.START | TokenType.OPEN_PARENTHESE | TokenType.CLOSE_PARENTHESE | TokenType.SIMPLE_TEXT);
rules.Add(TokenType.CLOSE_PARENTHESE, TokenType.SIMPLE_TEXT | TokenType.CLOSE_PARENTHESE);
rules.Add(TokenType.SIMPLE_TEXT, TokenType.SIMPLE_TEXT | TokenType.CLOSE_PARENTHESE | TokenType.OPEN_PARENTHESE);
rules.Add(TokenType.END, TokenType.CLOSE_PARENTHESE);
return rules;
}
static bool isValid(Token prev, Token cur)
{
var rules = Parser.getRules();
return rules.ContainsKey(cur.type) && ((prev.type & rules[cur.type]) == prev.type);
}
public static ArrayList parse(string sourceText)
{
ArrayList result = new ArrayList();
int openParenthesesCount = 0;
Lexer lexer = new Lexer(sourceText);
Token prevToken = lexer.getStartToken();
Token currentToken = lexer.readNextToken();
string tmpText = "";
while (currentToken.type != TokenType.END)
{
if (currentToken.type == TokenType.OPEN_PARENTHESE)
{
openParenthesesCount++;
if (openParenthesesCount > 1)
{
tmpText += currentToken.token;
}
}
else if (currentToken.type == TokenType.CLOSE_PARENTHESE)
{
openParenthesesCount--;
if (openParenthesesCount < 0)
{
return Parser.Error();
}
if (openParenthesesCount > 0)
{
tmpText += currentToken.token;
}
}
else if (currentToken.type == TokenType.SIMPLE_TEXT)
{
tmpText += currentToken.token;
}
if (!Parser.isValid(prevToken, currentToken))
{
return Parser.Error();
}
if (openParenthesesCount == 1 && tmpText.Trim() != "")
{
result.Add(tmpText);
tmpText = "";
}
prevToken = currentToken;
currentToken = lexer.readNextToken();
}
if (openParenthesesCount != 0)
{
return Parser.Error();
}
if (!Parser.isValid(prevToken, currentToken))
{
return Parser.Error();
}
if (tmpText.Trim() != "")
{
result.Add(tmpText);
}
return result;
}
static ArrayList Error()
{
var er = new ArrayList();
er.Add("ERROR");
return er;
}
}
class Lexer
{
string _txt;
int _index;
public Lexer(string text)
{
this._index = 0;
this._txt = text;
}
public Token getStartToken()
{
return new Token(-1, TokenType.START, "");
}
public Token readNextToken()
{
if (this._index >= this._txt.Length)
{
return new Token(-1, TokenType.END, "");
}
Token t = null;
string txt = "";
if (this._txt[this._index] == '(')
{
txt = "(";
t = new Token(this._index, TokenType.OPEN_PARENTHESE, txt);
}
else if (this._txt[this._index] == ')')
{
txt = ")";
t = new Token(this._index, TokenType.CLOSE_PARENTHESE, txt);
}
else
{
txt = this._readText();
t = new Token(this._index, TokenType.SIMPLE_TEXT, txt);
}
this._index += txt.Length;
return t;
}
private string _readText()
{
string txt = "";
int i = this._index;
while (i < this._txt.Length && this._txt[i] != '(' && this._txt[i] != ')')
{
txt = txt + this._txt[i];
i++;
}
return txt;
}
}
class Token
{
public int position
{
get;
private set;
}
public TokenType type
{
get;
private set;
}
public string token
{
get;
private set;
}
public Token(int position, TokenType type, string token)
{
this.position = position;
this.type = type;
this.token = token;
}
}
[Flags]
enum TokenType
{
START = 1,
OPEN_PARENTHESE = 2,
SIMPLE_TEXT = 4,
CLOSE_PARENTHESE = 8,
END = 16
}
well, regex will do the job:
var text = #"(example (to (parsing nested paren) but) (first lvl only))";
var pattern = #"\(([\w\s]+) (\([\w\s]+ \([\w\s]+\) [\w\s]+\)) (\([\w\s]+\))\)*";
try
{
Regex r = new Regex(pattern, RegexOptions.IgnoreCase);
Match m = r.Match(text);
string group_1 = m.Groups[1].Value; //example
string group_2 = m.Groups[2].Value; //(to (parsing nested paren) but)
string group_3 = m.Groups[3].Value; //(first lvl only)
return new string[]{group_1,group_2,group_3};
}
catch(Exception ex){
return new string[]{"error"};
}
hopefully this helps, tested here in dotnetfiddle
Edit:
this might get you started into building the right expression according to whatever patterns you are falling into and maybe build a recursive function to parse the rest into the desired output :)
RegEx is not recursive. You either count bracket level, or recurse.
An non-recursive parser loop I tested for the example you show is..
string SplitFirstLevel(string s)
{
List<string> result = new List<string>();
int p = 0, level = 0;
for (int i = 0; i < s.Length; i++)
{
if (s[i] == '(')
{
level++;
if (level == 1) p = i + 1;
if (level == 2)
{
result.Add('"' + s.Substring(p, i - p) + '"');
p = i;
}
}
if (s[i] == ')')
if (--level == 0)
result.Add('"' + s.Substring(p, i - p) + '"');
}
return "[" + String.Join(",", result) + "]";
}
Note: after some more testing, I see your specification is unclear. How to delimit orphaned level 1 terms, that is terms without bracketing ?
For example, my parser translates
(example (to (parsing nested paren) but) (first lvl only))
to:
["example ","(to (parsing nested paren) but) ","(first lvl only)"]
and
(example (to (parsing nested paren)) but (first lvl only))
to:
["example ","(to (parsing nested paren)) but ","(first lvl only)"]
In either case, "example" gets a separate term, while "but" is grouped with the first term. In the first example this is logical, it is in the bracketing, but it may be unwanted behaviour in the second case, where "but" should be separated, like "example", which also has no bracketing (?)
I have a list of UserNames in a comma delimited string. I want to find next one of the input username.
For Example:
var s0 = "abc,deF,ghi,jkl";
var s1 = "abc";
var s2 = "def";
var s3 = "ghi";
var s4 = "jkl";
Result should be:
NextInString(s0,s1 ) == "def"
NextInString(s0,s2 ) == "ghi"
NextInString(s0,s3 ) == "jkl"
NextInString(s0,s4 ) == "jkl"
Here is what I have:
string NextInString(string listOfNames, string userName)
{
if(listOfNames == string.Empty || userName == string.Empty)
return string.Empty;
var s = listOfNames.Split(',');
var count = 0;
foreach (var element in s)
{
if (element == userName)break;
count++;
}
if (s.Length -1 == count)
{
return s[count];
}
else return s[ count + 1 ];
}
My question is, is there a better/easier way to approach this?
If you take the extra step to ensure your string list is trimmed, you can just use the IndexOf() method of List<T>:
string csv = "test1, test2, test3, test4";
List<string> names = csv.Split(',').Select(x => x.Trim()).ToList();
Then your NextInString() method (I think this is a poorly named method) would look like this:
private static string NextInString(List<string> names, string userName)
{
int index = names.IndexOf(userName);
if(names.Count - 1 == index || index == -1)
{
return "No result";
}
else
{
return names[index + 1];
}
}
I made a fiddle here
You can use Linq like this:
string userName = "abc";
string listOfNames = "abc,xyz,123";
var names = listOfNames
.Split(',')
.Select((n, i) => new {name = n, index =i} )
.ToArray();
var firstMatch = names.FirstOrDefault(n => n.name == userName);
var result = firstMatch == null
? string.Empty
: firstMatch.index == names.Length - 1
? string.Empty
: names[firstMatch.index + 1].name;
Here is the LINQ approach:
string NextInString(string listOfNames, string userName)
{
if(listOfNames == string.Empty || userName == string.Empty) return string.Empty;
var names = listOfNames.Split(',');
return names
.SkipWhile(x => x != userName)
.Skip(1)
.FirstOrDefault() ?? names.Last();
}
You can make a nice little extension method to do this after the string is split, like so:
static class IListExtensions
{
public static T FindItemAfter<T>(this IList<T> list, T targetItem)
{
return list[list.IndexOf(targetItem)+ 1];
}
}
You can use it like this:
static void Main(string[] args)
{
var list = "cat,dog,rat".Split(',');
Console.WriteLine(list.FindItemAfter("cat"));
Console.WriteLine(list.FindItemAfter("dog"));
Console.ReadLine();
}
It returns:
dog
rat
This overload will allow you to specify a default value that gets returned if the requested item isn't found, or the next item would be outside the list.
public static T FindItemAfter<T>(this IList<T> list, T targetItem, T defaultValue)
{
var index = list.IndexOf(targetItem);
if (index == -1 || index >= list.Count - 1)
{
return defaultValue;
}
return list[index + 1];
}
How about something like this?
var s = listOfNames.Split(',');
for (var i = 0; i < s.count; i++)
{
if (i == s.count - 1)
{
return string.Format("No user after {0} was found", userName);
}
else if (s[i] == userName)
{
return s[i + 1];
}
}
I am trying to validate the command line arguments and print an error message if there is some error.
My problem is that if the number of command line parameters is increased (currently, I only have 3), then my code would turn into spaghetti code. How can I reduce the cyclomatic complexity of the given code?
var isCmdLineWrong = false;
var Arg1 = "Undefined";
var Arg2 = "Undefined";
var Arg3 = "Undefined";
var commandArguments = Environment.GetCommandLineArgs();
if (commandArguments.Contains("-r") && arguments[commandArguments.IndexOf("-r") + 1].StartsWith("-") == false)
Arg1 = commandArguments[commandArguments.IndexOf("-r") + 1];
else
{
isCmdLineWrong = true;
}
if (commandArguments.Contains("-n") && commandArguments[commandArguments.IndexOf("-n") + 1].StartsWith("-") == false)
Arg2 = commandArguments[commandArguments.IndexOf("-n") + 1];
else
{
isCmdLineWrong = true;
}
if (commandArguments.Contains("-p") && commandArguments[commandArguments.IndexOf("-p") + 1].StartsWith("-") == false)
Arg3 = commandArguments[commandArguments.IndexOf("-p") + 1];
else
{
isCmdLineWrong = true;
}
if (isCmdLineWrong) Console.WriteLine("Parameters structure is inconsistent");
I suggest extracting CommandLine class:
public static class CommandLine {
private static String FindValue(string value) {
var commandArguments = Environment.GetCommandLineArgs();
int index = commandArguments.IndexOf(value);
if (index < 0)
return null;
else if (index >= commandArguments.Length - 1)
return null; // cmd like "myRoutine.exe -p"
else
return commandArguments[index + 1];
}
static CommandLine() {
Arg1 = FindValue("-r");
Arg2 = FindValue("-n");
Arg3 = FindValue("-p");
}
public static String Arg1 { get; private set; }
public static String Arg2 { get; private set; }
public static String Arg3 { get; private set; }
public static bool IsValid {
get {
return Arg1 != null && Arg2 != null && Arg3 != null;
}
}
}
Having this class written you can put
if (!CommandLine.IsValid) {
Console.WriteLine("Parameters structure is inconsistent");
return;
}
if (CommandLine.Arg1 == "quit") {
...
}
This question is a simple example of how to reuse code.
Look for code which appears to have been copied/pasted,
Put it in a function,
Any differences between copies, pass them in as parameters,
Replace the copies with function calls.
The result is
// Returns this option's value from args, or null on error
public string OptionValue(string[] args, string option)
{
try
{
if (args.Contains(option))
{
string value = args[args.IndexOf(option) + 1]; // reuse expressions as well
if (!value.StartsWith("-"))
return value;
}
return null; // null meaning "undefined"
}
catch
{
return null;
}
}
// And now your code
string[] args = Environment.GetCommandLineArgs();
string Arg1 = OptionValue(args, "-r");
string Arg2 = OptionValue(args, "-n");
string Arg3 = OptionValue(args, "-p");
bool isCmdLineWrong = (Arg1 == null ||
Arg2 == null ||
Arg3 == null);
Of course, all this rewriting could have been avoided if you didn't copy/paste code to start with.
Probably the most important thing to observe in your code is that you are doing the exact same thing several times, though with different inputs "-r" and Arg1, "-n" and Arg2, "-p" and Arg3. That is, you have the following code fragment appear three times (minus my reformatting):
if (commandArguments.Contains(…) &&
arguments[commandArguments.IndexOf(…) + 1].StartsWith("-") == false)
{
… = commandArguments[commandArguments.IndexOf(…) + 1];
}
else
{
isCmdLineWrong = true;
}
The Don't Repeat Yourself (DRY) principle tries to warn us from writing copy-and-paste-style repetitious code, and your original code is a pretty clear violation of it.
I suggest that you extract the common code and put it in a separate method. For example:
static bool TryGetArg(string commandArguments, string name, out string value)
{
// Debug.Assert(name.StartsWith("-"));
if (commandArguments.Contains("-") &&
arguments[commandArguments.IndexOf(name) + 1].StartsWith("-") == false)
{
value = commandArguments[commandArguments.IndexOf(name) + 1];
return true;
}
else
{
value = null;
return false;
}
}
Now you replace your repeated if else with the following:
string commandArguments = Environment.GetCommandLineArgs();
string arg1 = null;
string arg2 = null;
string arg3 = null;
bool isCmdLineOk = TryGetArg(commandArguments, "-r", out arg1) &&
TryGetArg(commandArguments, "-n", out arg2) &&
TryGetArg(commandArguments, "-p", out arg3);
if (isCmdLineOk)
{
// do something with `arg1`, `arg2`, `arg3`.
}
else
{
// not all of `arg1`, `arg2`, `arg3` could be set to a value.
Console.WriteLine("Parameters structure is inconsistent");
}
This question already has answers here:
How to determine if a string is a valid variable name?
(5 answers)
Closed 7 years ago.
In Java, there are methods called isJavaIdentifierStart and isJavaIdentifierPart on the Character class that may be used to tell if a string is a valid Java identifier, like so:
public boolean isJavaIdentifier(String s) {
int n = s.length();
if (n==0) return false;
if (!Character.isJavaIdentifierStart(s.charAt(0)))
return false;
for (int i = 1; i < n; i++)
if (!Character.isJavaIdentifierPart(s.charAt(i)))
return false;
return true;
}
Is there something like this for C#?
Yes:
// using System.CodeDom.Compiler;
CodeDomProvider provider = CodeDomProvider.CreateProvider("C#");
if (provider.IsValidIdentifier (YOUR_VARIABLE_NAME)) {
// Valid
} else {
// Not valid
}
From here: How to determine if a string is a valid variable name?
I would be wary of the other solutions offered here. Calling CodeDomProvider.CreateProvider requires finding and parsing the Machine.Config file, as well as your app.config file. That's likely to be several times slower than the time required to just check the string your self.
Instead I would advocate you make one of the following changes:
Cache the provider in a static variable.
This will cause you to take the hit of creating it only once, but it will slow down type loading.
Create the provider directly, by creating a Microsoft.CSharp.CSharpCodeProvider instance your self
This will skip the config file parsing all together.
Write the code to implement the check your self.
If you do this, you get the greatest control over how it's implemented, which can help you optimize performance if you need to. See section 2.2.4 of the C# language spec for the complete lexical grammar for C# identifiers.
With Roslyn being open source, code analysis tools are right at your fingertips, and they're written for performance. (Right now they're in pre-release).
However, I can't speak to the performance cost of loading the assembly.
Install the tools using nuget:
Install-Package Microsoft.CodeAnalysis -Pre
Ask your question:
var isValid = Microsoft.CodeAnalysis.CSharp.SyntaxFacts.IsValidIdentifier("I'mNotValid");
Console.WriteLine(isValid); // False
Basically something like:
const string start = #"(\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl})";
const string extend = #"(\p{Mn}|\p{Mc}|\p{Nd}|\p{Pc}|\p{Cf})";
Regex ident = new Regex(string.Format("{0}({0}|{1})*", start, extend));
s = s.Normalize();
return ident.IsMatch(s);
Necromancing here.
In .NET Core/DNX, you can do it with Roslyn-SyntaxFacts
Microsoft.CodeAnalysis.CSharp.SyntaxFacts.IsReservedKeyword(
Microsoft.CodeAnalysis.CSharp.SyntaxFacts.GetKeywordKind("protected")
);
foreach (ColumnDefinition cl in tableColumns)
{
sb.Append(#" public ");
sb.Append(cl.DOTNET_TYPE);
sb.Append(" ");
// for keywords
//if (!Microsoft.CodeAnalysis.CSharp.SyntaxFacts.IsValidIdentifier(cl.COLUMN_NAME))
if (Microsoft.CodeAnalysis.CSharp.SyntaxFacts.IsReservedKeyword(
Microsoft.CodeAnalysis.CSharp.SyntaxFacts.GetKeywordKind(cl.COLUMN_NAME)
))
sb.Append("#");
sb.Append(cl.COLUMN_NAME);
sb.Append("; // ");
sb.AppendLine(cl.SQL_TYPE);
} // Next cl
Or in the old variant with Codedom - After a look in the mono sourcecode:
CodeDomProvider.cs
public virtual bool IsValidIdentifier (string value)
286 {
287 ICodeGenerator cg = CreateGenerator ();
288 if (cg == null)
289 throw GetNotImplemented ();
290 return cg.IsValidIdentifier (value);
291 }
292
Then CSharpCodeProvider.cs
public override ICodeGenerator CreateGenerator()
91 {
92 #if NET_2_0
93 if (providerOptions != null && providerOptions.Count > 0)
94 return new Mono.CSharp.CSharpCodeGenerator (providerOptions);
95 #endif
96 return new Mono.CSharp.CSharpCodeGenerator();
97 }
Then CSharpCodeGenerator.cs
protected override bool IsValidIdentifier (string identifier)
{
if (identifier == null || identifier.Length == 0)
return false;
if (keywordsTable == null)
FillKeywordTable ();
if (keywordsTable.Contains (identifier))
return false;
if (!is_identifier_start_character (identifier [0]))
return false;
for (int i = 1; i < identifier.Length; i ++)
if (! is_identifier_part_character (identifier [i]))
return false;
return true;
}
private static System.Collections.Hashtable keywordsTable;
private static string[] keywords = new string[] {
"abstract","event","new","struct","as","explicit","null","switch","base","extern",
"this","false","operator","throw","break","finally","out","true",
"fixed","override","try","case","params","typeof","catch","for",
"private","foreach","protected","checked","goto","public",
"unchecked","class","if","readonly","unsafe","const","implicit","ref",
"continue","in","return","using","virtual","default",
"interface","sealed","volatile","delegate","internal","do","is",
"sizeof","while","lock","stackalloc","else","static","enum",
"namespace",
"object","bool","byte","float","uint","char","ulong","ushort",
"decimal","int","sbyte","short","double","long","string","void",
"partial", "yield", "where"
};
static void FillKeywordTable ()
{
lock (keywords) {
if (keywordsTable == null) {
keywordsTable = new Hashtable ();
foreach (string keyword in keywords) {
keywordsTable.Add (keyword, keyword);
}
}
}
}
static bool is_identifier_start_character (char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '#' || Char.IsLetter (c);
}
static bool is_identifier_part_character (char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= '0' && c <= '9') || Char.IsLetter (c);
}
You get this code:
public static bool IsValidIdentifier (string identifier)
{
if (identifier == null || identifier.Length == 0)
return false;
if (keywordsTable == null)
FillKeywordTable();
if (keywordsTable.Contains(identifier))
return false;
if (!is_identifier_start_character(identifier[0]))
return false;
for (int i = 1; i < identifier.Length; i++)
if (!is_identifier_part_character(identifier[i]))
return false;
return true;
}
internal static bool is_identifier_start_character(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '#' || char.IsLetter(c);
}
internal static bool is_identifier_part_character(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= '0' && c <= '9') || char.IsLetter(c);
}
private static System.Collections.Hashtable keywordsTable;
private static string[] keywords = new string[] {
"abstract","event","new","struct","as","explicit","null","switch","base","extern",
"this","false","operator","throw","break","finally","out","true",
"fixed","override","try","case","params","typeof","catch","for",
"private","foreach","protected","checked","goto","public",
"unchecked","class","if","readonly","unsafe","const","implicit","ref",
"continue","in","return","using","virtual","default",
"interface","sealed","volatile","delegate","internal","do","is",
"sizeof","while","lock","stackalloc","else","static","enum",
"namespace",
"object","bool","byte","float","uint","char","ulong","ushort",
"decimal","int","sbyte","short","double","long","string","void",
"partial", "yield", "where"
};
internal static void FillKeywordTable()
{
lock (keywords)
{
if (keywordsTable == null)
{
keywordsTable = new System.Collections.Hashtable();
foreach (string keyword in keywords)
{
keywordsTable.Add(keyword, keyword);
}
}
}
}
Recently, I wrote an extension method that validates a string as a valid C# identifier.
You can find a gist with the implementation here: https://gist.github.com/FabienDehopre/5245476
It's based on the MSDN documentation of Identifier (http://msdn.microsoft.com/en-us/library/aa664670(v=vs.71).aspx)
public static bool IsValidIdentifier(this string identifier)
{
if (String.IsNullOrEmpty(identifier)) return false;
// C# keywords: http://msdn.microsoft.com/en-us/library/x53a06bb(v=vs.71).aspx
var keywords = new[]
{
"abstract", "event", "new", "struct",
"as", "explicit", "null", "switch",
"base", "extern", "object", "this",
"bool", "false", "operator", "throw",
"breal", "finally", "out", "true",
"byte", "fixed", "override", "try",
"case", "float", "params", "typeof",
"catch", "for", "private", "uint",
"char", "foreach", "protected", "ulong",
"checked", "goto", "public", "unchekeced",
"class", "if", "readonly", "unsafe",
"const", "implicit", "ref", "ushort",
"continue", "in", "return", "using",
"decimal", "int", "sbyte", "virtual",
"default", "interface", "sealed", "volatile",
"delegate", "internal", "short", "void",
"do", "is", "sizeof", "while",
"double", "lock", "stackalloc",
"else", "long", "static",
"enum", "namespace", "string"
};
// definition of a valid C# identifier: http://msdn.microsoft.com/en-us/library/aa664670(v=vs.71).aspx
const string formattingCharacter = #"\p{Cf}";
const string connectingCharacter = #"\p{Pc}";
const string decimalDigitCharacter = #"\p{Nd}";
const string combiningCharacter = #"\p{Mn}|\p{Mc}";
const string letterCharacter = #"\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}";
const string identifierPartCharacter = letterCharacter + "|" +
decimalDigitCharacter + "|" +
connectingCharacter + "|" +
combiningCharacter + "|" +
formattingCharacter;
const string identifierPartCharacters = "(" + identifierPartCharacter + ")+";
const string identifierStartCharacter = "(" + letterCharacter + "|_)";
const string identifierOrKeyword = identifierStartCharacter + "(" +
identifierPartCharacters + ")*";
var validIdentifierRegex = new Regex("^" + identifierOrKeyword + "$", RegexOptions.Compiled);
var normalizedIdentifier = identifier.Normalize();
// 1. check that the identifier match the validIdentifer regex and it's not a C# keyword
if (validIdentifierRegex.IsMatch(normalizedIdentifier) && !keywords.Contains(normalizedIdentifier))
{
return true;
}
// 2. check if the identifier starts with #
if (normalizedIdentifier.StartsWith("#") && validIdentifierRegex.IsMatch(normalizedIdentifier.Substring(1)))
{
return true;
}
// 3. it's not a valid identifier
return false;
}
The now-released Roslyn project provides Microsoft.CodeAnalysis.CSharp.SyntaxFacts, with SyntaxFacts.IsIdentifierStartCharacter(char) and SyntaxFacts.IsIdentifierPartCharacter(char) methods just like Java.
Here it is in use, in a simple function I use to turn noun phrases (eg "Start Date") into C# identifiers (eg "StartDate"). N.B I'm using Humanizer to do the camel-case conversion, and Roslyn to check whether a character is valid.
public static string Identifier(string name)
{
Check.IsNotNullOrWhitespace(name, nameof(name));
// trim off leading and trailing whitespace
name = name.Trim();
// should deal with spaces => camel casing;
name = name.Dehumanize();
var sb = new StringBuilder();
if (!SyntaxFacts.IsIdentifierStartCharacter(name[0]))
{
// the first characters
sb.Append("_");
}
foreach(var ch in name)
{
if (SyntaxFacts.IsIdentifierPartCharacter(ch))
{
sb.Append(ch);
}
}
var result = sb.ToString();
if (SyntaxFacts.GetKeywordKind(result) != SyntaxKind.None)
{
result = #"#" + result;
}
return result;
}
Tests;
[TestCase("Start Date", "StartDate")]
[TestCase("Bad*chars", "BadChars")]
[TestCase(" leading ws", "LeadingWs")]
[TestCase("trailing ws ", "TrailingWs")]
[TestCase("class", "Class")]
[TestCase("int", "Int")]
[Test]
public void CSharp_GeneratesDecentIdentifiers(string input, string expected)
{
Assert.AreEqual(expected, CSharp.Identifier(input));
}
This can be done using reflection - see How to determine if a string is a valid variable name?