How to choose an XML Node? (Using LINQ, XPath, anything is fine) - c#

I have an XML like below :
<Decide Name="MemoryCheck" CommonUnit="MB">
<Decision CellColor="Red" Status="Critical" Exp="<=100" />
<Decision CellColor="Yellow" Status="Warning" Exp="<=200 & >100"/>
<Decision CellColor="Green" Status="OK" Exp=">200" />
</Decide>
For Input 50 MB, Output returned should be "Critical-Red"
For Input 142 MB, Output returned should be "Warning-Yellow"
For Input 212 MB, Output returned should be"OK-Green"
How to go about this using C# ??
Xml Name is "Decide.xml" and Code I have now :
XmlDocument xmldecide = new XmlDocument();
xmldecide.Load("C:\\Decide.xml");
XmlNodeList decidelist = xmldecide.GetElementsbyTagName("Decide");
XmlNode xdecide = decidelist[0];
string input = "50"; // Unit in MB
// Now I have to display the desired O/P "Critical-Red"
string input = "142"; // Unit in MB
// Now I have to display the desired O/P "Warning-Yellow"
string input = "212"; // Unit in MB
// Now I have to display the desired O/P "OK-Green"

Just a suggestion - If you have control of that xml you should consider creating a min and max attribute. Having to parse out conditional and integer information from a single attribute is ugly. That said, assuming you can't change the xml, here's a solution. It assumes the conditionals in the attribute are always in a similar format.
public static string AlertLevel(this XDocument decisionDocument, int size)
{
var queryResult = decisionDocument.Descendants("Decision");
foreach (var item in queryResult)
{
var expAttribute = item.Attribute("Exp");
if (expAttribute == null) continue;
var returnString = CreateResultString(item);
int minValue;
int maxValue;
if (expAttribute.Value.Contains(">") && expAttribute.Value.Contains("<="))
{
//evaluate minValue < size > maxValue
var stringValue = expAttribute.Value.Replace("<=", string.Empty).Replace(">", string.Empty).Trim();
var stringValueArray = stringValue.Split('&');
if (int.TryParse(stringValueArray[1], out minValue) &&
int.TryParse(stringValueArray[0], out maxValue))
{
if (minValue < size &&
size < maxValue)
return returnString;
}
}
else if (expAttribute.Value.Contains(">"))
{
//evaluate size > value
var stringValue = expAttribute.Value.Replace(">", string.Empty).Trim();
if (int.TryParse(stringValue, out maxValue))
{
if (size > maxValue)
return returnString;
}
}
else if (expAttribute.Value.Contains("<="))
{
//else evaluate size < value
var stringValue = expAttribute.Value.Replace("<=", string.Empty).Trim();
if (int.TryParse(stringValue, out minValue))
{
if (size < minValue)
return returnString;
}
}
}
return "No condition was met!";
}
private static string CreateResultString(XElement item)
{
var statusAttribute = item.Attribute("Status");
var returnString = statusAttribute == null ? "Status" : statusAttribute.Value;
var colorAttribute = item.Attribute("CellColor");
returnString += colorAttribute == null ? "-Color" : "-" + colorAttribute.Value;
return returnString;
}
usage
var xmlDecide = XDocument.Load("Decide.xml");
Console.WriteLine("50MB: " + xmlDecide.AlertLevel(50));
Console.WriteLine("142MB: " + xmlDecide.AlertLevel(142));
Console.WriteLine("212MB: " + xmlDecide.AlertLevel(212));
EDIT: You can use the same code for use with XmlDocument instead of XDocument. Just change "Attribute" to "Attributes.GetNamedItem" and "Descendants" to "GetElementsByTagName"

This is complicated.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Xml;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
string XML =
"<Decide Name=\"MemoryCheck\" CommonUnit=\"MB\">" +
"<Decision CellColor=\"Red\" Status=\"Critical\" Exp=\"<=100\" />" +
"<Decision CellColor=\"Yellow\" Status=\"Warning\" Exp=\"<=200 & >100\"/>" +
"<Decision CellColor=\"Green\" Status=\"OK\" Exp=\">200\" />" +
"</Decide>";
XmlDocument doc = new XmlDocument();
doc.LoadXml(XML);
XmlNodeList memoryCheck = doc.GetElementsByTagName("Decision");
foreach(XmlNode decision in memoryCheck)
{
Decision newDecision = new Decision();
Decision.decisions.Add(newDecision);
newDecision.Cellcolor = decision.Attributes.GetNamedItem("CellColor").Value;
newDecision.status = decision.Attributes.GetNamedItem("Status").Value;
newDecision.low = 0;
newDecision.high = null;
string exps = decision.Attributes.GetNamedItem("Exp").Value;
string[] expsArray = exps.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
foreach (string exp in expsArray)
{
if(exp.StartsWith("<="))
{
newDecision.high = int.Parse(exp.Substring(exp.IndexOf("=") + 1));
}
if(exp.StartsWith(">"))
{
newDecision.low = int.Parse(exp.Substring(exp.IndexOf(">") + 1));
}
}
}
Decision result = Decision.GetBySize(212);
}
}
public class Decision
{
public static List<Decision> decisions = new List<Decision>();
public string Cellcolor { get; set; }
public string status { get; set; }
public int? low { get; set; }
public int? high {get; set;}
public static Decision GetBySize(int memory)
{
Decision newDecision = null;
foreach(Decision decision in decisions)
{
if (memory >= decision.low)
{
if (decision.high == null)
{
newDecision = decision;
break;
}
else
{
if (memory <= decision.high)
{
newDecision = decision;
break;
}
}
}
}
return newDecision;
}
}
}

Related

How to parse nested parenthesis only in first level in C#

I would like to write C# code that parses nested parenthesis to array elements, but only on first level. An example is needed for sure:
I want this string:
"(example (to (parsing nested paren) but) (first lvl only))"
tp be parsed into:
["example", "(to (parsing nested paren) but)", "(first lvl only)"]
I was thinking about using regex but can't figure out how to properly use them without implementing this behaviour from scratch.
In the case of malformed inputs I would like to return an empty array, or an array ["error"]
I developed a parser for your example. I also checked some other examples which you can see in the code.
using System;
using System.Collections;
using System.Collections.Generic;
public class Program
{
public static void Main()
{
string str = "(example (to (parsing nested paren) but) (first lvl only))"; // => [example , (to (parsing nested paren) but) , (first lvl only)]
//string str = "(first)(second)(third)"; // => [first , second , third]
//string str = "(first(second)third)"; // => [first , (second) , third]
//string str = "(first(second)(third)fourth)"; // => [first , (second) , (third) , fourth]
//string str = "(first((second)(third))fourth)"; // => [first , ((second)(third)) , fourth]
//string str = "just Text"; // => [ERROR]
//string str = "start with Text (first , second)"; // => [ERROR]
//string str = "(first , second) end with text"; // => [ERROR]
//string str = ""; // => [ERROR]
//string str = "("; // => [ERROR]
//string str = "(first()(second)(third))fourth)"; // => [ERROR]
//string str = "(((extra close pareanthese))))"; // => [ERROR]
var res = Parser.parse(str);
showRes(res);
}
static void showRes(ArrayList res)
{
var strings = res.ToArray();
var theString = string.Join(" , ", strings);
Console.WriteLine("[" + theString + "]");
}
}
public class Parser
{
static Dictionary<TokenType, TokenType> getRules()
{
var rules = new Dictionary<TokenType, TokenType>();
rules.Add(TokenType.OPEN_PARENTHESE, TokenType.START | TokenType.OPEN_PARENTHESE | TokenType.CLOSE_PARENTHESE | TokenType.SIMPLE_TEXT);
rules.Add(TokenType.CLOSE_PARENTHESE, TokenType.SIMPLE_TEXT | TokenType.CLOSE_PARENTHESE);
rules.Add(TokenType.SIMPLE_TEXT, TokenType.SIMPLE_TEXT | TokenType.CLOSE_PARENTHESE | TokenType.OPEN_PARENTHESE);
rules.Add(TokenType.END, TokenType.CLOSE_PARENTHESE);
return rules;
}
static bool isValid(Token prev, Token cur)
{
var rules = Parser.getRules();
return rules.ContainsKey(cur.type) && ((prev.type & rules[cur.type]) == prev.type);
}
public static ArrayList parse(string sourceText)
{
ArrayList result = new ArrayList();
int openParenthesesCount = 0;
Lexer lexer = new Lexer(sourceText);
Token prevToken = lexer.getStartToken();
Token currentToken = lexer.readNextToken();
string tmpText = "";
while (currentToken.type != TokenType.END)
{
if (currentToken.type == TokenType.OPEN_PARENTHESE)
{
openParenthesesCount++;
if (openParenthesesCount > 1)
{
tmpText += currentToken.token;
}
}
else if (currentToken.type == TokenType.CLOSE_PARENTHESE)
{
openParenthesesCount--;
if (openParenthesesCount < 0)
{
return Parser.Error();
}
if (openParenthesesCount > 0)
{
tmpText += currentToken.token;
}
}
else if (currentToken.type == TokenType.SIMPLE_TEXT)
{
tmpText += currentToken.token;
}
if (!Parser.isValid(prevToken, currentToken))
{
return Parser.Error();
}
if (openParenthesesCount == 1 && tmpText.Trim() != "")
{
result.Add(tmpText);
tmpText = "";
}
prevToken = currentToken;
currentToken = lexer.readNextToken();
}
if (openParenthesesCount != 0)
{
return Parser.Error();
}
if (!Parser.isValid(prevToken, currentToken))
{
return Parser.Error();
}
if (tmpText.Trim() != "")
{
result.Add(tmpText);
}
return result;
}
static ArrayList Error()
{
var er = new ArrayList();
er.Add("ERROR");
return er;
}
}
class Lexer
{
string _txt;
int _index;
public Lexer(string text)
{
this._index = 0;
this._txt = text;
}
public Token getStartToken()
{
return new Token(-1, TokenType.START, "");
}
public Token readNextToken()
{
if (this._index >= this._txt.Length)
{
return new Token(-1, TokenType.END, "");
}
Token t = null;
string txt = "";
if (this._txt[this._index] == '(')
{
txt = "(";
t = new Token(this._index, TokenType.OPEN_PARENTHESE, txt);
}
else if (this._txt[this._index] == ')')
{
txt = ")";
t = new Token(this._index, TokenType.CLOSE_PARENTHESE, txt);
}
else
{
txt = this._readText();
t = new Token(this._index, TokenType.SIMPLE_TEXT, txt);
}
this._index += txt.Length;
return t;
}
private string _readText()
{
string txt = "";
int i = this._index;
while (i < this._txt.Length && this._txt[i] != '(' && this._txt[i] != ')')
{
txt = txt + this._txt[i];
i++;
}
return txt;
}
}
class Token
{
public int position
{
get;
private set;
}
public TokenType type
{
get;
private set;
}
public string token
{
get;
private set;
}
public Token(int position, TokenType type, string token)
{
this.position = position;
this.type = type;
this.token = token;
}
}
[Flags]
enum TokenType
{
START = 1,
OPEN_PARENTHESE = 2,
SIMPLE_TEXT = 4,
CLOSE_PARENTHESE = 8,
END = 16
}
well, regex will do the job:
var text = #"(example (to (parsing nested paren) but) (first lvl only))";
var pattern = #"\(([\w\s]+) (\([\w\s]+ \([\w\s]+\) [\w\s]+\)) (\([\w\s]+\))\)*";
try
{
Regex r = new Regex(pattern, RegexOptions.IgnoreCase);
Match m = r.Match(text);
string group_1 = m.Groups[1].Value; //example
string group_2 = m.Groups[2].Value; //(to (parsing nested paren) but)
string group_3 = m.Groups[3].Value; //(first lvl only)
return new string[]{group_1,group_2,group_3};
}
catch(Exception ex){
return new string[]{"error"};
}
hopefully this helps, tested here in dotnetfiddle
Edit:
this might get you started into building the right expression according to whatever patterns you are falling into and maybe build a recursive function to parse the rest into the desired output :)
RegEx is not recursive. You either count bracket level, or recurse.
An non-recursive parser loop I tested for the example you show is..
string SplitFirstLevel(string s)
{
List<string> result = new List<string>();
int p = 0, level = 0;
for (int i = 0; i < s.Length; i++)
{
if (s[i] == '(')
{
level++;
if (level == 1) p = i + 1;
if (level == 2)
{
result.Add('"' + s.Substring(p, i - p) + '"');
p = i;
}
}
if (s[i] == ')')
if (--level == 0)
result.Add('"' + s.Substring(p, i - p) + '"');
}
return "[" + String.Join(",", result) + "]";
}
Note: after some more testing, I see your specification is unclear. How to delimit orphaned level 1 terms, that is terms without bracketing ?
For example, my parser translates
(example (to (parsing nested paren) but) (first lvl only))
to:
["example ","(to (parsing nested paren) but) ","(first lvl only)"]
and
(example (to (parsing nested paren)) but (first lvl only))
to:
["example ","(to (parsing nested paren)) but ","(first lvl only)"]
In either case, "example" gets a separate term, while "but" is grouped with the first term. In the first example this is logical, it is in the bracketing, but it may be unwanted behaviour in the second case, where "but" should be separated, like "example", which also has no bracketing (?)

Swiftly search for multiple partial strings in a huge string

I need to check whether all parts of a string like
A=1&AW=43&KO=96&R=7&WW=15&ZJ=80
are in a big string like:
A=1&AG=77&AW=43&.....&KF=11&KO=96&.....&QW=55&R=7&....&WV=1&WW=15&....ZJ=80&
My code splits the first string on & and uses Contains. But the duration is too long, as the big string is up to 800000 characters.
Is there a better/faster method for this?
public partial class UserDefinedFunctions
{
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlInt32 EquipmentCompare(SqlString equip, SqlString comp)
{
SqlInt32 result = 1;
if (comp.IsNull)
{
result = 1;
}
else
{
string equipment = "&" + equip.ToString();
string compString = comp.ToString() + "! ";
while (compString.Length > 1)
{
string sub = compString.Substring(0, compString.IndexOf("!"));
compString = compString.Substring(compString.IndexOf("!")+1);
string[] elements = sub.Split('&');
foreach (string i in elements)
{
if (i.StartsWith("~"))
{
if (equipment.Contains("&" + i.Substring(1) + "&"))
{
result = 0;
break;
}
}
else if (!equipment.Contains("&" + i + "&"))
{
result = 0;
break;
}
else
{
result = 1;
continue;
}
}
if (result == 1)
{
break;
}
}
}
return result;
}
}
I think you may speed up your code by using HashSet. Try this:
var str1 = "A=1&AW=43&KO=96&R=7&WW=15&ZJ=80";
var str2 = "A=1&AG=77&AW=43&.....&KF=11&KO=96&.....&QW=55&R=7&....&WV=1&WW=15&....ZJ=80&";
var largeStringSet = new HashSet<string>(str2.Split('&'));
var allPartsIncluded = str1.Split('&').All(s => largeStringSet.Contains(s));

Compare text files in C# and remove duplicate lines

1.txt:
Origination,destination,datetime,price
YYZ,YTC,2016-04-01 12:30,$550
YYZ,YTC,2016-04-01 12:30,$550
LKC,LKP,2016-04-01 12:30,$550
2.txt:
Origination|destination|datetime|price
YYZ|YTC|2016-04-01 12:30|$550
AMV|YRk|2016-06-01 12:30|$630
LKC|LKP|2016-12-01 12:30|$990
I have two text files with ',' and '|' as separators, and I want to create a console app in C# which reads these two files when I pass an origination and destination location from command prompt.
While searching, I want to ignore duplicate lines, and I want to display the results in order by price.
The output should be { origination } -> { destination } -> datetime -> price
Need help how to perform.
Here's a simple solution that works for your example files. It doesn't have any error checking for if the file is in a bad format.
using System;
using System.Collections.Generic;
class Program
{
class entry
{
public string origin;
public string destination;
public DateTime time;
public double price;
}
static void Main(string[] args)
{
List<entry> data = new List<entry>();
//parse the input files and add the data to a list
ParseFile(data, args[0], ',');
ParseFile(data, args[1], '|');
//sort the list (by price first)
data.Sort((a, b) =>
{
if (a.price != b.price)
return a.price > b.price ? 1 : -1;
else if (a.origin != b.origin)
return string.Compare(a.origin, b.origin);
else if (a.destination != b.destination)
return string.Compare(a.destination, b.destination);
else
return DateTime.Compare(a.time, b.time);
});
//remove duplicates (list must be sorted for this to work)
int i = 1;
while (i < data.Count)
{
if (data[i].origin == data[i - 1].origin
&& data[i].destination == data[i - 1].destination
&& data[i].time == data[i - 1].time
&& data[i].price == data[i - 1].price)
data.RemoveAt(i);
else
i++;
}
//print the results
for (i = 0; i < data.Count; i++)
Console.WriteLine("{0}->{1}->{2:yyyy-MM-dd HH:mm}->${3}",
data[i].origin, data[i].destination, data[i].time, data[i].price);
Console.ReadLine();
}
private static void ParseFile(List<entry> data, string filename, char separator)
{
using (System.IO.FileStream fs = System.IO.File.Open(filename, System.IO.FileMode.Open))
using (System.IO.StreamReader reader = new System.IO.StreamReader(fs))
while (!reader.EndOfStream)
{
string[] line = reader.ReadLine().Split(separator);
if (line.Length == 4)
{
entry newitem = new entry();
newitem.origin = line[0];
newitem.destination = line[1];
newitem.time = DateTime.Parse(line[2]);
newitem.price = double.Parse(line[3].Substring(line[3].IndexOf('$') + 1));
data.Add(newitem);
}
}
}
}
I'm not 100% clear on what the output of your program is supposed to be, so I'll leave that part of the implementation up to you. My strategy was to use a constructor method that takes a string (that you will read from a file) and a delimiter (since it varies) and use that to create objects which you can manipulate (e.g. add to hash sets, etc).
PriceObject.cs
using System;
using System.Globalization;
namespace ConsoleApplication1
{
class PriceObject
{
public string origination { get; set; }
public string destination { get; set; }
public DateTime time { get; set; }
public decimal price { get; set; }
public PriceObject(string inputLine, char delimiter)
{
string[] parsed = inputLine.Split(new char[] { delimiter }, 4);
origination = parsed[0];
destination = parsed[1];
time = DateTime.ParseExact(parsed[2], "yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture);
price = Decimal.Parse(parsed[3], NumberStyles.Currency, new CultureInfo("en-US"));
}
public override bool Equals(object obj)
{
var item = obj as PriceObject;
return origination.Equals(item.origination) &&
destination.Equals(item.destination) &&
time.Equals(item.time) &&
price.Equals(item.price);
}
public override int GetHashCode()
{
unchecked
{
var result = 17;
result = (result * 23) + origination.GetHashCode();
result = (result * 23) + destination.GetHashCode();
result = (result * 23) + time.GetHashCode();
result = (result * 23) + price.GetHashCode();
return result;
}
}
}
}
Program.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
HashSet<PriceObject> list1 = new HashSet<PriceObject>();
HashSet<PriceObject> list2 = new HashSet<PriceObject>();
using (StreamReader reader = File.OpenText(args[0]))
{
string line = reader.ReadLine(); // this will remove the header row
while (!reader.EndOfStream)
{
line = reader.ReadLine();
if (String.IsNullOrEmpty(line))
continue;
// add each line to our list
list1.Add(new PriceObject(line, ','));
}
}
using (StreamReader reader = File.OpenText(args[1]))
{
string line = reader.ReadLine(); // this will remove the header row
while (!reader.EndOfStream)
{
line = reader.ReadLine();
if (String.IsNullOrEmpty(line))
continue;
// add each line to our list
list2.Add(new PriceObject(line, '|'));
}
}
// merge the two hash sets, order by price
list1.UnionWith(list2);
List<PriceObject> output = list1.ToList();
output.OrderByDescending(x => x.price).ToList();
// display output here, e.g. define your own ToString method, etc
foreach (var item in output)
{
Console.WriteLine(item.ToString());
}
Console.ReadLine();
}
}
}

Removing Invalid Characters From XML File Before Deserialization

I have some XML I am receiving from a server that sometimes has some invalid characters that I would like to remove before deserialization. I have no control over the XML file I receive so I need to check for the invalid characters myself.
Sample XML.....
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
As you can see, the customer po section has the invalid characters I need to remove. This sometimes occurs only in certain elements that include user typed data.
Here is my Response code.....
//configure http request
HttpWebRequest httpRequest = WebRequest.Create(url) as HttpWebRequest;
httpRequest.Method = "POST";
//prepare correct encoding for XML serialization
UTF8Encoding encoding = new UTF8Encoding();
//use Xml property to obtain serialized XML data
//convert into bytes using encoding specified above and get length
byte[] bodyBytes = encoding.GetBytes(Xml);
httpRequest.ContentLength = bodyBytes.Length;
//get http request stream for putting XML data into
Stream httpRequestBodyStream = httpRequest.GetRequestStream();
//fill stream with serialized XML data
httpRequestBodyStream.Write(bodyBytes, 0, bodyBytes.Length);
httpRequestBodyStream.Close();
//get http response
HttpWebResponse httpResponse = httpRequest.GetResponse() as HttpWebResponse;
StreamReader httpResponseStream = new StreamReader(httpResponse.GetResponseStream(), System.Text.Encoding.ASCII);
//extract XML from response
string httpResponseBody = httpResponseStream.ReadToEnd();
httpResponseStream.Close();
//ignore everything that isn't XML by removing headers
httpResponseBody = httpResponseBody.Substring(httpResponseBody.IndexOf("<?xml"));
//deserialize XML into ProductInquiryResponse
XmlSerializer serializer = new XmlSerializer(typeof(MyResponseClass));
StringReader responseReader = new StringReader(httpResponseBody);
//return MyResponseClass result
return serializer.Deserialize(responseReader) as MyResponseClass;
Does anyone happen to have any suggestions to check the XML? Should I just check the elements I am concerned with right before the xml string gets deserialized? Or is there a better way?
A general fix for your problem would be to recursively descend the XML, parsing as you go and comparing to the schema for that node. At any point if the input differs from the input expected from the schema, or is malformed in some way, allow an error handler to run to fix the input stream, rolling back to the most recent good state and proceeding forward with the fixed input.
The .Net XmlTextReader class is not flexible enough to do this. However, if you know in advance that from the schema that certain XML Elements cannot have children, then the following will read an XML input stream, and upon encountering an element whose fully qualified name matches the known names of leaf nodes, and "escape" the text of all such nodes:
public enum XmlDoctorStatus
{
NoFixNeeded,
FixMade,
FixFailed
}
public class XmlDoctor
{
internal class XmlFixData
{
public string InitialXml { get; private set; }
public string FixedXml { get; private set; }
public int LineNumber { get; private set; }
public int LinePosition { get; private set; }
public XmlFixData(string initialXml, string fixedXml, int lineNumber, int linePosition)
{
this.InitialXml = initialXml;
this.FixedXml = fixedXml;
this.LineNumber = lineNumber;
this.LinePosition = linePosition;
}
public bool ComesAfter(XmlFixData other)
{
if (LineNumber > other.LineNumber)
return true;
if (LineNumber == other.LineNumber && LinePosition > other.LinePosition)
return true;
return false;
}
}
internal class XmlFixedException : Exception
{
public XmlFixData XmlFixData { get; private set; }
public XmlFixedException(XmlFixData data)
{
this.XmlFixData = data;
}
}
readonly HashSet<XName> childlessNodes;
public string OriginalXml { get; private set; }
public XmlDoctor(string xml, IEnumerable<XName> childlessNodes)
{
if (xml == null)
throw new ArgumentNullException();
this.OriginalXml = xml;
this.childlessNodes = new HashSet<XName>(childlessNodes);
}
List<int> indices = null;
string passXml = string.Empty;
bool inPass = false;
void InitializePass(string xml)
{
if (inPass)
throw new Exception("nested pass");
ClearElementData();
TextHelper.NormalizeLines(xml, out passXml, out indices);
inPass = true;
}
void EndPass()
{
inPass = false;
indices = null;
passXml = string.Empty;
ClearElementData();
}
static int LineNumber(XmlReader reader)
{
return ((IXmlLineInfo)reader).LineNumber;
}
static int LinePosition(XmlReader reader)
{
return ((IXmlLineInfo)reader).LinePosition;
}
// Taken from https://stackoverflow.com/questions/1132494/string-escape-into-xml
public static string XmlEscape(string escaped)
{
var replacements = new KeyValuePair<string, string>[]
{
new KeyValuePair<string,string>("&", "&"),
new KeyValuePair<string,string>("\"", """),
new KeyValuePair<string,string>("'", "&apos;"),
new KeyValuePair<string,string>("<", "<"),
new KeyValuePair<string,string>(">", ">"),
};
foreach (var pair in replacements)
foreach (var index in escaped.IndexesOf(pair.Key, 0).Reverse())
if (!replacements.Any(other => string.Compare(other.Value, 0, escaped, index, other.Value.Length, StringComparison.Ordinal) == 0))
{
escaped = escaped.Substring(0, index) + pair.Value + escaped.Substring(index + 1, escaped.Length - index - 1);
}
return escaped;
}
void HandleNode(XmlReader reader)
{
// Adapted from http://blogs.msdn.com/b/mfussell/archive/2005/02/12/371546.aspx
if (reader == null)
{
throw new ArgumentNullException("reader");
}
switch (reader.NodeType)
{
case XmlNodeType.Element:
HandleStartElement(reader);
if (reader.IsEmptyElement)
{
HandleEndElement(reader);
}
break;
case XmlNodeType.Text:
HandleText(reader);
break;
case XmlNodeType.Whitespace:
case XmlNodeType.SignificantWhitespace:
break;
case XmlNodeType.CDATA:
break;
case XmlNodeType.EntityReference:
break;
case XmlNodeType.XmlDeclaration:
case XmlNodeType.ProcessingInstruction:
break;
case XmlNodeType.DocumentType:
break;
case XmlNodeType.Comment:
break;
case XmlNodeType.EndElement:
HandleEndElement(reader);
break;
}
}
private void HandleText(XmlReader reader)
{
if (string.IsNullOrEmpty(currentElementLocalName) || string.IsNullOrEmpty(currentElementName))
return;
var name = XName.Get(currentElementLocalName, currentElementNameSpace);
if (!childlessNodes.Contains(name))
return;
var lineIndex = LineNumber(reader) - 1;
var charIndex = LinePosition(reader) - 1;
if (lineIndex < 0 || charIndex < 0)
return;
int startIndex = indices[lineIndex] + charIndex;
// Scan forward in the input string until we find either the beginning of a CDATA section or the end of this element.
// Patterns to match: </Name
//
string pattern1 = "</" + currentElementName;
var index1 = FindElementEnd(passXml, startIndex, pattern1);
if (index1 < 0)
return; // BAD XML.
string pattern2 = "<![CDATA[";
var index2 = passXml.IndexOf(pattern2, startIndex);
int endIndex = (index2 < 0 ? index1 : Math.Min(index1, index2));
var text = passXml.Substring(startIndex, endIndex - startIndex);
var escapeText = XmlEscape(text);
if (escapeText != text)
{
if (escapeText != XmlEscape(escapeText))
{
Debug.Assert(escapeText == XmlEscape(escapeText));
throw new InvalidOperationException("Escaping error");
}
string fixedXml = passXml.Substring(0, startIndex) + escapeText + passXml.Substring(endIndex, passXml.Length - endIndex);
throw new XmlFixedException(new XmlFixData(passXml, fixedXml, lineIndex + 1, charIndex + 1));
}
}
static bool IsXmlSpace(char ch)
{
// http://www.w3.org/TR/2000/REC-xml-20001006#NT-S
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
return ch == '\u0020' || ch == '\u0009' || ch == '\u000D' || ch == '\u000A';
}
private static int FindElementEnd(string passXml, int charPos, string tagEnd)
{
while (true)
{
var index = passXml.IndexOf(tagEnd, charPos);
if (index < 0)
return index;
int endPos = index + tagEnd.Length;
if (index + tagEnd.Length >= passXml.Length)
return -1; // Bad xml?
// Now we must have zero or more white space characters and a ">"
while (endPos < passXml.Length && IsXmlSpace(passXml[endPos]))
endPos++;
if (endPos >= passXml.Length)
return -1; // BAD XML;
if (passXml[endPos] == '>')
return index;
index = endPos;
// Spurious ending, keep searching.
}
}
string currentElementName = string.Empty;
string currentElementNameSpace = string.Empty;
string currentElementLocalName = string.Empty;
private void HandleStartElement(XmlReader reader)
{
currentElementName = reader.Name;
currentElementLocalName = reader.LocalName;
currentElementNameSpace = reader.NamespaceURI;
}
private void HandleEndElement(XmlReader reader)
{
ClearElementData();
}
private void ClearElementData()
{
currentElementName = string.Empty;
currentElementNameSpace = string.Empty;
currentElementLocalName = string.Empty;
}
public XmlDoctorStatus TryFix(out string newXml)
{
XmlFixData data = null;
while (true)
{
XmlFixData newData;
var status = TryFixOnePass((data == null ? OriginalXml : data.FixedXml), out newData);
switch (status)
{
case XmlDoctorStatus.FixFailed:
Debug.WriteLine("Could not fix XML");
newXml = OriginalXml;
return XmlDoctorStatus.FixFailed;
case XmlDoctorStatus.FixMade:
if (data != null && !newData.ComesAfter(data))
{
Debug.WriteLine("Warning -- possible infinite loop detected, aborting fix");
newXml = OriginalXml;
return XmlDoctorStatus.FixFailed;
}
data = newData;
break; // Try to fix more
case XmlDoctorStatus.NoFixNeeded:
if (data == null)
{
newXml = OriginalXml;
return XmlDoctorStatus.NoFixNeeded;
}
else
{
newXml = data.FixedXml;
return XmlDoctorStatus.FixMade;
}
}
}
}
XmlDoctorStatus TryFixOnePass(string xml, out XmlFixData data)
{
try
{
InitializePass(xml);
using (var textReader = new StringReader(passXml))
using (XmlReader reader = XmlReader.Create(textReader))
{
while (true)
{
bool read = reader.Read();
if (!read)
break;
HandleNode(reader);
}
}
}
catch (XmlFixedException ex)
{
// Success - a fix was made.
data = ex.XmlFixData;
return XmlDoctorStatus.FixMade;
}
catch (Exception ex)
{
// Failure - the file was not fixed and could not be parsed.
Debug.WriteLine("Fix Failed: " + ex.ToString());
data = null;
return XmlDoctorStatus.FixFailed;
}
finally
{
EndPass();
}
// No fix needed.
data = null;
return XmlDoctorStatus.NoFixNeeded;
}
}
public static class TextHelper
{
public static void NormalizeLines(string text, out string newText, out List<int> lineIndices)
{
var sb = new StringBuilder();
var indices = new List<int>();
using (var sr = new StringReader(text))
{
string line;
while ((line = sr.ReadLine()) != null)
{
indices.Add(sb.Length);
sb.AppendLine(line);
}
}
lineIndices = indices;
newText = sb.ToString();
}
public static IEnumerable<int> IndexesOf(this string str, string value, int startAt)
{
if (str == null)
yield break;
for (int index = startAt, valueLength = value.Length; ; index += valueLength)
{
index = str.IndexOf(value, index);
if (index == -1)
break;
yield return index;
}
}
}
Then use it like:
public static class TestXmlDoctor
{
public static void TestFix()
{
string xml1 = #"<?xml version=""1.0"" encoding=""UTF-8""?>
<MainClass>
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
</MainClass>
";
XName[] childlessNodes1 = new XName[]
{
XName.Get("CustomerPO", string.Empty),
};
try
{
TestFix(xml1, childlessNodes1);
}
catch (Exception ex)
{
Debug.WriteLine(ex);
}
}
public static string TestFix(string xml, IEnumerable<XName> childlessNodes)
{
string fixedXml;
var status = (new XmlDoctor(xml, childlessNodes).TryFix(out fixedXml));
switch (status)
{
case XmlDoctorStatus.NoFixNeeded:
return xml;
case XmlDoctorStatus.FixFailed:
Debug.WriteLine("Failed to fix xml");
return xml;
case XmlDoctorStatus.FixMade:
Debug.WriteLine("Fixed XML, new XML is as follows:");
Debug.WriteLine(fixedXml);
Debug.WriteLine(string.Empty);
return fixedXml;
default:
Debug.Assert(false, "Unknown fix status " + status.ToString());
return xml;
}
}
}
This with this, your XML fragment can be parsed, and becomes:
<?xml version="1.0" encoding="UTF-8"?>
<MainClass>
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
</MainClass>

Asserting Elements Parsed from XML

I have a class that parses an XML document in C# using XElement.
I parse the XML for example:
IEnumerable<Element> elements =
from topLevelElement in XElement.Parse(xml).Elements("topLevel")
select new Element()
{
LongElement = Int64.Parse(topLevelElement.Element("long").Value),
StringElement = topLevelElement.Element("string").Value,
DateTimeElement = DateTime.Parse(topLevelElement.Element("datetime").Value)
};
What would be the best way to assert that the elements were properly parsed? I would like to check if LongElement, StringElement, and DateTimeElement is not null after parsing, but if there is a better way to go about this, I am open to it.
If you are unsure of the values that may be returned by the elements, you should really be using TryParse e.g.
int i = 0;
string s = "3";
if (Int32.TryParse(s, out i))
{
// Valid integer, now stored in i.
}
else
{
// Invalid integer.
}
Both your data types DateTime and Int32 have TryParse as an available method. As for a string, you can just do a trivial == null or String.IsNullOrEmpty
I would use functions from within Linq. These allow you to either throw an exception or set required defaults if you want your application to be not so strict ;)
Anyways, you get more control:
var elements = from topLevelElement in XElement.Parse(xml).Elements("topLevel")
select new Element()
{
LongElement = ConvertToInt(topLevelElement.Element("long").Value),
StringElement = topLevelElement.Element("string").Value,
DateTimeElement = DateTime.Parse(topLevelElement.Element("datetime").Value)
};
Where within ConvertToInt could do all you want, like:
public int ConvertToInt(object value)
{
if(value is int)
// return converted value
else
// return default, throw exception, etc
}
This is also a more reusable layout.
I would store the parse states in the element as a KeyValuePair:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml.Linq;
namespace ConsoleApplication1
{
internal class Program
{
private static void Main(string[] args)
{
var states = new string[] { "null", "empty", "noparse", "value" };
var xml = "<root>";
xml += "<topLevel><long>-13451245234</long><string>hello world</string><datetime>1/1/2012 8:00AM</datetime></topLevel>";
xml += "<topLevel><long>4563264643</long><string>lipsum</string><datetime></datetime></topLevel>";
xml += "<topLevel><string>hello world</string><datetime>1/1/2012 8:00AM</datetime></topLevel>";
xml += "</root>";
IEnumerable<Element> elements =
from topLevelElement in XElement.Parse(xml).Elements("topLevel")
select new Element
{
LongElement = ParseValue(topLevelElement, "long"),
DateTimeElement = ParseValue(topLevelElement, "datetime"),
StringElement = ParseValue(topLevelElement, "string"),
};
var idx = 0;
elements.All(e =>
{
Console.WriteLine("---- ELEMENT #{0} -----",idx++);
Console.WriteLine("[long] State: {0}\tValue:{1}\tType:{2}", states[e.LongElement.Key], e.LongElement.Value, (e.LongElement.Value).GetType());
Console.WriteLine("[datetime] State: {0}\tValue:{1}\tType:{2}", states[e.DateTimeElement.Key], e.DateTimeElement.Value, (e.DateTimeElement.Value).GetType());
Console.WriteLine("[string] State: {0}\tValue:{1}\tType:{2}", states[e.StringElement.Key], e.StringElement.Value, (e.StringElement.Value).GetType());
return true;
});
}
private static dynamic ParseValue(XElement parent, String propname)
{
var prop = parent.Element(propname);
dynamic val = null;
byte state = 255;
if (prop == null) state = 0;
else if (string.IsNullOrEmpty(prop.Value)) state = 1;
if (state < 255) return GetKVP(propname, state, GetDefaultValue(propname));
switch (propname)
{
case "string":
state = 3;
val = prop.Value;
break;
case "long":
Int64 longvalue;
if (Int64.TryParse(prop.Value, out longvalue)) { state = 3; val = longvalue; }
else state = 2;
break;
case "datetime":
DateTime datetimevalue;
if (DateTime.TryParse(prop.Value, out datetimevalue)) { state = 3; val = datetimevalue; }
else state = 2;
break;
default:
val = GetDefaultValue(propname);
break;
}
return GetKVP(propname,state,val);
}
private static dynamic GetKVP(string propname, byte state, object val)
{
if (propname == "long") return new KeyValuePair<byte, Int64>(state, (Int64)val);
if (propname == "datetime") return new KeyValuePair<byte, DateTime>(state, (DateTime)val);
if (propname == "string") return new KeyValuePair<byte, String>(state, (String)val);
return null;
}
private static dynamic GetDefaultValue(string propname)
{
if (propname == "long") return long.MinValue;
if (propname == "datetime") return DateTime.MinValue;
if (propname == "string") return null;
return null;
}
#region Nested type: Element
public struct Element
{
// States stored as byte, 0 = null, 1= empty, 2 = has a value
public KeyValuePair<byte,Int64> LongElement { get; set; }
public KeyValuePair<byte,String> StringElement { get; set; }
public KeyValuePair<byte,DateTime> DateTimeElement { get; set; }
}
#endregion
}
}

Categories