I'm using HTML Agility Pack to web scrape to datatable. However the website have multiple same column name which it was not able to add on for the second table.
The error will be prompt out like this as the "2020" had been added before
My code as below :
public void WebDataScrap()
{
try
{
//Get the content of the URL from the Web
const string url = "https://www.wsj.com/market-data/quotes/MY/XKLS/0146/financials/annual/cash-flow";
var web = new HtmlWeb();
var doc = web.Load(url);
const string classValue = "cr_dataTable"; //cr_datatable
//var nodes = doc.DocumentNode.SelectNodes($"//table[#class='{classValue}']") ?? Enumerable.Empty<HtmlNode>();
var resultDataset = new DataSet();
foreach (HtmlNode table in doc.DocumentNode.SelectNodes($"//table[#class='{classValue}']") ?? Enumerable.Empty<HtmlNode>())
{
var resultTable = new DataTable(table.Id);
foreach (HtmlNode row in table.SelectNodes("//tr"))
{
var headerCells = row.SelectNodes("th");
if (headerCells != null)
{
foreach (HtmlNode cell in headerCells)
{
resultTable.Columns.Add(cell.InnerText);
}
}
var dataCells = row.SelectNodes("td");
if (dataCells != null)
{
var dataRow = resultTable.NewRow();
for (int i = 0; i < dataCells.Count; i++)
{
dataRow[i] = dataCells[i].InnerText;
}
resultTable.Rows.Add(dataRow);
}
}
}
}
catch (Exception ex)
{
MessageBox.Show(ex.ToString());
}
}
The URL i trying to web scrape : https://www.wsj.com/market-data/quotes/MY/XKLS/0146/financials/annual/cash-flow
I did try to do looping to skip if it was having the same name but it will prompt that the column unable to find when I try to debug.
Is there any solution that can help to solve this? In the end I will need to export the datatable to csv/excel file.
Thanks
I think you want to do this instead:
foreach (HtmlNode table in doc.DocumentNode.SelectNodes($"//table[#class='{classValue}']") ?? Enumerable.Empty<HtmlNode>())
{
var resultTable = new DataTable(table.Id);
// select all the headers and add them to the table
var headerCells = table.SelectNodes("thead/tr/th");
if (headerCells != null)
{
foreach (HtmlNode cell in headerCells)
{
resultTable.Columns.Add(cell.InnerText);
}
}
// select all the rows and add them to the table
foreach (HtmlNode row in table.SelectNodes("tbody/tr"))
{
var dataCells = row.SelectNodes("td");
if (dataCells != null)
{
var dataRow = resultTable.NewRow();
for (int i = 0; i < dataCells.Count; i++)
{
dataRow[i] = dataCells[i].InnerText;
}
resultTable.Rows.Add(dataRow);
}
}
}
The header section and the data section each have their own loop rather than the header section being nested in the data loop. We're also being more explicit about where we want data from: the header should come from thead/tr/th and the data should come from tbody/tr.
I am very new at C#. In my project I need to create a csv file which will get data from a xml data. Now, I can get data from XML, and print in looger for some particulaer attributes from xml. But I am not sure how can I store my Data into CSV file for that particular attribues.
Here is my XML file that I need to create a CSV file.
<?xml version="1.0" encoding="utf-8"?>
<tlp:WorkUnits xmlns:tlp="http://www.timelog.com/XML/Schema/tlp/v4_4"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.timelog.com/XML/Schema/tlp/v4_4 http://www.timelog.com/api/xsd/WorkUnitsRaw.xsd">
<tlp:WorkUnit ID="130">
<tlp:EmployeeID>3</tlp:EmployeeID>
<tlp:AllocationID>114</tlp:AllocationID>
<tlp:TaskID>239</tlp:TaskID>
<tlp:ProjectID>26</tlp:ProjectID>
<tlp:ProjectName>LIK Template</tlp:ProjectName>
<tlp:CustomerId>343</tlp:CustomerId>
<tlp:CustomerName>Lekt Corp Inc.</tlp:CustomerName>
<tlp:IsBillable>1</tlp:IsBillable>
<tlp:ApprovedStatus>0</tlp:ApprovedStatus>
<tlp:LastModifiedBy>AL</tlp:LastModifiedBy>
</tlp:WorkUnit>
And my Code where I am getting this value in logger.But I am not sure how can I create a csv file that stores that value in order.
Edited
namespace TimeLog.ApiConsoleApp
{
/// <summary>
/// Template class for consuming the reporting API
/// </summary>
public class ConsumeReportingApi
{
private static readonly ILog Logger = LogManager.GetLogger(typeof(ConsumeReportingApi));
public static void Consume()
{
if (ServiceHandler.Instance.TryAuthenticate())
{
if (Logger.IsInfoEnabled)
{
Logger.Info("Successfully authenticated on reporting API");
}
var customersRaw = ServiceHandler.Instance.Client.GetWorkUnitsRaw(ServiceHandler.Instance.SiteCode,
ServiceHandler.Instance.ApiId,
ServiceHandler.Instance.ApiPassword,
WorkUnit.All,
Employee.All,
Allocation.All,
Task.All,
Project.All,
Department.All,
DateTime.Now.AddDays(-5).ToString(),
DateTime.Now.ToString()
);
if (customersRaw.OwnerDocument != null)
{
var namespaceManager = new XmlNamespaceManager(customersRaw.OwnerDocument.NameTable);
namespaceManager.AddNamespace("tlp", "http://www.timelog.com/XML/Schema/tlp/v4_4");
var workUnit = customersRaw.SelectNodes("tlp:WorkUnit", namespaceManager);
var output = new StringBuilder();
output.AppendLine("AllocationID,ApprovedStatus,CustomerId,CustomerName,EmployeeID");
if (workUnit != null)
{
foreach (XmlNode customer in workUnit)
{
var unit = new WorkUnit();
var childNodes = customer.SelectNodes("./*");
if (childNodes != null)
{
foreach (XmlNode childNode in childNodes)
{
if (childNode.Name == "tlp:EmployeeID")
{
unit.EmployeeID = Int32.Parse(childNode.InnerText);
}
if (childNode.Name == "tlp:EmployeeFirstName")
{
unit.EmployeeFirstName = childNode.InnerText;
}
if (childNode.Name == "tlp:EmployeeLastName")
{
unit.EmployeeLastName = childNode.InnerText;
}
if (childNode.Name == "tlp:AllocationID")
{
unit.AllocationID = Int32.Parse(childNode.InnerText);
}
if (childNode.Name == "tlp:TaskName")
{
unit.TaskName = childNode.InnerText;
}
}
}
output.AppendLine($"{unit.EmployeeID},{unit.EmployeeFirstName},{unit.EmployeeLastName},{unit.AllocationID},{unit.TaskName}");
//Console.WriteLine("---");
}
Console.WriteLine(output.ToString());
File.WriteAllText("c:\\...\\WorkUnits.csv", output.ToString());
}
}
else
{
if (Logger.IsWarnEnabled)
{
Logger.Warn("Failed to authenticate to reporting API");
}
}
}
}
}
}
You want to write the columns in the correct order to the CSV (of course), so you need to process them in the correct order. Two options:
intermediate class
Create a new class (let's call it WorkUnit) with properties for each of the columns that you want to write to the CSV. Create a new instance for every <tlp:WorkUnit> node in your XML and fill the properties when you encounter the correct subnodes. When you have processed the entire WorkUnit node, write out the properties in the correct order.
var output = new StringBuilder();
foreach (XmlNode customer in workUnit)
{
// fresh instance of the class that holds all columns (so all properties are cleared)
var unit = new WorkUnit();
var childNodes = customer.SelectNodes("./*");
if (childNodes != null)
{
foreach (XmlNode childNode in childNodes)
{
if(childNode.Name== "tlp:EmployeeID")
{
// employeeID node found, now write to the corresponding property:
unit.EmployeeId = childNode.InnerText;
}
// etc for the other XML nodes you are interested in
}
// all nodes have been processed for this one WorkUnit node
// so write a line to the CSV
output.AppendLine($"{unit.EmployeeId},{unit.AllocationId}, etc");
}
read in correct order
Instead of using foreach to loop through all subnodes in whatever order they appear, search for specific subnodes in the order you want. Then you can write out the CSV in the same order. Note that even when you don't find some subnode, you still need to write out the separator.
var output = new StringBuilder();
foreach (XmlNode customer in workUnit)
{
// search for value for first column (EmployeeID)
var node = workUnit.SelectSingleNode("tlp:EmployeeID");
if (node != null)
{
output.Append(node.InnerText).Append(',');
}
else
{
output.Append(','); // no content, but we still need a separator
}
// etc for the other columns
And of course watch out for string values that contain the separator.
Assuming that you put your XML data into List
StringBuilder str = new StringBuilder();
foreach (var fin list.ToList())
{
str.Append(fin.listfield.ToString() + ",");
}
to create a new line:
str.Replace(",", Environment.NewLine, str.Length - 1, 1);
to save:
string filename=(DirectoryPat/filename.csv");
File.WriteAllText(Filename, str.ToString());
Try this:
var output = new StringBuilder();
output.AppendLine("AllocationID,ApprovedStatus,CustomerId,CustomerName,EmployeeID");
if (workUnit != null)
{
foreach (XmlNode customer in workUnit)
{
var unit = new WorkUnit();
var childNodes = customer.SelectNodes("./*");
if (childNodes != null)
{
for (int i = 0; i<childNodes.Count; ++i)
{
XmlNode childNode = childNodes[i];
if (childNode.Name == "tlp:EmployeeID")
{
unit.EmployeeID = Int32.Parse(childNode.InnerText);
}
if (childNode.Name == "tlp:EmployeeFirstName")
{
unit.EmployeeFirstName = childNode.InnerText;
}
if (childNode.Name == "tlp:EmployeeLastName")
{
unit.EmployeeLastName = childNode.InnerText;
}
if (childNode.Name == "tlp:AllocationID")
{
unit.AllocationID = Int32.Parse(childNode.InnerText);
}
if (childNode.Name == "tlp:TaskName")
{
unit.TaskName = childNode.InnerText;
}
output.Append(childNode.InnerText);
if (i<childNodes.Count - 1)
output.Append(",");
}
output.Append(Environment.NewLine);
}
}
Console.WriteLine(output.ToString());
File.WriteAllText("c:\\Users\\mnowshin\\projects\\WorkUnits.csv", output.ToString());
}
You can use this sequence:
a. Deserialize (i.e. convert from XML to C# objects) your XML.
b. Write a simple loop to write the data to a file.
The advantages of this sequence:
You can use a list of your data/objects "readable" that you can add any other access code to it.
If you XML schema changed at any time, you can maintain the code very easily.
The solution
a. Desrialize:
Copy you XML file contents. Note You should modify your XML input before coping it.. You should double the WorkUnit node, in order to tell Visual Studio that you would have a list of this node nested inside WorkUnits node.
From Visual Studio Menus select Edit -> Paste Special -> Paste XML as Classes.
Use the deserialize code.
var workUnitsNode = customersRaw.SelectSingleNode("tlp:WorkUnits", namespaceManager);
XmlSerializer ser = new XmlSerializer(typeof(WorkUnits));
WorkUnits workUnits = (WorkUnits)ser.Deserialize(workUnitsNode);
b. Write the csv file
StringBuilder csvContent = new StringBuilder();
// add the header line
csvContent.AppendLine("AllocationID,ApprovedStatus,CustomerId,CustomerName,EmployeeID");
foreach (var unit in workUnits.WorkUnit)
{
csvContent.AppendFormat(
"{0}, {1}, {2}, {3}, {4}",
new object[]
{
unit.AllocationID,
unit.ApprovedStatus,
unit.CustomerId,
unit.CustomerName,
unit.EmployeeID
// you get the idea
});
csvContent.AppendLine();
}
File.WriteAllText(#"G:\Projects\StackOverFlow\WpfApp1\WorkUnits.csv", csvContent.ToString());
You can use Cinchoo ETL - if you have room to use open source library
using (var csvWriter = new ChoCSVWriter("sample1.csv").WithFirstLineHeader())
{
using (var xmlReader = new ChoXmlReader("sample1.xml"))
csvWriter.Write(xmlReader);
}
Output:
ID,tlp_EmployeeID,tlp_AllocationID,tlp_TaskID,tlp_ProjectID,tlp_ProjectName,tlp_CustomerId,tlp_CustomerName,tlp_IsBillable,tlp_ApprovedStatus,tlp_LastModifiedBy
130,3,114,239,26,LIK Template,343,Lekt Corp Inc.,1,0,AL
Disclaimer: I'm the author of this library.
By the beginning of this week Iwas having a problem with TreeView not displaying children. Everything got worked out through recursiveness. However, a new and unexpected problem arose: the methods i'm using are getting duplicate nodes on some specific DataTables.
Having this DataTable of two columns:
ParentOT ChildOT
20120601 20120602
20120601 20120603
20120601 20120604
20120601 20120611
20120601 20120612
20120602 20120605
20120602 20120606
20120602 20120607
20120602 20120608
20120602 20120610
20120603 20120607
20120603 20120608
20120603 20120609
If I try to display its Treeview I get the right treeview, but five times consecutively (the times the parent appears as parent in parentOT records).
The Methods are these:
private TreeView cargarOtPadres(TreeView trv, int otPadre, DataTable datos)
{
if (datos.Rows.Count > 0)
{
foreach (DataRow dr in datos.Select("OTPadre="+ otPadre))
{
TreeNode nodoPadre = new TreeNode();
nodoPadre.Text = dr["OTPadre"].ToString();
trv.Nodes.Add(nodoPadre);
cargarSubOts(ref nodoPadre, int.Parse(dr["OTPadre"].ToString()), datos);
}
}
return trv;
}
private void cargarSubOts(ref TreeNode nodoPadre, int otPadre, DataTable datos)
{
DataRow[] otHijas = datos.Select("OTPadre=" + otPadre);
foreach (DataRow drow in otHijas)
{
TreeNode hija = new TreeNode();
hija.Text = drow["OTHija"].ToString();
nodoPadre.Nodes.Add(hija);
cargarSubOts(ref hija, int.Parse(drow["OTHija"].ToString()), datos);
}
}
With Tables with just 1 great parent appearing 1 time only, it works great. How can i prevent the TreeView from duplicating??
I'll leave the answer for the sake of completion. This solution came courtesy of #King King
public static class TreeViewExtension
{
public static void LoadFromDataTable(this TreeView tv, DataTable dt)
{
var parentNodes = dt.AsEnumerable()
.GroupBy(row => (string)row[0])
.ToDictionary(g => g.Key, value => value.Select(x => (string)x[1]));
Stack<KeyValuePair<TreeNode, IEnumerable<string>>> lookIn = new Stack<KeyValuePair<TreeNode, IEnumerable<string>>>();
HashSet<string> removedKeys = new HashSet<string>();
foreach (var node in parentNodes)
{
if (removedKeys.Contains(node.Key)) continue;
TreeNode tNode = new TreeNode(node.Key);
lookIn.Push(new KeyValuePair<TreeNode, IEnumerable<string>>(tNode, node.Value));
while (lookIn.Count > 0)
{
var nodes = lookIn.Pop();
foreach (var n in nodes.Value)
{
IEnumerable<string> children;
TreeNode childNode = new TreeNode(n);
nodes.Key.Nodes.Add(childNode);
if (parentNodes.TryGetValue(n, out children))
{
lookIn.Push(new KeyValuePair<TreeNode, IEnumerable<string>>(childNode, children));
removedKeys.Add(n);
}
}
}
tv.Nodes.Add(tNode);
}
}
}
You create this class
And you use afterwards like this.
treeView1.LoadFromDataTable(DataTable);
Be sure to use it with a String type DataTable. If you have a int type Table, you can do something like this:
DataTable stringDataTable = intDataTable.Clone();
stringDataTable.Columns[0].DataType = typeof(string);
stringDataTable.Columns[1].DataType = typeof(string);
foreach (DataRow dr in intDataTable.Rows)
{
stringDataTable.ImportRow(dr);
}
treeView1.LoadFromDataTable(stringDataTable);
I have an log file like this..
This is the segment 1
============================
<MAINELEMENT><ELEMENT1>10-10-2013 10:10:22.444</ELEMENT1><ELEMENT2>1111</ELEMENT2>
<ELEMENT3>Message 1</ELEMENT3></MAINELEMENT>
<MAINELEMENT><ELEMENT1>10-10-2013 10:10:22.555</ELEMENT1><ELEMENT2>1111</ELEMENT2>
<ELEMENT3>Message 2</ELEMENT3></MAINELEMENT>
This is the segment 2
============================
<MAINELEMENT><ELEMENT1>10-11-2012 10:10:22.444</ELEMENT1><ELEMENT2>2222</ELEMENT2>
<ELEMENT3>Message 1</ELEMENT3></MAINELEMENT>
<MAINELEMENT><ELEMENT1>10-11-2012 10:10:22.555</ELEMENT1><ELEMENT2>2222</ELEMENT2>
<ELEMENT3>Message 2</ELEMENT3></MAINELEMENT>
How can I read this into DataTable excluding the data This is the segment 1 and This is the segment 2 and ====== lines completely.
I would like to have the Datatable as with Columns as "ELEMENT1", "ELEMENT2", "ELEMENT3" and fill the details with the content between those tags in the order of print of line.
It should not change the sequence of the order of records in the table while inserting.
HtmlAgilityPack seems to be a good tool for what you need:
using HtmlAgilityPack;
class Program
{
static void Main(string[] args)
{
var doc = new HtmlDocument();
doc.Load("log.txt");
var dt = new DataTable();
bool hasColumns = false;
foreach (HtmlNode row in doc
.DocumentNode
.SelectNodes("//mainelement"))
{
if (!hasColumns)
{
hasColumns = true;
foreach (var column in row.ChildNodes
.Where(node => node.GetType() == typeof(HtmlNode)))
{
dt.Columns.Add(column.Name);
}
}
dt.Rows.Add(row.ChildNodes
.Where(node => node.GetType() == typeof(HtmlNode))
.Select(node => node.InnerText).ToArray());
}
}
}
could do this, where stringData is the data from the file you have
var array = stringData.Split(new[] { "============================" }, StringSplitOptions.RemoveEmptyEntries);
var document = new XDocument(new XElement("Root"));
foreach (var item in array)
{
if(!item.Contains("<"))
continue;
var subDocument = XDocument.Parse("<Root>" + item.Substring(0, item.LastIndexOf('>') + 1) + "</Root>");
foreach (var element in subDocument.Root.Descendants("MAINELEMENT"))
{
document.Root.Add(element);
}
}
var table = new DataTable();
table.Columns.Add("ELEMENT1");
table.Columns.Add("ELEMENT2");
table.Columns.Add("ELEMENT3");
var rows =
document.Descendants("MAINELEMENT").Select(el =>
{
var row = table.NewRow();
row["ELEMENT1"] = el.Element("ELEMENT1").Value;
row["ELEMENT2"] = el.Element("ELEMENT2").Value;
row["ELEMENT3"] = el.Element("ELEMENT3").Value;
return row;
});
foreach (var row in rows)
{
table.Rows.Add(row);
}
foreach (DataRow dataRow in table.Rows)
{
Console.WriteLine("{0},{1},{2}", dataRow["ELEMENT1"], dataRow["ELEMENT2"], dataRow["ELEMENT3"]);
}
I'm not so sure where you problem is.
You can use XElement for reading the xml and manually creating DataTable.
For Reading the XML See Xml Parsing using XElement
Then you can create dynamically the datatable.
Heres an example of creating a datatable in code
https://sites.google.com/site/bhargavaclub/datatablec
But why do you want to use a DataTable ? There are a lot of downsides...
I'm trying to get this xml info into a table.
I've tried reading the xml into a dataset...
string myXMLfile = #"..\..\..\BR7.xml";
//http://tatts.com/pagedata/racing/2011/10/5/BR7.xml
//http://tatts.com/racing/2011/10/5/BR/7
DataSet ds = new DataSet();
try
{
ds.ReadXml(myXMLfile);
for (int i = 0; i < ds.Tables.Count; i++)
{
listBox1.Items.Add(ds.Tables[i].TableName);
}
dgvRunner.DataSource = ds;
dgvRunner.DataMember = "Runner";
dgvWinOdds.DataSource = ds;
dgvWinOdds.DataMember = "WinOdds";
dgvPlaceOdds.DataSource = ds;
dgvPlaceOdds.DataMember = "PlaceOdds";
dgvFixedOdds.DataSource = ds;
dgvFixedOdds.DataMember = "FixedOdds";
but I get four separate tables. Runner, WinOdds, PlaceOdds, and fixedOdds
How do I get all the information for a Runner into a single table?
Here's some of the xml...
-<Runner RunnerNo="1" Rtng="93" LastResult="0X1" Form="W" Weight="57.0" Handicap="0" Barrier="10" RiderChanged="N" Rider="P SCHMIDT(A)" Scratched="N" RunnerName="PREACHER BOY">
<WinOdds CalcTime="2011-10-05T16:51:07" LastCalcTime="2011-10-05T16:46:32" Short="N" Lastodds="11.50" Odds="10.70"/>
<PlaceOdds Short="N" Lastodds="3.50" Odds="3.30"/>
-<FixedOdds RaceDayDate="2011-10-05T00:00:00" MeetingCode="BR" RaceNo="07" RunnerNo="01" LateScratching="0" Status="w" OfferName="PREACHER BOY" RetailPlaceOdds="3.3500" RetailWinOdds="12.0000" PlaceOdds="3.3500" WinOdds="12.0000" OfferId="981020"><Book SubEventId="863449" BookStatus="F"/>
</FixedOdds>
</Runner>
You should have the information about RunnerNo in every table (it is missing on WinOdd and PlaceOdds) so that you can relate your four datatables. You can define a the RunnerNo as Unique
After that you and use only one gridview and assing that relation between the four datatables as the gridview's DataMember.
here is a sample of how a relation should look like
I would propose an approach of moving all the attributes of the Runner children attributes to the Runner node attributes collection. This takes the following assumptions:
Each nested element in the Runner nodes has maximum 1 nested element inside it (i.e. there is only one Book element inside the FixedOdds element)
The attributes will be renamed by prefixing them with the name of their originating node (the CalcTime attribute in the WinOdds element will be copied in the Runner attribute's collection with the name WinOddsCalcTime)
You can keep or delete the children nodes (I chose to delete them in the code sample)
Here's the code:
string myXMLfile = #"xml.xml";
DataSet ds = new DataSet();
try
{
XmlDocument doc = new XmlDocument();
doc.Load(myXMLfile);
var runners = doc.SelectNodes("/Runner");
foreach (XmlNode runner in runners)
{
foreach (XmlNode child in runner.ChildNodes)
{
for (int i = 0; i < child.Attributes.Count; i++)
{
var at =doc.CreateAttribute(child.Name + child.Attributes[i].Name);
at.Value=child.Attributes[i].Value;
runner.Attributes.Append(at);
}
if (child.Name == "FixedOdds")
{
foreach (XmlNode book in child.ChildNodes)
{
for (int i = 0; i < book.Attributes.Count; i++)
{
var at = doc.CreateAttribute(book.Name + book.Attributes[i].Name);
at.Value = book.Attributes[i].Value;
runner.Attributes.Append(at);
}
}
}
// delete the attributes and the children nodes
child.RemoveAll();
}
// delete the child noeds
while (runner.ChildNodes.Count > 0)
{
runner.RemoveChild(runner.ChildNodes[0]);
}
}
doc.Save("xml1.xml");
ds.ReadXml("xml1.xml");
for (int i = 0; i < ds.Tables.Count; i++)
{
listBox1.Items.Add(ds.Tables[i].TableName);
}
dgvRunner.DataSource = ds;
dgvRunner.DataMember = "Runner";
//dgvWinOdds.DataSource = ds;
//dgvWinOdds.DataMember = "WinOdds";
//dgvPlaceOdds.DataSource = ds;
//dgvPlaceOdds.DataMember = "PlaceOdds";
//dgvFixedOdds.DataSource = ds;
//dgvFixedOdds.DataMember = "FixedOdds";
}
catch (Exception)
{ }
}
}