C# HtmlAgilityPack Select table from specific h2 - c#

I have some html:
<h2>Results</h2>
<div class="box">
<table class="tFormat">
<th>Head</th>
<tr>1</tr>
</table>
</div>
<h2>Grades</h2>
<div class="box">
<table class="tFormat">
<th>Head</th>
<tr>1</tr>
</table>
</div>
I was wondering how would I get the table under "Results"
I've tried:
var nodes = doc.DocumentNode.SelectNodes("//h2");
foreach (var o in nodes)
{
if (o.InnerText.Equals("Results"))
{
foreach (var c in o.SelectNodes("//table"))
{
Console.WriteLine(c.InnerText);
}
}
}
It works but it also gets the table under Grades h2

Note that the div is not hierarchically inside the header, so it doesn't make sense to look for it there.
This can work for you - it finds the next element after the title:
if (o.InnerText.Equals("Results"))
{
var nextDiv = o.NextSibling;
while (nextDiv != null && nextDiv.NodeType != HtmlNodeType.Element)
nextDiv = nextDiv.NextSibling;
// nextDiv should be correct here.
}
You can also write a more specific xpath to find just that div:
doc.DocumentNode.SelectNodes("//h2[text()='Results']/following-sibling::div[1]");

var nodes = doc.DocumentNode.SelectNodes("//h2");
if (nodes.FirstOrDefault()!=null)
{
var o=nodes.FirstOrDefault();
if (o.InnerText.Equals("Results"))
{
foreach (var c in o.SelectNodes("//table"))
{
Console.WriteLine(c.InnerText);
}
}
}

Related

HtmlAgilityPack filtering HTML based on a query

I have a block of two HTML elements which look like this:
<div class="a-row">
<a class="a-size-small a-link-normal a-text-normal" href="/Chemical-Guys-CWS-107-Extreme-Synthetic/dp/B003U4P3U0/ref=sr_1_1_sns?s=automotive&ie=UTF8&qid=1504525216&sr=1-1">
<span aria-label="$19.51" class="a-color-base sx-zero-spacing">
<span class="sx-price sx-price-large">
<sup class="sx-price-currency">$</sup>
<span class="sx-price-whole">19</span>
<sup class="sx-price-fractional">51</sup>
</span>
</span>
<span class="a-letter-space"></span>Subscribe & Save
</a>
</div>
And next block of HTML:
<div class="a-row a-spacing-none">
<a class="a-link-normal a-text-normal" href="https://rads.stackoverflow.com/amzn/click/com/B003U4P3U0" rel="nofollow noreferrer">
<span aria-label="$22.95" class="a-color-base sx-zero-spacing">
<span class="sx-price sx-price-large">
<sup class="sx-price-currency">$</sup>
<span class="sx-price-whole">22</span>
<sup class="sx-price-fractional">95</sup>
</span>
</span>
</a>
<span class="a-letter-space"></span>
<i class="a-icon a-icon-prime a-icon-small s-align-text-bottom" aria-label="Prime">
<span class="a-icon-alt">Prime</span>
</i>
</div>
Both of these elements are quite similar in their structure, but the trick is that I want to extract the value of element which next to it contains a span element with a class: aria-label="Prime"
This is how I currently extract the price but it's not good:
if (htmlDoc.DocumentNode.SelectNodes("//span[#class='a-color-base sx-zero-spacing']") != null)
{
var span = htmlDoc.DocumentNode.SelectSingleNode("//span[#class='a-color-base sx-zero-spacing']");
price = span.Attributes["aria-label"].Value;
}
This basically selects HTML element at position 0, since there are more than one element. But the trick here is that I would like to select that span element which contains the prime value , just like the 2nd piece of HTML I've shown...
In case the 2nd element with such values doesn't exists I would just simply use this first method I wrote up there...
Can someone help me out with this ? =)
I've also tried something like this:
var pr = htmlDoc.DocumentNode.SelectNodes("//a[#class='a-link-normal a-text-normal']")
.Where(x => x.SelectSingleNode("//i[#class='a-icon a-icon-prime a-icon-small s-align-text-bottom']") != null)
.Select(x => x.SelectSingleNode("//span[#class='a-color-base sx-zero-spacing']").Attributes["aria-label"].Value);
But it's still returning first element xD
New version guys:
var pr = htmlDoc.DocumentNode.SelectNodes("//a[#class='a-link-normal a-text-normal']");
string prrrrrr = "";
for (int i = 0; i < pr.Count; i++)
{
if (pr.ElementAt(i).SelectNodes("//i[#class='a-icon a-icon-prime a-icon-small s-align-text-bottom']").ElementAt(i) != null)
{
prrrrrr = pr.ElementAt(i).SelectNodes("//span[#class='a-color-base sx-zero-spacing']").ElementAt(i).Attributes["aria-label"].Value;
}
}
So the idea is that I take out all "a" elements from the HTML file and create a HTML Node collection of a's, and then loop through them and see which one indeed contains the element that I'm looking for and then match it...?
The problem here is that this if statement always passes:
if (pr.ElementAt(i).SelectNodes("//i[#class='a-icon a-icon-prime a-icon-small s-align-text-bottom']").ElementAt(i) != null)
How can I loop through each individual element in node collection ?
I think you should start to look at div level with class a-row. Then loop and check if the div contains a i with class area-label equals to 'Prime'. And finally get the span with the a-color-base sx-zero-spacing class and the value of the attribute aria-label like this:
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//div[starts-with(#class,'a-row')]");
foreach (HtmlNode node in nodes)
{
HtmlNode i = node.SelectSingleNode("i[#aria-label='Prime']");
if (i != null)
{
HtmlNode span = node.SelectSingleNode(".//span[#class='a-color-base sx-zero-spacing']");
if (span != null)
{
string currentValue = span.Attributes["aria-label"].Value;
}
}
}

html agility pack getting same output twice c#

<div class="header">
<span id="content">test1</span>
</div>
<div class="header">
<span id="content">test2</span>
</div>
var web = new HtmlWeb();
var doc = web.Load(url)
var value = doc.DocumentNode.SelectNodes("//div[#class='header']")
foreach(var v in value)
{
var name = v.SelectSingleNode("//span[#id='content']")
Console.Writeline(name.OuterHtml);
}
the code above gives me as output twice <span id="content">test1</span>instead of <span id="content">test2</span> as second output. So it gets the correct number of nodes but not the correct output.
Using // and / in XPath will query the root node even you are using the current node.
Please see my fix in your code.
var value = doc.DocumentNode.SelectNodes("//div[#class='header']");
foreach (var v in value)
{
var name = v.SelectSingleNode("span[#id='content']");
Console.WriteLine(name.OuterHtml);
}
See this fiddle. https://dotnetfiddle.net/nih2lw
A side note, id attribute should always be unique in the document. Use class instead.

Html Agility Pack parsing table into object

So I have HTML like this:
<tr class="row1">
<td class="id">123</td>
<td class="date">2014-08-08</td>
<td class="time">12:31:25</td>
<td class="notes">something here</td>
</tr>
<tr class="row0">
<td class="id">432</td>
<td class="date">2015-02-09</td>
<td class="time">12:22:21</td>
<td class="notes">something here</td>
</tr>
And it continues like that for each customer row. I want to parse contents of each table row to an object. I've tried few methods but I can't seem to get it work right.
This is what I have currently
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
foreach (HtmlNode row in doc.DocumentNode.SelectNodes("//table[#id='customerlist']//tr"))
{
Customer cust = new Customer();
foreach (HtmlNode info in row.SelectNodes("//td"))
{
if (info.GetAttributeValue("class", String.Empty) == "id")
{
cust.ID = info.InnerText;
}
if (info.GetAttributeValue("class", String.Empty) == "date")
{
cust.DateAdded = info.InnerText;
}
if (info.GetAttributeValue("class", String.Empty) == "time")
{
cust.TimeAdded = info.InnerText;
}
if (info.GetAttributeValue("class", String.Empty) == "notes")
{
cust.Notes = info.InnerText;
}
}
Console.WriteLine(cust.ID + " " + cust.TimeAdded + " " + cust.DateAdded + " " + cust.Notes);
}
It works to the point that it prints info of the last row of the table on each loop. I'm just missing something very simple but cannot see what.
Also is my way of creating the object fine, or should I use a constructor and create the object from variables? E.g.
string Notes = String.Empty;
if (info.GetAttributeValue("class", String.Empty) == "notes")
{
Notes = info.InnerText;
}
..
Customer cust = new Customer(id, other_variables, Notes, etc);
Your XPath query is wrong. You need to use td instead of //td:
foreach (HtmlNode info in row.SelectNodes("td"))
Passing //td to SelectNodes() will match all <td> elements in the document, hence your inner loop runs 8 times instead of 4 times, and the last 4 times always overrides the values previously set in your Customer object.
See XPath Examples

Taking different Table of Elements with HtmlAgilityPack

I have this loop structure several times.
Table 1
<table>
<tbody>
<tr>
<th>titulo</th>
</tr>
</tbody>
</table>
Table 2
<table>
<tbody>
<tr>
<th>Texto</th>
<th>Texto</th>
<th>Texto</th>
<th>Texto</th>
</tr>
</tbody>
</table>
This pattern is repeated several times.
How do I switch them to an array and a list for me to get the values ​​of each ?
Short Demo using a Console App:
class Program
{
static void Main(string[] args)
{
HtmlDocument doc = new HtmlDocument();
doc.Load("Demo.html");
var result = doc.DocumentNode.SelectNodes("//table")
.Select(table => new //create anonymous type
{
Table = table,
HeaderNodes = table.SelectNodes("./tbody/tr/th").ToList() //the th subnodes
});
foreach (var table in result)
{
foreach (HtmlNode headerNode in table.HeaderNodes)
{
Console.WriteLine( headerNode.InnerText);
}
Console.WriteLine("--------------------------");
}
}
}
Output:
titulo
--------------------------
Texto
Texto
Texto
Texto
--------------------------

Splitting HTML string into two parts with HtmlAgilityPack

I'm looking for the best way to split an HTML document over some tag in C# using HtmlAgilityPack. I want to preserve the intended markup as I'm doing the split. Here is an example.
If the document is like this:
<p>
<div>
<p>
Stuff
</p>
<p>
<ul>
<li>Bullet 1</li>
<li>link</li>
<li>Bullet 3</li>
</ul>
</p>
<span>Footer</span>
</div>
</p>
Once it's split, it should look like this:
Part 1
<p>
<div>
<p>
Stuff
</p>
<p>
<ul>
<li>Bullet 1</li>
</ul>
</p>
</div>
</p>
Part 2
<p>
<div>
<p>
<ul>
<li>Bullet 3</li>
</ul>
</p>
<span>Footer</span>
</div>
</p>
What would be the best way of doing something like that?
Definitely not by regex. (Note: this was originally a tag on the question—now removed.) I'm usually not one to jump on The Pony is Coming bandwagon, but this is one case in which regular expressions would be particularly bad.
First, I would write a recursive function that removes all siblings of a node that follow that node—call it RemoveSiblingsAfter(node)—and then calls itself on its parent, so that all siblings following the parent are removed as well (and all siblings following the grandparent, and so on). You can use an XPath to find the node(s) on which you want to split, e.g. doc.DocumentNode.SelectNodes("//a[#href='#']"), and call the function on that node. When done, you'd remove the splitting node itself, and that's it. You'd repeat these steps for a copy of the original document, except you'd implement RemoveSiblingsBefore(node) to remove siblings that precede a node.
In your example, RemoveSiblingsBefore would act as follows:
<a href="#"> has no siblings, so recurse on parent, <li>.
<li> has a preceding sibling—<li>Bullet 1</li>—so remove, and recurse on parent, <ul>.
<ul> has no siblings, so recurse on parent, <p>.
<p> has a preceding sibling—<p>Stuff</p>—so remove, and recurse on parent, <div>.
and so on.
Here is what I came up with. This does the split and removes the "empty" elements of the element where the split happens.
private static void SplitDocument()
{
var doc = new HtmlDocument();
doc.Load("HtmlDoc.html");
var links = doc.DocumentNode.SelectNodes("//a[#href]");
var firstPart = GetFirstPart(doc.DocumentNode, links[0]).DocumentNode.InnerHtml;
var secondPart = GetSecondPart(links[0]).DocumentNode.InnerHtml;
}
private static HtmlDocument GetFirstPart(HtmlNode currNode, HtmlNode link)
{
var nodeStack = new Stack<Tuple<HtmlNode, HtmlNode>>();
var newDoc = new HtmlDocument();
var parent = newDoc.DocumentNode;
nodeStack.Push(new Tuple<HtmlNode, HtmlNode>(currNode, parent));
while (nodeStack.Count > 0)
{
var curr = nodeStack.Pop();
var copyNode = curr.Item1.CloneNode(false);
curr.Item2.AppendChild(copyNode);
if (curr.Item1 == link)
{
var nodeToRemove = NodeAndEmptyAncestors(copyNode);
nodeToRemove.ParentNode.RemoveChild(nodeToRemove);
break;
}
for (var i = curr.Item1.ChildNodes.Count - 1; i >= 0; i--)
{
nodeStack.Push(new Tuple<HtmlNode, HtmlNode>(curr.Item1.ChildNodes[i], copyNode));
}
}
return newDoc;
}
private static HtmlDocument GetSecondPart(HtmlNode link)
{
var nodeStack = new Stack<HtmlNode>();
var newDoc = new HtmlDocument();
var currNode = link;
while (currNode.ParentNode != null)
{
currNode = currNode.ParentNode;
nodeStack.Push(currNode.CloneNode(false));
}
var parent = newDoc.DocumentNode;
while (nodeStack.Count > 0)
{
var node = nodeStack.Pop();
parent.AppendChild(node);
parent = node;
}
var newLink = link.CloneNode(false);
parent.AppendChild(newLink);
currNode = link;
var newParent = newLink.ParentNode;
while (currNode.ParentNode != null)
{
var foundNode = false;
foreach (var child in currNode.ParentNode.ChildNodes)
{
if (foundNode) newParent.AppendChild(child.Clone());
if (child == currNode) foundNode = true;
}
currNode = currNode.ParentNode;
newParent = newParent.ParentNode;
}
var nodeToRemove = NodeAndEmptyAncestors(newLink);
nodeToRemove.ParentNode.RemoveChild(nodeToRemove);
return newDoc;
}
private static HtmlNode NodeAndEmptyAncestors(HtmlNode node)
{
var currNode = node;
while (currNode.ParentNode != null && currNode.ParentNode.ChildNodes.Count == 1)
{
currNode = currNode.ParentNode;
}
return currNode;
}

Categories