I have a html document that contains multiple divs
Example:
<div class="element">
<div class="title">
<a href="127.0.0.1" title="Test>Test</a>
</div>
</div>
Now I'm using this code to extract the title element.
List<string> items = new List<string>();
var nodes = Web.DocumentNode.SelectNodes("//*[#title]");
if (nodes != null)
{
foreach (var node in nodes)
{
foreach (var attribute in node.Attributes)
if (attribute.Name == "title")
items.Add(attribute.Value);
}
}
I don't know how to adapt my code to extract the href and the title element
at the same time.
Each div should be an object with the included a tags as properties.
public class CheckBoxListItem
{
public string Text { get; set; }
public string Href { get; set; }
}
You can use the following xpath query to retrieve only a tags with a title and href :
//a[#title and #href]
The you can use your code like this:
List<CheckBoxListItem> items = new List<CheckBoxListItem>();
var nodes = Web.DocumentNode.SelectNodes("//a[#title and #href]");
if (nodes != null)
{
foreach (var node in nodes)
{
items.Add(new CheckBoxListItem()
{
Text = node.Attributes["title"].Value,
Href = node.Attributes["href"].Value
});
}
}
I very often use ScrapySharp's package together with HtmlAgilityPack for css selection.
(add a using statement for ScrapySharp.Extensions so you can use the CssSelect method).
using HtmlAgilityPack;
using ScrapySharp.Extensions;
In your case, I would do:
HtmlWeb w = new HtmlWeb();
var htmlDoc = w.Load("myUrl");
var titles = htmlDoc.DocumentNode.CssSelect(".title");
foreach (var title in titles)
{
string href = string.Empty;
var anchor = title.CssSelect("a").FirstOrDefault();
if (anchor != null)
{
href = anchor.GetAttributeValue("href");
}
}
Related
I am using HTMLAgilityPack and I'm trying to scrape the link http://www.hundsun.co.jp/ which is under the data-preconnect-urls. How can I get that?
<h3>
<a style="display:none" href="/aclk?sa=L&ai=DChcSEwimnPnc5OvQAhWRl70KHcxqCEAYABAA&ei=9hZNWLqlCIyY8gXA04vACg&sig=AOD64_3SZuXd57_-qOs8nnhn8rqw8GlIgw&q=&sqi=2&ved=0ahUKEwi6-PTc5OvQAhUMjLwKHcDpAqgQ0QwIGA&adurl=" id="s0p1c0"></a>
ブリッジSE募集中 - hundsun.co.jp
You can do it like this:
using System;
using HtmlAgilityPack;
using System.Xml;
public class Program
{
public static void Main()
{
string html = "<html><body><h3><a style=\"display:none\" href=\"/aclk?sa=L&ai=DChcSEwimnPnc5OvQAhWRl70KHcxqCEAYABAA&ei=9hZNWLqlCIyY8gXA04vACg&sig=AOD64_3SZuXd57_-qOs8nnhn8rqw8GlIgw&q=&sqi=2&ved=0ahUKEwi6-PTc5OvQAhUMjLwKHcDpAqgQ0QwIGA&adurl=\" id=\"s0p1c0\"></a>ブリッジSE募集中 - hundsun.co.jp</h3></body></html>";
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
var links = doc.DocumentNode.SelectNodes("//a[#data-preconnect-urls]");
if (links == null)
{
Console.WriteLine("no links contain attribute data-preconnect-urls");
return;
}
foreach(var htmlNode in links)
{
var attr = htmlNode.Attributes["data-preconnect-urls"];
Console.WriteLine(attr.Value);
}
}
}
you can try it out here:
https://dotnetfiddle.net/gMTFV3
I have a file which contain text in that. I need to search for a string and extract the href on that line.
file.txt is the file which contain basic wordpress homepage
finally I want the http://example.com like link.
I tried several ways like
DateTime dateTime = DateTime.UtcNow.Date;
string stringpart = dateTime.ToString("-dd-M-yyyy");
string finalword = "candy" + stringpart;
List<List<string>> groups = new List<List<string>>();
List<string> current = null;
foreach (var line in File.ReadAllLines(#"E:/file.txt"))
{
if (line.Contains("-22-8-2014") && current == null)
current = new List<string>();
else if (line.Contains("candy") && current != null)
{
groups.Add(current);
current = null;
}
if (current != null)
current.Add(line);
}
foreach (object o in groups)
{
Console.WriteLine(o);
}
Console.ReadLine();
}
To do this correctly, you must parse this html-file. Use something like CSquery, HTML Agility Pack, or SgmlReader.
Solution of your problem with CSQuery:
public IEnumerable<string> ExtractLinks(string htmlFile)
{
var page = CQ.CreateFromFile(htmlFile);
return page.Select("a[href]").Select(tag => tag.GetAttribute("href"));
}
In case you decided to use HtmlAgilityPack, this should be easy :
var doc = new HtmlDocument();
//load your HTML file to HtmlDocument
doc.Load("path_to_your_html.html");
//select all <a> tags containing href attribute
var links = doc.DocumentNode.SelectNodes("//a[#href]");
foreach(HtmlNode link in links)
{
//print value of href attribute
Console.WriteLine(link.GetAttributeValue("href", "");
}
I am trying to replace some text using HtmlAgilityPack in Html string and placing ASP.net user controls but I am getting lower case in output html. Any Idea how to get original case output.
Code :
public static string ConvertPageTitlesToCMSTitle(string htmlstring, string themeSlug)
{
var htmlDoc = new HtmlAgilityPack.HtmlDocument()
{
OptionOutputOriginalCase = true,
OptionWriteEmptyNodes = true
};
htmlDoc.LoadHtml(htmlstring);
var stPageTitleTags = htmlDoc.DocumentNode.SelectNodes("//stpagetitle");
foreach (var stPageTitleTag in stPageTitleTags)
{
var pageTitle = Strings.StripHTML(stPageTitleTag.InnerText);
pageTitle = pageTitle.Trim();
var pageId = CreateUpdateContentPageInDb(pageTitle, themeSlug, null, null);
var widgetControl = string.Format("<widget:PageTitleDisplay runat=\"server\" PageId=\"{0}\" Editable=\"True\" />", pageId);
htmlDoc.DocumentNode.InnerHtml = htmlDoc.DocumentNode.InnerHtml.Replace(stPageTitleTag.OuterHtml, widgetControl);
}
return htmlDoc.DocumentNode.OuterHtml;
}
As a workaround you could create a text node instead of HTML node. See:
foreach (var stPageTitleTag in stPageTitleTags)
{
var pageTitle = Strings.StripHTML(stPageTitleTag.InnerText);
pageTitle = pageTitle.Trim();
var pageId = CreateUpdateContentPageInDb(pageTitle, themeSlug, null, null);
var widgetControl = string.Format("<widget:PageTitleDisplay runat=\"server\" PageId=\"{0}\" Editable=\"True\" />", pageId);
// creating a text node
var widget = htmlDoc.CreateTextNode(widgetControl);
// replacing <sppagetitle> node with the new one
stPageTitleTag.ReplaceChild(widget, stPageTitleTag);
}
This should get the output you want.
I've seen a few related questions out here, but they don’t exactly talk about the same problem I am facing.
I want to use the HTML Agility Pack to remove unwanted tags from my HTML without losing the content within the tags.
So for instance, in my scenario, I would like to preserve the tags "b", "i" and "u".
And for an input like:
<p>my paragraph <div>and my <b>div</b></div> are <i>italic</i> and <b>bold</b></p>
The resulting HTML should be:
my paragraph and my <b>div</b> are <i>italic</i> and <b>bold</b>
I tried using HtmlNode's Remove method, but it removes my content too. Any suggestions?
I wrote an algorithm based on Oded's suggestions. Here it is. Works like a charm.
It removes all tags except strong, em, u and raw text nodes.
internal static string RemoveUnwantedTags(string data)
{
if(string.IsNullOrEmpty(data)) return string.Empty;
var document = new HtmlDocument();
document.LoadHtml(data);
var acceptableTags = new String[] { "strong", "em", "u"};
var nodes = new Queue<HtmlNode>(document.DocumentNode.SelectNodes("./*|./text()"));
while(nodes.Count > 0)
{
var node = nodes.Dequeue();
var parentNode = node.ParentNode;
if(!acceptableTags.Contains(node.Name) && node.Name != "#text")
{
var childNodes = node.SelectNodes("./*|./text()");
if (childNodes != null)
{
foreach (var child in childNodes)
{
nodes.Enqueue(child);
parentNode.InsertBefore(child, node);
}
}
parentNode.RemoveChild(node);
}
}
return document.DocumentNode.InnerHtml;
}
How to recursively remove a given list of unwanted html tags from an html string
I took #mathias answer and improved his extension method so that you can supply a list of tags to exclude as a List<string> (e.g. {"a","p","hr"}). I also fixed the logic so that it works recursively properly:
public static string RemoveUnwantedHtmlTags(this string html, List<string> unwantedTags)
{
if (String.IsNullOrEmpty(html))
{
return html;
}
var document = new HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection tryGetNodes = document.DocumentNode.SelectNodes("./*|./text()");
if (tryGetNodes == null || !tryGetNodes.Any())
{
return html;
}
var nodes = new Queue<HtmlNode>(tryGetNodes);
while (nodes.Count > 0)
{
var node = nodes.Dequeue();
var parentNode = node.ParentNode;
var childNodes = node.SelectNodes("./*|./text()");
if (childNodes != null)
{
foreach (var child in childNodes)
{
nodes.Enqueue(child);
}
}
if (unwantedTags.Any(tag => tag == node.Name))
{
if (childNodes != null)
{
foreach (var child in childNodes)
{
parentNode.InsertBefore(child, node);
}
}
parentNode.RemoveChild(node);
}
}
return document.DocumentNode.InnerHtml;
}
Try the following, you might find it a bit neater than the other proposed solutions:
public static int RemoveNodesButKeepChildren(this HtmlNode rootNode, string xPath)
{
HtmlNodeCollection nodes = rootNode.SelectNodes(xPath);
if (nodes == null)
return 0;
foreach (HtmlNode node in nodes)
node.RemoveButKeepChildren();
return nodes.Count;
}
public static void RemoveButKeepChildren(this HtmlNode node)
{
foreach (HtmlNode child in node.ChildNodes)
node.ParentNode.InsertBefore(child, node);
node.Remove();
}
public static bool TestYourSpecificExample()
{
string html = "<p>my paragraph <div>and my <b>div</b></div> are <i>italic</i> and <b>bold</b></p>";
HtmlDocument document = new HtmlDocument();
document.LoadHtml(html);
document.DocumentNode.RemoveNodesButKeepChildren("//div");
document.DocumentNode.RemoveNodesButKeepChildren("//p");
return document.DocumentNode.InnerHtml == "my paragraph and my <b>div</b> are <i>italic</i> and <b>bold</b>";
}
Before removing a node, get its parent and its InnerText, then remove the node and re-assign the InnerText to the parent.
var parent = node.ParentNode;
var innerText = parent.InnerText;
node.Remove();
parent.AppendChild(doc.CreateTextNode(innerText));
If you do not want to use Html agility pack and still want to remove Unwanted Html Tag than you can do as given below.
public static string RemoveHtmlTags(string strHtml)
{
string strText = Regex.Replace(strHtml, "<(.|\n)*?>", String.Empty);
strText = HttpUtility.HtmlDecode(strText);
strText = Regex.Replace(strText, #"\s+", " ");
return strText;
}
I want to get the contents of an ordered list from a HTML page using HTMLAgilityPack in C#, i have tried the following code but, this is not working can anyone help, i want to pass html text and get the contents of the first ordered list found in the html
private bool isOrderedList(HtmlNode node)
{
if (node.NodeType == HtmlNodeType.Element)
{
if (node.Name.ToLower() == "ol")
return true;
else
return false;
}
else
return false;
}
public string GetOlList(string htmlText)
{
string s="";
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlText);
HtmlNode nd = doc.DocumentNode;
foreach (HtmlNode node in nd.ChildNodes)
{
if (isOrderedList(node))
{
s = node.WriteContentTo();
break;
}
else if (node.HasChildNodes)
{
string sx= GetOlList(node.WriteTo());
if (sx != "")
{
s = sx;
break;
}
}
}
return s;
}
The following code worked for me
public static string GetComments(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string s = "";
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//ol"))
{
s += node.OuterHtml;
}
return s;
}
How about:
var el = (HtmlElement)doc.DocumentNode
.SelectSingleNode("//ol");
if(el!=null)
{
string s = el.OuterHtml;
}
(untested, from memory)