How can I convert an HtmlAttribute to a string?

How can I convert an HtmlAttribute to a string? - c#

var web = new HtmlWeb();
var doc = web.Load(page);
var Articles = doc.DocumentNode.SelectNodes("//*[#class = 'b-product-grid-tile js-tile-container']");
var href = doc.DocumentNode.SelectNodes("//a[#href]");
foreach (HtmlNode link in href)
{
HtmlAttribute att = link.Attributes["href"];
_entries.Add(new EntryModel { Link = att });
// att.ToString(); <----- Want to convert the HtmlAttribute to a string.
}
Full Code of my Scraper:
EntryModel List:
Main Window:
The links I need:

The easiest would be using the System.Web.Mvc.TagBuilder class:
var attributes = new { #class = "myClass", id = "elId" };
var tag = new TagBuilder("href");
tag.MergeAttributes(new RouteValueDictionary(attributes));
return tag.ToString();

Related

I wan to extract all the links inside the div tag with class span12 and <p>Automotive Brands</p>

HtmlWeb web = new HtmlWeb();
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
HtmlDocument doc = web.Load("https://www.fcagroup.com/en-US/group/regions/Pages/northamerica.aspx");
var foundAppropriateMetaTag = false;
var divs = doc.DocumentNode.SelectNodes("//div[contains(#class,'span12')]");
var linksOnPage = from lnks in divs.Descendants()
where lnks.Name == "a" &&
lnks.Attributes["href"] != null &&
lnks.InnerText.Trim().Length > 0
select new
{
Url = lnks.Attributes["href"].Value,
Text = lnks.InnerText,
};
I have tried above but it is extracting all the links from all the divs with class span12 but I only want links inside div that contains Automotive Brands PTag. Help me to achieve that.

Below
var divs = doc.DocumentNode.SelectNodes("//div[contains(#class,'span12')]");
you can add the following code.
var autoNodes = new List<HtmlNode>();
foreach (var div in divs)
{
if (div.ChildNodes.Any(c => c.InnerText.Contains("Automotive Brands")))
{
autoNodes.Add(div);
}
}
var links = new List<KeyValuePair<string, string>>();
foreach (var node in autoNodes)
{
var nodeLinks = node.Descendants().Where(c => c.Name.Equals("a")
&& c.Attributes["href"].Value.Contains("brands")
&& !string.IsNullOrEmpty(c.InnerText.Trim()));
links.AddRange(nodeLinks.Select(nl =>
new KeyValuePair<string, string>(nl.Attributes["href"].Value, nl.InnerText)));
}
This is the readable version.
You can transform it to
var autoNodes = divs.Where(div => div.ChildNodes.Any(c => c.InnerText.Contains("Automotive Brands"))).ToList();
if you like.

Html Agility Pack get contents from table

I need to get the location, address, and phone number from "http://anytimefitness.com/find-gym/list/AL" So far I have this...
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.LoadHtml(stateURLs[0].ToString());
var BlankNode =
htmlDoc.DocumentNode.SelectNodes("/div[#class='segmentwhite']/table[#style='width: 100%;']//tr[#class='']");
var GrayNode =
htmlDoc.DocumentNode.SelectNodes("/div[#class='segmentwhite']/table[#style='width: 100%;']//tr[#class='gray_bk']");
I have looked around stackoverflow for a while but none of the present post regarding htmlagilitypack has really helped. I have also have been using http://www.w3schools.com/xpath/xpath_syntax.asp

Since <div> you're after is not direct child of root node, you need to use // instead of /. Then you can combine XPath for BlankNode and GrayNode using or operator, for example :
var htmlweb = new HtmlWeb();
HtmlDocument htmlDoc = htmlweb.Load("http://anytimefitness.com/find-gym/list/AL");
htmlDoc.OptionFixNestedTags = true;
var AllNode =
htmlDoc.DocumentNode.SelectNodes("//div[#class='segmentwhite']/table//tr[#class='' or #class='gray_bk']");
foreach (HtmlNode node in AllNode)
{
var location = node.SelectSingleNode("./td[2]").InnerText;
var address = node.SelectSingleNode("./td[3]").InnerText;
var phone = node.SelectSingleNode("./td[4]").InnerText;
//do something with above informations
}

Here's an example I tested in LinqPad.
string url = #"http://anytimefitness.com/find-gym/list/AL";
var client = new System.Net.WebClient();
var data = client.DownloadData(url);
var html = Encoding.UTF8.GetString(data);
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.LoadHtml(html);
var gyms = htmlDoc.DocumentNode.SelectNodes("//tbody/tr[#class='' or #class='gray_bk']");
foreach (var gym in gyms) {
var city = gym.SelectSingleNode("./td[2]").InnerText;
var address = gym.SelectSingleNode("./td[3]").InnerText;
var phone = gym.SelectSingleNode("./td[4]").InnerText;
}
Since the HtmlAgilityPack also supports Linq, you could also do something like:
string [] classes = {"", "gray_bk"};
var gyms = htmlDoc
.DocumentNode
.Descendants("tr")
.Where(t => classes.Contains(t.Attributes["class"].Value))
.ToList();
gyms.ForEach(gym => {
var city = gym.SelectSingleNode("./td[2]").InnerText;
var address = gym.SelectSingleNode("./td[3]").InnerText;
var phone = gym.SelectSingleNode("./td[4]").InnerText;
});

OptionOutputOriginalCase not working in HtmlAgilityPack

I am trying to replace some text using HtmlAgilityPack in Html string and placing ASP.net user controls but I am getting lower case in output html. Any Idea how to get original case output.
Code :
public static string ConvertPageTitlesToCMSTitle(string htmlstring, string themeSlug)
{
var htmlDoc = new HtmlAgilityPack.HtmlDocument()
{
OptionOutputOriginalCase = true,
OptionWriteEmptyNodes = true
};
htmlDoc.LoadHtml(htmlstring);
var stPageTitleTags = htmlDoc.DocumentNode.SelectNodes("//stpagetitle");
foreach (var stPageTitleTag in stPageTitleTags)
{
var pageTitle = Strings.StripHTML(stPageTitleTag.InnerText);
pageTitle = pageTitle.Trim();
var pageId = CreateUpdateContentPageInDb(pageTitle, themeSlug, null, null);
var widgetControl = string.Format("<widget:PageTitleDisplay runat=\"server\" PageId=\"{0}\" Editable=\"True\" />", pageId);
htmlDoc.DocumentNode.InnerHtml = htmlDoc.DocumentNode.InnerHtml.Replace(stPageTitleTag.OuterHtml, widgetControl);
}
return htmlDoc.DocumentNode.OuterHtml;
}

As a workaround you could create a text node instead of HTML node. See:
foreach (var stPageTitleTag in stPageTitleTags)
{
var pageTitle = Strings.StripHTML(stPageTitleTag.InnerText);
pageTitle = pageTitle.Trim();
var pageId = CreateUpdateContentPageInDb(pageTitle, themeSlug, null, null);
var widgetControl = string.Format("<widget:PageTitleDisplay runat=\"server\" PageId=\"{0}\" Editable=\"True\" />", pageId);
// creating a text node
var widget = htmlDoc.CreateTextNode(widgetControl);
// replacing <sppagetitle> node with the new one
stPageTitleTag.ReplaceChild(widget, stPageTitleTag);
}
This should get the output you want.

extracting values of text from html source file

in this code var TempTxt holds An Html Body Content
as string
how can i extract element <table> or <td> inner text/ html using lambada syntax ?
public string ExtractPageValue(IWebDriver DDriver, string url="")
{
if(string.IsNullOrEmpty(url))
url = #"http://www.boi.org.il/he/Markets/ExchangeRates/Pages/Default.aspx";
var service = InternetExplorerDriverService.CreateDefaultService(directory);
service.LogFile = directory + #"\seleniumlog.txt";
service.LoggingLevel = InternetExplorerDriverLogLevel.Trace;
var options = new InternetExplorerOptions();
options.IntroduceInstabilityByIgnoringProtectedModeSettings = true;
DDriver = new InternetExplorerDriver(service, options, TimeSpan.FromSeconds(60));
DDriver.Navigate().GoToUrl(url);
var TempTxt = DDriver.PageSource;
return "";//Math.Round(Convert.ToDouble( TempTxt.Split(' ')[10]),2).ToString();
}

If you are open to try HtmlAgilityPack
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
var table = doc.DocumentNode.SelectNodes("//table/tr")
.Select(tr => tr.Elements("td").Select(td => td.InnerText).ToList())
.ToList();

how to get xml nth element name in wpf

public partial class XML_3 : Window
{
public XML_3()
{
this.InitializeComponent();
XmlDocument doc = new XmlDocument();
doc.Load("D:/sample.xml");
XmlNodeList student_list = doc.GetElementsByTagName("Student");
foreach (XmlNode node in student_list)
{
XmlElement student = (XmlElement)node;
int element_count = student.ChildNodes.Count;
}
}
}
In above code.I can get the count of element except root element(Student). now the count is 3.
But i have to get 2ed element name(Kavi),it's attribute element name(ID) and it's child element name(FName,MName).
what should i do to get those stuff.
Please help me...

Use XDocument (why?):
var doc = XDocument.Parse(xml); // OR Load(...)
var nodeCount = doc.Elements().Count();
var secondNode = doc.Elements().Skip(1).First();
var studentName = secondNode.Name;
var studentId = secondNode.Attribute("ID").Value;
or (for your code):
var secondNode = student.ChildNodes[1] as XmlElement;
var studentName = secondNode.LocalName;
var studentId = secondNode.Attributes["ID"];
Added:
var secondNode = student.ChildNodes[1];
var fName =
secondNode.ChildNodes.Cast<XmlElement>().FirstOrDefault(x => x.LocalName == "FName").InnerText;
var mName =
secondNode.ChildNodes.Cast<XmlElement>().FirstOrDefault(x => x.LocalName == "MName").InnerText;
var studentId = secondNode.Attributes["ID"].Value;

We Keep Coding

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

How can I convert an HtmlAttribute to a string? - c#

The easiest would be using the System.Web.Mvc.TagBuilder class: var attributes = new { #class = "myClass", id = "elId" }; var tag = new TagBuilder("href"); tag.MergeAttributes(new RouteValueDictionary(attributes)); return tag.ToString();

Related

I wan to extract all the links inside the div tag with class span12 and <p>Automotive Brands</p>

Html Agility Pack get contents from table

OptionOutputOriginalCase not working in HtmlAgilityPack

extracting values of text from html source file

how to get xml nth element name in wpf

Categories

Resources