var web = new HtmlWeb();
var doc = web.Load(page);
var Articles = doc.DocumentNode.SelectNodes("//*[#class = 'b-product-grid-tile js-tile-container']");
var href = doc.DocumentNode.SelectNodes("//a[#href]");
foreach (HtmlNode link in href)
{
HtmlAttribute att = link.Attributes["href"];
_entries.Add(new EntryModel { Link = att });
// att.ToString(); <----- Want to convert the HtmlAttribute to a string.
}
Full Code of my Scraper:
EntryModel List:
Main Window:
The links I need:
The easiest would be using the System.Web.Mvc.TagBuilder class:
var attributes = new { #class = "myClass", id = "elId" };
var tag = new TagBuilder("href");
tag.MergeAttributes(new RouteValueDictionary(attributes));
return tag.ToString();
Related
HtmlWeb web = new HtmlWeb();
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
HtmlDocument doc = web.Load("https://www.fcagroup.com/en-US/group/regions/Pages/northamerica.aspx");
var foundAppropriateMetaTag = false;
var divs = doc.DocumentNode.SelectNodes("//div[contains(#class,'span12')]");
var linksOnPage = from lnks in divs.Descendants()
where lnks.Name == "a" &&
lnks.Attributes["href"] != null &&
lnks.InnerText.Trim().Length > 0
select new
{
Url = lnks.Attributes["href"].Value,
Text = lnks.InnerText,
};
I have tried above but it is extracting all the links from all the divs with class span12 but I only want links inside div that contains Automotive Brands PTag. Help me to achieve that.
Below
var divs = doc.DocumentNode.SelectNodes("//div[contains(#class,'span12')]");
you can add the following code.
var autoNodes = new List<HtmlNode>();
foreach (var div in divs)
{
if (div.ChildNodes.Any(c => c.InnerText.Contains("Automotive Brands")))
{
autoNodes.Add(div);
}
}
var links = new List<KeyValuePair<string, string>>();
foreach (var node in autoNodes)
{
var nodeLinks = node.Descendants().Where(c => c.Name.Equals("a")
&& c.Attributes["href"].Value.Contains("brands")
&& !string.IsNullOrEmpty(c.InnerText.Trim()));
links.AddRange(nodeLinks.Select(nl =>
new KeyValuePair<string, string>(nl.Attributes["href"].Value, nl.InnerText)));
}
This is the readable version.
You can transform it to
var autoNodes = divs.Where(div => div.ChildNodes.Any(c => c.InnerText.Contains("Automotive Brands"))).ToList();
if you like.
I need to get the location, address, and phone number from "http://anytimefitness.com/find-gym/list/AL" So far I have this...
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.LoadHtml(stateURLs[0].ToString());
var BlankNode =
htmlDoc.DocumentNode.SelectNodes("/div[#class='segmentwhite']/table[#style='width: 100%;']//tr[#class='']");
var GrayNode =
htmlDoc.DocumentNode.SelectNodes("/div[#class='segmentwhite']/table[#style='width: 100%;']//tr[#class='gray_bk']");
I have looked around stackoverflow for a while but none of the present post regarding htmlagilitypack has really helped. I have also have been using http://www.w3schools.com/xpath/xpath_syntax.asp
Since <div> you're after is not direct child of root node, you need to use // instead of /. Then you can combine XPath for BlankNode and GrayNode using or operator, for example :
var htmlweb = new HtmlWeb();
HtmlDocument htmlDoc = htmlweb.Load("http://anytimefitness.com/find-gym/list/AL");
htmlDoc.OptionFixNestedTags = true;
var AllNode =
htmlDoc.DocumentNode.SelectNodes("//div[#class='segmentwhite']/table//tr[#class='' or #class='gray_bk']");
foreach (HtmlNode node in AllNode)
{
var location = node.SelectSingleNode("./td[2]").InnerText;
var address = node.SelectSingleNode("./td[3]").InnerText;
var phone = node.SelectSingleNode("./td[4]").InnerText;
//do something with above informations
}
Here's an example I tested in LinqPad.
string url = #"http://anytimefitness.com/find-gym/list/AL";
var client = new System.Net.WebClient();
var data = client.DownloadData(url);
var html = Encoding.UTF8.GetString(data);
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.LoadHtml(html);
var gyms = htmlDoc.DocumentNode.SelectNodes("//tbody/tr[#class='' or #class='gray_bk']");
foreach (var gym in gyms) {
var city = gym.SelectSingleNode("./td[2]").InnerText;
var address = gym.SelectSingleNode("./td[3]").InnerText;
var phone = gym.SelectSingleNode("./td[4]").InnerText;
}
Since the HtmlAgilityPack also supports Linq, you could also do something like:
string [] classes = {"", "gray_bk"};
var gyms = htmlDoc
.DocumentNode
.Descendants("tr")
.Where(t => classes.Contains(t.Attributes["class"].Value))
.ToList();
gyms.ForEach(gym => {
var city = gym.SelectSingleNode("./td[2]").InnerText;
var address = gym.SelectSingleNode("./td[3]").InnerText;
var phone = gym.SelectSingleNode("./td[4]").InnerText;
});
I am trying to replace some text using HtmlAgilityPack in Html string and placing ASP.net user controls but I am getting lower case in output html. Any Idea how to get original case output.
Code :
public static string ConvertPageTitlesToCMSTitle(string htmlstring, string themeSlug)
{
var htmlDoc = new HtmlAgilityPack.HtmlDocument()
{
OptionOutputOriginalCase = true,
OptionWriteEmptyNodes = true
};
htmlDoc.LoadHtml(htmlstring);
var stPageTitleTags = htmlDoc.DocumentNode.SelectNodes("//stpagetitle");
foreach (var stPageTitleTag in stPageTitleTags)
{
var pageTitle = Strings.StripHTML(stPageTitleTag.InnerText);
pageTitle = pageTitle.Trim();
var pageId = CreateUpdateContentPageInDb(pageTitle, themeSlug, null, null);
var widgetControl = string.Format("<widget:PageTitleDisplay runat=\"server\" PageId=\"{0}\" Editable=\"True\" />", pageId);
htmlDoc.DocumentNode.InnerHtml = htmlDoc.DocumentNode.InnerHtml.Replace(stPageTitleTag.OuterHtml, widgetControl);
}
return htmlDoc.DocumentNode.OuterHtml;
}
As a workaround you could create a text node instead of HTML node. See:
foreach (var stPageTitleTag in stPageTitleTags)
{
var pageTitle = Strings.StripHTML(stPageTitleTag.InnerText);
pageTitle = pageTitle.Trim();
var pageId = CreateUpdateContentPageInDb(pageTitle, themeSlug, null, null);
var widgetControl = string.Format("<widget:PageTitleDisplay runat=\"server\" PageId=\"{0}\" Editable=\"True\" />", pageId);
// creating a text node
var widget = htmlDoc.CreateTextNode(widgetControl);
// replacing <sppagetitle> node with the new one
stPageTitleTag.ReplaceChild(widget, stPageTitleTag);
}
This should get the output you want.
in this code var TempTxt holds An Html Body Content
as string
how can i extract element <table> or <td> inner text/ html using lambada syntax ?
public string ExtractPageValue(IWebDriver DDriver, string url="")
{
if(string.IsNullOrEmpty(url))
url = #"http://www.boi.org.il/he/Markets/ExchangeRates/Pages/Default.aspx";
var service = InternetExplorerDriverService.CreateDefaultService(directory);
service.LogFile = directory + #"\seleniumlog.txt";
service.LoggingLevel = InternetExplorerDriverLogLevel.Trace;
var options = new InternetExplorerOptions();
options.IntroduceInstabilityByIgnoringProtectedModeSettings = true;
DDriver = new InternetExplorerDriver(service, options, TimeSpan.FromSeconds(60));
DDriver.Navigate().GoToUrl(url);
var TempTxt = DDriver.PageSource;
return "";//Math.Round(Convert.ToDouble( TempTxt.Split(' ')[10]),2).ToString();
}
If you are open to try HtmlAgilityPack
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
var table = doc.DocumentNode.SelectNodes("//table/tr")
.Select(tr => tr.Elements("td").Select(td => td.InnerText).ToList())
.ToList();
public partial class XML_3 : Window
{
public XML_3()
{
this.InitializeComponent();
XmlDocument doc = new XmlDocument();
doc.Load("D:/sample.xml");
XmlNodeList student_list = doc.GetElementsByTagName("Student");
foreach (XmlNode node in student_list)
{
XmlElement student = (XmlElement)node;
int element_count = student.ChildNodes.Count;
}
}
}
In above code.I can get the count of element except root element(Student). now the count is 3.
But i have to get 2ed element name(Kavi),it's attribute element name(ID) and it's child element name(FName,MName).
what should i do to get those stuff.
Please help me...
Use XDocument (why?):
var doc = XDocument.Parse(xml); // OR Load(...)
var nodeCount = doc.Elements().Count();
var secondNode = doc.Elements().Skip(1).First();
var studentName = secondNode.Name;
var studentId = secondNode.Attribute("ID").Value;
or (for your code):
var secondNode = student.ChildNodes[1] as XmlElement;
var studentName = secondNode.LocalName;
var studentId = secondNode.Attributes["ID"];
Added:
var secondNode = student.ChildNodes[1];
var fName =
secondNode.ChildNodes.Cast<XmlElement>().FirstOrDefault(x => x.LocalName == "FName").InnerText;
var mName =
secondNode.ChildNodes.Cast<XmlElement>().FirstOrDefault(x => x.LocalName == "MName").InnerText;
var studentId = secondNode.Attributes["ID"].Value;