var web = new HtmlWeb();
var doc = web.Load(page);
var Articles = doc.DocumentNode.SelectNodes("//*[#class = 'b-product-grid-tile js-tile-container']");
var href = doc.DocumentNode.SelectNodes("//a[#href]");
foreach (HtmlNode link in href)
{
HtmlAttribute att = link.Attributes["href"];
_entries.Add(new EntryModel { Link = att });
// att.ToString(); <----- Want to convert the HtmlAttribute to a string.
}
Full Code of my Scraper:
EntryModel List:
Main Window:
The links I need:
The easiest would be using the System.Web.Mvc.TagBuilder class:
var attributes = new { #class = "myClass", id = "elId" };
var tag = new TagBuilder("href");
tag.MergeAttributes(new RouteValueDictionary(attributes));
return tag.ToString();
In C# viaGeckoFx, I have not found a method to find all attributes of an element.
To do this, I made a JavaScript function. Here is my code
GeckoWebBrowser GeckoBrowser = ....;
GeckoNode NodeElement = ....; // HTML element where to find all HTML attributes
string JSresult = "";
string JStext = #"
function getElementAttributes(element)
{
var AttributesAssocArray = {};
for (var index = 0; index < element.attributes.length; ++index) { AttributesAssocArray[element.attributes[index].name] = element.attributes[index].value; };
return JSON.stringify(AttributesAssocArray);
}
getElementAttributes(this);
";
using (AutoJSContext JScontext = new AutoJSContext(GeckoBrowser.Window.JSContext)) { JScontext.EvaluateScript(JStext, (nsISupports)NodeElement.DomObject, out JSresult); }
Do you have others suggestions to achieve this in C# (with no Javascript)?
The property GeckoElement.Attributes allows access to an elements attributes.
So for example (this is untested and uncompiled code):
public string GetElementAttributes(GeckoElement element)
{
var result = new StringBuilder();
foreach(var a in element.Attributes)
{
result.Append(String.Format(" {0} = '{1}' ", a.NodeName, a.NodeValue));
}
return result.ToString();
}
I have a large htmlencoded string and i want decode only specific whitelisted html tags.
Is there a way to do this in c#, WebUtility.HtmlDecode() decodes everything.
`I am looking for an implementaiton of DecodeSpecificTags() that will pass below test.
[Test]
public void DecodeSpecificTags_SimpleInput_True()
{
string input = "<span>i am <strong color=blue>very</strong> big <br>man.</span>";
string output = "<span>i am <strong color=blue>very</strong> big <br>man.</span>";
List<string> whiteList = new List<string>(){ "strong","br" } ;
Assert.IsTrue(DecodeSpecificTags(whiteList,input) == output);
}`
You could do something like this
public string DecodeSpecificTags(List<string> whiteListedTagNames,string encodedInput)
{
String regex="";
foreach(string s in whiteListedTagNames)
{
regex="<"+#"\s*/?\s*"+s+".*?"+">";
encodedInput=Regex.Replace(encodedInput,regex);
}
return encodedInput;
}
A better approach could be to use some html parser like Agilitypack or csquery or Nsoup to find specific elements and decode it in a loop.
check this for links and examples of parsers
Check It, i did it using csquery :
string input = "<span>i am <strong color=blue>very</strong> big <br>man.</span>";
string output = "<span>i am <strong color=blue>very</strong> big <br>man.</span>";
var decoded = HttpUtility.HtmlDecode(output);
var encoded =input ; // HttpUtility.HtmlEncode(decoded);
Console.WriteLine(encoded);
Console.WriteLine(decoded);
var doc=CsQuery.CQ.CreateDocument(decoded);
var paras=doc.Select("strong").Union(doc.Select ("br")) ;
var tags=new List<KeyValuePair<string, string>>();
var counter=0;
foreach (var element in paras)
{
HttpUtility.HtmlEncode(element.OuterHTML).Dump();
var key ="---" + counter + "---";
var value= HttpUtility.HtmlDecode(element.OuterHTML);
var pair= new KeyValuePair<String,String>(key,value);
element.OuterHTML = key ;
tags.Add(pair);
counter++;
}
var finalstring= HttpUtility.HtmlEncode(doc.Document.Body.InnerHTML);
finalstring.Dump();
foreach (var element in tags)
{
finalstring=finalstring.Replace(element.Key,element.Value);
}
Console.WriteLine(finalstring);
Or you could use HtmlAgility with a black list or white list based on your requirement. I'm using black listed approach.
My black listed tag is store in a text file, for example "script|img"
public static string DecodeSpecificTags(this string content, List<string> blackListedTags)
{
if (string.IsNullOrEmpty(content))
{
return content;
}
blackListedTags = blackListedTags.Select(t => t.ToLowerInvariant()).ToList();
var decodedContent = HttpUtility.HtmlDecode(content);
var document = new HtmlDocument();
document.LoadHtml(decodedContent);
decodedContent = blackListedTags.Select(blackListedTag => document.DocumentNode.Descendants(blackListedTag))
.Aggregate(decodedContent,
(current1, nodes) =>
nodes.Select(htmlNode => htmlNode.WriteTo())
.Aggregate(current1,
(current, nodeContent) =>
current.Replace(nodeContent, HttpUtility.HtmlEncode(nodeContent))));
return decodedContent;
}
I am trying to clear the HTML coding from my RSS feed. I can not work out how to set the below to take out the HTML encoding.
var rssFeed = XElement.Parse(e.Result);
var currentFeed = this.DataContext as app.ViewModels.FeedViewModel;
var items = from item in rssFeed.Descendants("item")
select new ATP_Tennis_App.ViewModels.FeedItemViewModel()
{
Title = item.Element("title").Value,
DatePublished = DateTime.Parse(item.Element("pubDate").Value),
Url = item.Element("link").Value,
Description = item.Element("description").Value
};
foreach (var item in items)
currentFeed.Items.Add(item);
Just use the following code:
var withHtml = "<p>hello <b>there</b></p>";
var withoutHtml = Regex.Replace(withHtml, "<.+?>", string.Empty);
This will clean the html leaving only the text, so "hello there"
So, you can just copy and use this function:
string RemoveHtmlTags(string html) {
return Regex.Replace(html, "<.+?>", string.Empty);
}
Your code will look something like this:
var rssFeed = XElement.Parse(e.Result);
var currentFeed = this.DataContext as app.ViewModels.FeedViewModel;
var items = from item in rssFeed.Descendants("item")
select new ATP_Tennis_App.ViewModels.FeedItemViewModel()
{
Title = RemoveHtmlTags(item.Element("title").Value),
DatePublished = DateTime.Parse(item.Element("pubDate").Value),
Url = item.Element("link").Value,
Description = RemoveHtml(item.Element("description").Value)
};
You can use this code sample, it works fine on my side
public static string RemoveHTMLTags(string value)
{
string step1 = Regex.Replace(value, "<[^>]*>", " ");
string step2 = HttpUtility.HtmlDecode(step1);
return step2;
}
I hope, this code helps you.
Use the following class utility:
HttpUtility.HtmlDecode(string);
Please don't refer this answer no more.
I have recently decided to write a generic Table html helper to generate tables for my models and other objects, I have used reflection to make it more generic by taking an IEnumerable argument as the table data and a Dictionary for the .
I want to use reflection or some other method to get the properties [DisplayName()] attribute from the models MetaData so that it does not need to be specified in a dictionary. However all methods I have tried seem to return null, so I have removed them from my code.
public static MvcHtmlString Table(this HtmlHelper htmlHelper, Dictionary<string, string> boundColumns, IEnumerable<object> objectData, string tagId, string className, string controllerName, string idProperty)
{
bool hasAction = !String.IsNullOrEmpty(idProperty);
bool hasData = objectData.Count() > 0;
UrlHelper urlHelper = new UrlHelper(htmlHelper.ViewContext.RequestContext);
Type objectDataType = hasData ? objectData.First().GetType() : null;
IEnumerable<PropertyInfo> objectDataProperties = hasData ? from propInfo in objectDataType.GetProperties()
where boundColumns.ContainsKey(propInfo.Name)
select propInfo : null;
// Thead
TagBuilder theadtr = new TagBuilder("tr");
foreach (string col in boundColumns.Values)
theadtr.InnerHtml = String.Format("{0}\n{1}", theadtr.InnerHtml, (new TagBuilder("th") { InnerHtml = col }).ToString());
if (hasAction)
theadtr.InnerHtml = String.Format("{0}\n{1}", theadtr.InnerHtml, new TagBuilder("th") { InnerHtml = "Action" });
TagBuilder thead = new TagBuilder("thead") { InnerHtml = theadtr.ToString() };
// Tfoot
TagBuilder tfoot = new TagBuilder("tfoot");
if (!hasData) // Warn that there was no data to be displayed.
{
TagBuilder tfoottd = new TagBuilder("td") { InnerHtml = "There is currently nothing to display." };
tfoottd.MergeAttribute("colspan", (hasAction ? (boundColumns.Count + 1) : boundColumns.Count).ToString());
tfoottd.MergeAttribute("style", "text-align:center");
tfoot.InnerHtml = (new TagBuilder("tr") { InnerHtml = tfoottd.ToString() }).ToString();
}
else // Display a pager & filter for navigating through large amounts of data.
{
// The button for navigating to the first page.
TagBuilder pagefirst = new TagBuilder("img");
pagefirst.MergeAttribute("id", String.Format("{0}-page-first", tagId));
pagefirst.MergeAttribute("class", "first");
pagefirst.MergeAttribute("alt", "First Page");
pagefirst.MergeAttribute("src", urlHelper.Content("~/Content/Style/Tables/Themes/Blue/resultset_first.png"));
pagefirst.MergeAttribute("style", "cursor:pointer; vertical-align:middle;");
// The button for navigating to the previous page.
TagBuilder pageprev = new TagBuilder("img");
pageprev.MergeAttribute("id", String.Format("{0}-page-prev", tagId));
pageprev.MergeAttribute("class", "prev");
pageprev.MergeAttribute("alt", "Previous Page");
pageprev.MergeAttribute("src", urlHelper.Content("~/Content/Style/Tables/Themes/Blue/resultset_previous.png"));
pageprev.MergeAttribute("style", "cursor:pointer; vertical-align:middle;");
// The button for navigating to the next page.
TagBuilder pagenext = new TagBuilder("img");
pagenext.MergeAttribute("id", String.Format("{0}-page-next", tagId));
pagenext.MergeAttribute("class", "next");
pagenext.MergeAttribute("alt", "Next Page");
pagenext.MergeAttribute("src", urlHelper.Content("~/Content/Style/Tables/Themes/Blue/resultset_next.png"));
pagenext.MergeAttribute("style", "cursor:pointer; vertical-align:middle;");
// The button for navigating to the last page.
TagBuilder pagelast = new TagBuilder("img");
pagelast.MergeAttribute("id", String.Format("{0}-page-last", tagId));
pagelast.MergeAttribute("class", "last");
pagelast.MergeAttribute("alt", "Last Page");
pagelast.MergeAttribute("src", urlHelper.Content("~/Content/Style/Tables/Themes/Blue/resultset_last.png"));
pagelast.MergeAttribute("style", "cursor:pointer; vertical-align:middle;");
// The display field for the pager status.
TagBuilder pagedisplay = new TagBuilder("input");
pagedisplay.MergeAttribute("id", String.Format("{0}-page-display", tagId));
pagedisplay.MergeAttribute("type", "text");
pagedisplay.MergeAttribute("class", "pagedisplay");
pagedisplay.MergeAttribute("disabled", "disabled");
pagedisplay.MergeAttribute("style", "width:12%;");
// The select for changing page size.
TagBuilder pagesize = new TagBuilder("select");
pagesize.MergeAttribute("id", String.Format("{0}-page-size", tagId));
pagesize.MergeAttribute("class", "pagesize");
pagesize.MergeAttribute("style", "width:12%;");
for (int i = 10; i <= 100; i += 10)
{
TagBuilder option = new TagBuilder("option") { InnerHtml = i.ToString() };
if (i == 10)
option.MergeAttribute("selected", "selected");
option.MergeAttribute("value", i.ToString());
pagesize.InnerHtml = String.Format("{0}\n{1}", pagesize.InnerHtml, option.ToString());
}
// The pager container.
TagBuilder pagediv = new TagBuilder("div") { InnerHtml = (new TagBuilder("form") { InnerHtml = String.Format("{0}\n{1}\n{2}\n{3}\n{4}\n{5}", pagefirst.ToString(), pageprev.ToString(), pagenext.ToString(), pagelast.ToString(), pagedisplay.ToString(), pagesize.ToString()) }).ToString() };
pagediv.MergeAttribute("id", String.Format("{0}-pager", tagId));
pagediv.MergeAttribute("style", "float:left; width:50%;");
// Filter Text Field
TagBuilder filterfield = new TagBuilder("input");
filterfield.MergeAttribute("id", String.Format("{0}-filter-field", tagId));
filterfield.MergeAttribute("type", "text");
filterfield.MergeAttribute("style", "width:30%;");
// The filter container.
TagBuilder filterdiv = new TagBuilder("div") { InnerHtml = (new TagBuilder("form") {InnerHtml = String.Format("Search: {0}", filterfield.ToString())}).ToString() };
filterdiv.MergeAttribute("id", String.Format("{0}-filter", tagId));
filterdiv.MergeAttribute("style", "float:right; width:50%;");
TagBuilder tfoottd = new TagBuilder("td") { InnerHtml = String.Format("{0}\n{1}", pagediv.ToString(), filterdiv.ToString()) };
tfoottd.MergeAttribute("colspan", (hasAction ? (boundColumns.Count + 1) : boundColumns.Count).ToString());
tfoottd.MergeAttribute("style", "text-align:center");
tfoot.InnerHtml = (new TagBuilder("tr") { InnerHtml = tfoottd.ToString() }).ToString();
}
// Tbody
TagBuilder tbody = new TagBuilder("tbody");
foreach (object o in objectData)
{
TagBuilder tbodytr = new TagBuilder("tr");
foreach (PropertyInfo p in objectDataProperties)
{
string val = "N/A";
object pval = p.GetValue(o, null);
if (pval != null)
val = pval.ToString();
tbodytr.InnerHtml = String.Format("{0}\n{1}", tbodytr.InnerHtml, (new TagBuilder("td") { InnerHtml = val }).ToString());
}
if (hasAction)
{
string id = objectDataType.GetProperty(idProperty).GetValue(o, null).ToString();
tbodytr.InnerHtml = String.Format(
"{0}\n{1}",
tbodytr.InnerHtml,
(new TagBuilder("td") { InnerHtml = Table_ActionLinks(htmlHelper, controllerName, id) }).
ToString());
}
tbody.InnerHtml = String.Format("{0}\n{1}", tbody.InnerHtml, tbodytr.ToString());
}
// Table
TagBuilder table = new TagBuilder("table") { InnerHtml = String.Format("{0}\n{1}\n{2}", thead.ToString(), tfoot.ToString(), tbody.ToString()) };
table.MergeAttribute("id", string.IsNullOrEmpty(tagId) ? String.Format("table-{0}", boundColumns.Count.ToString()) : tagId);
table.MergeAttribute("summary", "Generic data list");
if (!String.IsNullOrEmpty(className))
table.MergeAttribute("class", String.Format("{0} {1}", className, "tablesorter"));
else
table.MergeAttribute("class", "tablesorter");
// Enable Sorting/Searching
if (hasData)
{
TagBuilder sortscript = new TagBuilder("script") { InnerHtml = String.Format("$(document).ready(function(){{$(\"#{0}\").tablesorter().tablesorterPager({{container:$(\"#{1}\")}});}});", tagId, String.Format("{0}-pager", tagId)) };
TagBuilder searchscript = new TagBuilder("script") { InnerHtml = String.Format("$(document).ready(function(){{$(\"#{0}\").keyup(function(){{$.uiTableFilter($(\"#{1}\"), this.value);}})}});", String.Format("{0}-filter-field", tagId), tagId) };
sortscript.MergeAttribute("type", "text/javascript");
return new MvcHtmlString(String.Format("{0}\n{1}\n{2}", table.ToString(), sortscript.ToString(), searchscript.ToString()));
}
return new MvcHtmlString(table.ToString());
}
So basically I am looking to use as much reflection as possible to eliminate as many arguments to this method as possible.
Thanks,
Alex.
I'm not sure why your Display attribute retrieval isn't working. Here's what I use. It's from a method which retrieves the Display attribute from enum field values, but it's the same basic pattern that I use to retrieve any attribute from an object:
public static string GetDisplayName<T>( T toCheck )
{
Type enumType = typeof(T);
if( !enumType.IsEnum ) return null;
MemberInfo[] members = enumType.GetMember(toCheck.ToString());
if( ( members == null ) || ( members.Length != 1 ) ) return toCheck.ToString();
foreach( MemberInfo memInfo in members )
{
DisplayAttribute[] attrs = (DisplayAttribute[]) memInfo.GetCustomAttributes(typeof(DisplayAttribute), false);
if( ( attrs != null ) && ( attrs.Length == 1 ) ) return attrs[0].Name;
}
return toCheck.ToString();
}