enter image description here
I want to pull the areas within the pictures
Iam pulled 2 and 3 think
Uri url = new Uri("http://www.milliyet.com.tr/sondakika/");
WebClient client = new WebClient();
client.Encoding = System.Text.Encoding.UTF8;
var html = client.DownloadString(url);
HtmlAgilityPack.HtmlDocument dokuman = new HtmlAgilityPack.HtmlDocument();
dokuman.LoadHtml(html);
HtmlNodeCollection basliklar = dokuman.DocumentNode.SelectNodes("//div[contains(#class,'kategoriList3')]//a");
foreach (var baslik in basliklar)
{
try
{
datacıktı.Rows.Add();
datacıktı.Rows[sayac].Cells[0].Value = baslik.Attributes["href"].Value.ToString();
datacıktı.Rows[sayac].Cells[1].Value = baslik.InnerText;
sayac++;
}
catch
{
continue;
}
}
This code can help you
Uri url = new Uri("http://www.milliyet.com.tr/sondakika/");
WebClient client = new WebClient();
client.Encoding = System.Text.Encoding.UTF8;
var html = client.DownloadString(url);
HtmlAgilityPack.HtmlDocument dokuman = new HtmlAgilityPack.HtmlDocument();
dokuman.LoadHtml(html);
IEnumerable<HtmlNode> htmlNodes = dokuman.DocumentNode.Descendants("ul").Where(d => d.Attributes.Contains("class") && d.Attributes["class"].Value.Contains("sonDK"));
foreach (HtmlNode htmlNode in htmlNodes)
{
IEnumerable<HtmlNode> liList = htmlNode.Descendants("li").Where(l => (l.Attributes.Contains("class") && l.Attributes["class"].Value.Contains("title")) == false);
foreach (HtmlNode liNode in liList)
{
Console.WriteLine("strong:" + liNode.FirstChild.InnerText + "- link:" + liNode.LastChild.Attributes["href"].Value);
}
}
Related
I am trying to get all links in my txt file to extract them using the Html Agility Pack but when extracting I get an error:
Can you explain why?
Code:
string[] lines = File.ReadAllLines("links");
foreach (string line in lines)
{
HtmlWeb hw = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc = hw.Load(line.ToString());
foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[#href]"))
{
string hrefValue = link.GetAttributeValue("href", string.Empty);
if (!hrefValue.ToString().StartsWith("http://") && !hrefValue.ToString().StartsWith("https://"))
continue;
if (!crawlListbox.Items.Contains(hrefValue))
{
crawlListbox.Items.Add(hrefValue);
}
}
}
Screenshot of the code and error message+variable values So, the goal is to take a word and get the part of speech of the word from its google definition.
I've tried a few different approaches but I'm getting a null reference error every time. Is my code failing to access the webpage? Is it a firewall issue, a logic issue, an {insert-issue-here} problem? I really wish i had a vague idea of what is wrong.
Thanks for your time.
Addendum: I've tried "//[#id=\"source - luna\"]//div" and "//[#id=\"source - luna\"]/div1" as XPath values.
//attempt 1////////////////////////////////////////////////////////////////////////
var term = "Hello";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.urbandictionary.com/define.php?term=" + term);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
StreamReader stream = new StreamReader(response.GetResponseStream());
string final_response = stream.ReadToEnd();
MessageBox.Show(final_response); //doesn't execute
//attempt 2////////////////////////////////////////////////////////////////////////
var url = "https://www.google.co.za/search?q=define+position";
var content = new System.Net.WebClient().DownloadString(url);
var webGet = new HtmlWeb();
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(content);
//doc is null at runtime
HtmlNode ourNode = doc.DocumentNode.SelectSingleNode("//*[#id=\"uid_0\"]/div[1]/div/div[1]/div[2]/div[2]/div[1]/i/span");
if (ourNode != null)
{
richTextBox1.AppendText(ourNode.InnerText);
}
else
richTextBox1.AppendText("null");
//attempt 3////////////////////////////////////////////////////////////////////////
var webGet = new HtmlWeb();
var doc = webGet.Load("https://www.google.co.za/search?q=define+position");
//doc is null at runtime
HtmlNode ourNode = doc.DocumentNode.SelectSingleNode("//*[#id=\"uid_0\"]/div[1]/div/div[1]/div[2]/div[2]/div[1]/i/span");
if (ourNode != null)
{
richTextBox1.AppendText(ourNode.InnerText);
}
else
richTextBox1.AppendText("null");
//attempt 4////////////////////////////////////////////////////////////////////////
string Url = "http://www.metacritic.com/game/pc/halo-spartan-assault";
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(Url);
//doc is null at runtime
string metascore = doc.DocumentNode.SelectNodes("//*[#id=\"main\"]/div[3]/div/div[2]/div[1]/div[1]/div/div/div[2]/a/span[1]")[0].InnerText;
string userscore = doc.DocumentNode.SelectNodes("//*[#id=\"main\"]/div[3]/div/div[2]/div[1]/div[2]/div[1]/div/div[2]/a/span[1]")[0].InnerText;
string summary = doc.DocumentNode.SelectNodes("//*[#id=\"main\"]/div[3]/div/div[2]/div[2]/div[1]/ul/li/span[2]/span/span[1]")[0].InnerText;
richTextBox1.AppendText(metascore + " " + userscore + " " + summary);
//attempt 5////////////////////////////////////////////////////////////////////////
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument html = web.Load("https://www.google.co.za/search?q=define+position");
//html is null
var div = html.DocumentNode.SelectNodes("//*[#id=\"uid_0\"]/div[1]/div/div[1]/div[2]/div[2]/div[1]/i/span");
richTextBox1.AppendText(Convert.ToString(div));
You are getting null because your XPATHs aren't correct or it couldn't find any node based on those XPATHs. What are you trying to achieve here?
I get from url html page.
in page I get table with hot opened <tr> tag
<table class="transparent">
<tr><td>Sąrašo eil. Nr.:</td><td>B-FA001</td></tr>
<td>Įrašymo į Sąrašą data:</td><td>2006-11-13</td></tr>
</table>
how to fix to
<table class="transparent">
<tr><td>Sąrašo eil. Nr.:</td><td>B-FA001</td></tr>
<tr><td>Įrašymo į Sąrašą data:</td><td>2006-11-13</td></tr>
</table>
I tried to do
private HtmlDocument GetHtmlDocument(string link)
{
string url = "http://195.182.67.7/paslaugos/administratoriai/bankroto-administratoriai/" + link;
var web = new HtmlWeb { AutoDetectEncoding = false, OverrideEncoding = Encoding.UTF8 };
var doc = web.Load(url);
doc.OptionFixNestedTags = true;
doc.OptionAutoCloseOnEnd = true;
doc.OptionCheckSyntax = true;
// build a list of nodes ordered by stream position
NodePositions pos = new NodePositions(doc);
// browse all tags detected as not opened
foreach (HtmlParseError error in doc.ParseErrors.Where(e => e.Code == HtmlParseErrorCode.TagNotOpened))
{
// find the text node just before this error
var last = pos.Nodes.OfType<HtmlTextNode>().LastOrDefault(n => n.StreamPosition < error.StreamPosition);
if (last != null)
{
// fix the text; reintroduce the broken tag
last.Text = error.SourceText.Replace("/", "") + last.Text + error.SourceText;
}
}
doc.Save(Console.Out);
return doc;
}
but not fix
for this particular problem you could do simple regex replacing:
string wrong = "<table class=\"transparent\"><tr><td>Sąrašo eil. Nr.:</td><td>B-FA001</td></tr><td>Įrašymo į Sąrašą data:</td><td>2006-11-13</td></tr></table>";
Regex reg = new Regex(#"(?<!(?:<tr>)|(?:</td>))<td>");
string right = reg.Replace(wrong, "<tr><td>");
Console.WriteLine(right);
string searchString = textBox1.Text.Replace(" ", "%20");
string url = "http://sometorrentsearchurl.com/search/" + searchString + "/0/99/401";
HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse();
var doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(resp.GetResponseStream());
foreach (HtmlNode torrent in doc.DocumentNode.SelectNodes("//tr"))
{
foreach (HtmlNode title in torrent.SelectNodes(".//a[#class='detLink']"))
{
Label tTitle = new Label();
tTitle.Text = title.InnerText;
tTitle.Location = new Point(133, tHeightLoc);
tTitle.BackColor = Color.Transparent;
tTitle.ForeColor = Color.White;
tTitle.AutoSize = false;
tTitle.Font = new Font("Arial", 10);
tTitle.Size = new Size(347, 25);
tTitle.TextAlign = ContentAlignment.MiddleLeft;
tTitle.Anchor = (AnchorStyles.Top | AnchorStyles.Left | AnchorStyles.Right);
panel2.Controls.Add(tTitle);
tHeightLoc += 45;
}
}
I am trying to get the list of torrents from a site and for every html th tag found I want to create some controls in my form with values taken from other children html tags, but this line returns an error foreach (HtmlNode title in torrent.SelectNodes(".//a[#class='detLink']"))
I want to know how to fix it because is the first time that I am using Html Agility Pack.
The problem was here torrent.SelectNodes(".//a[#class='detLink']")), it was a null selection an I fixed it like this torrent.SelectNodes("//a[#class='detLink']"))
I am trying to parse a webpage. But it is giving an error. Please help me. Thanks.
Here's the code:
static void myMain()
{
using (var client = new WebClient())
{
string data = client.DownloadString("http://www.google.com");
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(data);
var nodes = doc.DocumentNode.SelectNodes("//a[#href]");
foreach (HtmlNode link in nodes)
{
HtmlAttribute att = link.Attributes["href"];
Console.WriteLine(att.Value);
}
}
}
It is giving error that The type 'System.Windows.Form.HtmlDocument' has no constructors defined. I have included HAP.
Thanks
Change
HtmlDocument doc = new HtmlDocument();
to
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
Because you don't want work with System.Windows.Form.HtmlDocument