So I need to save several links from a site, but when it reaches near the 64k links it gives me the error OutOfMemoryException.
Here is my code, please if someone could help me it would be wonderful.
Note : if you want to test (of course you have to edit to test, but it is not that much to edit), the url it receives is :
http://santacatarina.entrei.net/busca/listar_empresas.php?filter={0}&pagina={1}
The code :
namespace WebCrawler.SantaCatarina
{
class SCLinkFinder : ILinkFinder
{
private readonly Queue<char> _alfabeto;
private int _paginaAtual;
private char _letraAtual;
public SCLinkFinder()
{
_alfabeto = new Queue<char>();
_alfabeto.Enqueue('1');
_alfabeto.Enqueue('A');
_alfabeto.Enqueue('B');
_alfabeto.Enqueue('C');
_alfabeto.Enqueue('D');
_alfabeto.Enqueue('E');
_alfabeto.Enqueue('F');
_alfabeto.Enqueue('G');
_alfabeto.Enqueue('H');
_alfabeto.Enqueue('I');
_alfabeto.Enqueue('J');
_alfabeto.Enqueue('K');
_alfabeto.Enqueue('L');
_alfabeto.Enqueue('M');
_alfabeto.Enqueue('N');
_alfabeto.Enqueue('O');
_alfabeto.Enqueue('P');
_alfabeto.Enqueue('Q');
_alfabeto.Enqueue('R');
_alfabeto.Enqueue('S');
_alfabeto.Enqueue('T');
_alfabeto.Enqueue('U');
_alfabeto.Enqueue('V');
_alfabeto.Enqueue('W');
_alfabeto.Enqueue('X');
_alfabeto.Enqueue('Y');
_alfabeto.Enqueue('Z');
_paginaAtual = 1;
_letraAtual = _alfabeto.Dequeue();
}
public string[] Find(string url)
{
List<string> _empresas = new List<string>();
if (!_alfabeto.Any() && _letraAtual == ' ')
{
return _empresas.ToArray();
}
var webGet = new HtmlWeb();
var formattedUrl = String.Format(url, _letraAtual, _paginaAtual++);
var document = webGet.Load(formattedUrl);
var nodes = document.DocumentNode.SelectNodes("//div[#id='conteudo']/div[#class='gratuito']/p/a");
foreach (var node in nodes)
{
var href = node.GetAttributeValue("href", "");
_empresas.Add(href);
}
var elUrlProximaPagina = document.DocumentNode.SelectSingleNode("//div[#id='principal']/div[#id='conteudo']/div[#class='paginacao']/a[contains(#class,'nextPage')]");
if (elUrlProximaPagina == null)
{
_letraAtual = _alfabeto.Any() ? _alfabeto.Dequeue() : ' ';
_paginaAtual = 1;
}
Console.WriteLine(_letraAtual);
Console.WriteLine(_paginaAtual);
DadoPo.SalvarUrl();
return Find(url);
}
}
Ok, now the error is at another place, it is giving outofmemoryexception at
var document = webGet.Load(formattedUrl);
Persist the content of _empresas in the harddisk (database, physical file) after N times(1,000 for example) of scraped information from the website. And then clean the _empresas for a new set of information
What you are doing is pretty much using all the memory allowed by CLR for your PE
namespace WebCrawler.SantaCatarina
{
class SCLinkFinder : ILinkFinder
{
private readonly Queue<char> _alfabeto;
private int _paginaAtual;
private char _letraAtual;
public SCLinkFinder()
{
_alfabeto = new Queue<char>();
_alfabeto.Enqueue('1');
_alfabeto.Enqueue('A');
_alfabeto.Enqueue('B');
_alfabeto.Enqueue('C');
_alfabeto.Enqueue('D');
_alfabeto.Enqueue('E');
_alfabeto.Enqueue('F');
_alfabeto.Enqueue('G');
_alfabeto.Enqueue('H');
_alfabeto.Enqueue('I');
_alfabeto.Enqueue('J');
_alfabeto.Enqueue('K');
_alfabeto.Enqueue('L');
_alfabeto.Enqueue('M');
_alfabeto.Enqueue('N');
_alfabeto.Enqueue('O');
_alfabeto.Enqueue('P');
_alfabeto.Enqueue('Q');
_alfabeto.Enqueue('R');
_alfabeto.Enqueue('S');
_alfabeto.Enqueue('T');
_alfabeto.Enqueue('U');
_alfabeto.Enqueue('V');
_alfabeto.Enqueue('W');
_alfabeto.Enqueue('X');
_alfabeto.Enqueue('Y');
_alfabeto.Enqueue('Z');
_paginaAtual = 1;
_letraAtual = _alfabeto.Dequeue();
}
public string[] Find(string url)
{
List<string> _empresas = new List<string>();
if (!_alfabeto.Any() && _letraAtual == ' ')
{
return _empresas.ToArray();
}
var webGet = new HtmlWeb();
var formattedUrl = String.Format(url, _letraAtual, _paginaAtual++);
var document = webGet.Load(formattedUrl);
var nodes = document.DocumentNode.SelectNodes("//div[#id='conteudo']/div[#class='gratuito']/p/a");
foreach (var node in nodes)
{
var href = node.GetAttributeValue("href", "");
_empresas.Add(href);
}
var elUrlProximaPagina = document.DocumentNode.SelectSingleNode("//div[#id='principal']/div[#id='conteudo']/div[#class='paginacao']/a[contains(#class,'nextPage')]");
if (elUrlProximaPagina == null)
{
_letraAtual = _alfabeto.Any() ? _alfabeto.Dequeue() : ' ';
_paginaAtual = 1;
}
Console.WriteLine(_letraAtual);
Console.WriteLine(_paginaAtual);
//Your code to read _empresas and Persist in database(or file)
return Find(url);
}
}
}
Related
I have a problem. I have 2 Android.Support.V4.App.Fragments
In the first Framgent I use this code:
AgentSpinnerAdapter = new ArrayAdapter<string>(Context, Android.Resource.Layout.SimpleSpinnerDropDownItem);
AgentSpinner.Adapter = AgentSpinnerAdapter;
foreach (string[] str in NamesArray)
{
string AgentId = str[0];
string Owner = str[1];
string Exchange = str[2];
string Remark = str[3];
AgentSpinnerAdapter.Add("Agent " + AgentId + " - " + Owner + " - " + Remark);
}
In the second Fragment I call this line:
dbValue = Fragment1.AgentSpinnerAdapter.GetItem(0);
But it says that AgentSpinnerAdapter is a nullreference, which is weird, because it get's filled. I have set the AgentSpinnerAdapter to Public static. Also in my MainActivity I first create Fragment1 and then Fragment2 like this:
Fragment1 = Fragment1.NewInstance();
Fragment2 = Fragment2.NewInstance();
What am I doing wrong?
UPDATE
Here is the full Fragment1.cs method
public void LoadAgentSpinner()
{
string json = "";
try
{
string html = string.Empty;
string url = "https://www.efy.nl/app/getagents.php";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
IgnoreBadCertificates();
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
using (Stream stream = response.GetResponseStream())
using (StreamReader reader = new StreamReader(stream))
{
json = reader.ReadToEnd();
}
}
catch (Exception ex1)
{
try
{
WebClient client = new WebClient();
NameValueCollection fields = new NameValueCollection();
fields.Add("error", ex1.GetBaseException().ToString());
string url = "https://www.mywebsite.com";
IgnoreBadCertificates();
byte[] respBytes = client.UploadValues(url, fields);
string resp = client.Encoding.GetString(respBytes);
SelectedQuantity.Text = "";
SelectedLimit.Text = "";
}
catch (Exception ex2)
{
string exFullName = (ex2.GetType().FullName);
string ExceptionString = (ex2.GetBaseException().ToString());
}
}
//Parse json content
var jObject = JObject.Parse(json);
//Create Array from everything inside Node:"Coins"
var agentPropery = jObject["Agents"] as JArray;
//Create List to save Coin Data
agentList = new List<agent>();
//Find every value in Array: coinPropery
foreach (var property in agentPropery)
{
//Convert every value in Array to string
var propertyList = JsonConvert.DeserializeObject<List<agent>>(property.ToString());
//Add all strings to List
agentList.AddRange(propertyList);
}
//Get all the values from Name, and convert it to an Array
string[][] NamesArray = agentList.OrderBy(i => i.AgentId)
.Select(i => new string[] { i.AgentId.ToString(), i.Owner, i.Exchange, i.Remark })
.Distinct()
.ToArray();
AgentSpinnerAdapter = new ArrayAdapter<string>(Context, Android.Resource.Layout.SimpleSpinnerDropDownItem);
AgentSpinner.Adapter = AgentSpinnerAdapter;
foreach (string[] str in NamesArray)
{
string AgentId = str[0];
string Owner = str[1];
string Exchange = str[2];
string Remark = str[3];
AgentSpinnerAdapter.Add("Agent " + AgentId + " - " + Owner + " - " + Remark); // format your string here
}
if(MainActivity.db.CheckExistTableSettings("Default Agent") == true)
{
string Value = MainActivity.db.SelectValueFromTableSettings("Default Agent");
int spinnerPosition = AgentSpinnerAdapter.GetPosition(Value);
AgentSpinner.SetSelection(spinnerPosition);
}
else
{
AgentSpinner.SetSelection(0);
}
}
In a few of my applications it's necessary to access the other fragments from my main Activity, so we do the following:
public class MainActivity : AppCompatActivity, BottomNavigationView.IOnNavigationItemSelectedListener
{
public static Dictionary<string, Fragment> FragmentList { get; set; }
private Fragment currentFragment = null;
private BottomNavigationView navigation;
protected override void OnCreate(Bundle savedInstanceState)
{
base.OnCreate(savedInstanceState);
SetContentView(Resource.Layout.layout_mainactivity);
// create our fragments and initialise them early.
if (FragmentList == null)
{
FragmentList = new Dictionary<string, Fragment>
{
{ "main", MainFragment.NewInstance() },
{ "bugreport", BugReportFragment.NewInstance() },
{ "settings", SettingsFragment.NewInstance() }
};
}
navigation = FindViewById<BottomNavigationView>(Resource.Id.bottom_nav);
navigation.SetOnNavigationItemSelectedListener(this);
navigation.SelectedItemId = Resource.Id.navigation_main;
}
public bool OnNavigationItemSelected(IMenuItem item)
{
if (!popAction)
{
navigationResourceStack.Push(item.ItemId);
}
switch (item.ItemId)
{
case Resource.Id.navigation_main:
currentFragment = FragmentList["main"];
break;
case Resource.Id.navigation_settings:
currentFragment = FragmentList["settings"];
break;
case Resource.Id.navigation_bugreport:
currentFragment = FragmentList["bugreport"];
break;
}
if (currentFragment == null)
{
return false;
}
else
{
FragmentManager.BeginTransaction().Replace(Resource.Id.frame_content, currentFragment).Commit();
return true;
}
}
}
What this means is you could do something like MainActivity.FragmentList["main"] and then call any public method on the actual initialized class because a pointer to it is stored within the dictionary.
I am trying the get all user media from the instagram api and store into database but how can do that i don't know. i am write the code but using this code just add one media in the database. any one have the idea then please let me know how can do that. here below list the my code.
This is my C# method :
public string makePostFromInstagram()
{
var serializer1 = new System.Web.Script.Serialization.JavaScriptSerializer();
var nodes1 = serializer1.Deserialize<dynamic>(GetData(strInstagramUserId));
foreach (var date in nodes1)
{
if (date.Key == "data")
{
string theKey = date.Key;
var thisNode = date.Value;
int userCount = 0;
foreach (var post in thisNode)
{
if (thisNode[userCount]["username"] == strInstagramUserId)
{
id = thisNode[userCount]["id"].ToString();
}
userCount++;
}
}
}
var serializer = new System.Web.Script.Serialization.JavaScriptSerializer();
Dictionary<string, object> csObj = serializer.Deserialize<Dictionary<string, object>>(GetRecentPost(id, accessToken));
int length = ((ArrayList)csObj["data"]).Count;
var nodes = serializer.Deserialize<dynamic>(GetRecentPost(id, accessToken));
foreach (var date in nodes)
{
if (date.Key == "data")
{
string theKey = date.Key;
var thisNode = date.Value;
foreach (var post in thisNode)
{
UsersOnInstagram objUserInsta = new UsersOnInstagram();
string result = null;
//here i am add just one image so i want to here add multiple image insert
if (post["type"] == "image")
result = UsersOnInstagram.addInstagramPost(strPtId, HttpUtility.UrlEncode(post["caption"]["text"]), post["images"]["standard_resolution"]["url"], UnixTimeStampToDateTime(Convert.ToDouble(post["created_time"])), null, post["type"]);
else if (post["type"] == "video")
result = objUserInsta.addInstagramPost(HttpUtility.UrlEncode(post["caption"]["text"]), strPtId, post["images"]["standard_resolution"]["url"], UnixTimeStampToDateTime(Convert.ToDouble(post["created_time"])), post["videos"]["standard_resolution"]["url"], post["type"]);
}
}
}
Response.End();
}
this is my api method :
public static string GetRecentPost(string instagramaccessid, string instagramaccesstoken)
{
Double MAX_TIMESTAMP = DateTimeToUnixTimestamp(DateTime.Today.AddDays(-1));
Double MIN_TIMESTAMP = DateTimeToUnixTimestamp(DateTime.Today.AddDays(-2));
string url = "https://api.instagram.com/v1/users/" + instagramaccessid + "/media/recent?access_token=" + instagramaccesstoken + "&min_timestamp=" + MIN_TIMESTAMP + "&maz_timestamp=" + MAX_TIMESTAMP;
var webClient = new System.Net.WebClient();
string d = webClient.DownloadString(url);
return d;
}
any one know how can do that please let me know.
I've got a List of Document
public class Document
{
public string[] fullFilePath;
public bool isPatch;
public string destPath;
public Document() { }
public Document(string[] fullFilePath, bool isPatch, string destPath)
{
this.fullFilePath = fullFilePath;
this.isPatch = isPatch;
this.destPath = destPath;
}
The fullFilepath should a List or an Array of Paths.
For example:
Document 1
---> C:\1.pdf
---> C:\2.pdf
Document 2
---> C:\1.pdf
---> C:\2.pdf
---> C:\3.pdf
etc.
My problem if I am using an array string all Documents got "null" in its fullFilePath.
If I'm using a List for the fullFilePath all Documents got the same entries from the last Document.
Here is how the List is filled:
int docCount = -1;
int i = 0;
List<Document> Documents = new List<Document>();
string[] sourceFiles = new string[1];
foreach (string file in filesCollected)
{
string bc;
string bcValue;
if (Settings.Default.barcodeEngine == "Leadtools")
{
bc = BarcodeReader.ReadBarcodeSymbology(file);
bcValue = "PatchCode";
}
else
{
bc = BarcodeReader.ReadBacrodes(file);
bcValue = "009";
}
if (bc == bcValue)
{
if(Documents.Count > 0)
{
Array.Clear(sourceFiles, 0, sourceFiles.Length);
Array.Resize<string>(ref sourceFiles, 1);
i = 0;
}
sourceFiles[i] = file ;
i++;
Array.Resize<string>(ref sourceFiles, i + 1);
Documents.Add(new Document(sourceFiles, true,""));
docCount++;
}
else
{
if (Documents.Count > 0)
{
sourceFiles[i] = file;
i++;
Array.Resize<string>(ref sourceFiles, i + 1);
Documents[docCount].fullFilePath = sourceFiles;
}
}
}
You are using the same instance of the array for every document. The instance is updated with a new list of files at every inner loop, but an array is a reference to an area of memory (oversimplification, I know but for the purpose of this answer is enough) and if you change the content of that area of memory you are changing it for every document.
You need to create a new instance of the source files for every new document you add to your documents list. Moreover, when you are not certain of the number of elements that you want to be included in the array, it is a lot better to use a generic List and remove all that code that handles the resizing of the array.
First change the class definition
public class Document
{
public List<string> fullFilePath;
public bool isPatch;
public string destPath;
public Document() { }
public Document(List<string> fullFilePath, bool isPatch, string destPath)
{
this.fullFilePath = fullFilePath;
this.isPatch = isPatch;
this.destPath = destPath;
}
}
And now change your inner loop to
foreach (string file in filesCollected)
{
string bc;
string bcValue;
....
if (bc == bcValue)
{
List<string> files = new List<string>();
files.Add(file);
Documents.Add(new Document(files, true, ""));
docCount++;
}
else
Documents[docCount].fullFilePath.Add(file);
}
Notice that when you need to add a new Document I build a new List<string>, add the current file and pass everything at the constructor (In reality this should be moved directly inside the constructor of the Document class). When you want to add just a new file you could add it directly to the public fullFilePath property
Moving the handling of the files inside the Documents class could be rewritten as
public class Document
{
public List<string> fullFilePath;
public bool isPatch;
public string destPath;
public Document()
{
// Every constructory initializes internally the List
fullFilePath = new List<string>();
}
public Document(string aFile, bool isPatch, string destPath)
{
// Every constructory initializes internally the List
fullFilePath = new List<string>();
this.fullFilePath.Add(aFile);
this.isPatch = isPatch;
this.destPath = destPath;
}
public void AddFile(string aFile)
{
this.fullFilePath.Add(aFile);
}
}
Of course, now in you calling code you pass only the new file or call AddFile without the need to check for the list initialization.
The issue should be here:
string[] sourceFiles = new string[1];
If you move this line of code in your foreach you should solve this problem because in your foreach you always use the same variable, so the same reference.
int docCount = -1;
int i = 0;
List<Document> Documents = new List<Document>();
foreach (string file in filesCollected)
{
string[] sourceFiles = new string[1];
string bc;
string bcValue;
if (Settings.Default.barcodeEngine == "Leadtools")
{
bc = BarcodeReader.ReadBarcodeSymbology(file);
bcValue = "PatchCode";
}
else
{
bc = BarcodeReader.ReadBacrodes(file);
bcValue = "009";
}
if (bc == bcValue)
{
if(Documents.Count > 0)
{
Array.Clear(sourceFiles, 0, sourceFiles.Length);
Array.Resize<string>(ref sourceFiles, 1);
i = 0;
}
sourceFiles[i] = file ;
i++;
Array.Resize<string>(ref sourceFiles, i + 1);
Documents.Add(new Document(sourceFiles, true,""));
docCount++;
}
else
{
if (Documents.Count > 0)
{
sourceFiles[i] = file;
i++;
Array.Resize<string>(ref sourceFiles, i + 1);
Documents[docCount].fullFilePath = sourceFiles;
}
}
}
Ok i am having a major problem atm.
My software is using extremely high amount of ram. I am using a lot of HtmlAgilityPack.HtmlDocument objects with big size pages sources.
However all of the objects are used inside static functions and HtmlAgilityPack.HtmlDocument isn't IDisposable
So do i need to set every variable explicitly to null ?
Even if they are inside static functions ?
For example do i need to set these variables to null at the end of the function below
The variables i am asking : lstDrwList ? Or since it is inside it will get disposed automatically ?
Should i call explicitly garbage collector ?
C# .net 4.5 WPF application
private static void func_CheckWaitingToProcessPages(Object state)
{
ParallelOptions myOptions = new ParallelOptions();
myOptions.MaxDegreeOfParallelism = PublicSettings.ir_How_Many_Tasks_For_Per_Pages_Process;
List<DataRow> lstDrwList = new List<DataRow>();
using (DataTable dtMyTable = DbConnection.db_Select_DataTable(srSelectTopProcessPagesQuery))
{
foreach (DataRow drw in dtMyTable.Rows)
{
lstDrwList.Add(drw);
}
}
Parallel.ForEach(lstDrwList, myOptions, drw =>
{
process_Given_Page(drw);
});
}
The problem is found issue is how to fix
Here the problem this happens in 10 seconds i used visual studio profiler
Here the full class that causes this huge memory leak issue
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace doktora_tez_projesi_crawler_program
{
public static class PagesProcessor
{
private static Timer _timer;
private static int howManySeconds = 10;
public static void func_StartCrawlingWaitingUrls()
{
PublicStaticFunctions.AddMsgToEvents("Checking waiting to process crawled urls process started every " + howManySeconds + " seconds!");
_timer = new Timer(func_CheckWaitingToProcessPages, null, PublicSettings.irTimers_Delayed_Start_MiliSeconds, howManySeconds * 1000);
}
private static string srSelectTopProcessPagesQuery = " select top 100 cl_IdUrl,cl_RooSiteId,cl_CrawlSource,cl_CrawlOrgUrl from tblCrawlUrls " +
" where cl_PageProcessed=0 and cl_TotalCrawlTimes > 0 " +
" order by cl_LastProcessDate asc";
private static void func_CheckWaitingToProcessPages(Object state)
{
ParallelOptions myOptions = new ParallelOptions();
myOptions.MaxDegreeOfParallelism = PublicSettings.ir_How_Many_Tasks_For_Per_Pages_Process;
List<DataRow> lstDrwList = new List<DataRow>();
using (DataTable dtMyTable = DbConnection.db_Select_DataTable(srSelectTopProcessPagesQuery))
{
foreach (DataRow drw in dtMyTable.Rows)
{
lstDrwList.Add(drw);
}
}
Parallel.ForEach(lstDrwList, myOptions, drw =>
{
process_Given_Page(drw);
});
}
private class csProductFeatures
{
public string srProductRootSiteId = "null", srProductTitle = "null", srProductCode = "null", srProductImageLink = "null";
public string srProductDetailedExplanation = "null", srProductFeatures = "null", srCrawledOrgUrl = "null", srProductIdCode = "null";
public bool blPossibleProductPage = false, blFreeCargo = false, blProductPage = true;
public List<string> lstProductCategories = new List<string>();
public int irProductPrice = 0;
public List<csProductComments> lstProductComments = new List<csProductComments>();
public List<KeyValuePair<string, string>> lstProductFeatures = new List<KeyValuePair<string, string>>();
}
private class csProductComments
{
public string srCommentTitle = "null", srCommentPros = "null", srCommentCons = "null";
public int irCommentScore = 0; //0 = negative 5=full star
}
private static void process_Given_Page(DataRow drw)
{
csProductFeatures temp_ProductFeatures = new csProductFeatures();
temp_ProductFeatures.srProductRootSiteId = drw["cl_RooSiteId"].ToString();
temp_ProductFeatures.srCrawledOrgUrl = drw["cl_CrawlOrgUrl"].ToString();
HtmlDocument hdMyDoc = new HtmlDocument();//nulled
hdMyDoc.LoadHtml(drw["cl_CrawlSource"].ToString());
bool blBreakLoop = false;
foreach (var vrVariable in PublicVariables.dicRootSites[temp_ProductFeatures.srProductRootSiteId].lstRootSiteIdentifiers)
{
if (vrVariable.srHtmlObjectType != "link")
{
HtmlNodeCollection hdNodes;
if (vrVariable.blSelectMultipleNodes == false)
hdNodes = hdMyDoc.DocumentNode.SelectNodes(string.Format("//{0}[#{1}='{2}']", vrVariable.srHtmlObjectType,
vrVariable.srHtmlObjectTypeIdentifier, vrVariable.srHtmlObjectTypeName));
else
hdNodes = hdMyDoc.DocumentNode.SelectNodes(string.Format("//{0}[#{1}='{2}']//{3}", vrVariable.srHtmlObjectType,
vrVariable.srHtmlObjectTypeIdentifier, vrVariable.srHtmlObjectTypeName, vrVariable.srHtmlSubIdentifierType));
if (hdNodes == null && vrVariable.srIndetifierType == "ProductTitle")
{
blBreakLoop = true;
temp_ProductFeatures.blProductPage = false;
continue;
}
if (blBreakLoop == true)
break;
if (hdNodes == null)
continue;
string sr_Node_Required_Val = "null";
if (hdNodes[0].InnerText != null)
sr_Node_Required_Val = hdNodes[0].InnerText;
string srLinkVal = "null";
if (vrVariable.srHtmlObjectType == "a" && hdNodes[0].Attributes != null)
{
if (hdNodes[0].Attributes["href"] != null)
{
srLinkVal = PublicStaticFunctions.Return_Absolute_Url(hdNodes[0].Attributes["href"].Value, temp_ProductFeatures.srCrawledOrgUrl);
}
}
if (vrVariable.blGetValue == true)
{
if (hdNodes[0].Attributes != null)
if (hdNodes[0].Attributes["value"] != null)
sr_Node_Required_Val = hdNodes[0].Attributes["value"].Value;
}
sr_Node_Required_Val = sr_Node_Required_Val.Trim();
switch (vrVariable.srIndetifierType)
{
case "ProductPage":
temp_ProductFeatures.blPossibleProductPage = true;
break;
case "ProductTitle":
temp_ProductFeatures.srProductTitle = sr_Node_Required_Val;
break;
case "ProductCode":
temp_ProductFeatures.srProductCode = sr_Node_Required_Val;
break;
case "ProductCargo":
temp_ProductFeatures.blFreeCargo = true;
break;
case "ProductCategories":
temp_ProductFeatures.lstProductCategories = func_Return_Product_Categories(hdNodes);
break;
case "ProductPrice":
temp_ProductFeatures.irProductPrice = func_Return_Product_Price(sr_Node_Required_Val, temp_ProductFeatures.srProductRootSiteId);
break;
case "ProductImage":
temp_ProductFeatures.srProductImageLink = srLinkVal;
break;
case "ProductIdCode":
temp_ProductFeatures.srProductIdCode = sr_Node_Required_Val;
break;
}
}
if (vrVariable.srHtmlObjectType == "link")
{
string srLinkToFetch = vrVariable.srHtmlObjectTypeIdentifier;
if (vrVariable.blUsesProductIdCode == true)
{
srLinkToFetch = string.Format(srLinkToFetch, temp_ProductFeatures.srProductIdCode);
}
string srFetchResult = CrawlGivenUrl.func_fetch_Page(srLinkToFetch);
string srResultToAssign = "null";
if (srFetchResult == PublicSettings.srCrawlFailedMessage)
{
srResultToAssign = srFetchResult;
}
else
{
HtmlDocument temp_HdDocument = new HtmlDocument();//nulled
temp_HdDocument.LoadHtml(srFetchResult);
if (temp_HdDocument.DocumentNode != null)
if (temp_HdDocument.DocumentNode.InnerText != null)
srResultToAssign = temp_HdDocument.DocumentNode.InnerText;
temp_HdDocument = null;
}
switch (vrVariable.srIndetifierType)
{
case "ProductExplanation":
temp_ProductFeatures.srProductDetailedExplanation = srResultToAssign;
break;
case "ProductFeatures":
temp_ProductFeatures.lstProductFeatures = func_Return_Product_Features(temp_ProductFeatures.srProductRootSiteId, srFetchResult, temp_ProductFeatures.srCrawledOrgUrl);
break;
}
}
}
if (temp_ProductFeatures.blProductPage == true)
{
string asdas = "";
}
hdMyDoc = null;
}
private static List<string> func_Return_Product_Categories(HtmlNodeCollection hdNodeCollection)
{
List<string> lstCategories = new List<string> { };
foreach (HtmlNode hdNode in hdNodeCollection)
{
if (hdNode.InnerText != null)
{
lstCategories.Add(hdNode.InnerText);
}
}
return lstCategories;
}
private static int func_Return_Product_Price(string srPriceText, string srRootSiteId)
{
int irPrice = 0;
srPriceText = srPriceText.Replace(PublicVariables.dicRootSites[srRootSiteId].srPriceDelimeter, "");
if (srPriceText.Contains(PublicVariables.dicRootSites[srRootSiteId].srPriceIgnoreDelimeter) == true)
{
srPriceText = srPriceText.Substring(0, srPriceText.IndexOf(PublicVariables.dicRootSites[srRootSiteId].srPriceIgnoreDelimeter));
}
Int32.TryParse(srPriceText, out irPrice);
return irPrice;
}
private static List<KeyValuePair<string, string>> func_Return_Product_Features(string srRootSiteId, string srPageSource, string srCrawlUrl)
{
List<KeyValuePair<string, string>> lstFoundFeatures = new List<KeyValuePair<string, string>>();
if (srPageSource == PublicSettings.srCrawlFailedMessage)
return lstFoundFeatures;
HtmlDocument temp_HdDocument = new HtmlDocument();//nulled
temp_HdDocument.LoadHtml(srPageSource);
List<string> lstFeatureTitles = new List<string>();
List<string> lstFeatureDescriptions = new List<string>();
foreach (var vrVariable in PublicVariables.dicRootSites[srRootSiteId].lstRootSitesFeaturesIdentifiers)
{
if (vrVariable.blPerFeatureIdentifier == true)
{
HtmlNodeCollection hdNodes = temp_HdDocument.DocumentNode.SelectNodes(string.Format("//{0}[#{1}='{2}']", vrVariable.srHtmlObjectType,
vrVariable.srHtmlObjectIdentifier, vrVariable.srHtmlObjectIdentifierName));
if (hdNodes != null)
foreach (var vrNewVariable in PublicVariables.dicRootSites[srRootSiteId].lstRootSitesFeaturesIdentifiers)
{
if (vrNewVariable.blPerFeatureIdentifier == false)
{
foreach (HtmlNode hdTempNode in hdNodes)
{
var vrTempNewNode = hdTempNode.SelectSingleNode(string.Format("//{0}[#{1}='{2}']", vrVariable.srHtmlObjectType,
vrVariable.srHtmlObjectIdentifier, vrVariable.srHtmlObjectIdentifierName));
if (vrTempNewNode != null)
if (vrTempNewNode.InnerText != null)
{
string srNodeFeature = vrTempNewNode.InnerText.Trim();
switch (vrVariable.srWhichFeatureIdentifier)
{
case "FeatureTitle":
lstFeatureTitles.Add(srNodeFeature);
break;
case "FeatureDescription":
lstFeatureDescriptions.Add(srNodeFeature);
break;
}
}
}
}
}
break;
}
}
temp_HdDocument = null;
if (lstFeatureDescriptions.Count != lstFeatureTitles.Count)
{
ErrorLogger.LogError("found features count not equal to features description count crawled url: " + srCrawlUrl);
return lstFoundFeatures;
}
for (int i = 0; i < lstFeatureDescriptions.Count; i++)
{
KeyValuePair<string, string> myKeyValPair = new KeyValuePair<string, string>(lstFeatureTitles[i], lstFeatureDescriptions[i]);
lstFoundFeatures.Add(myKeyValPair);
}
return lstFoundFeatures;
}
}
}
No, you don't need to set variables to null in both static and instance methods. The variables inside a method (even inside static method) are on the stack space of the method, so generally they will go out of scope at the end of method execution and will be targeted for garbage collection. And Generally explicitly calling the garbage collector isn't a good practice.
Is there a possibility to extract plain text from a PDF-File with PdfSharp?
I don't want to use iTextSharp because of its license.
Took Sergio's answer and made some extension methods. I also changed the accumulation of strings into an iterator.
public static class PdfSharpExtensions
{
public static IEnumerable<string> ExtractText(this PdfPage page)
{
var content = ContentReader.ReadContent(page);
var text = content.ExtractText();
return text;
}
public static IEnumerable<string> ExtractText(this CObject cObject)
{
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name== OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
foreach (var txt in ExtractText(cOperand))
yield return txt;
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
foreach (var txt in ExtractText(element))
yield return txt;
}
else if (cObject is CString)
{
var cString = cObject as CString;
yield return cString.Value;
}
}
}
I have implemented it somehow similar to how David did it.
Here is my code:
...
{
// ....
var page = document.Pages[1];
CObject content = ContentReader.ReadContent(page);
var extractedText = ExtractText(content);
// ...
}
private IEnumerable<string> ExtractText(CObject cObject)
{
var textList = new List<string>();
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name == OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
{
textList.AddRange(ExtractText(cOperand));
}
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
{
textList.AddRange(ExtractText(element));
}
}
else if (cObject is CString)
{
var cString = cObject as CString;
textList.Add(cString.Value);
}
return textList;
}
PDFSharp provides all the tools to extract the text from a PDF. Use the ContentReader class to access the commands within each page and extract the strings from TJ/Tj operators.
I've uploaded a simple implementation to github.