I am trying to use the iText7 library but for some reason, I cannot split pages into the list of strings.
Instead, I am getting a list of pages like this:
1,1+2,1+2+3,1+2+3+4
public List<string> PdfPages;
private ITextExtractionStrategy _Strategy;
public PdfExtractor(IFormFile pdf, ITextExtractionStrategy? strategy = default)
{
this._Strategy = strategy ?? new SimpleTextExtractionStrategy();
PdfPages = new List<string>();
ExtractTextFromPages(pdf);
}
private void ExtractTextFromPages(IFormFile pdf)
{
using (var stream = pdf.OpenReadStream())
{
using (var reader = new PdfReader(stream))
{
PdfDocument pdfDoc = new PdfDocument(reader);
for (int index = 1; index < pdfDoc.GetNumberOfPages(); index++)
{
string PdfPageToText = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(index), _Strategy);
PdfPages.Add(PdfPageToText);
}
}
}
}
Does anyone know how to correct that?
The problem was as #mkl mentioned in the comment below, that I did not create a new ITextExtractionStrategy object for each page, and when I did that everything works like a charm without the need to save files anywhere.
using (var stream = pdf.OpenReadStream())
{
using (var reader = new PdfReader(stream))
{
PdfDocument pdfDoc = new PdfDocument(reader);
for (int index = 1; index < pdfDoc.GetNumberOfPages(); index++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string PdfPageToText = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(index), strategy);
PdfPages.Add(PdfPageToText);
}
pdfDoc.Close();
reader.Close();
}
}
Related
Here is the pdf sample with texts on the layer. If I turn off the layer all the text belong to this layer will be invisible also.
I need to get all the texts from the specific layer. Any body know how to archive this.
Here is my sample PDF file: https://drive.google.com/file/d/1TcRyE8MQRhw-j89BbovV7fFIwZ0yks0N/view?usp=sharing
My code can get all texts. But I don't know how to get texts belong any specific layer only.
public CreateHyperLinkButton(string inPutPDF, string outPutPDF, List<ViewPortInfo> ViewportInfos)
{
using (FileStream pdf = new FileStream(outPutPDF, FileMode.Create))
{
using (PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(inPutPDF))
{
using (PdfStamper pdfStamper = new iTextSharp.text.pdf.PdfStamper(pdfReader, pdf))
{
//Get Text list on 2D PDF
List<TextRenderInfo> listTextInfor = GetAllTextInfor(inPutPDF, pdfReader);
listTextInfor.ForEach(item =>{
string btnName = item.GetText().Trim();
//Check btnName exist in ViewportInfos
for (var i = 0; i < ViewportInfos.Count; i++)
{
string szRes = GetTextContained(ViewportInfos[i].Hyperlinks.Keys.ToList(), btnName);
if (!string.IsNullOrEmpty(szRes))
{
iTextSharp.text.Rectangle box = GetRectOfText(item);
iTextSharp.text.pdf.PushbuttonField btnField = new iTextSharp.text.pdf.PushbuttonField(pdfStamper.Writer, box, szRes);
iTextSharp.text.pdf.PdfAnnotation pushbutton = btnField.Field;
//Add JS function and button in annotation
string js = "mapView('" + szRes + "');";
pushbutton.SetAdditionalActions(iTextSharp.text.pdf.PdfName.U, iTextSharp.text.pdf.PdfAction.JavaScript(js, pdfStamper.Writer));
pdfStamper.AddAnnotation(pushbutton, 1);
}
}
});
pdfStamper.Close();
}
pdfReader.Close();
}
pdf.Close();
}
}
private static List<TextRenderInfo> GetAllTextInfor(string inPutPDF, PdfReader pdfReader)
{
List<TextRenderInfo> listTextInfor = new List<TextRenderInfo>();
TextExtractionStrategy allTextInfo = new TextExtractionStrategy();
for (int i = 1; i <= pdfReader.NumberOfPages; i++)
{
PdfTextExtractor.GetTextFromPage(pdfReader, i, allTextInfo);
}
listTextInfor = allTextInfo.textList;
return listTextInfor;
}
public class TextExtractionStrategy : ITextExtractionStrategy
{
public List<TextRenderInfo> textList = new List<TextRenderInfo>();
public void BeginTextBlock()
{
}
public void EndTextBlock()
{
}
public string GetResultantText()
{
return "";
}
public void RenderImage(ImageRenderInfo renderInfo)
{
var a = renderInfo;
}
public void RenderText(TextRenderInfo renderInfo)
{
textList.Add(renderInfo);
}
}
You could use ironpdf for this purpose. Parse/open the pdf as per the docs on their site and examine it in debug, then you can develop some code to retrieve text from that layer only.
Creating a PDF document from the stream of a HTTP request.
public class HomeController : Controller {
public HomeController() {
converter = new HtmlToPdf();
InitializeConverter();
}
public void Index() {
ConvertHtmlToPdf(new Uri("http://localhost:52328/CertificateOfOrigin?noCertificate=2691"));
}
public void ConvertHtmlToPdf(Uri toConvert) {
if(toConvert == null) throw new ArgumentNullException(nameof(toConvert));
using(var stream =new MemoryStream()) {
var doc = converter.ConvertUrl(toConvert.AbsoluteUri);
// The doc.AddTemplate returns a PdfTemplate and should be assigned to doc.Footer
doc.Footer = doc.AddTemplate(doc.Pages[0].ClientRectangle.Width, 100);
var pageNumbering = new PdfTextElement(20, 50, "Page {page_number} of {total_pages}", doc.Fonts[0], Color.Black);
// Once template defined, I add it to the doc Footer. But...
doc.Footer.Add(pageNumbering); // Throws a NullPointerException?
doc.Footer = template;
doc.Save(stream);
doc.Close();
using(var ms = new MemoryStream(stream.ToArray())) {
Response.AddHeader("content-disposition", "filename=certificate-of-origin.pdf");
Response.ContentType = "application/pdf";
ms.CopyTo(Response.OutputStream);
Response.End();
Response.Close();
}
}
}
private void InitializeConverter() {
converter.Options.MarginBottom = 0;
converter.Options.MarginLeft = 0;
converter.Options.MarginRight = 0;
converter.Options.MarginTop = 0;
converter.Options.PdfPageSize = PdfPageSize.Letter;
}
private readonly HtmlToPdf converter;
}
I put a breakpoint and quick watched the return of doc.AddTemplate method call and it returns an actual PdfTemplate no problem!
Other than that, everything works fine. Document is generated no problem, except when I uncomment the page numbering because the doc.Footer remains null despite its assignment.
Could it be a bug? Idk.
You need to either set the header/footer content before the conversion, like here:
https://selectpdf.com/demo-mvc/HtmlToPdfHeadersAndFooters
using System;
using System.Web.Mvc;
namespace SelectPdf.Samples.Controllers
{
public class HtmlToPdfHeadersAndFootersController : Controller
{
// GET: HtmlToPdfHeadersAndFooters
public ActionResult Index()
{
return View();
}
[HttpPost]
public ActionResult SubmitAction(FormCollection collection)
{
// get parameters
string headerUrl = Server.MapPath("~/files/header.html");
string footerUrl = Server.MapPath("~/files/footer.html");
bool showHeaderOnFirstPage = collection["ChkHeaderFirstPage"] == "on";
bool showHeaderOnOddPages = collection["ChkHeaderOddPages"] == "on";
bool showHeaderOnEvenPages = collection["ChkHeaderEvenPages"] == "on";
int headerHeight = 50;
try
{
headerHeight = Convert.ToInt32(collection["TxtHeaderHeight"]);
}
catch { }
bool showFooterOnFirstPage = collection["ChkFooterFirstPage"] == "on";
bool showFooterOnOddPages = collection["ChkFooterOddPages"] == "on";
bool showFooterOnEvenPages = collection["ChkFooterEvenPages"] == "on";
int footerHeight = 50;
try
{
footerHeight = Convert.ToInt32(collection["TxtFooterHeight"]);
}
catch { }
// instantiate a html to pdf converter object
HtmlToPdf converter = new HtmlToPdf();
// header settings
converter.Options.DisplayHeader = showHeaderOnFirstPage ||
showHeaderOnOddPages || showHeaderOnEvenPages;
converter.Header.DisplayOnFirstPage = showHeaderOnFirstPage;
converter.Header.DisplayOnOddPages = showHeaderOnOddPages;
converter.Header.DisplayOnEvenPages = showHeaderOnEvenPages;
converter.Header.Height = headerHeight;
PdfHtmlSection headerHtml = new PdfHtmlSection(headerUrl);
headerHtml.AutoFitHeight = HtmlToPdfPageFitMode.AutoFit;
converter.Header.Add(headerHtml);
// footer settings
converter.Options.DisplayFooter = showFooterOnFirstPage ||
showFooterOnOddPages || showFooterOnEvenPages;
converter.Footer.DisplayOnFirstPage = showFooterOnFirstPage;
converter.Footer.DisplayOnOddPages = showFooterOnOddPages;
converter.Footer.DisplayOnEvenPages = showFooterOnEvenPages;
converter.Footer.Height = footerHeight;
PdfHtmlSection footerHtml = new PdfHtmlSection(footerUrl);
footerHtml.AutoFitHeight = HtmlToPdfPageFitMode.AutoFit;
converter.Footer.Add(footerHtml);
// add page numbering element to the footer
if (collection["ChkPageNumbering"] == "on")
{
// page numbers can be added using a PdfTextSection object
PdfTextSection text = new PdfTextSection(0, 10,
"Page: {page_number} of {total_pages} ",
new System.Drawing.Font("Arial", 8));
text.HorizontalAlign = PdfTextHorizontalAlign.Right;
converter.Footer.Add(text);
}
// create a new pdf document converting an url
PdfDocument doc = converter.ConvertUrl(collection["TxtUrl"]);
// custom header on page 3
if (doc.Pages.Count >= 3)
{
PdfPage page = doc.Pages[2];
PdfTemplate customHeader = doc.AddTemplate(
page.PageSize.Width, headerHeight);
PdfHtmlElement customHtml = new PdfHtmlElement(
"<div><b>This is the custom header that will " +
"appear only on page 3!</b></div>",
string.Empty);
customHeader.Add(customHtml);
page.CustomHeader = customHeader;
}
// save pdf document
byte[] pdf = doc.Save();
// close pdf document
doc.Close();
// return resulted pdf document
FileResult fileResult = new FileContentResult(pdf, "application/pdf");
fileResult.FileDownloadName = "Document.pdf";
return fileResult;
}
}
}
Or use this approach, to add headers/footers to an already generated pdf:
https://selectpdf.com/demo-mvc/ExistingPdfHeadersAndFooters
using System.Web.Mvc;
using System.Drawing;
namespace SelectPdf.Samples.Controllers
{
public class ExistingPdfHeadersAndFootersController : Controller
{
// GET: ExistingPdfHeadersAndFooters
public ActionResult Index()
{
return View();
}
[HttpPost]
public ActionResult SubmitAction(FormCollection collection)
{
// the test file
string filePdf = Server.MapPath("~/files/selectpdf.pdf");
string imgFile = Server.MapPath("~/files/logo.png");
// resize the content
PdfResizeManager resizer = new PdfResizeManager();
resizer.Load(filePdf);
// add extra top and bottom margins
resizer.PageMargins = new PdfMargins(0, 0, 90, 40);
// add the header and footer to the existing (now resized pdf document)
PdfDocument doc = resizer.GetDocument();
// header template (90 points in height) with image element
PdfTemplate header = doc.AddTemplate(doc.Pages[0].ClientRectangle.Width, 90);
PdfImageElement img1 = new PdfImageElement(10, 10, imgFile);
header.Add(img1);
// footer template (40 points in height) with text element
PdfTemplate footer = doc.AddTemplate(new RectangleF(0,
doc.Pages[0].ClientRectangle.Height - 40,
doc.Pages[0].ClientRectangle.Width, 40));
// create a new pdf font
PdfFont font2 = doc.AddFont(PdfStandardFont.Helvetica);
font2.Size = 12;
PdfTextElement text1 = new PdfTextElement(10, 10,
"Generated by SelectPdf. Page number {page_number} of {total_pages}.",
font2);
text1.ForeColor = System.Drawing.Color.Blue;
footer.Add(text1);
// save pdf document
byte[] pdf = doc.Save();
// close pdf document
resizer.Close();
// return resulted pdf document
FileResult fileResult = new FileContentResult(pdf, "application/pdf");
fileResult.FileDownloadName = "Document.pdf";
return fileResult;
}
}
}
The best approach is the first, so try to move your footer setting before the conversion.
I'm trying to convert a file from XLS to XLSX using NPOI. As I'm not aware of an explicit conversion, I wrote this first implementation going through the rows and cells and copying from one to another:
public string ConvertToXlsx(string xlsPath)
{
var oldWorkbook = new HSSFWorkbook(new FileStream(xlsPath, FileMode.Open));
var oldWorkSheet = oldWorkbook.GetSheetAt(0);
var newExcelPath = xlsPath.Replace("xls", "xlsx");
using (var fileStream = new FileStream(newExcelPath, FileMode.Create))
{
var newWorkBook = new XSSFWorkbook();
var newWorkSheet = new XSSFSheet();
newWorkBook.Add(newWorkSheet);
foreach (HSSFRow oldRow in oldWorkSheet)
{
var newRow = newWorkSheet.CreateRow(oldRow.RowNum);
for (int ii = oldRow.FirstCellNum; ii < oldRow.LastCellNum; ii++)
{
var newCell = newRow.CreateCell(ii);
newCell = oldRow.Cells[ii];
}
}
newWorkBook.Write(fileStream);
}
return newExcelPath;
}
Yet, on line var newCell = newRow.CreateCell(ii); NPOI throws a NullReferenceException With the following stack trace:
at NPOI.XSSF.UserModel.XSSFCell..ctor(XSSFRow row, CT_Cell cell)
at NPOI.XSSF.UserModel.XSSFRow.CreateCell(Int32 columnIndex, CellType type)
at NPOI.XSSF.UserModel.XSSFRow.CreateCell(Int32 columnIndex)
at Ing2Ynab.Excel.IngExcelConverter.ConvertToXlsx(String xlsPath)
Which I don't get why it's happening, as XSSFRow should be in charge of creating the CT_Cell that gets passed on to XSSFCell constructor, from what I could read in NPOIs code.
Has anyone else tried to do this and/or has fixed it?
Thanks.
Looks like you have to explicitly call the Workbooks CreateSheet() method instead of calling .Add(). Additionally, you seem to have some out of range exceptions on your loop so keep an eye out for that.
public string ConvertToXlsx(string xlsPath)
{
var oldWorkbook = new HSSFWorkbook(new FileStream(xlsPath, FileMode.Open));
var oldWorkSheet = oldWorkbook.GetSheetAt(0);
var newExcelPath = xlsPath.Replace("xls", "xlsx");
using (var fileStream = new FileStream(newExcelPath, FileMode.Create))
{
var newWorkBook = new XSSFWorkbook();
var newWorkSheet = newWorkBook.CreateSheet("Sheet1");
foreach (HSSFRow oldRow in oldWorkSheet)
{
var newRow = newWorkSheet.CreateRow(oldRow.RowNum);
for (int ii = oldRow.FirstCellNum; ii < oldRow.LastCellNum; ii++)
{
var newCell = newRow.CreateCell(ii);
newCell = oldRow.Cells[ii];
}
}
newWorkBook.Write(fileStream);
}
return newExcelPath;
}
I have a wcf ksoap2 service that returns Dictionary<ArrayList, List<byte[]>>. Now at android side I want to fill my Dictionary<String[], ArrayList<Object>> diction; from wcf response. I am new to wcf and android/java, I don't have idea how to do this. Please provide me some better example of filling Dictionary with wcf.
Thanks in advance
This is my wcf code
public Dictionary<ArrayList, List<byte[]>> getImages()
{
Dictionary<ArrayList, List<byte[]>> image_Name = new Dictionary<ArrayList, List<byte[]>>();
DirectoryInfo directoryInfo = new DirectoryInfo(#"C:\Users\Yakhtar\Desktop\abc");
arr1 = new ArrayList();
foreach (FileInfo fi in directoryInfo.GetFiles())
arr1.Add(fi.FullName);
list = new List<byte[]>();
for (int i = 0; i < arr1.Count; i++)
{
img = Image.FromFile(arr1[i].ToString());
ms = new MemoryStream();
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
list.Add(ms.ToArray());
}
image_Name.Add(arr1, list);
//image_Name[arr1 as ArrayList] = [list as byte[]];
return image_Name;
}
Well I am not sure about that but have you thought about JSON parsing instead of ksoap2 ??
Here is a tutorial on how to work with array of complex objects with KSOAP. I found out by countless hours of debugging. Hope this hepls
also try this
SoapObject countryDetails = (SoapObject)envelope.getResponse();
System.out.println(countryDetails.toString());
ArrayList list = new ArrayList(countryDetails.getPropertyCount());
lv_arr = new String[countryDetails.getPropertyCount()];
for (int i = 0; i < countryDetails.getPropertyCount(); i++) {
Object property = countryDetails.getProperty(i);
if (property instanceof SoapObject) {
SoapObject countryObj = (SoapObject) property;
String countryName = countryObj.getProperty("countryName").toString();
list.add(countryName );
}
}
Do something like this..
list = new List<byte[]>();
for (int i = 0; i < arr1.Count; i++)
{
img = Image.FromFile(arr1[i].ToString());
ms = new MemoryStream();
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
list.Add(ms.ToArray());
}
image_Name.Add(arr1, list);
//image_Name[arr1 as ArrayList] = [list as byte[]];
return image_Name;
}
I'm trying to extract some keywords from a text. It works quite fine but I need to remove plurals.
As I'm already using Lucene for searching purpose, I'm trying to use it to extract keyword from indexed terms.
1st, I index the document in a RAMDirectory index,
RAMDirectory idx = new RAMDirectory();
using (IndexWriter writer =
new IndexWriter(
idx,
new CustomStandardAnalyzer(StopWords.Get(this.Language),
Lucene.Net.Util.Version.LUCENE_30, this.Language),
IndexWriter.MaxFieldLength.LIMITED))
{
writer.AddDocument(createDocument(this._text));
writer.Optimize();
}
Then, I extract the keywords:
var list = new List<KeyValuePair<int, string>>();
using (var reader = IndexReader.Open(directory, true))
{
var tv = reader.GetTermFreqVector(0, "text");
if (tv != null)
{
string[] terms = tv.GetTerms();
int[] freq = tv.GetTermFrequencies();
for (int i = 0; i < terms.Length; i++)
list.Add(new KeyValuePair<int, string>(freq[i], terms[i]));
}
}
in the list of terms I can have terms like "president" and "presidents"
How could I remove it?
My CustomStandardAnalyzer use this:
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
{
//create the tokenizer
TokenStream result = new StandardTokenizer(this.version, reader);
//add in filters
result = new Lucene.Net.Analysis.Snowball.SnowballFilter(result, this.getStemmer());
result = new LowerCaseFilter(result);
result = new ASCIIFoldingFilter(result);
result = new StopFilter(true, result, this.stopWords ?? StopWords.English);
return result;
}
So I already use the SnowballFilter (with the correct language specific stemmer).
How could I remove plurals?
My output from the following program is:
text:and
text:presid
text:some
text:text
text:with
class Program
{
private class CustomStandardAnalyzer : Analyzer
{
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
{
//create the tokenizer
TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader);
//add in filters
result = new Lucene.Net.Analysis.Snowball.SnowballFilter(result, new EnglishStemmer());
result = new LowerCaseFilter(result);
result = new ASCIIFoldingFilter(result);
result = new StopFilter(true, result, new HashSet<string>());
return result;
}
}
private static Document createDocument(string text)
{
Document d = new Document();
Field f = new Field("text", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
f.SetValue(text);
d.Add(f);
return d;
}
static void Main(string[] args)
{
RAMDirectory idx = new RAMDirectory();
using (IndexWriter writer =
new IndexWriter(
idx,
new CustomStandardAnalyzer(),
IndexWriter.MaxFieldLength.LIMITED))
{
writer.AddDocument(createDocument("some text with president and presidents"));
writer.Commit();
}
using (var reader = IndexReader.Open(idx, true))
{
var terms = reader.Terms(new Term("text", ""));
if (terms.Term != null)
do
Console.WriteLine(terms.Term);
while (terms.Next());
}
Console.ReadLine();
}
}