In the following code I create a pdf dynamically using ITextSharp.
I want the 2nd table to be splitted when there is not room enough on the page.
How can this be accomplished ? I tried it with the newPage method on the pdf stamper, but no new page has been created...
(not all codepaths included for readability)
private byte[] DoGenerateStatisticsPerOrganisationalUnitPdf(
string emptyPdfFile,
DateTime currentDateTime,
string organisationalUnit,
int? roleId,
DateTime? fromDate,
DateTime? toDate)
{
var pdfReader = new ITextSharp.pdf.PdfReader(emptyPdfFile); // note that PdfReader is not IDisposeable
using (MemoryStream memoryStream = new MemoryStream())
using (ITextSharp.pdf.PdfStamper pdfStamper = new ITextSharp.pdf.PdfStamper(pdfReader, memoryStream))
{
// Get content bytes of first page
var pdfContentByte = pdfStamper.GetOverContent(1);
// Make a page width/height large rectangle column for write actions
var ct = new ITextSharp.pdf.ColumnText(pdfContentByte);
ct.SetSimpleColumn(
PageStartX,
PageStartY,
PageEndX,
PageEndY);
var paragraph = new iTextSharp.text.Paragraph(new ITextSharp.Chunk("Statistieken Profchecks", titleFont));
ct.AddElement(paragraph);
// Add printed date time
var dateTimeText = string.Format(
CultureInfo.CurrentCulture,
"Afdrukdatum: {0}",
currentDateTime.ToString(DateFormat, CultureInfo.CurrentCulture));
paragraph = new iTextSharp.text.Paragraph(new ITextSharp.Chunk(dateTimeText, rowFont));
ct.AddElement(paragraph);
// Add selected filter
var filterItems = string.Empty;
if (!string.IsNullOrEmpty(organisationalUnit))
{
filterItems += "\n" + string.Format(CultureInfo.CurrentCulture, " ° Organisatie: {0}", organisationalUnit);
}
if (roleId.HasValue)
{
filterItems += "\n" + string.Format(CultureInfo.CurrentCulture, " ° Rol: {0}", roleService.GetById(roleId.Value).Name);
}
if (fromDate.HasValue)
{
filterItems += "\n" + string.Format(CultureInfo.CurrentCulture, " ° Datum van: {0}", fromDate.Value.ToString(DateFormat, CultureInfo.CurrentCulture));
}
if (toDate.HasValue)
{
filterItems += "\n" + string.Format(CultureInfo.CurrentCulture, " ° Datum t/m: {0}", toDate.Value.ToString(DateFormat, CultureInfo.CurrentCulture));
}
var filterText = string.Format(
CultureInfo.CurrentCulture,
"Geselecteerde filter: {0}",
filterItems.Length > 0 ? filterItems : "(geen filter)");
paragraph = new iTextSharp.text.Paragraph(new ITextSharp.Chunk(filterText, rowFont));
ct.AddElement(paragraph);
paragraph = new iTextSharp.text.Paragraph(new ITextSharp.Chunk("\nResultaten per game", titleFont));
ct.AddElement(paragraph);
// Table: Results per game
var table = CreateTable(new string[] { "Game", "Unieke spelers", "Resultaat" });
var gameResultList = statisticsService.GetOrganisationalUnitStatistics(1, 20, organisationalUnit, roleId, fromDate, toDate);
foreach (var gameResultItem in gameResultList)
{
table.AddCell(new iTextSharp.text.Phrase(gameResultItem.Game, rowFont));
table.AddCell(new iTextSharp.text.Phrase(gameResultItem.NumberOfUsers.ToString(CultureInfo.CurrentCulture), rowFont));
var percentage = gameResultItem.AveragePercentage.HasValue ? string.Format(CultureInfo.CurrentCulture, "{0}%", gameResultItem.AveragePercentage) : "?";
table.AddCell(new iTextSharp.text.Phrase(percentage, rowFont));
}
table.CompleteRow();
ct.AddElement(table);
paragraph = new iTextSharp.text.Paragraph(new ITextSharp.Chunk("\nResultaten per kenniscategorie", titleFont));
ct.AddElement(paragraph);
// Table: Results per knowledgecategory
table = CreateTable(new string[] { "Kenniscategorie", "Gemiddeld", "Laagste", "Hoogste", "Standaard deviatie" });
var knowledgeCategoryResultList = statisticsService.GetGlobalKnowledgeCategoryResultStatistics(
organisationalUnit,
roleId,
fromDate,
toDate);
foreach (var knowledgeCategoryResultItem in knowledgeCategoryResultList)
{
table.AddCell(new iTextSharp.text.Phrase(knowledgeCategoryResultItem.KnowledgeCategory.Name, rowFont));
table.AddCell(new iTextSharp.text.Phrase(
knowledgeCategoryResultItem.Statistics.Average.ToString(CultureInfo.CurrentCulture),
rowFont));
table.AddCell(new iTextSharp.text.Phrase(
knowledgeCategoryResultItem.Statistics.Minimum.ToString(CultureInfo.CurrentCulture),
rowFont));
table.AddCell(new iTextSharp.text.Phrase(
knowledgeCategoryResultItem.Statistics.Maximum.ToString(CultureInfo.CurrentCulture),
rowFont));
table.AddCell(new iTextSharp.text.Phrase(
knowledgeCategoryResultItem.Statistics.StDev.HasValue ? knowledgeCategoryResultItem.Statistics.StDev.Value.ToString(
CultureInfo.CurrentCulture) : "?",
rowFont));
}
table.CompleteRow();
ct.AddElement(table);
// Parse
ct.Go();
pdfStamper.FormFlattening = true;
pdfStamper.FreeTextFlattening = true;
// Close stamper explicitly, otherwise the pdf gets corrupted (don't wait until the Dispose is called in the using-clause)
pdfStamper.Close();
// Always call ToArray, to get all the bytes returned.
return memoryStream.ToArray();
}
}
I see you take an existing PDF file (referred to as "emptyPdfFile") add content to that PDF (2 tables) and want to add pages as necessary. So I assume you actually want to create a PDF from scratch.
In that case it's most likely easier to use PdfWriter and add your tables using Document.Add(). Tables will be split and pages will be added automatically when the end of the current page is reached.
A simple example of adding a table with Document.Add() can be found here in the MyFirstTable example (that's iText code in Java, check the C# port for iTextSharp code).
If you do want to follow the approach of your example code, using PdfReader, PdfStamper and ColumnText:
ColumnText.Go() adds content to the defined area until that area is full. Any remaining content stays in the ColumnText object. So if you want to split the content over multiple areas, you have to loop and call ColumnText.Go() until all content is consumed.
Here's an example of the ColumnText.Go() looping: ColumnTable (Again, you may want to check the C# port).
In that example the tables are layed out in 2 columns on each page, but the approach stays the same for 1 table per page.
Note that Document.NewPage() is used in the example to add an extra page. You'll have to replace this call with PdfStamper.InsertPage() in your case.
Related
I have a situation where I replicate a pdf from an existing pdf blank 'certificate', The replica pdf contains some personal info that was added to the 'blue print' copy.
After mailing this to the user I want to delete it again so as not to save a copy on the server.
No matter what I do, however, I get the error message
The process cannot access the file 'certificate#672712.pdf' because it is being used by another process.
It is pretty clear what the message means, what I want to know is how I should alter the code for the certificate creation part so that is 'released' after words.
This is the code that I use.
try
{
string suffix = (DateTime.Now.Day % 10 == 1 && DateTime.Now.Day != 11) ? "st of"
: (DateTime.Now.Day % 10 == 2 && DateTime.Now.Day != 12) ? "nd of"
: (DateTime.Now.Day % 10 == 3 && DateTime.Now.Day != 13) ? "rd of"
: "th of";
string dateToday = string.Format("{0:dd}{1} {0:MMMM yyyy}", DateTime.Now, (suffix));
Random generator = new Random();
string referenceNumber = "#" + generator.Next(0, 1000000).ToString("D6");
string bluePrint = HttpContext.Current.Server.MapPath("~/Certificates/blue_print.pdf");
if (certificateNumber != "")
{
referenceNumber = certificateNumber;
}
string certificate = HttpContext.Current.Server.MapPath("~/Certificates/certificate_" + referenceNumber + ".pdf");
PdfReader reader = new PdfReader(bluePrint);
using (PdfStamper stamper = new PdfStamper(reader, new FileStream(certificate, FileMode.Create)))
{
reader.SelectPages("1");
var pageSize = reader.GetPageSize(1);
PdfContentByte pbover = stamper.GetOverContent(1);
BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
pbover.SetColorFill(BaseColor.DARK_GRAY);
Font font = new Font(bf);
font.Size = 32;
ColumnText.ShowTextAligned(pbover, Element.ALIGN_LEFT, new Phrase(UserName, font), 90, 305, 0);
}
reader.Close();
return referenceNumber;
}
catch (Exception exe)
{
return exe.Message.ToString();
}
in my controller method I call the methods in this sequence.
//certificate
var reference = UtilityService.GenerateCertificate(String.Format("{0} {1}", firstName, lastName), "");
//Mail the certificate
string message = await MailService.MailTheCertificate(user.Email, "Well done", "Well done on completing the course", reference);
//THE ERROR HAPPENS AS I AM TRYING TO DELETE THE CERTIFICATE
FileInfo file = new FileInfo(HttpContext.Current.Server.MapPath("~/Certificates/certificate_" + reference + ".pdf"));
file.Delete();
return Ok("success");
I would like to update text content within a FreeText annotation when I copy the annotation from one PDF document to another, but for some reason the text does not update in the final PDF using the approach shown below. The annotation object updates, but the final result within the PDF does not reflect the updated content for the FreeText annotation type. Strangely, Ink type annotations do get updated with the revised content, as it shows up in the form of a sticky note looking comment overlaid on top of the Ink annotation itself.
Here's a quick snippet of the code I've used (if needed I can add more):
foreach (var anno in annots)
{
var a = anno.GetPdfObject().CopyTo(masterPdfDoc);
PdfAnnotation ano = PdfAnnotation.MakeAnnotation(a);
var contents = ano.GetContents().ToString();
ano.SetContents(new PdfString("COMMENT: " + contents));
//ano.Put(PdfName.Contents, new PdfString("COMMENT: " + contents));
masterDocPage.AddAnnotation(ano);
}
Would greatly appreciate any advice provided. Thanks
The following code snippet copies and modifies the text content of FreeText annotations from 1 PDF (i.e. annots) and saves the modified annotations into a new PDF. A good chunk of the code is similar to the answer of this post but was updated for iText7.
foreach (var anno in annots)
{
var a = anno.GetPdfObject().CopyTo(masterPdfDoc);
PdfAnnotation ano = PdfAnnotation.MakeAnnotation(a);
var apDict = ano.GetAppearanceDictionary();
if (apDict == null)
{
Console.WriteLine("No appearances.");
continue;
}
foreach (PdfName key in apDict.KeySet())
{
Console.WriteLine("Appearance: {0}", key);
PdfStream value = apDict.GetAsStream(key);
if (value != null)
{
var text = ExtractAnnotationText(value);
Console.WriteLine("Extracted Text: {0}", text);
if (text != "")
{
var valueString = Encoding.ASCII.GetString(value.GetBytes());
value.SetData(Encoding.ASCII.GetBytes(valueString.Replace(text, "COMMENT: " + text)));
}
}
}
masterDocPage.AddAnnotation(ano);
}
public static String ExtractAnnotationText(PdfStream xObject)
{
PdfResources resources = new PdfResources(xObject.GetAsDictionary(PdfName.Resources));
ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy);
processor.ProcessContent(xObject.GetBytes(), resources);
var text = strategy.GetResultantText();
return text;
}
I have a big problem and i hope you can help me out.
I have a PDF file which I split in more files.
Problem: There is a table content (goto page X), but in the new pdf files it doesnt work anymore.
I cant find a solution to copy this table content to the new pdf files.
For your understanding:
I have three pdf files which are equal.
PDF:
Page 1:
Text
Page 2:
Contenttable:
Content x Page 3 (Goto Page 3)
Content x Page 4 ""
content x Page 5 ""
...
...
I put all these pdf files together in one big pdf. (Content table still working)
And now, i will split these back in to single once (Content table not working)
My idea was to export the content table out of the original pdf(Where table content is still working) and to import it in the new once.
But i dont know how.
Second idea: search for keywords in the PDF and change the keyword to an goto local page action.
But i find no solution how i can do this.
However in the end i want to repair the table content in each single pdf file.
any ideas?
code so far:
PdfReader reader = null;
Document sourceDocument = null;
Document remote = null;
PdfCopy pdfCopyProvider = null;
PdfImportedPage importedPage = null;
PdfStamper stamp = null;
Chunk chunk = null;
reader = new PdfReader(path);
Seitenanzahl = reader.NumberOfPages;
sourceDocument = Document(reader.GetPageSizeWithRotation(startpage));
tmp = PdfTextExtractor.GetTextFromPage(reader, startpage);
string[] strsplit = tmp.Split(' ');
vorname = strsplit[4];
nachname = strsplit[5];
ort = strsplit[6];
perso = strsplit[7];
vorname = vorname.Replace("\n", "");
vorname = vorname.Replace(" ", "");
nachname = nachname.Replace("\n", "");
nachname = nachname.Replace(" ", "");
ort = ort.Replace("\n", "");
ort = ort.Replace(" ", "");
perso = perso.Replace("\n", "");
perso = perso.Replace(" ", "");
if (Directory.Exists("test/" + ort))
{
}
else
{
DirectoryInfo di = Directory.CreateDirectory("test/" + ort);
}
outputpdfpath = "test/" + ort + "/" + "p_" + ort + "_" + perso + "_" + vorname.ToLower() + "_" + nachname.ToLower() + "_hj1_booklet_2017" + ".pdf";
pdfCopyProvider = new PdfCopy(sourceDocument,
new FileStream(outputpdfpath, FileMode.Create, FileAccess.ReadWrite));
sourceDocument.Open();
for (int i = startpage; i <= endpage; i++)
{
importedPage = pdfCopyProvider.GetImportedPage(reader, i);
pdfCopyProvider.AddPage(importedPage);
}
sourceDocument.Close();
reader.Close();
}
I have a text field in my database and it has a text with many lines.
When generating a MS Word document using OpenXML and bookmarks, the text become one single line.
I've noticed that in each new line the bookmark value show the characters "\r\n".
Looking for a solution, I've found some answers which helped me, but I'm still having a problem.
I've used the run.Append(new Break()); solution, but the text replaced is showing the name of the bookmark as well.
For example:
bookmark test = "Big text here in first paragraph\r\nSecond paragraph".
It is shown in MS Word document like:
testBig text here in first paragraph
Second paragraph
Can anyone, please, help me to eliminate the bookmark name?
Here is my code:
public void UpdateBookmarksVistoria(string originalPath, string copyPath, string fileType)
{
string wordmlNamespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
// Make a copy of the template file.
File.Copy(originalPath, copyPath, true);
//Open the document as an Open XML package and extract the main document part.
using (WordprocessingDocument wordPackage = WordprocessingDocument.Open(copyPath, true))
{
MainDocumentPart part = wordPackage.MainDocumentPart;
//Setup the namespace manager so you can perform XPath queries
//to search for bookmarks in the part.
NameTable nt = new NameTable();
XmlNamespaceManager nsManager = new XmlNamespaceManager(nt);
nsManager.AddNamespace("w", wordmlNamespace);
//Load the part's XML into an XmlDocument instance.
XmlDocument xmlDoc = new XmlDocument(nt);
xmlDoc.Load(part.GetStream());
//pega a url para exibir as fotos
string url = HttpContext.Current.Request.Url.ToString();
string enderecoURL;
if (url.Contains("localhost"))
enderecoURL = url.Substring(0, 26);
else if (url.Contains("www."))
enderecoURL = url.Substring(0, 24);
else
enderecoURL = url.Substring(0, 20);
//Iterate through the bookmarks.
int cont = 56;
foreach (KeyValuePair<string, string> bookmark in bookmarks)
{
var res = from bm in part.Document.Body.Descendants<BookmarkStart>()
where bm.Name == bookmark.Key
select bm;
var bk = res.SingleOrDefault();
if (bk != null)
{
Run bookmarkText = bk.NextSibling<Run>();
if (bookmarkText != null) // if the bookmark has text replace it
{
var texts = bookmark.Value.Split(new[] { Environment.NewLine }, StringSplitOptions.None);
for (int i = 0; i < texts.Length; i++)
{
if (i > 0)
bookmarkText.Append(new Break());
Text text = new Text();
text.Text = texts[i];
bookmarkText.Append(text); //HERE IS MY PROBLEM
}
}
else // otherwise append new text immediately after it
{
var parent = bk.Parent; // bookmark's parent element
Text text = new Text(bookmark.Value);
Run run = new Run(new RunProperties());
run.Append(text);
// insert after bookmark parent
parent.Append(run);
}
bk.Remove(); // we don't want the bookmark anymore
}
}
//Write the changes back to the document part.
xmlDoc.Save(wordPackage.MainDocumentPart.GetStream(FileMode.Create));
wordPackage.Close();
}}
How can I read PDF content with the itextsharp with the Pdfreader class. My PDF may include Plain text or Images of the text.
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.IO;
public string ReadPdfFile(string fileName)
{
StringBuilder text = new StringBuilder();
if (File.Exists(fileName))
{
PdfReader pdfReader = new PdfReader(fileName);
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
}
pdfReader.Close();
}
return text.ToString();
}
LGPL / FOSS iTextSharp 4.x
var pdfReader = new PdfReader(path); //other filestream etc
byte[] pageContent = _pdfReader .GetPageContent(pageNum); //not zero based
byte[] utf8 = Encoding.Convert(Encoding.Default, Encoding.UTF8, pageContent);
string textFromPage = Encoding.UTF8.GetString(utf8);
None of the other answers were useful to me, they all seem to target the AGPL v5 of iTextSharp. I could never find any reference to SimpleTextExtractionStrategy or LocationTextExtractionStrategy in the FOSS version.
Something else that might be very useful in conjunction with this:
const string PdfTableFormat = #"\(.*\)Tj";
Regex PdfTableRegex = new Regex(PdfTableFormat, RegexOptions.Compiled);
List<string> ExtractPdfContent(string rawPdfContent)
{
var matches = PdfTableRegex.Matches(rawPdfContent);
var list = matches.Cast<Match>()
.Select(m => m.Value
.Substring(1) //remove leading (
.Remove(m.Value.Length - 4) //remove trailing )Tj
.Replace(#"\)", ")") //unencode parens
.Replace(#"\(", "(")
.Trim()
)
.ToList();
return list;
}
This will extract the text-only data from the PDF if the text displayed is Foo(bar) it will be encoded in the PDF as (Foo\(bar\))Tj, this method would return Foo(bar) as expected. This method will strip out lots of additional information such as location coordinates from the raw pdf content.
Here is a VB.NET solution based on ShravankumarKumar's solution.
This will ONLY give you the text. The images are a different story.
Public Shared Function GetTextFromPDF(PdfFileName As String) As String
Dim oReader As New iTextSharp.text.pdf.PdfReader(PdfFileName)
Dim sOut = ""
For i = 1 To oReader.NumberOfPages
Dim its As New iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy
sOut &= iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(oReader, i, its)
Next
Return sOut
End Function
In my case, I just wanted the text from a specific area of the PDF document so I used a rectangle around the area and extracted the text from it. In the sample below the coordinates are for the entire page. I don't have PDF authoring tools so when it came time to narrow down the rectangle to the specific location I took a few guesses at the coordinates until the area was found.
Rectangle _pdfRect = new Rectangle(0f, 0f, 612f, 792f); // Entire page - PDF coordinate system 0,0 is bottom left corner. 72 points / inch
RenderFilter _renderfilter = new RegionTextRenderFilter(_pdfRect);
ITextExtractionStrategy _strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), _filter);
string _text = PdfTextExtractor.GetTextFromPage(_pdfReader, 1, _strategy);
As noted by the above comments the resulting text doesn't maintain any of the formatting found in the PDF document, however, I was happy that it did preserve the carriage returns. In my case, there were enough constants in the text that I was able to extract the values that I required.
Here an improved answer of ShravankumarKumar. I created special classes for the pages so you can access words in the pdf based on the text rows and the word in that row.
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
//create a list of pdf pages
var pages = new List<PdfPage>();
//load the pdf into the reader. NOTE: path can also be replaced with a byte array
using (PdfReader reader = new PdfReader(path))
{
//loop all the pages and extract the text
for (int i = 1; i <= reader.NumberOfPages; i++)
{
pages.Add(new PdfPage()
{
content = PdfTextExtractor.GetTextFromPage(reader, i)
});
}
}
//use linq to create the rows and words by splitting on newline and space
pages.ForEach(x => x.rows = x.content.Split('\n').Select(y =>
new PdfRow() {
content = y,
words = y.Split(' ').ToList()
}
).ToList());
The custom classes
class PdfPage
{
public string content { get; set; }
public List<PdfRow> rows { get; set; }
}
class PdfRow
{
public string content { get; set; }
public List<string> words { get; set; }
}
Now you can get a word by row and word index.
string myWord = pages[0].rows[12].words[4];
Or use Linq to find the rows containing a specific word.
//find the rows in a specific page containing a word
var myRows = pages[0].rows.Where(x => x.words.Any(y => y == "myWord1")).ToList();
//find the rows in all pages containing a word
var myRows = pages.SelectMany(r => r.rows).Where(x => x.words.Any(y => y == "myWord2")).ToList();
Public Sub PDFTxtToPdf(ByVal sTxtfile As String, ByVal sPDFSourcefile As String)
Dim sr As StreamReader = New StreamReader(sTxtfile)
Dim doc As New Document()
PdfWriter.GetInstance(doc, New FileStream(sPDFSourcefile, FileMode.Create))
doc.Open()
doc.Add(New Paragraph(sr.ReadToEnd()))
doc.Close()
End Sub