This question already has an answer here:
Replace text in PDF file using iTextSharp(not AcroFields) [closed]
(1 answer)
Closed 6 years ago.
I' ve been searching the Internet for 2 Weeks and found some interesting solutions for my Problem, but nothing seems to give me the answer.
My goal is to do the folowing:
I want to find a Text in a static PDF-File and replace this text with another text.
I would like to keep the design of the content. Is it really that hard?
I found a way but I lost the whole information:
using (PdfReader reader = new PdfReader(path))
{
StringBuilder text = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++)
{
text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
text.Replace(txt_SuchenNach.Text, txt_ErsetzenMit.Text);
}
return text.ToString();
}
The second try I had was way better, but needs fields where I can change the text inside:
string fileNameExisting =path;
string fileNameNew = #"C:\TEST.pdf";
using (FileStream existingFileStream = new FileStream(fileNameExisting, FileMode.Open))
using (FileStream newFileStream = new FileStream(fileNameNew, FileMode.Create))
{
// PDF öffnen
PdfReader pdfReader = new PdfReader(existingFileStream);
PdfStamper stamper = new PdfStamper(pdfReader, newFileStream);
var form = stamper.AcroFields;
var fieldKeys = form.Fields.Keys;
foreach (string fieldKey in fieldKeys)
{
var value = pdfReader.AcroFields.GetField(fieldKey);
form.SetField(fieldKey, value.Replace(txt_SuchenNach.Text, txt_ErsetzenMit.Text));
}
// Textfeld unbearbeitbar machen (sieht aus wie normaler text)
stamper.FormFlattening = true;
stamper.Close();
pdfReader.Close();
}
This keeps the formatation of the rest of text and does only change my searched text. I need a solution for text which is NOT in a Textfield.
thanks for all your answers and your help.
The general issue is that text objects may use embedded fonts with specific glyphs assigned to specific letters. I.e. if you have a text object with some text like "abcdef" then the embedded font may contain glyphs for these ("abcdef" letters) only but not for other letters. So if you replace "abcdef" with "xyz" then the PDF will not display these "xyz" as no glyphs are available for these letters to be displayed.
So I would consider the following workflow:
Iterate through all the text objects;
Add new text objects created from scratch on top of PDF file and set the same properties (font, position, etc) but with a different text; This step could require you to have the same fonts installed on your as were used in the original PDF but you may check for installed fonts and use another font for a new text object. This way iTextSharp or another PDF tool will embed a new font object for a new text object.
Remove original text object once you have created a duplicated text object;
Process every text object with the workflow described above;
Save the modified PDF document into a new file.
I have worked on the same requirement and I am able to achieve this by the following steps.
Step1: Locating Source Pdf File and Destination file Path
Step2: Read Source Pdf file and Searching for the location of string that we want to replace
Step3: Replacing the string with new one.
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using PDFExtraction;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
namespace PDFReplaceTextUsingItextSharp
{
public partial class ExtractPdf : System.Web.UI.Page
{
static iTextSharp.text.pdf.PdfStamper stamper = null;
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Replace_Click(object sender, EventArgs e)
{
string ReplacingVariable = txtReplace.Text;
string sourceFile = "Source File Path";
string descFile = "Destination File Path";
PdfReader pReader = new PdfReader(sourceFile);
stamper = new iTextSharp.text.pdf.PdfStamper(pReader, new System.IO.FileStream(descFile, System.IO.FileMode.Create));
PDFTextGetter("ExistingVariableinPDF", ReplacingVariable , StringComparison.CurrentCultureIgnoreCase, sourceFile, descFile);
stamper.Close();
pReader.Close();
}
/// <summary>
/// This method is used to search for the location words in pdf and update it with the words given from replacingText variable
/// </summary>
/// <param name="pSearch">Searchable String</param>
/// <param name="replacingText">Replacing String</param>
/// <param name="SC">Case Ignorance</param>
/// <param name="SourceFile">Path of the source file</param>
/// <param name="DestinationFile">Path of the destination file</param>
public static void PDFTextGetter(string pSearch, string replacingText, StringComparison SC, string SourceFile, string DestinationFile)
{
try
{
iTextSharp.text.pdf.PdfContentByte cb = null;
iTextSharp.text.pdf.PdfContentByte cb2 = null;
iTextSharp.text.pdf.PdfWriter writer = null;
iTextSharp.text.pdf.BaseFont bf = null;
if (System.IO.File.Exists(SourceFile))
{
PdfReader pReader = new PdfReader(SourceFile);
for (int page = 1; page <= pReader.NumberOfPages; page++)
{
myLocationTextExtractionStrategy strategy = new myLocationTextExtractionStrategy();
cb = stamper.GetOverContent(page);
cb2 = stamper.GetOverContent(page);
//Send some data contained in PdfContentByte, looks like the first is always cero for me and the second 100,
//but i'm not sure if this could change in some cases
strategy.UndercontentCharacterSpacing = (int)cb.CharacterSpacing;
strategy.UndercontentHorizontalScaling = (int)cb.HorizontalScaling;
//It's not really needed to get the text back, but we have to call this line ALWAYS,
//because it triggers the process that will get all chunks from PDF into our strategy Object
string currentText = PdfTextExtractor.GetTextFromPage(pReader, page, strategy);
//The real getter process starts in the following line
List<iTextSharp.text.Rectangle> MatchesFound = strategy.GetTextLocations(pSearch, SC);
//Set the fill color of the shapes, I don't use a border because it would make the rect bigger
//but maybe using a thin border could be a solution if you see the currect rect is not big enough to cover all the text it should cover
cb.SetColorFill(BaseColor.WHITE);
//MatchesFound contains all text with locations, so do whatever you want with it, this highlights them using PINK color:
foreach (iTextSharp.text.Rectangle rect in MatchesFound)
{
//width
cb.Rectangle(rect.Left, rect.Bottom, 60, rect.Height);
cb.Fill();
cb2.SetColorFill(BaseColor.BLACK);
bf = BaseFont.CreateFont(BaseFont.HELVETICA_BOLD, BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
cb2.SetFontAndSize(bf, 9);
cb2.BeginText();
cb2.ShowTextAligned(0, replacingText, rect.Left, rect.Bottom, 0);
cb2.EndText();
cb2.Fill();
}
}
}
}
catch (Exception ex)
{
}
}
}
}
Related
I'm using IText 7 to convert my HTML file into PDF and auto download the file when users click a button.
Currently I'm planning to insert a barcode using IText7 into the PDF file, however I encountered some error. Btw, it works fine without the 'barcode code'.
This is the error : 'iText.Kernel.PdfException: 'Pdf indirect object belongs to other PDF document. Copy object to current pdf document.''
How can I add a barcode at the end of my pdf file?
public MemoryStream GetCovidFormPdfByAccessionNumber(string htmlFile, string accessionNumber)
{
var workStream = new MemoryStream();
using (var pdfWriter = new PdfWriter(workStream))
{
pdfWriter.SetCloseStream(false);
var pdfDoc = new PdfDocument(pdfWriter);
using (var document = HtmlConverter.ConvertToDocument(htmlFile, pdfWriter))
{
document.Add(CreateBarcode(accessionNumber, pdfDoc));
}
}
workStream.Position = 0;
return workStream;
}
private static Image CreateBarcode(string code, PdfDocument pdfDoc)
{
Barcode39 barcode = new Barcode39(pdfDoc);
barcode.SetCode(code);
//Create barcode object to put it to the cell as image
PdfFormXObject barcodeObject = barcode.CreateFormXObject(ColorConstants.BLACK, ColorConstants.BLACK, pdfDoc);
var image = new Image(barcodeObject);
image.SetWidth(250);
return image;
}
I have a MS word file which has some sentences and I need to insert some images in between the lines. When I am using the AddPicture method in Microsoft.Office.Interop.Word I am able to insert the image but not at a particular position.
I did not find any method to insert other than AddPicture to insert an image into existing word file. I am trying to insert an image after a particular line after apple there should an image of apple.
Here I am creating a paragraph and trying to add the image. This is my initial file:
This contains paragraphs containing the words apple, mango, and grape.
This is the output of my code (below)
The image should be inserted after the apple line
Required output:
Required Output
using System;
using System.Collections.Generic;
using System.Reflection;
using System.Reflection.Metadata;
using Word =Microsoft.Office.Interop.Word;
using System.IO;
namespace ConsoleApp2
{
class Program
{
static void Main(string[] args)
{
Word.Application ap = new Word.Application();
Word.Document document = ap.Documents.Open(#"C:\Users\ermcnnj\Desktop\Doc1.docx");
//document.InlineShapes.AddPicture(#"C:\Users\ermcnnj\Desktop\apple.png");
String read = string.Empty;
List<string> data = new List<string>();
for (int i = 0; i < document.Paragraphs.Count; i++)
{
string temp = document.Paragraphs[i + 1].Range.Text.Trim();
if (temp != string.Empty && temp.Contains("Apple"))
{
var pPicture = document.Paragraphs.Add();
pPicture.Format.SpaceAfter = 10f;
document.InlineShapes.AddPicture(#"C:\Users\ermcnnj\Desktop\apple.png", Range: pPicture.Range);
}
}
}
}
}
The above is the code I am using.
The following code snippet illustrates how this can be done. Note that. for the sake of clarity, it's simplified to set only the text to be found - there are a lot of additional properties that might need to be specified; read up on the Find functionality in Word's Language Reference.
If a search term is found, the Range associated with Find changes to the found term and further action can be taken. In this case, a new (empty) paragraph is inserted after the found term. (The question specifies that the term is the entire content of a paragraph, so that's what this example assumes!) The Range is then moved to this new paragraph and the InlineShape inserted.
Note how the graphic is assigned to an InlineShape object. If anything needs to be done to this object, work with the object variable ils.
Word.Application ap = new Word.Application();
Word.Document document = ap.Documents.Open(#"C:\Users\ermcnnj\Desktop\Doc1.docx");
Word.Range rng = document.Content;
Word.Find wdFind = rng.Find;
wdFind.Text = "apple";
bool found = wdFind.Execute();
if (found)
{
rng.InsertAfter("\n");
rng.MoveStart(Word.WdUnits.wdParagraph, 1);
Word.InlineShape ils = rng.InlineShapes.AddPicture(#"C:\Test\avatar.jpg", false, true, rng);
}
I am using following code to convert html to pdf.But arabic is not showing when pdf generated.What is the problem with the following code.
//Create a byte array that will eventually hold our final PDF
Byte[] bytes;
using (var ms = new MemoryStream())
{
FontFactory.Register(Server.MapPath("~/fonts/TRADBDO.TTF"));
using (var doc = new Document())
{
using (var writer = PdfWriter.GetInstance(doc, ms))
{
doc.Open();
var example_html = #"<p>This <em>is البرامج الدراسية المطروحة البرامج الدراسية المطروحةالبرامج الدراسية المطروحةالبرامج الدراسية المطروحةالبرامج الدراسية المطروحةالبرامج الدراسية المطروحة</em>55555<span class=""headline"" style=""text-decoration: underline;"">some</span> <strong>sample <em> text</em></strong><span style=""color: red;"">!!!</span></p>";
var example_css = #".headline{font-size:200%}";
FontFactory.Register(Server.MapPath("~/fonts/TRADBDO.TTF"));
using (var msCss = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(example_css)))
{
using (var msHtml = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(example_html)))
{
iTextSharp.tool.xml.XMLWorkerHelper.GetInstance().ParseXHtml(writer, doc, msHtml, msCss);
}
}
doc.Close();
}
}
bytes = ms.ToArray();
}
var testFile = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test_123_345.pdf");
System.IO.File.WriteAllBytes(testFile, bytes);
ParseXHtml() takes a parameter that lets you manually handle fonts which is probably the easiest thing to do. If you subclass FontFactoryImp you can then override the GetFont method and specify your own font. The below code does that without much logic and pretty much says "I don't care what font was specified in the HTML, always use this one" which should probably work for you.
public class FontOverrider : FontFactoryImp {
private readonly BaseFont baseFont;
/// <summary>
/// Create a new font factory that always uses the provided font.
/// </summary>
/// <param name="fullPathToFontFileToUse">The full path to the font file to use.</param>
/// <param name="encoding">The type of encoding to use. Default BaseFont.IDENTITY_H. See <see cref="http://api.itextpdf.com/itext/com/itextpdf/text/pdf/BaseFont.html#createFont(java.lang.String, java.lang.String, boolean)"/> for details.</param>
/// <param name="embedded">Whether or not to embed the entire font. Default True. See <see cref="http://api.itextpdf.com/itext/com/itextpdf/text/pdf/BaseFont.html#createFont(java.lang.String, java.lang.String, boolean)"/> for details.</param>
public FontOverrider( string fullPathToFontFileToUse, string encoding = BaseFont.IDENTITY_H, bool embedded = BaseFont.EMBEDDED ) {
//If you are using this class then this font is required and a missing font should be a fatal error
if (!System.IO.File.Exists(fullPathToFontFileToUse)) {
throw new System.IO.FileNotFoundException("Could not find the supplied font file", fullPathToFontFileToUse);
}
//Create our embedded base font
baseFont = BaseFont.CreateFont(fullPathToFontFileToUse, encoding, embedded);
}
public override iTextSharp.text.Font GetFont(string fontname, string encoding, bool embedded, float size, int style, BaseColor color, bool cached) {
return new iTextSharp.text.Font(baseFont, size, style, color);
}
}
To use it you just need to change your ParseXHtml() call to this:
iTextSharp.tool.xml.XMLWorkerHelper.GetInstance().ParseXHtml(writer, doc, msHtml, msCss, System.Text.Encoding.UTF8, new FontOverrider(myFont));
where myFont is your Server.MapPath() (or whatever) to your full font.
Just a note, I looked online for the font you mentioned and the free version that I found cannot be legally embedded in a PDF. If I try to use it I actually get a message telling me exactly that. This code assumes you've taken care of licensing agreements and have a validly licensed font for embedding. For my sample purposes I just used Arial Unicode MS.
I've seen many posts that have helped me get to where I am, I'm new to programming. My intention is to get the files within the directory "sourceDir" and look for a Regex Match. When it finds a Match, I want to create a new file with the Match as the name. If the code finds another file with the same Match (the file already exists) then create a new page within that document.
Right now the code works, however instead of adding a new page, it overwrites the first page of the document. NOTE: Every document in the directory is only one page!
string sourceDir = #"C:\Users\bob\Desktop\results\";
string destDir = #"C:\Users\bob\Desktop\results\final\";
string[] files = Directory.GetFiles(sourceDir);
foreach (string file in files)
{
using (var pdfReader = new PdfReader(file.ToString()))
{
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
var text = new StringBuilder();
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
var currentText =
PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
Regex reg = new Regex(#"ABCDEFG");
MatchCollection matches = reg.Matches(currentText);
foreach (Match m in matches)
{
string newFile = destDir + m.ToString() + ".pdf";
if (!File.Exists(newFile))
{
using (PdfReader reader = new PdfReader(File.ReadAllBytes(file)))
{
using (Document doc = new Document(reader.GetPageSizeWithRotation(page)))
{
using (PdfCopy copy = new PdfCopy(doc, new FileStream(newFile, FileMode.Create)))
{
var importedPage = copy.GetImportedPage(reader, page);
doc.Open();
copy.AddPage(importedPage);
doc.Close();
}
}
}
}
else
{
using (PdfReader reader = new PdfReader(File.ReadAllBytes(newFile)))
{
using (Document doc = new Document(reader.GetPageSizeWithRotation(page)))
{
using (PdfCopy copy = new PdfCopy(doc, new FileStream(newFile, FileMode.OpenOrCreate)))
{
var importedPage = copy.GetImportedPage(reader, page);
doc.Open();
copy.AddPage(importedPage);
doc.Close();
}
}
}
}
}
}
}
}
Bruno did a great job explaining the problem and how to fix it but since you've said that you are new to programming and you've further posted a very similar and related question I'm going to go a little deeper to hopefully help you.
First, let's write down the knowns:
There's a directory full of PDFs
Each PDF has only a single page
Then the objectives:
Extract the text of each PDF
Compare the extracted text with a pattern
If there's a match, then using the match for a file name do one of:
If a file exists append the source PDF to it
If there isn't a match, create a new file with the PDF
There's a couple of things that you need to know before proceeding. You tried to work in "append mode" by using FileMode.OpenOrCreate. It was a good guess but incorrect. The PDF format has both an beginning and an end, so "start here" and "end here". When you attempt to append another PDF (or anything for that matter) to an existing file you are just writing past the "end here" section. At best, that's junk data that gets ignored but more likely you'll end up with a corrupt PDF. The same is true of almost any file format. Two XML files concatenated is invalid because an XML document can only have one root element.
Second but related, iText/iTextSharp cannot edit existing files. This is very important. It can, however, create brand new files that happen to have the exact or possibly modified versions of other files. I don't know if I can stress how important this is.
Third, you are using a line that get's copied over and over again but is very wrong and actually can corrupt your data. For why it is bad, read this.
currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
Fourth, you are using RegEx which is an overly complicated way to perform a search. Maybe the code that you posted was just a sample but if it wasn't I would recommend just using currentText.Contains("") or if you need to ignore case currentText.IndexOf( "", StringComparison.InvariantCultureIgnoreCase ). For the benefit of the doubt, the code below assumes you have a more complex RegEx.
With all that, below is a full working example that should walk you through everything. Since we don't have access to your PDFs, the second section actually creates 100 sample PDFs with our search terms occasionally added to them. Your real code obviously wouldn't do this but we need common ground to work with you on. The third section is the search and merge feature that you are trying to do. Hopefully the comments in the code explain everything.
/**
* Step 1 - Variable Setup
*/
//This is the folder that we'll be basing all other directory paths on
var workingFolder = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
//This folder will hold our PDFs with text that we're searching for
var folderPathContainingPdfsToSearch = Path.Combine(workingFolder, "Pdfs");
var folderPathContainingPdfsCombined = Path.Combine(workingFolder, "Pdfs Combined");
//Create our directories if they don't already exist
System.IO.Directory.CreateDirectory(folderPathContainingPdfsToSearch);
System.IO.Directory.CreateDirectory(folderPathContainingPdfsCombined);
var searchText1 = "ABC";
var searchText2 = "DEF";
/**
* Step 2 - Create sample PDFs
*/
//Create 100 sample PDFs
for (var i = 0; i < 100; i++) {
using (var fs = new FileStream(Path.Combine(folderPathContainingPdfsToSearch, i.ToString() + ".pdf"), FileMode.Create, FileAccess.Write, FileShare.None)) {
using (var doc = new Document()) {
using (var writer = PdfWriter.GetInstance(doc, fs)) {
doc.Open();
//Add a title so we know what page we're on when we combine
doc.Add(new Paragraph(String.Format("This is page {0}", i)));
//Add various strings every once in a while.
//(Yes, I know this isn't evenly distributed but I haven't
// had enough coffee yet.)
if (i % 10 == 3) {
doc.Add(new Paragraph(searchText1));
} else if (i % 10 == 6) {
doc.Add(new Paragraph(searchText2));
} else if (i % 10 == 9) {
doc.Add(new Paragraph(searchText1 + searchText2));
} else {
doc.Add(new Paragraph("Blah blah blah"));
}
doc.Close();
}
}
}
}
/**
* Step 3 - Search and merge
*/
//We'll search for two different strings just to add some spice
var reg = new Regex("(" + searchText1 + "|" + searchText2 + ")");
//Loop through each file in the directory
foreach (var filePath in Directory.EnumerateFiles(folderPathContainingPdfsToSearch, "*.pdf")) {
using (var pdfReader = new PdfReader(filePath)) {
for (var page = 1; page <= pdfReader.NumberOfPages; page++) {
//Get the text from the page
var currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, new SimpleTextExtractionStrategy());
currentText.IndexOf( "", StringComparison.InvariantCultureIgnoreCase )
//DO NOT DO THIS EVER!! See this for why https://stackoverflow.com/a/10191879/231316
//currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
//Match our pattern against the extracted text
var matches = reg.Matches(currentText);
//Bail early if we can
if (matches.Count == 0) {
continue;
}
//Loop through each match
foreach (var m in matches) {
//This is the file path that we want to target
var destFile = Path.Combine(folderPathContainingPdfsCombined, m.ToString() + ".pdf");
//If the file doesn't already exist then just copy the file and move on
if (!File.Exists(destFile)) {
System.IO.File.Copy(filePath, destFile);
continue;
}
//The file exists so we're going to "append" the page
//However, writing to the end of file in Append mode doesn't work,
//that would be like "add a file to a zip" by concatenating two
//two files. In this case, we're actually creating a brand new file
//that "happens" to contain the original file and the matched file.
//Instead of writing to disk for this new file we're going to keep it
//in memory, delete the original file and write our new file
//back onto the old file
using (var ms = new MemoryStream()) {
//Use a wrapper helper provided by iText
var cc = new PdfConcatenate(ms);
//Open for writing
cc.Open();
//Import the existing file
using (var subReader = new PdfReader(destFile)) {
cc.AddPages(subReader);
}
//Import the matched file
//The OP stated a guarantee of only 1 page so we don't
//have to mess around with specify which page to import.
//Also, PdfConcatenate closes the supplied PdfReader so
//just use the variable pdfReader.
using (var subReader = new PdfReader(filePath)) {
cc.AddPages(subReader);
}
//Close for writing
cc.Close();
//Erase our exisiting file
File.Delete(destFile);
//Write our new file
File.WriteAllBytes(destFile, ms.ToArray());
}
}
}
}
}
I'll write this in pseudo code.
You do something like this:
// loop over different single-page documents
for () {
// introduce a condition
if (condition == met) {
// create single-page PDF
new Document();
new PdfCopy();
document.Open();
copy.add(singlePage);
document.Close();
}
}
This means that you are creating a single-page PDF every time the condition is met. Incidentally, you're overwriting existing files many times.
What you should do, is something like this:
// Create a document with as many pages as times a condition is met
new Document();
new PdfCopy();
document.Open();
// loop over different single-page documents
for () {
// introduce a condition
if (condition == met) {
copy.addPage(singlePage);
}
}
document.Close();
Now you are possibly adding more than one page to the new document you are creating with PdfCopy. Be careful: an exception can be thrown if the condition is never met.
I'm using iTextSharp to generate pdf-a documents from images. So far I've not been successful.
Edit: I'm using iTextSharp to generate the PDF
All I try is to make a pdf-a document (1a or 1b, whatever suits), with some images. This is the code I've come up so far, but I keep getting errors when I try to validate them with pdf-tools or validatepdfa.
This are the errors I get from pdf-tools (using PDF/A-1b validation):
Edit: MarkInfo and Color Space arn't yet working. The rest is okay
Validating file "0.pdf" for conformance level pdfa-1a
The key MarkInfo is required but missing.
A device-specific color space (DeviceRGB) without an appropriate output intent is used.
The document does not conform to the requested standard.
The document contains device-specific color spaces.
The document doesn't provide appropriate logical structure information.
Done.
Main flow
var output = new MemoryStream();
using (var iccProfileStream = new FileStream("ToPdfConverter/ColorProfiles/sRGB_v4_ICC_preference_displayclass.icc", FileMode.Open))
{
var document = new Document(new Rectangle(PageSize.A4.Width, PageSize.A4.Height), 0f, 0f, 0f, 0f);
var pdfWriter = PdfWriter.GetInstance(document, output);
pdfWriter.PDFXConformance = PdfWriter.PDFA1A;
document.Open();
var pdfDictionary = new PdfDictionary(PdfName.OUTPUTINTENT);
pdfDictionary.Put(PdfName.OUTPUTCONDITION, new PdfString("sRGB IEC61966-2.1"));
pdfDictionary.Put(PdfName.INFO, new PdfString("sRGB IEC61966-2.1"));
pdfDictionary.Put(PdfName.S, PdfName.GTS_PDFA1);
var iccProfile = ICC_Profile.GetInstance(iccProfileStream);
var pdfIccBased = new PdfICCBased(iccProfile);
pdfIccBased.Remove(PdfName.ALTERNATE);
pdfDictionary.Put(PdfName.DESTOUTPUTPROFILE, pdfWriter.AddToBody(pdfIccBased).IndirectReference);
pdfWriter.ExtraCatalog.Put(PdfName.OUTPUTINTENT, new PdfArray(pdfDictionary));
var image = PrepareImage(imageBytes);
document.Open();
document.Add(image);
pdfWriter.CreateXmpMetadata();
pdfWriter.CloseStream = false;
document.Close();
}
return output.GetBuffer();
This is prepareImage()
It's used to flatten the image to bmp, so I don't need to bother about alpha channels.
private Image PrepareImage(Stream stream)
{
Bitmap bmp = new Bitmap(System.Drawing.Image.FromStream(stream));
var file = new MemoryStream();
bmp.Save(file, ImageFormat.Bmp);
var image = Image.GetInstance(file.GetBuffer());
if (image.Height > PageSize.A4.Height || image.Width > PageSize.A4.Width)
{
image.ScaleToFit(PageSize.A4.Width, PageSize.A4.Height);
}
return image;
}
Can anyone help me into a direction to fix the errors?
Specifically the device-specific color spaces
Edit: More explanation: What I'm trying to achieve is, converting scanned images to PDF/A for long-term data storage
Edit: added some files I'm using to test with
PDFs and Pictures.rar (3.9 MB)
https://mega.co.nz/#!n8pClYgL!NJOJqSO3EuVrqLVyh3c43yW-u_U35NqeB0svc6giaSQ
OK, I checked one of your files in callas pdfToolbox and it says: "Device color space used but no PDF/A output intent". Which I took as a sign that you do something wrong while writing an output intent to the document. I then converted that document to PDF/A-1b with the same tool and the difference is obvious.
Perhaps there are other errors you need to fix, but the first error here is that you put a key in the catalog dict for the PDF file that is named "OutputIntent". That's wrong: page 75 of the PDF Specification states that the key should be named "OutputIntents".
Like I said, perhaps there are other problems with your file beyond this, but the wrong name for the key causes PDF/A validators not to find the Output Intent you try to put in the file...
First of all, pdfx IS NOT pdfa.
Second, you're using wrong PdfWriter. It should be PdfAWriter.
I do not have solution for image problem unfortunatelly, but I have for 1 and 2.
Regards
using System;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.Text;
using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.html.simpleparser;
using iTextSharp.tool.xml;
using System.Drawing;
using System.Drawing.Imaging;
namespace Tests
{
/*
* References:
* UTF-8 encoding http://stackoverflow.com/questions/4902033/itextsharp-5-polish-character
* PDFA http://www.codeproject.com/Questions/661704/Create-pdf-A-using-itextsharp
* Images http://stackoverflow.com/questions/15896581/make-a-pdf-conforming-pdf-a-with-only-images-using-itextsharp
*/
[TestClass]
public class UnitTest1
{
/*
* IMPORTANT: Restrictions with html usage of tags and attributes
* 1. Dont use * <head> <title>Sklep</title> </head>, because title is rendered to the page
*/
// Test cases
static string contents = "<html><body style=\"font-family:arial unicode ms;font-size: 8px;\"><p style=\"text-align: center;\"> Davčna številka dolžnika: 74605968<br /> </p><table> <tr> <td><b>\u0160t. sklepa: 88711501</b></td> <td style=\"text-align: right;\">Davčna številka dolžnika: 74605968</td> </tr> </table> <br/><img src=\"http://img.rtvslo.si/_static/images/rtvslo_mmc_logo.png\" /></body></html>";
//static string contents = "<html><body style=\"font-family:arial unicode ms;font-size: 8px;\"><p style=\"text-align: center;\"> Davčna številka dolžnika: 74605968<br /> </p><table> <tr> <td><b>\u0160t. sklepa: 88711501</b></td> <td style=\"text-align: right;\">Davčna številka dolžnika: 74605968</td> </tr> </table> <br/></body></html>";
//[TestMethod]
public void CreatePdfHtml()
{
createPDF(contents, true);
}
private void createPDF(string html, bool isPdfa)
{
TextReader reader = new StringReader(html);
Document document = new Document(PageSize.A4, 30, 30, 30, 30);
HTMLWorker worker = new HTMLWorker(document);
PdfWriter writer;
if (isPdfa)
{
//set conformity level
writer = PdfAWriter.GetInstance(document, new FileStream(#"c:\temp\testA.pdf", FileMode.Create), PdfAConformanceLevel.PDF_A_1B);
//set pdf version
writer.SetPdfVersion(PdfAWriter.PDF_VERSION_1_4);
// Create XMP metadata. It's a PDF/A requirement.
writer.CreateXmpMetadata();
}
else
{
writer = PdfWriter.GetInstance(document, new FileStream(#"c:\temp\test.pdf", FileMode.Create));
}
document.Open();
if (isPdfa) // document should be opend, or it will fail
{
// Set output intent for uncalibrated color space. PDF/A requirement.
ICC_Profile icc = ICC_Profile.GetInstance(Environment.GetEnvironmentVariable("SystemRoot") + #"\System32\spool\drivers\color\sRGB Color Space Profile.icm");
writer.SetOutputIntents("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1", icc);
}
//register font used in html
FontFactory.Register(Environment.GetEnvironmentVariable("SystemRoot") + "\\Fonts\\ARIALUNI.TTF", "arial unicode ms");
//adding custom style attributes to html specific tasks. Can be used instead of css
//this one is a must fopr display of utf8 language specific characters (čćžđpš)
iTextSharp.text.html.simpleparser.StyleSheet ST = new iTextSharp.text.html.simpleparser.StyleSheet();
ST.LoadTagStyle("body", "encoding", "Identity-H");
worker.SetStyleSheet(ST);
worker.StartDocument();
worker.Parse(reader);
worker.EndDocument();
worker.Close();
document.Close();
}
}
}