Extract image from pdf using Itext - c#

I have been using ITEXT functions to read simple text from the pdf file but is it possible to read image from the PDF file using ITEXT in C#

you can try something like this...
using iTextSharp.text;
using iTextSharp.text.pdf;
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res =
(PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj =
(PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type =
(PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (PdfName.IMAGE.Equals(type))
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(#"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
// GetImageEncoder is found below this method
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
break;
}
}
}
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
}
}
#endregion
#region GetImageEncoder
public static System.Drawing.Imaging.ImageCodecInfo GetImageEncoder(string imageType)
{
imageType = imageType.ToUpperInvariant();
foreach (ImageCodecInfo info in ImageCodecInfo.GetImageEncoders())
{
if (info.FormatDescription == imageType)
{
return info;
}
}
return null;
}
#endregion
I hope it will helps you....

Hi this is not C# but my code in Java I hope you can use this to extract images in C#
public ByteArrayOutputStream extractImages(byte[] pdf) throws IOException{
PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ZipOutputStream zip = new ZipOutputStream(baos);
MyImageRenderer listener = new MyImageRenderer(zip);
for(int i=1;i<=reader.getNumberOfPages();i++){
parser.processContent(i, listener);
}
zip=listener.getZip();
zip.close();
return baos;
}
MyImageRenderer is a class that implements the RendererListener interface here's the method I wrote for rendering the images.
public void renderImage(ImageRenderInfo renderInfo) {
try {
PdfImageObject image = renderInfo.getImage();
if (image == null)
return;
ZipEntry entry = new ZipEntry(String.format(img, renderInfo
.getRef().getNumber(), image.getFileType()));
System.out.println(image.getFileType());
zip.putNextEntry(entry);
zip.write(image.getImageAsBytes());
zip.closeEntry();
} catch (IOException ioex) {
ioex.printStackTrace();
}
}
I know this code is in Java but it's to give you a general idea

Related

How do I delete/replace an image in a PDF file without breaking the file, using iTextsharp and C#

I'm trying to insert an image with id into a PDF document and allow to replace it later with another image.
My process is as follows:
Get an image from the client (with a unique ID).
Try to find an existing image with the same ID, in the PDF document.
If I find an existing image, try to delete it and put the new image instead, or try to replace the existing image with the new one. (tried both).
If I don't find an existing image, insert the image in a position I choose.
I use code from Bruno Lowagie book:
Replacing
Adding image
The problem is that whenever I delete an existing image or replace it my document gets corrupted. What am I doing wrong?
This is the code:
public static bool PdfInsertSignature(string path, string fileName, string signatureName, byte[] imageBytes)
{
bool resultOk = true;
string tmpFilename = string.Concat("tmp_", Guid.NewGuid().ToString(), ".pdf");
// get file, copy to new file with signature
using (Stream inputPdfStream = new FileStream(path + fileName, FileMode.Open, FileAccess.Read, FileShare.Read))
using (Stream outputPdfStream = new FileStream(path + tmpFilename, FileMode.Create, FileAccess.Write, FileShare.None))
{
using (var reader = new PdfReader(inputPdfStream))
using (PdfStamper stamper = new PdfStamper(reader, outputPdfStream, '\0', false))
{
var img = System.Drawing.Image.FromStream(new MemoryStream(imageBytes));
Image image = Image.GetInstance(img, BaseColor.WHITE);
img.Dispose();
var positions = stamper.AcroFields.GetFieldPositions(signatureName)[0];
if (positions != null)
{
//DeleteExistingSignatureImage(reader, stamper, signatureName);
image.SetAbsolutePosition(positions.position.Left + 20, positions.position.Top - 15);
image.ScalePercent(0.2f * 100);
image.BorderWidth = 0;
PdfImage pdfImg = new PdfImage(image, "", null);
pdfImg.Put(new PdfName("ITXT_SigImageId"), new PdfName(signatureName + "_img"));
if (!ReplaceImage(reader, stamper, signatureName, pdfImg))
{
PdfIndirectObject objRef = stamper.Writer.AddToBody(pdfImg);
image.DirectReference = objRef.IndirectReference;
PdfContentByte pdfContentByte = stamper.GetOverContent(positions.page);
pdfContentByte.AddImage(image);
}
}
else
{
resultOk = false;
logger.Error($"No matching Signature found for signatureName: {signatureName} in fileName: {fileName}.");
}
}
}
if (resultOk)
{
// delete old file and rename new file to old file's name
File.Delete(path + fileName);
File.Move(path + tmpFilename, path + fileName);
}
else
{
File.Delete(path + tmpFilename);
}
return resultOk;
}
private static bool ReplaceImage(PdfReader reader, PdfStamper stamper, string signatureName, PdfStream newImgStream)
{
PdfName key = new PdfName("ITXT_SigImageId");
PdfName value = new PdfName(signatureName + "_img");
PdfObject obj;
PRStream stream;
for (int i = 1; i < reader.XrefSize; i++)
{
obj = reader.GetPdfObject(i);
if (obj == null || !obj.IsStream())
{
continue;
}
stream = (PRStream)obj;
PdfObject pdfSubtype = stream.Get(PdfName.SUBTYPE);
if (pdfSubtype != null && pdfSubtype.ToString().Equals(PdfName.IMAGE.ToString()))
{
var streamVal = stream.Get(key);
if (streamVal != null && value.Equals(streamVal))
{
stream.Clear();
var ms = new MemoryStream();
stream.WriteContent(ms);
stream.SetData(ms.ToArray(), false);
foreach (PdfName name in newImgStream.Keys)
{
stream.Put(name, stream.Get(name));
}
return true;
}
}
}
return false;
}
private static void DeleteExistingSignatureImage(PdfReader reader, PdfStamper stamper, string signatureName)
{
PdfName key = new PdfName("ITXT_SigImageId");
PdfName value = new PdfName(signatureName + "_img");
PdfObject obj;
PRStream stream;
for (int i = 1; i < reader.XrefSize; i++)
{
obj = reader.GetPdfObject(i);
if (obj == null || !obj.IsStream())
{
continue;
}
stream = (PRStream)obj;
PdfObject pdfSubtype = stream.Get(PdfName.SUBTYPE);
if (pdfSubtype != null && pdfSubtype.ToString().Equals(PdfName.IMAGE.ToString()))
{
var streamVal = stream.Get(key);
if (streamVal != null && value.Equals(streamVal))
{
stream.Clear();
PdfReader.KillIndirect(stream);
//PdfReader.KillIndirect(obj);
//reader.RemoveUnusedObjects();
}
}
}
}
The purpose of signing a PDF file is to prevent further changes without notice.
You need to sign the document after you swap the image, or it will be corrupted.
Just do make it easier to find:
This is Solution provided by amira.
This is the code i've used to replace a 'ButtonField' on my PDF template with a signature image :
string TempStampPath = Server.MapPath(TempPath + "BookingConfirmation.pdf");
PdfReader pdfReader = new PdfReader(TempStampPath);
PdfStamper pdfStamper = new PdfStamper(pdfReader, new FileStream(LocalFileName, FileMode.Create));
AcroFields pdfFormFields = pdfStamper.AcroFields;
try
{
pdfFormFields.SetField("NameSurname", NameSurname);
pdfFormFields.SetField("IdNumber", IDNumber);
pdfFormFields.SetField("CourseName", CourseName);
pdfFormFields.SetField("Location", Venue);
pdfFormFields.SetField("DateCompleted", CourseDate);
pdfFormFields.SetField("FacilitatorName", Facilitator);
try
{
iTextSharp.text.Image signature = iTextSharp.text.Image.GetInstance(image, System.Drawing.Imaging.ImageFormat.Png);
PushbuttonField ad = pdfStamper.AcroFields.GetNewPushbuttonFromField("btnFacilitatorSignature");
ad.Layout = PushbuttonField.LAYOUT_ICON_ONLY;
ad.ProportionalIcon = true;
ad.Image = signature;
ad.BackgroundColor = iTextSharp.text.BaseColor.WHITE;
pdfFormFields.ReplacePushbuttonField("btnFacilitatorSignature", ad.Field);
}
catch (Exception ex)
{ }
pdfStamper.FormFlattening = true;
pdfStamper.Close();
pdfStamper.Dispose();
pdfReader.Close();
}
catch (Exception ex)
{
pdfStamper.Close();
pdfStamper.Dispose();
pdfReader.Close();
}

ITextsharp: Error reading a pdf file in Byte[] content (PdfReader)

I'm trying to merge several PDFs into a single file through a list that contains their content in byte[]. When opening a document from the Byte[] list with PdfReader, the program launches the following exception: "the document has no pages". When I review the contents of the Byte[] list there are complete, but the exception is always launched.
I try to download the content of that single page separately and the generated document launches error when opening it. The division of the pdf does well because it generates each document in physical and makes it perfect for each page of the PDF.
I appreciate your help or opinions in this situation.
This is the code I use to split and merge documents:
public List<byte[]> SplitPDF(byte[] contentPdf)
{
try
{
var listBythe = new List<byte[]>();
PdfImportedPage page = null;
PdfCopy PdfCopy = null;
PdfReader reader = new PdfReader(contentPdf);
for (int numPage = 1; numPage <= reader.NumberOfPages; numPage++)
{
Document doc = new Document(PageSize.LETTER);
var mStream = new MemoryStream();
PdfCopy = new PdfCopy(doc, mStream);
doc.Open();
page = PdfCopy.GetImportedPage(reader, numPage);
PdfCopy.AddPage(page);
listBythe.Add(mStream.ToArray());
doc.Close();
}
MergePdfToPage(listBythe);
return listBythe;
}
catch (Exception ex)
{
throw ex;
}
}
private byte[] MergePdfToPage(List<byte[]>contentPage)
{
byte[] docPdfByte = null;
var ms = new MemoryStream();
using (Document doc = new Document(PageSize.LETTER))
{
PdfCopy copy = new PdfCopy(doc, ms);
doc.Open();
var num = doc.PageNumber;
foreach (var file in contentPage.ToArray())
{
using (var reader = new PdfReader(file))
{
copy.AddDocument(reader);
}
}
doc.Close();
docPdfByte = ms.ToArray();
}
return docPdfByte;
In your loop you do
Document doc = new Document(PageSize.LETTER);
var mStream = new MemoryStream();
PdfCopy = new PdfCopy(doc, mStream);
doc.Open();
page = PdfCopy.GetImportedPage(reader, numPage);
PdfCopy.AddPage(page);
listBythe.Add(mStream.ToArray());
doc.Close();
In particular you retrieve the mStream bytes before closing doc. But before doc is closed, the pdf is incomplete in mStream!
To get a complete pdf from mStream, please change the order of instructions an do
Document doc = new Document(PageSize.LETTER);
var mStream = new MemoryStream();
PdfCopy = new PdfCopy(doc, mStream);
doc.Open();
page = PdfCopy.GetImportedPage(reader, numPage);
PdfCopy.AddPage(page);
doc.Close();
listBythe.Add(mStream.ToArray());
instead.
I created something for you, hopefully it will work as well as it did for me.
Class :
public class PDFFactory
{
public PDFFactory()
{
PdfDocument = new Document(iTextSharp.text.PageSize.A4, 65, 65, 60, 60);
}
private Document _pdfDocument;
public Document PdfDocument
{
get
{
return _pdfDocument;
}
set
{
_pdfDocument = value;
}
}
private MemoryStream _pdfMemoryStream;
public MemoryStream PDFMemoryStream
{
get
{
return _pdfMemoryStream;
}
set
{
_pdfMemoryStream = value;
}
}
private string _pdfBase64;
public string PDFBase64
{
get
{
if (this.DocumentClosed)
return _pdfBase64;
else
return null;
}
set
{
_pdfBase64 = value;
}
}
private byte[] _pdfBytes;
public byte[] PDFBytes
{
get
{
if (this.DocumentClosed)
return _pdfBytes;
else
return null;
}
set
{
_pdfBytes = value;
}
}
public byte[] GetPDFBytes()
{
PDFDocument.Close();
return PDFMemoryStream.GetBuffer();
}
public void closeDocument()
{
PDFDocument.Close();
PDFBase64 = Convert.ToBase64String(this.PDFMemoryStream.GetBuffer());
PDFBytes = this.PDFMemoryStream.GetBuffer();
}
}
Service:
public byte[] ()
{
PDFFactory pdf_1 = new PDFFactory();
PDFFactory pdf_2 = new PDFFactory();
List<byte[]> sourceFiles = new List<byte[]>();
sourceFiles.Add(pdf_1.GetPDFBytes);
sourceFiles.Add(pdf_2.GetPDFBytes);
PDFFactory pdfFinal = new PDFFactory();
for (int fileCounter = 0; fileCounter <= sourceFiles.Count - 1; fileCounter += 1)
{
PdfReader reader2 = new PdfReader(sourceFiles[fileCounter]);
int numberOfPages = reader2.NumberOfPages;
for (int currentPageIndex = 1; currentPageIndex <= numberOfPages; currentPageIndex++)
{
// Determine page size for the current page
pdfFinal.PDFDocument.SetPageSize(reader2.GetPageSizeWithRotation(currentPageIndex));
// Create page
pdfFinal.PDFDocument.NewPage();
PdfImportedPage importedPage = pdfFinal.PDFWriter.GetImportedPage(reader2, currentPageIndex);
// Determine page orientation
int pageOrientation = reader2.GetPageRotation(currentPageIndex);
if ((pageOrientation == 90) || (pageOrientation == 270))
pdfFinal.PDFWriter.DirectContent.AddTemplate(importedPage, 0, -1.0F, 1.0F, 0, 0, reader2.GetPageSizeWithRotation(currentPageIndex).Height);
else
pdfFinal.PDFWriter.DirectContent.AddTemplate(importedPage, 1.0F, 0, 0, 1.0F, 0, 0);
}
}
pdfFinal.closeDocument();
return pdfFinal.PDFBytes;
}
Let me know if it helped.

Image extract from pdf using itextsharp in c#

Hello i am using
public void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
Response.Write("Page " + pageNumber.ToString());
PdfDictionary pg = pdf.GetPageN(pageNumber);
// recursively search pages, forms and groups for images.
PdfObject obj = FindImageInPDFDictionary(pg);
if (obj != null)
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Seek(0, SeekOrigin.Begin);
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = System.IO.Path.Combine(outputPath, String.Format(#"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = GetEncoder(System.Drawing.Imaging.ImageFormat.Jpeg);
img.Save(path, jpegEncoder, parms);
}
}
}
else
{
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}
code but does not extract my pdf file , parameter is not valid error and i am using third party dll use for extract image so where get all images from pdf but i want to use itextsharp dll , please check and help me.

Image is not created from stream

i am trying to extract image from pdf using this code
#region ExtractImagesFromPDF
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res =
(PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj =
(PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type =
(PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (PdfName.IMAGE.Equals(type))
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(#"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
// GetImageEncoder is found below this method
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
break;
}
}
}
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
}
}
#endregion
its all going right but the line
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
giving an error “Parameter not valid”
i cant getting whats the problem is the stream is not a image stream then why iTextSharp reading it as image.
please anyone help me out

Extract Image from a particular page in PDF

I want to extract an Image from a PDF file. I tried with the following code and it extracted a jpeg Image perfectly from the PDF. The problem is how to extract image from a particular page e.g. Page 1 or from some other page. I don't want to read the whole PDF to search for the Image.
Any suggestions?
Code to extract Image:
private void List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
{
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
iTextSharp.text.pdf.PdfObject PDFObj = null;
iTextSharp.text.pdf.PdfStream PDFStremObj = null;
try
{
RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
{
byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);
if ((bytes != null))
{
try
{
System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);
MS.Position = 0;
System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
pictureBox1.Image = ImgPDF;
MS.Close();
MS.Flush();
}
catch (Exception)
{
}
}
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
I don't have iTextSharp 4.0 available currently so this code targets 5.2 but it should work just fine for the older one, too. This code is an almost direct lift from this post here, so please see that post as well as responses for further questions. As I said in the comments above, your code is looking at all of the images from the document-perspective while the code that I linked to goes page-by-page.
Please read all of the comments in the other post, especially this one which explains that this ONLY works for JPG images. There's a lot of different types of images that PDF supports so unless you know that you're only dealing with JPGs you'll need to add a bunch of more code. See this post and this post for some hints.
string testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Doc1.pdf");
string outputPath = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
int pageNum = 1;
PdfReader pdf = new PdfReader(testFile);
PdfDictionary pg = pdf.GetPageN(pageNum);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj == null) { return; }
foreach (PdfName name in xobj.Keys) {
PdfObject obj = xobj.Get(name);
if (!obj.IsIndirect()) { continue; }
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (!type.Equals(PdfName.IMAGE)) { continue; }
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if (bytes == null) { continue; }
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes)) {
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(#"{0}.jpg", pageNum));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
var jpegEncoder = ImageCodecInfo.GetImageEncoders().ToList().Find(x => x.FormatID == ImageFormat.Jpeg.Guid);
img.Save(path, jpegEncoder, parms);
}
}
The following is the code which I am using to extract images from PDF. It works completely fine for me.
// Required: iTextSharp.dll
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;
namespace PDF_Parsing {
partial class ExtractPdfImage
{
string imgPath = #"c:\extractedImg.png";
private void ExtractImage(string pdfFile)
{
const int pageNumber = 1;
PdfReader pdf = new PdfReader(pdfFile);
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
string width = tg.Get(PdfName.WIDTH).ToString();
string height = tg.Get(PdfName.HEIGHT).ToString();
ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)),
(PRIndirectReference)obj, tg);
RenderImage(imgRI);
}
}
}
private void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
using (Dotnet dotnetImg = image.GetDrawingImage())
{
if (dotnetImg != null)
{
using (MemoryStream ms = new MemoryStream())
{
dotnetImg.Save(ms, ImageFormat.Tiff);
Bitmap d = new Bitmap(dotnetImg);
d.Save(imgPath);
}
}
}
}
}
}
The following code works fine to extract image from particular page.
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;
namespace PDF_Parsing
{
partial class PDF_ImgExtraction
{
string imgPath;
private void ExtractImage(string pdfFile)
{
const int pageNumber = 1;//Page number to extract the image from
PdfReader pdf = new PdfReader(pdfFile);
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
string width = tg.Get(PdfName.WIDTH).ToString();
string height = tg.Get(PdfName.HEIGHT).ToString();
ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
RenderImage(imgRI);
}
}
}
private void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
using (Dotnet dotnetImg = image.GetDrawingImage())
{
if (dotnetImg != null)
{
using (MemoryStream ms = new MemoryStream())
{
dotnetImg.Save(ms, ImageFormat.Tiff);
Bitmap d = new Bitmap(dotnetImg);
d.Save(imgPath);
}
}
}
}
}
}

Categories