I am working with a SQL DB that stores Excel files (along with other file types such as PDF) as binary data. I use the following code to extract these files onto the file system.
The Problem:
PDF files come out just fine. But for Excel, the files get created and when I try to open them, they crash or just give me garbage text.
I am using this code from the previous guy who wrote this app for retrieving files. This code uses OpenMcdf which I don't fully understand because I couldn't find useful online documentation for it.
//execution starts here
public override void SaveToDisk()
{
byte[] keys = { (byte)0xd0, (byte)0xcf };
//Searches through m_RawOleObject for combination of 'keys'
int offset = Utils.SearchBytes(m_RawOleObject, keys); //returns '60' in case of Excel and '66' in case of Pdf
//m_RawOleOjbect contains the data from the sqlDataReader (the binary data from the column.)
m_RawOleObject = strip(m_RawOleObject, offset);
MemoryStream ms = new MemoryStream(m_RawOleObject);
CompoundFile cf = new CompoundFile(ms);
GetStorageByName(cf.RootStorage, m_StorageName);
if (Storage != null)
{
if (Storage is CFStream)
{
m_RawOleObject = (Storage as CFStream).GetData();
}
m_filename = System.IO.Path.Combine(STOREPATH, Utils.CombineFilenameWithExtension(Filename, m_extension));
WriteToFile(m_filename, m_RawOleObject);
}
}
protected void WriteToFile(string fn, byte[] obj)
{
fn = GetNextAvailableFilename(fn, 0);
FileStream fs = new FileStream(fn, FileMode.Create);
BinaryWriter writer = new BinaryWriter(fs);
writer.Write(obj);
writer.Close();
fs.Close();
fs.Dispose();
}
protected void GetStorageByName(CFStorage cfs, string name)
{
VisitedEntryAction va = delegate(CFItem target)
{
if (target is CFStorage)
{
GetStorageByName((CFStorage)target, name);
}
else
{
if (target.Name == name)
Storage = target;
}
};
//Visit NON-recursively (first level only)
cfs.VisitEntries(va, false);
}
Any ideas what's happening here? why are the excel corrupted? I couldn't find a lot online despite hours of search!
any ideas, suggestions, or solutions will be appreciated.
Thanks
Change your SaveToDisk logic as follows:
public override void SaveToDisk()
{
byte[] keys = { (byte)0xd0, (byte)0xcf, (byte)0x11, (byte)0xe0, (byte)0xa1, (byte)0xb1, (byte)0x1a, (byte)0xe1 };
int offset = Utils.SearchBytes(m_RawOleObject, keys);
using (MemoryStream ms = new MemoryStream(strip(m_RawOleObject, offset)))
{
CompoundFile cf = new CompoundFile(ms, UpdateMode.ReadOnly, true, true);
m_filename = GetNextAvailableFilename(System.IO.Path.Combine(STOREPATH, Utils.CombineFilenameWithExtension(Filename, m_extension)), 0);
using (var fs = new FileStream(m_filename, FileMode.Create))
{
cf.Save(fs);
cf.Close();
}
}
//Workbook would be saved as hidden in previous step
Microsoft.Office.Interop.Excel.Application xlApp = null;
Microsoft.Office.Interop.Excel.Workbook xlWb = null;
try
{
xlApp = new Microsoft.Office.Interop.Excel.Application();
xlWb = xlApp.Workbooks.Open(m_filename);
xlWb.CheckCompatibility = false;
foreach (Window wn in xlApp.Windows)
{
wn.Visible = true;
}
xlWb.Save();
xlWb.Close();
}
catch (Exception e)
{
//TODO: Log error and continue
}
finally
{
if (xlWb != null)
Marshal.ReleaseComObject(xlWb);
if (xlApp != null)
Marshal.ReleaseComObject(xlApp);
xlApp = null;
}
}
Related
Due to some issues by running .xlsm files over the network it was decided not to use VBA anymore and to develop standalone apps that will edit regular excel files.
Since I have a some C# and Visual Studio knowledge I decided to use those tools. Since Iterop.Excel is really slow I decided to use SpreadsheetLight.
Everything went smooth during while reading and analyzing data but after I added some records and save the file the file become corrupted: when trying to open with excel I got the following message:
"We found A problem with some content. Do you want us to recover as much as we can? If you trust the source of this workbook, click yes". After click yes got the message that it cannot be recovered because is corrupt.
Even if I don't add any records and just save the file got corrupted.
The thing is that the file opens without any issues in OpenOffice, all the records are there.
Any help will be appreciated!
Below the class that implements the r/w of the excel file:
class SPREADSHEET_TOOLS
{
public string file_name;
public SLDocument doc;
public List<string> sheets;
MemoryStream ms;
public SPREADSHEET_TOOLS()
{
}
public bool init(string _file_name)
{
this.file_name = _file_name;
ms = new MemoryStream();
try
{
FileStream stream = File.Open(this.file_name, FileMode.Open);
this.doc = new SLDocument(stream);
this.sheets = doc.GetSheetNames();
stream.Close();
}
catch (IOException)
{
MessageBox.Show("Fisierul este deschis de un alt utilizator. Nu poate fi accesat!!!!");
return false;
}
return true;
}
public List<string>getUniqeRowValues(string sheet,int row)
{
List<string> values = new List<string>();
if (this.sheets.Contains(sheet))
{
this.doc.SelectWorksheet(sheet);
while (this.doc.GetCellValueAsString(row, 1) != "")
{
if (values.Count == 0)
{
values.Add(this.doc.GetCellValueAsString(row, 1));
}
else
{
if (!values.Contains(this.doc.GetCellValueAsString(row, 1)))
{
values.Add(this.doc.GetCellValueAsString(row, 1));
}
}
row++;
}
}
return values;
}
public List<string>getChildValues(string sheet, string parent, int row, int column_parent, int column_child)
{
List<string> values = new List<string>();
if (this.sheets.Contains(sheet))
{
this.doc.SelectWorksheet(sheet);
while (this.doc.GetCellValueAsString(row, column_parent) != "")
{
if (this.doc.GetCellValueAsString(row, column_parent) == parent)
{
values.Add(this.doc.GetCellValueAsString(row, column_child));
}
row++;
}
}
return values;
}
public int getLastRow(string sheet)
{
int row=0;
if (this.sheets.Contains(sheet))
{
this.doc.SelectWorksheet(sheet);
row = 1;
while (this.doc.GetCellValueAsString(row, 1) != "")
{
row++;
}
}
return row;
}
public bool writeRow(string[] data, string sheet,int row)
{
if (this.sheets.Contains(sheet))
{
this.doc.SelectWorksheet(sheet);
for (int i=0; i < data.Length; i++)
{
InlineString str = new InlineString();
//bool a = this.doc.SetCellValue(row,i+1,data[i]);
}
//this.doc.SaveAs(this.ms);
foreach (string s in this.sheets)
{
this.doc.SelectWorksheet(s);
}
this.doc.DocumentProperties.Creator = "CP";
this.doc.SaveAs("E:\\C-SHARP\\PONTAJ\\PONTAJ\\BUBU.XLSX");
MessageBox.Show("Saved!");
return true;
}
return false;
}
}
I also faced the same problem, Excel file gets corrupted after downloading.
So I have done some fixes and update SpreadSheetLight code to .NET 6.
You can download source code from here: https://github.com/bhavinvachhani403/SpreadSheetLight_Net6.0
I hope this will helps you to solve your problem.
I had the same error and I solved it by changing the version of DocumentFormat.OpenXml to version 2.5
We have a problem where our industrial equipments software's .XML settings files become blank, yet they still have the correct number of bytes.
I have a feeling it might be caused by the way the customers are shutting down the PC as it tends to happen after they've down a shutdown, isolate, and boot. The way I save the files is,
Serialize to %temp% file
Validate that the newly created file starts with <?xml
If the /backup folders version of the file is older than a day, copy the existing file to the /backup folder
Copy new file to overwrite existing file.
I thought maybe it's related to encoding, disk caching, Windows Update, or Windows Recovery.
Looking for ideas as I've spent two years chasing down why this is happening.
As per request, here is the code.
public static bool SerializeObjXml(object Object2Serialize, string FilePath, Type type, bool gzip = false)
{
if (!Path.IsPathRooted(FilePath))
FilePath = Path.Combine(ApplicationDir, FilePath);
bool isSuccess = false;
var tmpFile = Path.GetTempFileName();
try
{
for (int i = 0; i < 3; i++)
{
try
{
Directory.CreateDirectory(Path.GetDirectoryName(FilePath));
if (gzip)
{
using (var ms = new MemoryStream())
{
XmlSerializer bf = new XmlSerializer(type);
bf.Serialize(ms, Object2Serialize);
ms.Position = 0;
using (var fileStream = new BinaryWriter(File.Open(tmpFile, FileMode.Create)))
{
using (GZipStream gzipStream = new GZipStream(fileStream.BaseStream, CompressionMode.Compress))
{
byte[] buffer = new byte[4096];
int numRead;
while ((numRead = ms.Read(buffer, 0, buffer.Length)) != 0)
{
gzipStream.Write(buffer, 0, numRead);
}
}
}
}
if (!FileChecker.isGZip(tmpFile))
throw new XmlException("Failed to write valid XML file " + FilePath);
}
else
{
using (var fs = new StreamWriter(File.Open(tmpFile, FileMode.Create), Encoding.UTF8))
{
XmlSerializer bf = new XmlSerializer(type);
bf.Serialize(fs, Object2Serialize);
}
if (!FileChecker.isXML(tmpFile))
throw new XmlException("Failed to write valid XML file " + FilePath);
}
isSuccess = true;
return true;
}
catch (XmlException)
{
return false;
}
catch (System.IO.DriveNotFoundException) { continue; }
catch (System.IO.DirectoryNotFoundException) { continue; }
catch (System.IO.FileNotFoundException) { continue; }
catch (System.IO.IOException) { continue; }
}
}
finally
{
if (isSuccess)
{
lock (FilePath)
{
try
{
//Delete existing .bak file
if (File.Exists(FilePath + ".bak"))
{
File.SetAttributes(FilePath + ".bak", FileAttributes.Normal);
File.Delete(FilePath + ".bak");
}
}
catch { }
try
{
//Make copy of file as .bak
if (File.Exists(FilePath))
{
File.SetAttributes(FilePath, FileAttributes.Normal);
File.Copy(FilePath, FilePath + ".bak", true);
}
}
catch { }
try
{
//Copy the temp file to the target
File.Copy(tmpFile, FilePath, true);
//Delete .bak file if no error
if (File.Exists(FilePath + ".bak"))
File.Delete(FilePath + ".bak");
}
catch { }
}
}
try
{
//Delete the %temp% file
if (File.Exists(tmpFile))
File.Delete(tmpFile);
}
catch { }
}
return false;
}
public static class FileChecker
{
const string gzipSig = "1F-8B-08";
static string xmlSig = "EF-BB-BF";// <?x";
public static bool isGZip(string filepath)
{
return FileChecker.CheckSignature(filepath, (3, gzipSig)) != null;
}
public static bool isXML(string filepath)
{
return FileChecker.CheckSignature(filepath, (3, xmlSig)) != null;
}
public static bool isGZipOrXML(string filepath, out bool isGZip, out bool isXML)
{
var sig = FileChecker.CheckSignature(filepath, (3, gzipSig), (3, xmlSig));
isXML = (sig == xmlSig);
isGZip = (sig == gzipSig);
return isXML || isGZip;
}
public static string CheckSignature(string filepath, params (int signatureSize, string expectedSignature)[] pairs)
{
if (String.IsNullOrEmpty(filepath))
throw new ArgumentException("Must specify a filepath");
if (String.IsNullOrEmpty(pairs[0].expectedSignature))
throw new ArgumentException("Must specify a value for the expected file signature");
int signatureSize = 0;
foreach (var pair in pairs)
if (pair.signatureSize > signatureSize)
signatureSize = pair.signatureSize;
using (FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
if (fs.Length < signatureSize)
return null;
byte[] signature = new byte[signatureSize];
int bytesRequired = signatureSize;
int index = 0;
while (bytesRequired > 0)
{
int bytesRead = fs.Read(signature, index, bytesRequired);
bytesRequired -= bytesRead;
index += bytesRead;
}
foreach (var pair in pairs)
{
string actualSignature = BitConverter.ToString(signature, 0, pair.signatureSize);
if (actualSignature == pair.expectedSignature)
return actualSignature;
}
}
return null;
}
}
Using the operating system's move or copy file to overwrite an existing file is an atomic operation meaning the it wholly succeeds or doesn't and doesn't overlap other file operations.
Therefore what you have should work if that is how you are achieving step 4.
Copy new file to overwrite existing file.
If instead you are blanking out the existing file and re-writing the data I suspect that could be the the point of failure..
The issues while file space is being allocated the write is not occurring during shutdown, which leaves you when a file with bytes allocated without the data being flushed to disk.
During the OS shutdown, likely a ThreadAbortException is raised which triggers your finally block.
You can attempt to reproduce by calling Process.Start("shutdown", "-a") before your return statement but after you have set success = true.
I would suggest simplifying your code and have everything run inside of your try {} statement. This removes the possibility of having a state where success = true before your attempted your write to disk, which is then triggered in a finally statement trigged by a windows shutdown.
public static bool SerializeObjXml(
object Object2Serialize,
string FilePath,
Type type,
bool gzip = false)
{
if (!Path.IsPathRooted(FilePath))
FilePath = Path.Combine(ApplicationDir, FilePath);
Directory.CreateDirectory(FilePath);
for (int i = 0; i < 3; i++)
{
try
{
var tempFi = SerializeToXmlFile(Object2Serialize, type, gzip);
var fi = new FileInfo(FilePath);
if (fi.Exists)
fi.CopyTo(fi.FullName + ".bak", true);
tempFi.CopyTo(fi.FullName, true);
tempFi.Delete();
return true;
}
catch (Exception ex)
{
string message = $"[{DateTime.Now}] Error serializing file {FilePath}. {ex}";
File.WriteAllText(FilePath + ".log", message);
}
}
return false;
}
As a side note, you can simply use [Stream.CopyTo][1] and write directly to your temp file, without the need for intermediary streams or for manual buffer/byte read/write operations:
private static FileInfo SerializeToXmlFile(
object Object2Serialize,
Type type,
bool gzip)
{
var tmpFile = Path.GetTempFileName();
var tempFi = new FileInfo(tmpFile);
if (!gzip)
{
using (var fs = File.Open(tmpFile, FileMode.Create))
(new XmlSerializer(type)).Serialize(fs, Object2Serialize);
if (!FileChecker.isXML(tmpFile))
throw new Exception($"Failed to write valid XML file: {tmpFile}");
}
else
{
using (var fs = File.Open(tmpFile, FileMode.CreateNew))
using (var gz = new GZipStream(fs, CompressionMode.Compress))
(new XmlSerializer(type)).Serialize(fs, Object2Serialize);
if (!FileChecker.isGZip(tmpFile))
throw new Exception($"Failed to write valid XML gz file: {tmpFile}");
}
return tempFi;
}
I am using iTextSharp c# to extract images and its name from catalog pdf. I Am able to extract images from pdf, but struggling with extracting its corresponding image name as per the attached screenshot and save the file with that name. Please find the code below and let me know your suggestions.
Sample PDF: https://docdro.id/PwBsNR9
Code:
private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
{
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
iTextSharp.text.pdf.PdfObject PDFObj = null;
iTextSharp.text.pdf.PdfStream PDFStremObj = null;
try
{
RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath);
PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
{
}
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
{
try
{
iTextSharp.text.pdf.parser.PdfImageObject PdfImageObj =
new iTextSharp.text.pdf.parser.PdfImageObject((iTextSharp.text.pdf.PRStream)PDFStremObj);
System.Drawing.Image ImgPDF = PdfImageObj.GetDrawingImage();
ImgList.Add(ImgPDF);
}
catch (Exception)
{
}
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return ImgList;
}
Unfortunately the example PDF is not tagged. Thus, one has to otherwise try and associate title text and image, either by analyzing the location in respect to each other or by exploiting a pattern in the content streams.
In the case at hand analyzing the location in respect to each other is feasible as the title always is (at least partially) drawn on the matching image or is the text right beneath it. Thus, one could in a first pass extract the text with position from a page and in a second one the images, at the same time looking for a title in the previously extracted text in the image area or right beneath. Alternatively one could first extract images with position and size and then extract the text in these areas.
But there also is a certain pattern in the content streams: The titel is always drawn in a single text drawing instruction right after the corresponding image is drawn. Thus, one can also go ahead and in one pass extract images and the next drawn text as associated title.
Either approach can be implemented using the iText parser API. For example in case of the latter approach as follows: first, one implements a render listener that behaves as described, i.e. saves images and the following text:
internal class ImageWithTitleRenderListener : IRenderListener
{
int imageNumber = 0;
String format;
bool expectingTitle = false;
public ImageWithTitleRenderListener(String format)
{
this.format = format;
}
public void BeginTextBlock()
{ }
public void EndTextBlock()
{ }
public void RenderText(TextRenderInfo renderInfo)
{
if (expectingTitle)
{
expectingTitle = false;
File.WriteAllText(string.Format(format, imageNumber, "txt"), renderInfo.GetText());
}
}
public void RenderImage(ImageRenderInfo renderInfo)
{
imageNumber++;
expectingTitle = true;
PdfImageObject imageObject = renderInfo.GetImage();
if (imageObject == null)
{
Console.WriteLine("Image {0} could not be read.", imageNumber);
}
else
{
File.WriteAllBytes(string.Format(format, imageNumber, imageObject.GetFileType()), imageObject.GetImageAsBytes());
}
}
}
Then one parses the document pages using that render listener:
using (PdfReader reader = new PdfReader(#"EVERMOTION ARCHMODELS VOL.78.pdf"))
{
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
ImageWithTitleRenderListener listener = new ImageWithTitleRenderListener(#"EVERMOTION ARCHMODELS VOL.78-{0:D3}.{1}");
for (var i = 1; i <= reader.NumberOfPages; i++)
{
parser.ProcessContent(i, listener);
}
}
I hope this would help.
I am doing this type of thing but if this would help.
// existing pdf path
PdfReader reader = new PdfReader(path);
PRStream pst;
PdfImageObject pio;
PdfObject po;
// number of objects in pdf document
int n = reader.XrefSize;
//FileStream fs = null;
// set image file location
//String path = "E:/";
for (int i = 0; i < n; i++)
{
// get the object at the index i in the objects collection
po = reader.GetPdfObject(i);
// object not found so continue
if (po == null || !po.IsStream())
continue;
//cast object to stream
pst = (PRStream)po;
//get the object type
PdfObject type = pst.Get(PdfName.SUBTYPE);
//check if the object is the image type object
if (type != null && type.ToString().Equals(PdfName.IMAGE.ToString()))
{
//get the image
pio = new PdfImageObject(pst);
// fs = new FileStream(path + "image" + i + ".jpg", FileMode.Create);
//read bytes of image in to an array
byte[] imgdata = pio.GetImageAsBytes();
try
{
Stream stream = new MemoryStream(imgdata);
FileStream fs = stream as FileStream;
if (fs != null) Console.WriteLine(fs.Name);
}
catch
{
}
}
}
Now you can save your stream.
public void SaveStreamToFile(string fileFullPath, Stream stream)
{
if (stream.Length == 0) return;
// Create a FileStream object to write a stream to a file
using (FileStream fileStream = System.IO.File.Create(fileFullPath, (int)stream.Length))
{
// Fill the bytes[] array with the stream data
byte[] bytesInStream = new byte[stream.Length];
stream.Read(bytesInStream, 0, (int)bytesInStream.Length);
// Use FileStream object to write to the specified file
fileStream.Write(bytesInStream, 0, bytesInStream.Length);
}
}
My name's Lucas and I'm learning about WPF/C#.
I would like to join several images in a single file, as well as a game that use multiple textures that are all in one file, but I have no idea how to do. Could anyone help me at least to think how to do?
//Convert Image to Byte[]
public byte[] getByteFromImage()
{
byte[] imageArray = File.ReadAllBytes(op.FileName);
return imageArray;
}
//Convert Byte[] to Image
public void getImageFromByte()
{
FileStream f = new FileStream("escudos.bcf", FileMode.Open);
BinaryReader b = new BinaryReader(f);
Int64 c = f.Length+1;
MemoryStream ms = new MemoryStream(b.ReadBytes(int.Parse(c.ToString())));
Image image = new Image();
image.Source = BitmapFrame.Create(ms, BitmapCreateOptions.None,
BitmapCacheOption.OnLoad);
imgPatch2.Source = image.Source;
f.Dispose();
}
//Create Binary File
public void save(byte[] img)
{
FileStream f;
if (!File.Exists("escudos.bcf"))
{
f = new FileStream("escudos.bcf", FileMode.Create);
}
else
{
f = new FileStream("escudos.bcf", FileMode.Append);
}
BinaryWriter b = new BinaryWriter(f);
b.Write(img);
b.Close();
f.Dispose();
}
I thought in doing so, create a file and store it in binary images.
Until I got that part, but as this file will have multiple images in binary, I do not know how to pick just one binary image.
public void xmlCreate(string name, Int64 ini, Int64 fin)
{
if (!File.Exists("Escudos.xml"))
{
XmlTextWriter w = new XmlTextWriter("Escudos.xml", System.Text.Encoding.UTF8);
w.Formatting = Formatting.Indented;
w.WriteStartDocument();
w.WriteStartElement("Time");
w.WriteStartElement(name);
w.WriteElementString("Inicio", ini.ToString());
w.WriteElementString("Fim", fin.ToString());
w.WriteEndElement();
w.WriteEndDocument();
w.Close();
}
else
{
XDocument doc = XDocument.Load("Escudos.xml");
doc.Root.Add(new XElement(name));
doc.Root.Element(name).Add(new XElement("Inicio", ini.ToString()));
doc.Root.Element(name).Add(new XElement("Fim", fin.ToString()));
doc.Save("Escudos.xml");
}
}
Now I have created an xml file to store the start and end of the bytes. I can add only when I create a new xml file, I can not get a xml created and add new bytes. When I go to load the xml file gives an error message.
" An unhandled exception of type 'System.Xml.XmlException' occurred in System.Xml.dll
Additional information: '>' is an unexpected token. The expected token is '='. Line 3, position 15. "
UPDATE
when I'm reading the bytes to form an image, always the same way, even I adding different images. I'll add code below
//Add Image
private void btAddImage_Click(object sender, RoutedEventArgs e)
{
OpenFileDialog op = new OpenFileDialog();
op.Title = "Selecione a Imagem";
op.Filter = "All supported graphics|*.jpg;*.jpeg;*.png|" +
"JPEG (*.jpg;*.jpeg)|*.jpg;*.jpeg|" +
"Portable Network Graphic (*.png)|*.png";
if (op.ShowDialog() == true)
{
imgPatch.Source = new BitmapImage(new Uri(op.FileName));
txtName.Focus();
}
}
//Convert Image
private void btConvertImage_Click(object sender, RoutedEventArgs e)
{
if (String.IsNullOrEmpty(txtName.Text))
{
txtName.Focus();
MessageBox.Show("Preencha o Nome", "Error");
}
else
{
save(ConvertFileToByteArray(op.FileName), txtName.Text);
}
}
//Image to Byte Array
private static byte[] ConvertFileToByteArray(String FilePath)
{
return File.ReadAllBytes(FilePath);
}
//Save Binary File and XML File
public void save(byte[] img, string nome)
{
FileStream f;
long ini, fin = img.Length;
if (!File.Exists("Escudos.bcf"))
{
f = new FileStream("Escudos.bcf", FileMode.Create);
ini = 0;
}
else
{
f = new FileStream("Escudos.bcf", FileMode.Append);
ini = f.Length + 1;
bin = new TestBinarySegment();
}
bin.LoadAddSave("Escudos.xml", "Brasileiro", nome, ini, fin);
BinaryWriter b = new BinaryWriter(f);
b.Write(img);
b.Close();
f.Dispose();
}
//Load Image from Byte
private void btLoad_Click(object sender, RoutedEventArgs e)
{
getImageFromByte();
}
//Byte to Image
public void getImageFromByte(int start, int length)
{
using (FileStream fs = new FileStream("Escudos.bcf", FileMode.Open))
{
byte[] iba = new byte[fs.Length+1];
fs.Read(iba, start, length);
Image image = new Image();
image.Source = BitmapFrame.Create(fs, BitmapCreateOptions.None,
BitmapCacheOption.OnLoad);
imgPatch2.Source = image.Source;
}
}
Thanks
You have a binary file where its contents are segments and each segment contains the binary information needed to create an image. You need to store the starting index and length of each segment so that you can retrieve it. One way to do this is with an xml file.
To begin, create a container class for the segments. It looks like this...
public class BinarySegment
{
private const string FileName = "SegmentData.xml";
private static XmlSerializer serializer = new XmlSerializer(typeof(List<BinarySegment>));
public string SegmentName { get; set; }
public long SegmentStartIndex { get; set; }
public long SegmentLength { get; set; }
public static List<BinarySegment> LoadFromFile()
{
if (!File.Exists(FileName))
{
throw new Exception("File must be created first");
}
try
{
using (StreamReader sr = new StreamReader(FileName))
{
return serializer.Deserialize(sr) as List<BinarySegment>;
}
}
catch
{
throw new Exception("File as become corrupted");
}
}
public static void Save(List<BinarySegment> list)
{
try
{
using (StreamWriter sw = new StreamWriter(FileName))
{
serializer.Serialize(sw, list);
}
}
catch
{
throw;
}
}
}
There is one instance of this class for each image in your binary file. It will read/write a List of BinarySegments.
To test the class, create a test class like this...
public class TestBinarySegment
{
public TestBinarySegment()
{
List<BinarySegment> myBinarySegments = new List<BinarySegment>();
myBinarySegments.Add(new BinarySegment{SegmentName = "Segment1", SegmentStartIndex = 0, SegmentLength = 1111});
myBinarySegments.Add(new BinarySegment { SegmentName = "Segment2", SegmentStartIndex = 1111, SegmentLength = 1111 });
myBinarySegments.Add(new BinarySegment { SegmentName = "Segment3", SegmentStartIndex = 2222, SegmentLength = 1111 });
BinarySegment.Save(myBinarySegments);
}
public void LoadAddSave()
{
List<BinarySegment> myBinarySegments = BinarySegment.LoadFromFile();
myBinarySegments.Add(new BinarySegment { SegmentName = "Segment4", SegmentStartIndex = 333330, SegmentLength = 1111 });
BinarySegment.Save(myBinarySegments);
}
}
This class shows how to create the list and to save it. It also shows how to add new segments and resave the xml file.
When the test is run, you get a file like this...
<?xml version="1.0" encoding="utf-8"?>
<ArrayOfBinarySegment xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<BinarySegment>
<SegmentName>Segment1</SegmentName>
<SegmentStartIndex>0</SegmentStartIndex>
<SegmentLength>1111</SegmentLength>
</BinarySegment>
<BinarySegment>
<SegmentName>Segment2</SegmentName>
<SegmentStartIndex>1111</SegmentStartIndex>
<SegmentLength>1111</SegmentLength>
</BinarySegment>
<BinarySegment>
<SegmentName>Segment3</SegmentName>
<SegmentStartIndex>2222</SegmentStartIndex>
<SegmentLength>1111</SegmentLength>
</BinarySegment>
<BinarySegment>
<SegmentName>Segment4</SegmentName>
<SegmentStartIndex>333330</SegmentStartIndex>
<SegmentLength>1111</SegmentLength>
</BinarySegment>
</ArrayOfBinarySegment>
To run the test, use code like this...
TestBinarySegment test = new TestBinarySegment();
test.LoadAddSave();
This shows how to use xml serialization to create a file and add new segments to it. You will need to test this concept and then integrate it into your project.
I am trying to extract all the images from a pdf using itextsharp but can't seem to overcome this one hurdle.
The error occures on the line System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS); giving an error of "Parameter is not valid".
I think it works when the image is a bitmap but not of any other format.
I have this following code - sorry for the length;
private void Form1_Load(object sender, EventArgs e)
{
FileStream fs = File.OpenRead(#"reader.pdf");
byte[] data = new byte[fs.Length];
fs.Read(data, 0, (int)fs.Length);
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
iTextSharp.text.pdf.PdfObject PDFObj = null;
iTextSharp.text.pdf.PdfStream PDFStremObj = null;
try
{
RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(data);
PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
{
byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);
if ((bytes != null))
{
try
{
System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);
MS.Position = 0;
System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);
ImgList.Add(ImgPDF);
}
catch (Exception)
{
}
}
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
} //Form1_Load
Resolved...
Even I got the same exception of "Parameter is not valid" and after so much of
work with the help of the link provided by der_chirurg
(http://kuujinbo.info/iTextSharp/CCITTFaxDecodeExtract.aspx ) I resolved it
and following is the code:
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using iTextSharp.text.pdf.parser;
using Dotnet = System.Drawing.Image;
using iTextSharp.text.pdf;
namespace PDF_Parsing
{
partial class PDF_ImgExtraction
{
string imgPath;
private void ExtractImage(string pdfFile)
{
PdfReader pdfReader = new PdfReader(files[fileIndex]);
for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++)
{
PdfReader pdf = new PdfReader(pdfFile);
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
string width = tg.Get(PdfName.WIDTH).ToString();
string height = tg.Get(PdfName.HEIGHT).ToString();
ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
RenderImage(imgRI);
}
}
}
}
private void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
using (Dotnet dotnetImg = image.GetDrawingImage())
{
if (dotnetImg != null)
{
using (MemoryStream ms = new MemoryStream())
{
dotnetImg.Save(ms, ImageFormat.Tiff);
Bitmap d = new Bitmap(dotnetImg);
d.Save(imgPath);
}
}
}
}
}
}
You need to check the stream's /Filter to see what image format a given image uses. It may be a standard image format:
DCTDecode (jpeg)
JPXDecode (jpeg 2000)
JBIG2Decode (jbig is a B&W only format)
CCITTFaxDecode (fax format, PDF supports group 3 and 4)
Other than that, you'll need to get the raw bytes (as you are), and build an image using the image stream's width, height, bits per component, number of color components (could be CMYK, indexed, RGB, or Something Weird), and a few others, as defined in section 8.9 of the ISO PDF SPECIFICATION (available for free).
So in some cases your code will work, but in others, it'll fail with the exception you mentioned.
PS: When you have an exception, PLEASE include the stack trace every single time. Pretty please with sugar on top?
Works for me like this, using these two methods:
public static List<System.Drawing.Image> ExtractImagesFromPDF(byte[] bytes)
{
var imgs = new List<System.Drawing.Image>();
var pdf = new PdfReader(bytes);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
List<PdfObject> objs = FindImageInPDFDictionary(pg);
foreach (var obj in objs)
{
if (obj != null)
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
var pdfImage = new PdfImageObject((PRStream)pdfStrem);
var img = pdfImage.GetDrawingImage();
imgs.Add(img);
}
}
}
}
finally
{
pdf.Close();
}
return imgs;
}
private static List<PdfObject> FindImageInPDFDictionary(PdfDictionary pg)
{
var res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
var xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
var pdfObgs = new List<PdfObject>();
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
var tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
var type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (PdfName.IMAGE.Equals(type)) // image at the root of the pdf
{
pdfObgs.Add(obj);
}
else if (PdfName.FORM.Equals(type)) // image inside a form
{
FindImageInPDFDictionary(tg).ForEach(o => pdfObgs.Add(o));
}
else if (PdfName.GROUP.Equals(type)) // image inside a group
{
FindImageInPDFDictionary(tg).ForEach(o => pdfObgs.Add(o));
}
}
}
}
return pdfObgs;
}
In newer version of iTextSharp, the 1st parameter of ImageRenderInfo.CreateForXObject is not Matrix anymore but GraphicsState. #der_chirurg's approach should work. I tested myself with the information from the following link and it worked beautifully:
http://www.thevalvepage.com/swmonkey/2014/11/26/extract-images-from-pdf-files-using-itextsharp/
To extract all Images on all Pages, it is not necessary to implement different filters. iTextSharp has an Image Renderer, which saves all Images in their original image type.
Just do the following found here: http://kuujinbo.info/iTextSharp/CCITTFaxDecodeExtract.aspx You don't need to implement HttpHandler...
I added library on github which, extract images in PDF and compress them.
Could be useful, when you are going to start play with very powerful library ITextSharp.
Here the link: https://github.com/rock-walker/PdfCompression
This works for me and I think it's a simple solution:
Write a custom RenderListener and implement its RenderImage method, something like this
public void RenderImage(ImageRenderInfo info)
{
PdfImageObject image = info.GetImage();
Parser.Matrix matrix = info.GetImageCTM();
var fileType = image.GetFileType();
ImageFormat format;
switch (fileType)
{//you may add more types here
case "jpg":
case "jpeg":
format = ImageFormat.Jpeg;
break;
case "pnt":
format = ImageFormat.Png;
break;
case "bmp":
format = ImageFormat.Bmp;
break;
case "tiff":
format = ImageFormat.Tiff;
break;
case "gif":
format = ImageFormat.Gif;
break;
default:
format = ImageFormat.Jpeg;
break;
}
var pic = image.GetDrawingImage();
var x = matrix[Parser.Matrix.I31];
var y = matrix[Parser.Matrix.I32];
var width = matrix[Parser.Matrix.I11];
var height = matrix[Parser.Matrix.I22];
if (x < <some value> && y < <some value>)
{
return;//ignore these images
}
pic.Save(<path and name>, format);
}
I have used this library in the past without any problems.
http://www.winnovative-software.com/PdfImgExtractor.aspx
private void btnExtractImages_Click(object sender, EventArgs e)
{
if (pdfFileTextBox.Text.Trim().Equals(String.Empty))
{
MessageBox.Show("Please choose a source PDF file", "Choose PDF file", MessageBoxButtons.OK);
return;
}
// the source pdf file
string pdfFileName = pdfFileTextBox.Text.Trim();
// start page number
int startPageNumber = int.Parse(textBoxStartPage.Text.Trim());
// end page number
// when it is 0 the extraction will continue up to the end of document
int endPageNumber = 0;
if (textBoxEndPage.Text.Trim() != String.Empty)
endPageNumber = int.Parse(textBoxEndPage.Text.Trim());
// create the PDF images extractor object
PdfImagesExtractor pdfImagesExtractor = new PdfImagesExtractor();
pdfImagesExtractor.LicenseKey = "31FAUEJHUEBQRl5AUENBXkFCXklJSUlQQA==";
// the demo output directory
string outputDirectory = Path.Combine(Application.StartupPath, #"DemoFiles\Output");
Cursor = Cursors.WaitCursor;
// set the handler to be called when an image was extracted
pdfImagesExtractor.ImageExtractedEvent += pdfImagesExtractor_ImageExtractedEvent;
try
{
// start images counting
imageIndex = 0;
// call the images extractor to raise the ImageExtractedEvent event when an images is extracted from a PDF page
// the pdfImagesExtractor_ImageExtractedEvent handler below will be executed for each extracted image
pdfImagesExtractor.ExtractImagesInEvent(pdfFileName, startPageNumber, endPageNumber);
// Alternatively you can use the ExtractImages() and ExtractImagesToFile() methods
// to extracted the images from a PDF document in memory or to image files in a directory
// uncomment the line below to extract the images to an array of ExtractedImage objects
//ExtractedImage[] pdfPageImages = pdfImagesExtractor.ExtractImages(pdfFileName, startPageNumber, endPageNumber);
// uncomment the lines below to extract the images to image files in a directory
//string outputDirectory = System.IO.Path.Combine(Application.StartupPath, #"DemoFiles\Output");
//pdfImagesExtractor.ExtractImagesToFile(pdfFileName, startPageNumber, endPageNumber, outputDirectory, "pdfimage");
}
catch (Exception ex)
{
// The extraction failed
MessageBox.Show(String.Format("An error occurred. {0}", ex.Message), "Error");
return;
}
finally
{
// uninstall the event handler
pdfImagesExtractor.ImageExtractedEvent -= pdfImagesExtractor_ImageExtractedEvent;
Cursor = Cursors.Arrow;
}
try
{
System.Diagnostics.Process.Start(outputDirectory);
}
catch (Exception ex)
{
MessageBox.Show(string.Format("Cannot open output folder. {0}", ex.Message));
return;
}
}
/// <summary>
/// The ImageExtractedEvent event handler called after an image was extracted from a PDF page.
/// The event is raised when the ExtractImagesInEvent() method is used
/// </summary>
/// <param name="args">The handler argument containing the extracted image and the PDF page number</param>
void pdfImagesExtractor_ImageExtractedEvent(ImageExtractedEventArgs args)
{
// get the image object and page number from even handler argument
Image pdfPageImageObj = args.ExtractedImage.ImageObject;
int pageNumber = args.ExtractedImage.PageNumber;
// save the extracted image to a PNG file
string outputPageImage = Path.Combine(Application.StartupPath, #"DemoFiles\Output",
"pdfimage_" + pageNumber.ToString() + "_" + imageIndex++ + ".png");
pdfPageImageObj.Save(outputPageImage, ImageFormat.Png);
args.ExtractedImage.Dispose();
}