Related
This question already has answers here:
What is recursion and when should I use it?
(40 answers)
Closed 4 years ago.
I coming back to C# programming after some years of HTML/ASP.
I came across these lines and cannot find what it does.
It is a method in a class:
private string PeekNext()
{
if (pos < 0)
// pos < 0 indicates that there are no more tokens
return null;
if (pos < tokens.Length)
{
if (tokens[pos].Length == 0)
{
++pos;
return PeekNext();
}
return tokens[pos];
}
string line = reader.ReadLine();
if (line == null)
{
// There is no more data to read
pos = -1;
return null;
}
// Split the line that was read on white space characters
tokens = line.Split(null);
pos = 0;
return PeekNext();
}
Is it calling itself until some of the other Returns happens?
What is happening here, never seen a method returning itself!?
What is Returned, empty string or what...?
Or maybe I just missed it before.
Maybe simple but puzzles me.
private string PeekNext()
{
if (pos < 0)
// pos < 0 indicates that there are no more tokens
return null;
if (pos < tokens.Length)
{
if (tokens[pos].Length == 0)
{
++pos;
return PeekNext();
}
return tokens[pos];
}
string line = reader.ReadLine();
if (line == null)
{
// There is no more data to read
pos = -1;
return null;
}
// Split the line that was read on white space characters
tokens = line.Split(null);
pos = 0;
return PeekNext();
Notwithstanding the fact that the method depends on external (class) variables, and should probably be refactored to take its dependencies as parameters, a non-recursive version could look as follows:
private string PeekNext()
{
while (pos >= 0)
{
if (pos < tokens.Length)
{
if (tokens[pos].Length == 0)
{
++pos;
continue;
}
return tokens[pos];
}
string line = reader.ReadLine();
if (line == null)
{
// There is no more data to read
pos = -1;
return null;
}
// Split the line that was read on white space characters
tokens = line.Split(null);
pos = 0;
}
// pos < 0 indicates that there are no more tokens
return null;
}
I need to extract some data from a PDF file.
I'm using the iTextSharp to do that.
I'm using this code which I founded on the net:
using System;
using System.IO;
using iTextSharp.text.pdf;
namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
outFileName = String.Empty;
outFileName = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);
//string currentDirectory = Directory.GetCurrentDirectory();
//string filePath = System.IO.Path.Combine(currentDirectory, "Data", "myfile.txt");
// extract the text
//string test = "";
outFileName += #"\test.txt";
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, true, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch(Exception ex)
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
I'm using this way:
PDFParser pdfParser = new PDFParser();
pdfParser.ExtractText(pdfFile,Path.GetFileNameWithoutExtension(pdfFile) + ".txt");
So the pdf content is written in a txt file.
It works good for certain pdf-s, but for a pdf file that I really need to use, the txt file remains always empty. I didn't get errors, but for some reason it's not writing anything, although as you can see in this screenshot it recognize the pdf,that it has 2 pages...
This is the pdf that I need but the txt always remains empty.(the black lines are added by me, so there are not present when I want to write in the txt)
And this is another pdf. For this the program works ok, and it is written is a txt file. It is much bigger than the other pdf, and still for this I can extract the texts and for the other I can't.
Do you have any idea what can be the problem?
Too long for comment and maybe an answer that you do not like to get:
In PDFs the "Text your see" aka how does a font look and "What the glyphs mean" aka what glyph is mapped to which utf8-letter are separate things.
They are stored in different parts of the pdf - it is utterly possible that a pdf looks totally fine, but if you try to extract text it will give you nothing from it because it only contains the shape of your textglyphs but not theire "meaning".
Try to open the pdf and Select + Copy the text you are after, if you paste that into an editor and noting is there, your pdf lacks the information "what utf8-letter is displayed by this glyph".
OR:
It also might be that your pdf only containts the image of a text - a photo so to say. You can read it, iTextSharp sees only a "picture" - no text.
Those are possible 'why's that would answer your question. As to how to fix it:
There are several questions about corrupt PDFs on SO:
How to repair a PDF file and embed missing fonts
Embedded fonts in PDF: copy and paste problems (this answer)
Copy and Paste relates to text parsing, so the might help you out on how to fix it.
Your edit shows details about your parsing, why don't you leverage iTextSharp for that?
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public static string ExtractTextFromPdf(string path)
{
using (PdfReader reader = new PdfReader(path))
{
StringBuilder text = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++)
{
text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
}
return text.ToString();
}
from: http://www.squarepdf.net/parsing-pdf-files-using-itextsharp
or like here: parse-pdf-with-itextsharp-and-then-extract-specific-text-to-the-screen ?
I am trying to count the number of rows in a text file (to compare to a control file) before performing a complex SSIS insert package.
Currently I am using a StreamReader and it is breaking a line with a {LF} embedded into a new line, whereas SSIS is using {CR}{LF} (correctly), so the counts are not tallying up.
Does anyone know an alternate method of doing this where I can count the number of lines in the file based on {CR}{LF} Line breaks only?
Thanks in advance
Iterate through the file and count number of CRLFs.
Pretty straightforward implementation:
public int CountLines(Stream stream, Encoding encoding)
{
int cur, prev = -1, lines = 0;
using (var sr = new StreamReader(stream, encoding, false, 4096, true))
{
while ((cur = sr.Read()) != -1)
{
if (prev == '\r' && cur == '\n')
lines++;
prev = cur;
}
}
//Empty stream will result in 0 lines, any content would result in at least one line
if (prev != -1)
lines++;
return lines;
}
Example usage:
using(var s = File.OpenRead(#"<your_file_path>"))
Console.WriteLine("Found {0} lines", CountLines(s, Encoding.Default));
Actually it's a find substring in string task. More generic algorithms can be used.
{CR}{LF} is the desired. Can't really say which is correct.
Since ReadLine strips off the end of line you don't know
Use StreamReader.Read Method () and look for 13 followed by 10
It return Int
Here's a pretty lazy way... this will read the entire file into memory.
var cnt = File.ReadAllText("yourfile.txt")
.Split(new[] { "\r\n" }, StringSplitOptions.None)
.Length;
Here is an extension-method that reads the lines with line-seperator {Cr}{Lf} only, and not {LF}. You could do a count on it.
var count= new StreamReader(#"D:\Test.txt").ReadLinesCrLf().Count()
But could also use it for reading files, sometimes usefull since the normal StreamReader.ReadLine breaks on both {Cr}{Lf} and {LF}. Can be used on any TextReader and works streaming (file size is not an issue).
public static IEnumerable<string> ReadLinesCrLf(this TextReader reader, int bufferSize = 4096)
{
StringBuilder lineBuffer = null;
//read buffer
char[] buffer = new char[bufferSize];
int charsRead;
var previousIsLf = false;
while ((charsRead = reader.Read(buffer, 0, bufferSize)) != 0)
{
int bufferIndex = 0;
int writeIdx = 0;
do
{
var currentChar = buffer[bufferIndex];
switch (currentChar)
{
case '\n':
if (previousIsLf)
{
if (lineBuffer == null)
{
//return from current buffer writeIdx could be higher than 0 when multiple rows are in the buffer
yield return new string(buffer, writeIdx, bufferIndex - writeIdx - 1);
//shift write index to next character that will be read
writeIdx = bufferIndex + 1;
}
else
{
Debug.Assert(writeIdx == 0, $"Write index should be 0, when linebuffer != null");
lineBuffer.Append(buffer, writeIdx, bufferIndex - writeIdx);
Debug.Assert(lineBuffer.ToString().Last() == '\r',$"Last character in linebuffer should be a carriage return now");
lineBuffer.Length--;
//shift write index to next character that will be read
writeIdx = bufferIndex + 1;
yield return lineBuffer.ToString();
lineBuffer = null;
}
}
previousIsLf = false;
break;
case '\r':
previousIsLf = true;
break;
default:
previousIsLf = false;
break;
}
bufferIndex++;
} while (bufferIndex < charsRead);
if (writeIdx < bufferIndex)
{
if (lineBuffer == null) lineBuffer = new StringBuilder();
lineBuffer.Append(buffer, writeIdx, bufferIndex - writeIdx);
}
}
//return last row
if (lineBuffer != null && lineBuffer.Length > 0) yield return lineBuffer.ToString();
}
This question already has answers here:
Get last 10 lines of very large text file > 10GB
(21 answers)
Closed 1 year ago.
need a snippet of code which would read out last "n lines" of a log file. I came up with the following code from the net.I am kinda new to C sharp. Since the log file might be
quite large, I want to avoid overhead of reading the entire file.Can someone suggest any performance enhancement. I do not really want to read each character and change position.
var reader = new StreamReader(filePath, Encoding.ASCII);
reader.BaseStream.Seek(0, SeekOrigin.End);
var count = 0;
while (count <= tailCount)
{
if (reader.BaseStream.Position <= 0) break;
reader.BaseStream.Position--;
int c = reader.Read();
if (reader.BaseStream.Position <= 0) break;
reader.BaseStream.Position--;
if (c == '\n')
{
++count;
}
}
var str = reader.ReadToEnd();
Your code will perform very poorly, since you aren't allowing any caching to happen.
In addition, it will not work at all for Unicode.
I wrote the following implementation:
///<summary>Returns the end of a text reader.</summary>
///<param name="reader">The reader to read from.</param>
///<param name="lineCount">The number of lines to return.</param>
///<returns>The last lneCount lines from the reader.</returns>
public static string[] Tail(this TextReader reader, int lineCount) {
var buffer = new List<string>(lineCount);
string line;
for (int i = 0; i < lineCount; i++) {
line = reader.ReadLine();
if (line == null) return buffer.ToArray();
buffer.Add(line);
}
int lastLine = lineCount - 1; //The index of the last line read from the buffer. Everything > this index was read earlier than everything <= this indes
while (null != (line = reader.ReadLine())) {
lastLine++;
if (lastLine == lineCount) lastLine = 0;
buffer[lastLine] = line;
}
if (lastLine == lineCount - 1) return buffer.ToArray();
var retVal = new string[lineCount];
buffer.CopyTo(lastLine + 1, retVal, 0, lineCount - lastLine - 1);
buffer.CopyTo(0, retVal, lineCount - lastLine - 1, lastLine + 1);
return retVal;
}
Had trouble with your code. This is my version. Since its' a log file, something might be writing to it, so it's best making sure you're not locking it.
You go to the end. Start reading backwards until you reach n lines. Then read everything from there on.
int n = 5; //or any arbitrary number
int count = 0;
string content;
byte[] buffer = new byte[1];
using (FileStream fs = new FileStream("text.txt", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
// read to the end.
fs.Seek(0, SeekOrigin.End);
// read backwards 'n' lines
while (count < n)
{
fs.Seek(-1, SeekOrigin.Current);
fs.Read(buffer, 0, 1);
if (buffer[0] == '\n')
{
count++;
}
fs.Seek(-1, SeekOrigin.Current); // fs.Read(...) advances the position, so we need to go back again
}
fs.Seek(1, SeekOrigin.Current); // go past the last '\n'
// read the last n lines
using (StreamReader sr = new StreamReader(fs))
{
content = sr.ReadToEnd();
}
}
A friend of mine uses this method (BackwardReader can be found here):
public static IList<string> GetLogTail(string logname, string numrows)
{
int lineCnt = 1;
List<string> lines = new List<string>();
int maxLines;
if (!int.TryParse(numrows, out maxLines))
{
maxLines = 100;
}
string logFile = HttpContext.Current.Server.MapPath("~/" + logname);
BackwardReader br = new BackwardReader(logFile);
while (!br.SOF)
{
string line = br.Readline();
lines.Add(line + System.Environment.NewLine);
if (lineCnt == maxLines) break;
lineCnt++;
}
lines.Reverse();
return lines;
}
Does your log have lines of similar length? If yes, then you can calculate average length of the line, then do the following:
seek to end_of_file - lines_needed*avg_line_length (previous_point)
read everything up to the end
if you grabbed enough lines, that's fine. If no, seek to previous_point - lines_needed*avg_line_length
read everything up to previous_point
goto 3
memory-mapped file is also a good method -- map the tail of file, calculate lines, map the previous block, calculate lines etc. until you get the number of lines needed
Here is my answer:-
private string StatisticsFile = #"c:\yourfilename.txt";
// Read last lines of a file....
public IList<string> ReadLastLines(int nFromLine, int nNoLines, out bool bMore)
{
// Initialise more
bMore = false;
try
{
char[] buffer = null;
//lock (strMessages) Lock something if you need to....
{
if (File.Exists(StatisticsFile))
{
// Open file
using (StreamReader sr = new StreamReader(StatisticsFile))
{
long FileLength = sr.BaseStream.Length;
int c, linescount = 0;
long pos = FileLength - 1;
long PreviousReturn = FileLength;
// Process file
while (pos >= 0 && linescount < nFromLine + nNoLines) // Until found correct place
{
// Read a character from the end
c = BufferedGetCharBackwards(sr, pos);
if (c == Convert.ToInt32('\n'))
{
// Found return character
if (++linescount == nFromLine)
// Found last place
PreviousReturn = pos + 1; // Read to here
}
// Previous char
pos--;
}
pos++;
// Create buffer
buffer = new char[PreviousReturn - pos];
sr.DiscardBufferedData();
// Read all our chars
sr.BaseStream.Seek(pos, SeekOrigin.Begin);
sr.Read(buffer, (int)0, (int)(PreviousReturn - pos));
sr.Close();
// Store if more lines available
if (pos > 0)
// Is there more?
bMore = true;
}
if (buffer != null)
{
// Get data
string strResult = new string(buffer);
strResult = strResult.Replace("\r", "");
// Store in List
List<string> strSort = new List<string>(strResult.Split('\n'));
// Reverse order
strSort.Reverse();
return strSort;
}
}
}
}
catch (Exception ex)
{
System.Diagnostics.Debug.WriteLine("ReadLastLines Exception:" + ex.ToString());
}
// Lets return a list with no entries
return new List<string>();
}
const int CACHE_BUFFER_SIZE = 1024;
private long ncachestartbuffer = -1;
private char[] cachebuffer = null;
// Cache the file....
private int BufferedGetCharBackwards(StreamReader sr, long iPosFromBegin)
{
// Check for error
if (iPosFromBegin < 0 || iPosFromBegin >= sr.BaseStream.Length)
return -1;
// See if we have the character already
if (ncachestartbuffer >= 0 && ncachestartbuffer <= iPosFromBegin && ncachestartbuffer + cachebuffer.Length > iPosFromBegin)
{
return cachebuffer[iPosFromBegin - ncachestartbuffer];
}
// Load into cache
ncachestartbuffer = (int)Math.Max(0, iPosFromBegin - CACHE_BUFFER_SIZE + 1);
int nLength = (int)Math.Min(CACHE_BUFFER_SIZE, sr.BaseStream.Length - ncachestartbuffer);
cachebuffer = new char[nLength];
sr.DiscardBufferedData();
sr.BaseStream.Seek(ncachestartbuffer, SeekOrigin.Begin);
sr.Read(cachebuffer, (int)0, (int)nLength);
return BufferedGetCharBackwards(sr, iPosFromBegin);
}
Note:-
Call ReadLastLines with nLineFrom starting at 0 for the last line and nNoLines as the number of lines to read back from.
It reverses the list so the 1st one is the last line in the file.
bMore returns true if there are more lines to read.
It caches the data in 1024 char chunks - so it is fast, you may want to increase this size for very large files.
Enjoy!
This is in no way optimal but for quick and dirty checks with small log files I've been using something like this:
List<string> mostRecentLines = File.ReadLines(filePath)
// .Where(....)
// .Distinct()
.Reverse()
.Take(10)
.ToList()
Something that you can now do very easily in C# 4.0 (and with just a tiny bit of effort in earlier versions) is use memory mapped files for this type of operation. Its ideal for large files because you can map just a portion of the file, then access it as virtual memory.
There is a good example here.
As #EugeneMayevski stated above, if you just need an approximate number of lines returned, each line has roughly the same line length and you're more concerned with performance especially for large files, this is a better implementation:
internal static StringBuilder ReadApproxLastNLines(string filePath, int approxLinesToRead, int approxLengthPerLine)
{
//If each line is more or less of the same length and you don't really care if you get back exactly the last n
using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
var totalCharsToRead = approxLengthPerLine * approxLinesToRead;
var buffer = new byte[1];
//read approx chars to read backwards from end
fs.Seek(totalCharsToRead > fs.Length ? -fs.Length : -totalCharsToRead, SeekOrigin.End);
while (buffer[0] != '\n' && fs.Position > 0) //find new line char
{
fs.Read(buffer, 0, 1);
}
var returnStringBuilder = new StringBuilder();
using (StreamReader sr = new StreamReader(fs))
{
returnStringBuilder.Append(sr.ReadToEnd());
}
return returnStringBuilder;
}
}
Most log files have a DateTime stamp. Although can be improved, the code below works well if you want the log messages from the last N days.
/// <summary>
/// Returns list of entries from the last N days.
/// </summary>
/// <param name="N"></param>
/// <param name="cSEP">field separator, default is TAB</param>
/// <param name="indexOfDateColumn">default is 0; change if it is not the first item in each line</param>
/// <param name="bFileHasHeaderRow"> if true, it will not include the header row</param>
/// <returns></returns>
public List<string> ReadMessagesFromLastNDays(int N, char cSEP ='\t', int indexOfDateColumn = 0, bool bFileHasHeaderRow = true)
{
List<string> listRet = new List<string>();
//--- replace msFileName with the name (incl. path if appropriate)
string[] lines = File.ReadAllLines(msFileName);
if (lines.Length > 0)
{
DateTime dtm = DateTime.Now.AddDays(-N);
string sCheckDate = GetTimeStamp(dtm);
//--- process lines in reverse
int iMin = bFileHasHeaderRow ? 1 : 0;
for (int i = lines.Length - 1; i >= iMin; i--) //skip the header in line 0, if any
{
if (lines[i].Length > 0) //skip empty lines
{
string[] s = lines[i].Split(cSEP);
//--- s[indexOfDateColumn] contains the DateTime stamp in the log file
if (string.Compare(s[indexOfDateColumn], sCheckDate) >= 0)
{
//--- insert at top of list or they'd be in reverse chronological order
listRet.Insert(0, s[1]);
}
else
{
break; //out of loop
}
}
}
}
return listRet;
}
/// <summary>
/// Returns DateTime Stamp as formatted in the log file
/// </summary>
/// <param name="dtm">DateTime value</param>
/// <returns></returns>
private string GetTimeStamp(DateTime dtm)
{
// adjust format string to match what you use
return dtm.ToString("u");
}
As it currently stands, this question is not a good fit for our Q&A format. We expect answers to be supported by facts, references, or expertise, but this question will likely solicit debate, arguments, polling, or extended discussion. If you feel that this question can be improved and possibly reopened, visit the help center for guidance.
Closed 10 years ago.
Is there an open source library that will help me with reading/parsing PDF documents in .NET/C#?
Since this question was last answered in 2008, iTextSharp has improved their api dramatically. If you download the latest version of their api from http://sourceforge.net/projects/itextsharp/, you can use the following snippet of code to extract all text from a pdf into a string.
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace PdfParser
{
public static class PdfTextExtractor
{
public static string pdfText(string path)
{
PdfReader reader = new PdfReader(path);
string text = string.Empty;
for(int page = 1; page <= reader.NumberOfPages; page++)
{
text += PdfTextExtractor.GetTextFromPage(reader,page);
}
reader.Close();
return text;
}
}
}
iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.
using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;
namespace Spider.Utils
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { #"\\\(", #"\\\)", #"\\226", #"\\222", #"\\223", #"\\224", #"\\340", #"\\342", #"\\344", #"\\300", #"\\302", #"\\304", #"\\351", #"\\350", #"\\352", #"\\353", #"\\311", #"\\310", #"\\312", #"\\313", #"\\362", #"\\364", #"\\366", #"\\322", #"\\324", #"\\326", #"\\354", #"\\356", #"\\357", #"\\314", #"\\316", #"\\317", #"\\347", #"\\307", #"\\371", #"\\373", #"\\374", #"\\331", #"\\333", #"\\334", #"\\256", #"\\231", #"\\253", #"\\273", #"\\251", #"\\221"};
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
PDFClown might help, but I would not recommend it for a big or heavy use application.
public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
PdfReader reader2 = new PdfReader((string)Filename);
string strText = string.Empty;
for (int page = 1; page <= reader2.NumberOfPages; page++)
{
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
PdfReader reader = new PdfReader((string)Filename);
String s = PdfTextExtractor.GetTextFromPage(reader, page, its);
s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
strText = strText + s;
reader.Close();
}
return strText;
}
iText is the best library I know. Originally written in Java, there is a .NET port as well.
See http://www.ujihara.jp/iTextdotNET/en/
itext?
http://www.itextpdf.com/terms-of-use/index.php
Guide
http://www.vogella.com/articles/JavaPDF/article.html
You could look into this:
http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx
It's not completely free, but it looks very nice.
Alex
http://www.c-sharpcorner.com/UploadFile/psingh/PDFFileGenerator12062005235236PM/PDFFileGenerator.aspx is open source and may be a good starting point for you.
aspose pdf works pretty well. then again, you have to pay for it
Have a look at Docotic.Pdf library. It does not require you to make source code of your application open (like iTextSharp with viral AGPL 3 license, for example).
Docotic.Pdf can be used to read PDF files and extract text with or without formatting. Please have a look at the article that shows how to extract text from PDFs.
Disclaimer: I work for Bit Miracle, vendor of the library.
There is also LibHaru
http://libharu.org/wiki/Main_Page