c# Registry to XML Invalid character issue - c#

I have a problem when trying to create an XML file from registry. On my laptop(W7 64b) it is working fine, the xml file is generated but on another computer (Xp 32b) an exception is thrown : System.ArgumentException '.', hexadecimal values 0x00, is an invalid character. I have read few useful things about it but I don't know how to solve in this case, here is the code :
try
{
string regPath = "SOFTWARE\\IPS";
XElement xRegRoot = new XElement("Root", new XAttribute("Registry", regPath));
ReadRegistry(regPath, xRegRoot);
string xmlStringReg = xRegRoot.ToString();
XmlDocument docR = new XmlDocument();
docR.LoadXml(xmlStringReg);
docR.Save(AppDomain.CurrentDomain.BaseDirectory + "\\_RegistryList.xml");
}
catch (System.Exception ex)
{
Console.WriteLine(ex.ToString());
LogToFile(ex.ToString());
}
private static void ReadRegistry(string keyPath, XElement xRegRoot)
{
string[] subKeys=null;
RegistryKey HKLM = Registry.LocalMachine;
RegistryKey RegKey = HKLM.OpenSubKey(keyPath);
try
{
subKeys = RegKey.GetSubKeyNames();
foreach (string subKey in subKeys)
{
string fullPath = keyPath + "\\" + subKey;
Console.WriteLine("\r\nKey Name | " + fullPath);
LogToFile("Key Name | " + fullPath);
XElement xregkey = new XElement("RegKeyName", new XAttribute("FullName", fullPath), new XAttribute("Name", subKey));
xRegRoot.Add(xregkey);
ReadRegistry(fullPath, xRegRoot);
}
string[] subVals = RegKey.GetValueNames();
foreach (string val in subVals)
{
string keyName = val;
string keyType = RegKey.GetValueKind(val).ToString();
string keyValue = RegKey.GetValue(val).ToString();
Console.WriteLine("Key Value | " + keyType + " | " + keyName + " | " + keyValue);
LogToFile("Key " + keyType + " | " + keyName + " | " + keyValue);
XElement xregvalue = new XElement("RegKeyValue", new XAttribute("keyType", keyType), new XAttribute("keyName", keyName), new XAttribute("keyValue", keyValue));
xRegRoot.Add(xregvalue);
}
}
catch (System.Exception ex)
{
Console.WriteLine(ex.ToString());
LogToFile(ex.ToString());
}
}
Thanks in advance.

I did some experiments:
new XElement("foo\x00bar") throws on construction.
new XAttribute("foo\x00bar", "baz") throws on construction.
new XText("foo\x00bar") throws only when calling .ToString().
new XAttribute("foo", "bar\x00baz") is equivalent to new XAttribute("foo", new XText("bar\x00baz")), so it won't throw on construction.
I did not manage to make any of the registry-methods return a string with null-characters, but you should be able to find where this is returned yourself.

You can read more about it here: http://seattlesoftware.wordpress.com/2008/09/11/hexadecimal-value-0-is-an-invalid-character/
And more about it here: XElement & UTF-8 Issue
A valid list of xml chars are here
http://en.wikipedia.org/wiki/Valid_characters_in_XML
But essentially you can fix it by removing illegal chars before serialising
/// <summary>
/// Remove illegal XML characters from a string.
/// </summary>
public string SanitizeXmlString(string xml)
{
if (string.IsNullOrEmpty(value))
{
return value;
}
StringBuilder buffer = new StringBuilder(xml.Length);
foreach (char c in xml)
{
if (IsLegalXmlChar(c))
{
buffer.Append(c);
}
}
return buffer.ToString();
}
/// <summary>
/// Whether a given character is allowed by XML 1.0.
/// </summary>
public bool IsLegalXmlChar(int character)
{
return
(
character == 0x9 /* == '\t' == 9 */ ||
character == 0xA /* == '\n' == 10 */ ||
character == 0xD /* == '\r' == 13 */ ||
(character >= 0x20 && character <= 0xD7FF ) ||
(character >= 0xE000 && character <= 0xFFFD ) ||
(character >= 0x10000 && character <= 0x10FFFF)
);
}

Here are a couple little improvements that a) compile, and b) handle surrogate pairs:
/// <summary>
/// Remove illegal XML characters from a string.
/// </summary>
public static string SanitizeString(string s)
{
if (string.IsNullOrEmpty(s))
{
return s;
}
StringBuilder buffer = new StringBuilder(s.Length);
for (int i = 0; i < s.Length; i++)
{
int code;
try
{
code = Char.ConvertToUtf32(s, i);
}
catch (ArgumentException)
{
continue;
}
if (IsLegalXmlChar(code))
buffer.Append(Char.ConvertFromUtf32(code));
if (Char.IsSurrogatePair(s, i))
i++;
}
return buffer.ToString();
}
/// <summary>
/// Whether a given character is allowed by XML 1.0.
/// </summary>
private static bool IsLegalXmlChar(int codePoint)
{
return (codePoint == 0x9 ||
codePoint == 0xA ||
codePoint == 0xD ||
(codePoint >= 0x20 && codePoint <= 0xD7FF) ||
(codePoint >= 0xE000 && codePoint <= 0xFFFD) ||
(codePoint >= 0x10000/* && character <= 0x10FFFF*/) //it's impossible to get a code point bigger than 0x10FFFF because Char.ConvertToUtf32 would have thrown an exception
);
}

Related

How to make a function that checks the validity of brackets in a string?

I have coded a function that check if brackets in a certain string are valid and returns true if it is and false if it isn't.
For example:
str1: { [ a + b ] - ] ( c - d } ] = false.
str2: { [ a + b ] - ( c - d ) } = true.
When I run the program it doesn't give any output, just a blank output.
What do I need to change?
public static Boolean BracketCheck(string str)
{
Stack<char> stk = new Stack<char>();
Stack<char> aid = new Stack<char>();
Stack<char> temp = new Stack<char>();
while (str != "")
{
char ch = str[0];
if(ch == '(' || ch == '{' || ch == '[' || ch == ')' || ch == '}' || ch == ']')
{
stk.Push(ch);
}
if(str.Length != 1)
str = str.Substring(1, str.Length - 1);
}
stk = Opposite(stk);
char first = stk.Pop();
char last;
while (!stk.IsEmpty() && !aid.IsEmpty())
{
while (!stk.IsEmpty())
{
aid.Push(stk.Top());
last = stk.Pop();
if (stk.IsEmpty())
if (int.Parse(first + "") + 1 != int.Parse(last + "") || int.Parse(first + "") + 2 != int.Parse(last + ""))
{
return false;
}
}
first = aid.Pop();
while (!aid.IsEmpty())
{
aid.Push(aid.Top());
last = aid.Pop();
if (aid.IsEmpty())
if (int.Parse(first + "") + 1 != int.Parse(last + "") || int.Parse(first + "") + 2 != int.Parse(last + ""))
{
return false;
}
}
first = stk.Pop();
}
return true;
}
public static Stack<char> Opposite(Stack<char> stk)
{
Stack<char> temp = new Stack<char>();
while (stk.IsEmpty())
{
temp.Push(stk.Pop());
}
return temp;
}
You are on the right way (Stack) but it should be just one, not three. To check brackets validity only:
public static Boolean BracketCheck(string str) {
if (string.IsNullOrEmpty(str))
return true;
Stack<char> expected = new Stack<char>();
foreach (char c in str) {
if (c == '(')
expected.Push(')');
else if (c == '[')
expected.Push(']');
else if (c == '{')
expected.Push('}');
else if (c == ')' || c == ']' || c == '}') {
if (expected.Count == 0 || expected.Pop() != c)
return false;
}
}
return expected.Count == 0;
}
If you want to validate the string as a formula, e.g. (3 +) 5 has valid brackets, but is invalid formula, have a look at shunting yard algorithm
You created aid and did nothing with it before the line while (!stk.IsEmpty() && !aid.IsEmpty()) so aid is empty and nothing in that loop ever runs.
There's also a bunch of things that might be better asked on the code review site; for example you don't need to remove characters from a string to iterate over the characters in it or convert chars to strings to integers to compare them.
Essentially what you want to do is create a stack, iterate over the string, any opening bracket push to the stack, any closing bracket pop the stack and check the opening bracket matches, and if at the end of the string the stack is empty then it is valid. You don't need all the reversing and creating second stack stuff.
this works for me do you see any issuse or points to improve on?
public static Boolean BracketCheck(string str)
{
Stack<char> stk = new Stack<char>();
foreach(char c in str)
{
if (c == '(' || c == '[' || c == '{')
{
stk.Push(c);
}
else if (c == ')' || c == ']' || c == '}')
{
if (stk.Top() == (int)c - 1 || stk.Top() == (int)c - 2)
{
stk.Pop();
}
}
}
return stk.IsEmpty();
}

Problems to extract text from PDF for certain pdfs only C#

I need to extract some data from a PDF file.
I'm using the iTextSharp to do that.
I'm using this code which I founded on the net:
using System;
using System.IO;
using iTextSharp.text.pdf;
namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
outFileName = String.Empty;
outFileName = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);
//string currentDirectory = Directory.GetCurrentDirectory();
//string filePath = System.IO.Path.Combine(currentDirectory, "Data", "myfile.txt");
// extract the text
//string test = "";
outFileName += #"\test.txt";
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, true, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch(Exception ex)
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
I'm using this way:
PDFParser pdfParser = new PDFParser();
pdfParser.ExtractText(pdfFile,Path.GetFileNameWithoutExtension(pdfFile) + ".txt");
So the pdf content is written in a txt file.
It works good for certain pdf-s, but for a pdf file that I really need to use, the txt file remains always empty. I didn't get errors, but for some reason it's not writing anything, although as you can see in this screenshot it recognize the pdf,that it has 2 pages...
This is the pdf that I need but the txt always remains empty.(the black lines are added by me, so there are not present when I want to write in the txt)
And this is another pdf. For this the program works ok, and it is written is a txt file. It is much bigger than the other pdf, and still for this I can extract the texts and for the other I can't.
Do you have any idea what can be the problem?
Too long for comment and maybe an answer that you do not like to get:
In PDFs the "Text your see" aka how does a font look and "What the glyphs mean" aka what glyph is mapped to which utf8-letter are separate things.
They are stored in different parts of the pdf - it is utterly possible that a pdf looks totally fine, but if you try to extract text it will give you nothing from it because it only contains the shape of your textglyphs but not theire "meaning".
Try to open the pdf and Select + Copy the text you are after, if you paste that into an editor and noting is there, your pdf lacks the information "what utf8-letter is displayed by this glyph".
OR:
It also might be that your pdf only containts the image of a text - a photo so to say. You can read it, iTextSharp sees only a "picture" - no text.
Those are possible 'why's that would answer your question. As to how to fix it:
There are several questions about corrupt PDFs on SO:
How to repair a PDF file and embed missing fonts
Embedded fonts in PDF: copy and paste problems (this answer)
Copy and Paste relates to text parsing, so the might help you out on how to fix it.
Your edit shows details about your parsing, why don't you leverage iTextSharp for that?
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public static string ExtractTextFromPdf(string path)
{
using (PdfReader reader = new PdfReader(path))
{
StringBuilder text = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++)
{
text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
}
return text.ToString();
}
from: http://www.squarepdf.net/parsing-pdf-files-using-itextsharp
or like here: parse-pdf-with-itextsharp-and-then-extract-specific-text-to-the-screen ?

C# StreamReader detect  encoding of an XML File

In my xml file i have data like this :
<Data>
<Field>
<Name>BarcodeCapture_0</Name>
<Type>SimpleIndex</Type>
<DataType>DataMatrix</DataType>
<Value>DEA"¡CV°)Ñ võ Fƒ´ 20100410050</Value>
</Field>
</Data>
Im using a class that extend from StreamReader, i override reading methodes to prevent inaccepted characters like  character.
This is the class
public class CustomStreamReader : StreamReader
{
private const int EOF = -1;
public CustomStreamReader(Stream stream) : base(stream)
{
}
public CustomStreamReader(string path) : base(path)
{
}
public CustomStreamReader(string path, Encoding encoding) : base(path, encoding)
{
}
/// <summary>
/// Get whether an integer represents a legal XML 1.0 or 1.1 character. See
/// the specification at w3.org for these characters.
/// </summary>
/// <param name="xmlVersion">
/// The version number as a string. Use "1.0" for XML 1.0 character
/// validation, and use "1.1" for XML 1.1 character validation.
/// </param>
public static bool IsLegalXmlChar(string xmlVersion, int character)
{
switch (xmlVersion)
{
case "1.1": // http://www.w3.org/TR/xml11/#charsets
{
return
!(
character <= 0x8 ||
character == 0xB ||
character == 0xC ||
(character >= 0xE && character <= 0x1F) ||
(character >= 0x7F && character <= 0x84) ||
(character >= 0x86 && character <= 0x9F) ||
character > 0x10FFFF
);
}
case "1.0": // http://www.w3.org/TR/REC-xml/#charsets
{
return
(
character == 0x9 /* == '\t' == 9 */ ||
character == 0xA /* == '\n' == 10 */ ||
character == 0xD /* == '\r' == 13 */ ||
(character >= 0x20 && character <= 0xD7FF) ||
(character >= 0xE000 && character <= 0xFFFD) ||
(character >= 0x10000 && character <= 0x10FFFF)
);
}
default:
{
throw new ArgumentOutOfRangeException
("xmlVersion", string.Format("'{0}' is not a valid XML version."));
}
}
}
/// <summary>
/// Get whether an integer represents a legal XML 1.0 character. See the
/// specification at w3.org for these characters.
/// </summary>
public static bool IsLegalXmlChar(int character)
{
return CustomStreamReader.IsLegalXmlChar("1.0", character);
}
public override int Read()
{
// Read each character, skipping over characters that XML has prohibited
int nextCharacter;
do
{
// Read a character
if ((nextCharacter = base.Read()) == EOF)
{
// If the character denotes the end of the file, stop reading
break;
}
}
// Skip the character if it's prohibited, and try the next
while (!CustomStreamReader.IsLegalXmlChar(nextCharacter));
return nextCharacter;
}
public override int Peek()
{
// Return the next legl XML character without reading it
int nextCharacter;
do
{
// See what the next character is
nextCharacter = base.Peek();
}
while
(
// If it's prohibited XML, skip over the character in the stream
// and try the next.
!CustomStreamReader.IsLegalXmlChar(nextCharacter) &&
(nextCharacter = base.Read()) != EOF
);
return nextCharacter;
} // method
// The following methods are exact copies of the methods in TextReader,
// extracting by disassembling it in Refelctor
public override int Read(char[] buffer, int index, int count)
{
if (buffer == null)
{
throw new ArgumentNullException("buffer");
}
if (index < 0)
{
throw new ArgumentOutOfRangeException("index");
}
if (count < 0)
{
throw new ArgumentOutOfRangeException("count");
}
if ((buffer.Length - index) < count)
{
throw new ArgumentException();
}
int num = 0;
do
{
int num2 = this.Read();
if (num2 == -1)
{
return num;
}
buffer[index + num++] = (char)num2;
}
while (num < count);
return num;
}
public override int ReadBlock(char[] buffer, int index, int count)
{
int num;
int num2 = 0;
do
{
num2 += num = this.Read(buffer, index + num2, count - num2);
}
while ((num > 0) && (num2 < count));
return num2;
}
public override string ReadLine()
{
StringBuilder builder = new StringBuilder();
while (true)
{
int num = this.Read();
switch (num)
{
case -1:
if (builder.Length > 0)
{
return builder.ToString();
}
return null;
case 13:
case 10:
if ((num == 13) && (this.Peek() == 10))
{
this.Read();
}
return builder.ToString();
}
builder.Append((char)num);
}
}
public override string ReadToEnd()
{
int num;
char[] buffer = new char[0x1000];
StringBuilder builder = new StringBuilder(0x1000);
while ((num = this.Read(buffer, 0, buffer.Length)) != 0)
{
builder.Append(buffer, 0, num);
}
return builder.ToString();
}
}
In XML deserialization side :
CustomStreamReader fStream_scanTransaction_XML = new CustomStreamReader(scanTransactionFilePath, Encoding.UTF8);
XmlSerializer s = new XmlSerializer(typeof(ScanTransaction));
ScanTransaction result = ScanTransaction)s.Deserialize(fStream_scanTransaction_XML);
The problem is that StreamReader cannot detect  encoding, so it not deleting this character and the Xml deserialization faild.
Try:
using (var sr = new StreamReader("XMLFile1.xml", Encoding.UTF8))
using (var xtr = new XmlTextReader(sr))
{
XmlSerializer s = new XmlSerializer(typeof(ScanTransaction));
ScanTransaction result = (ScanTransaction)s.Deserialize(xtr);
}
You don't even neeed a "special" StreamReader. The XmlTextReader doesn't check for illegal characters (you can control this with the Normalize boolean property, but it is by default false, so no check on illegal characters)

Application that indents an unindented code in C#

My application should read a C# code sample that is unindented, then indent the code programatically. The way I am doing it may not be correct but still could achieve partial results.
I could set white spaces when a { is found then continue with the same amount of space for rest of the lines being read. When another { is found again add spaces and continue with this new space for rest of lines. For that this is what I did:
private void btn_format_Click(object sender, EventArgs e)
{
string lineInfo = "";
string fl = "";
string ctab= char.ConvertFromUtf32(32)+char.ConvertFromUtf32(32)+char.ConvertFromUtf32(32);
foreach (string line in txt_codepage.Lines) // text_codepage is a textbox with code
{
if (line.Contains("{"))
{
string l = line.Replace("{", ctab+"{");
lineInfo = lineInfo + (l + "\n");
fl = fl + ctab;
ctab = ctab + ctab;
}
else
{
lineInfo = lineInfo + (char.ConvertFromUtf32(32)+fl+ line + "\n");
}
I could achieve the proper indentation that I want till here. Now when I find a } I should do the reverse process but unfortunately that is not possible with strings. The reverse process that I meant is this:
if (line.Contains("}"))
{
string l = line.Replace(ctab + "}", "}");
lineInfo = lineInfo + (l + "\n");
fl = fl - ctab;
ctab = ctab - ctab;
}
else
{
lineInfo = lineInfo - (char.ConvertFromUtf32(32) + fl + line + "\n");
}
}
MessageBox.Show(lineInfo.ToString());
I know the above part of the code is a complete blunder but let me know how to achieve it in correct way
If you want parse string, you should use StringBuilder instead string concatenations (concatenations is to slow). I wrote some code, to demonstrate how you can parse CS or other code. It is not a full example, just a basic concepts.
If you want learn more about parsers you can read Compilers: Principles, Techniques, and Tools.
public static string IndentCSharpCode(string code)
{
const string INDENT_STEP = " ";
if (string.IsNullOrWhiteSpace(code))
{
return code;
}
var result = new StringBuilder();
var indent = string.Empty;
var lineContent = false;
var stringDefinition = false;
for (var i = 0; i < code.Length; i++)
{
var ch = code[i];
if (ch == '"' && !stringDefinition)
{
result.Append(ch);
stringDefinition = true;
continue;
}
if (ch == '"' && stringDefinition)
{
result.Append(ch);
stringDefinition = false;
continue;
}
if (stringDefinition)
{
result.Append(ch);
continue;
}
if (ch == '{' && !stringDefinition)
{
if (lineContent)
{
result.AppendLine();
}
result.Append(indent).Append("{");
if (lineContent)
{
result.AppendLine();
}
indent += INDENT_STEP;
lineContent = false;
continue;
}
if (ch == '}' && !stringDefinition)
{
if (indent.Length != 0)
{
indent = indent.Substring(0, indent.Length - INDENT_STEP.Length);
}
if (lineContent)
{
result.AppendLine();
}
result.Append(indent).Append("}");
if (lineContent)
{
result.AppendLine();
}
lineContent = false;
continue;
}
if (ch == '\r')
{
continue;
}
if ((ch == ' ' || ch == '\t') && !lineContent)
{
continue;
}
if (ch == '\n')
{
lineContent = false;
result.AppendLine();
continue;
}
if (!lineContent)
{
result.Append(indent);
lineContent = true;
}
result.Append(ch);
}
return result.ToString();
}
You can go and check out codemaid, an open source VS add in for cleaning code
Remove all of the whitespace from the line using String.Trim() and then add just the tabs you want. Also, your code would be much more readable if you could avoid char.ConvertFromUtf32(32) - why write that instead of " " or ' '?

Reading PDF documents in .Net [closed]

As it currently stands, this question is not a good fit for our Q&A format. We expect answers to be supported by facts, references, or expertise, but this question will likely solicit debate, arguments, polling, or extended discussion. If you feel that this question can be improved and possibly reopened, visit the help center for guidance.
Closed 10 years ago.
Is there an open source library that will help me with reading/parsing PDF documents in .NET/C#?
Since this question was last answered in 2008, iTextSharp has improved their api dramatically. If you download the latest version of their api from http://sourceforge.net/projects/itextsharp/, you can use the following snippet of code to extract all text from a pdf into a string.
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace PdfParser
{
public static class PdfTextExtractor
{
public static string pdfText(string path)
{
PdfReader reader = new PdfReader(path);
string text = string.Empty;
for(int page = 1; page <= reader.NumberOfPages; page++)
{
text += PdfTextExtractor.GetTextFromPage(reader,page);
}
reader.Close();
return text;
}
}
}
iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.
using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;
namespace Spider.Utils
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { #"\\\(", #"\\\)", #"\\226", #"\\222", #"\\223", #"\\224", #"\\340", #"\\342", #"\\344", #"\\300", #"\\302", #"\\304", #"\\351", #"\\350", #"\\352", #"\\353", #"\\311", #"\\310", #"\\312", #"\\313", #"\\362", #"\\364", #"\\366", #"\\322", #"\\324", #"\\326", #"\\354", #"\\356", #"\\357", #"\\314", #"\\316", #"\\317", #"\\347", #"\\307", #"\\371", #"\\373", #"\\374", #"\\331", #"\\333", #"\\334", #"\\256", #"\\231", #"\\253", #"\\273", #"\\251", #"\\221"};
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
PDFClown might help, but I would not recommend it for a big or heavy use application.
public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
PdfReader reader2 = new PdfReader((string)Filename);
string strText = string.Empty;
for (int page = 1; page <= reader2.NumberOfPages; page++)
{
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
PdfReader reader = new PdfReader((string)Filename);
String s = PdfTextExtractor.GetTextFromPage(reader, page, its);
s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
strText = strText + s;
reader.Close();
}
return strText;
}
iText is the best library I know. Originally written in Java, there is a .NET port as well.
See http://www.ujihara.jp/iTextdotNET/en/
itext?
http://www.itextpdf.com/terms-of-use/index.php
Guide
http://www.vogella.com/articles/JavaPDF/article.html
You could look into this:
http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx
It's not completely free, but it looks very nice.
Alex
http://www.c-sharpcorner.com/UploadFile/psingh/PDFFileGenerator12062005235236PM/PDFFileGenerator.aspx is open source and may be a good starting point for you.
aspose pdf works pretty well. then again, you have to pay for it
Have a look at Docotic.Pdf library. It does not require you to make source code of your application open (like iTextSharp with viral AGPL 3 license, for example).
Docotic.Pdf can be used to read PDF files and extract text with or without formatting. Please have a look at the article that shows how to extract text from PDFs.
Disclaimer: I work for Bit Miracle, vendor of the library.
There is also LibHaru
http://libharu.org/wiki/Main_Page

Categories