In my project I need to read a PDF document. This pdf contains ukrainian & russian characters. the PDFReader read all characters in this pdf but the cirillic characters missing in output. I'm try to use encoding but it not helped. What can I do with this chars?
public static string GetText(string filePath)
{
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
StringBuilder text = new StringBuilder();
if (File.Exists(filePath)){
PdfReader pdfReader = new PdfReader(filePath);
for (int i = 1; i < pdfReader.NumberOfPages; i++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string thePage = PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
text.Append(System.Environment.NewLine);
thePage = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(thePage)));
text.Append(thePage);
} pdfReader.Close();
} return text.ToString();
}
iTextSharp is an outdated product that is no longer supported, probably there are problems with text extraction. Here is a simple example of how the extraction text works in ITEXT 7 (the code is in java, but everything is the same for c#).
String filePath = "test.pdf";
StringBuilder text = new StringBuilder();
PdfReader pdfReader = new PdfReader(filePath);
PdfDocument pdfDocument = new PdfDocument(pdfReader);
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
PdfPage page = pdfDocument.getPage(i);
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
String thePage = PdfTextExtractor.getTextFromPage(page, strategy);
text.append(thePage);
}
pdfReader.close();
System.out.print(text);
The code is about the same as in your example, but the text extracts
Related
I am going to convert a PDF to a Text file using iText but the Euro currency symbol is missed in the final result.
public void TextExtraction()
{
StringBuilder allTextBuilder = new StringBuilder();
using (PdfReader pdfReader = new PdfReader(SourceFileName))
using (PdfDocument pdfDocument = new PdfDocument(pdfReader))
{
for (int page = 1; page <= pdfDocument.GetNumberOfPages(); page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentPageText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(page), strategy);
allTextBuilder.AppendFormat(currentPageText);
}
}
File.WriteAllText(DestinationFileName, allTextBuilder.ToString(), Encoding.Unicode);
}
I wonder if someone has any solution for me?
I am using iText to extract text from the pdf file, I could able to see all text value, but the structure is broken. Could you help me how to extract the text exactly like the pdf file. I've tried some online tool, it does the extraction correctly, what library they are using.
StringBuilder text = new StringBuilder();
if (File.Exists(ofd.FileName))
{
PdfReader pdfReader = new PdfReader(ofd.FileName);
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
//ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
}
pdfReader.Close();
}
rtxtboxInvoice.Text = text.ToString();
I am trying to create a pdf with pdf stamper which is working fine. I have created a big multiline textbox in the template, where I need to print a huge string. To the string I need to make few string in bold and other in normal font. I tried in many ways, but could not get the font in bold.I need to make the Heading string in bold.
string pdfTemplatePath = Server.MapPath("PDF/" + group + "/template.pdf");
PdfReader pdfReader = new PdfReader(pdfTemplatePath);
MemoryStream myMemoryStream = new MemoryStream();
PdfStamper pdfStamper = new PdfStamper(pdfReader, myMemoryStream);
AcroFields pdfFormFields = pdfStamper.AcroFields;
pdfFormFields.SetField("Name", "abc");
string a = "Some text";
string b = "Some large text";
StringBuilder sb = new StringBuilder();
sb.Append("Heading1");
sb.Append(a);
sb.Append("Heading2");
sb.Append(b);
String htmlText = sb.ToString();
List<IElement> htmlarraylist = iTextSharp.text.html.simpleparser.HTMLWorker.ParseToList(new StringReader(htmlText), null);
for (int k = 0; k < htmlarraylist.Count; k++)
{
phrase.Add((IElement)htmlarraylist[k]);
}
I am using the iTextSharp library in my project. How can I take PDF line's decoration or style? (something that indicates my text from others.)
public static string ReadPdfFile(string fileName)
{
StringBuilder text = new StringBuilder();
if (File.Exists(fileName))
{
PdfReader pdfReader = new PdfReader(fileName);
pdfReader.GetNamedDestinationFromStrings();
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
}
pdfReader.Close();
}
return text.ToString();
}
How could I find every SOH character, which looks like a box on the PDF, and place a checkbox form field on top of it. This question was close Extract text and text rectangle coordinates from a Pdf file using itextsharp but I can not get this to work. Below is some code of what I am trying to do. It would be best also, if I could not put a form if there is already one there.
StringBuilder text = new StringBuilder();
if (File.Exists(filePath))
{
using (PdfReader pdfReader = new PdfReader(filePath))
using (FileStream fileOut = new FileStream(#"C:\Projects\document.pdf", FileMode.Create, FileAccess.Write))
using (PdfStamper stamp = new PdfStamper(pdfReader, fileOut))
{
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
ITextExtractionStrategy strategy = new PdfHelper.LocationTextExtractionStrategyEx();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
int count = 0;
foreach (var strat in ((PdfHelper.LocationTextExtractionStrategyEx)(strategy)).TextLocationInfo)
{
RadioCheckField checkbox = new RadioCheckField(stamp.Writer, new iTextSharp.text.Rectangle(strat.TopLeft[0], strat.BottomRight[1], (strat.TopLeft[0] + 5), (strat.BottomRight[1] - 5)), ("CheckBoxInserted" + count), "On");
checkbox.CheckType = RadioCheckField.TYPE_SQUARE;
stamp.AddAnnotation(checkbox.CheckField, page);
}
RadioCheckField checkField = new RadioCheckField(stamp.Writer, new iTextSharp.text.Rectangle(450, 690, 460, 680), "checkboxname", "On");
checkField.CheckType = RadioCheckField.TYPE_SQUARE;
stamp.AddAnnotation(checkField.CheckField, 1);
}
}
}
return text.ToString();