C# Extract Text from .XPS Document

C# Extract Text from .XPS Document - c#

I have been using Another StackOverflow answer to this question as a reference to solving this problem, however I have run into a problem. I am getting an error at FixedDocumentSequence saying that it could not be found. I have added references to PresentationCore, PresentationFramework, WindowsBase and ReachFramework already, and I'm not quite sure if I need to add another reference for the FixedDocumentSequence.
Here is my code:
public string convertXPS(string fileName)
{
XpsDocument _xpsDocument = new XpsDocument(fileName, System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader = _xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
string _fullPageText = "";
for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
{
IXpsFixedPageReader _page = _document.FixedPages[pageCount];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null)
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
_fullPageText += _currentText.ToString();
}
return _fullPageText;
}

[STAThread]
static void Main(string[] args)
{
try
{
XpsDocument _xpsDocument = new XpsDocument(#"C:\Users\admin-\Desktop\testing.xps", System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader = _xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
string _fullPageText = "";
for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
{
IXpsFixedPageReader _page = _document.FixedPages[pageCount];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null)
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
_fullPageText += _currentText.ToString();
}
}
catch(Exception e)
{
}
}
I don't think there is much change in the code, try to add the [STAThread] which helped me to read the xps, also i only used the above mentioned references to read the file,also i got the same error that you got, but somehow solved it, you are 90% closer to get the result
Also see which reference is needed to add System.Windows.Documents;

Related

Get string data Xpath

I need help to get data from the site. I use geckofx in my application. I want it to retrieve text data from the xpath location after loading the page
XPathResult xpathResult = geckoWebBrowser1.Document.EvaluateXPath("/html/body/table[3]/tbody/tr[1]/td[2]/a[1]");
IEnumerable<GeckoNode> foundNodes = xpathResult.GetNodes();
How to download data as text?

It looks like you are struggling to retrieve the text from the GeckoFX objects.
Here are a few calls and operations that should get you started:
//get by XPath
XPathResult xpathResult = _browser.Document.EvaluateXPath("//*[#id]/div/p[2]");
var foundNodes = xpathResult.GetNodes();
foreach (var node in foundNodes)
{
var x = node.TextContent; // get text text contained by this node (including children)
GeckoHtmlElement element = node as GeckoHtmlElement; //cast to access.. inner/outerHtml
string inner = element.InnerHtml;
string outer = element.OuterHtml;
//iterate child nodes
foreach (var child in node.ChildNodes)
{
}
}
//get by id
GeckoHtmlElement htmlElementById = _browser.Document.GetHtmlElementById("mw-content-text");
//get by tag
GeckoElementCollection byTag = _browser.Document.GetElementsByTagName("input");
foreach (var ele in byTag)
{
var y = ele.GetAttribute("value");
}
//get by class
var byClass = _browser.Document.GetElementsByClassName("input");
foreach (var node in byClass)
{
//...
}
//cast to a different object
var username = ((GeckoInputElement)_browser.Document.GetHtmlElementById("yourUsername")).Value;
//create new object from DomObject
var button = new GeckoButtonElement(_browser.Document.GetElementById("myBtn").DomObject);

public string extract(string xpath, string type)
{
string result = string.Empty;
GeckoHtmlElement elm = null;
GeckoWebBrowser wb = geckoWebBrowser1;//(GeckoWebBrowser)GetCurrentWB();
if (wb != null)
{
elm = GetElement(wb, xpath);
if (elm != null)
//UpdateUrlAbsolute(wb.Document, elm);
if (elm != null)
{
switch (type)
{
case "html":
result = elm.OuterHtml;
break;
case "text":
if (elm.GetType().Name == "GeckoTextAreaElement")
{
result = ((GeckoTextAreaElement)elm).Value;
}
else
{
result = elm.TextContent.Trim();
}
break;
case "value":
result = ((GeckoInputElement)elm).Value;
break;
default:
result = extractData(elm, type);
break;
}
}
}
return result;
}
private string extractData(GeckoHtmlElement ele, string attribute)
{
var result = string.Empty;
if (ele != null)
{
var tmp = ele.GetAttribute(attribute);
/*if (tmp == null)
{
tmp = extractData(ele.Parent, attribute);
}*/
if (tmp != null)
result = tmp.Trim();
}
return result;
}
private object GetCurrentWB()
{
if (tabControl1.SelectedTab != null)
{
if(tabControl1.SelectedTab.Controls.Count > 0)
//if (tabControl1.SelectedTab.Controls.Count > 0)
{
Control ctr = tabControl1.SelectedTab.Controls[0];
if (ctr != null)
{
return ctr as object;
}
}
}
return null;
}
private GeckoHtmlElement GetElement(GeckoWebBrowser wb, string xpath)
{
GeckoHtmlElement elm = null;
if (xpath.StartsWith("/"))
{
if (xpath.Contains("#class") || xpath.Contains("#data-type"))
{
var html = GetHtmlFromGeckoDocument(wb.Document);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
var node = doc.DocumentNode.SelectSingleNode(xpath);
if (node != null)
{
var currentXpath = "/" + node.XPath;
elm = (GeckoHtmlElement)wb.Document.EvaluateXPath(currentXpath).GetNodes().FirstOrDefault();
}
}
else
{
elm = (GeckoHtmlElement)wb.Document.EvaluateXPath(xpath).GetNodes().FirstOrDefault();
}
}
else
{
elm = (GeckoHtmlElement)wb.Document.GetElementById(xpath);
}
return elm;
}
private string GetHtmlFromGeckoDocument(GeckoDocument doc)
{
var result = string.Empty;
GeckoHtmlElement element = null;
var geckoDomElement = doc.DocumentElement;
if (geckoDomElement is GeckoHtmlElement)
{
element = (GeckoHtmlElement)geckoDomElement;
result = element.InnerHtml;
}
return result;
}
private void button5_Click(object sender, EventArgs e)
{
var text = extract("/html/body/table[3]/tbody/tr[1]/td[2]/a[2]", "text");
MessageBox.Show(text);
}
I also insert the code from which I used a little longer code but it also works. maybe someone will need it. The creator of the code is Đinh Công Thắng, Web Automation App,
Regards

C# List embedded files inside a PDF file and get a file stream to one of the embedded files

I can create a PDF with embedded files using LaTeX for example:
\usepackage{embedfile}
\embedfile{abc.data}
\embedfile{def.data}
Using the Acrobat Reader I'm able to extract again the two data files.
But how can I do that from C#?
How can I list the files which are embedded inside a PDF, similiar to a directory list?
How can I get a file stream (readonly) to one of the embedded files inside a PDF?

Using the iTextSharp-LGPL PDF library I was able to solve it.
List embedded file names
public static string[] ListEmbeddedFileNames(string pdfFileName)
{
string[] fileNames = new string[0];
var reader = new iTextSharp.text.pdf.PdfReader(pdfFileName);
if (reader != null)
{
var root = reader.Catalog;
if (root != null)
{
var names = root.GetAsDict(iTextSharp.text.pdf.PdfName.NAMES);
if (names != null)
{
var embeddedFiles = names.GetAsDict(iTextSharp.text.pdf.PdfName.EMBEDDEDFILES);
if (embeddedFiles != null)
{
var namesArray = embeddedFiles.GetAsArray(iTextSharp.text.pdf.PdfName.NAMES);
if (namesArray != null)
{
int n = namesArray.Size / 2; // I don't understand why I have to divide by 2
fileNames = new string[n];
for (int i = 0; i < n; i++) fileNames[i] = namesArray[2 * i].ToString();
}
}
}
}
reader.Close();
}
return fileNames;
}
Get embedded file stream
public static Stream GetEmbeddedFileStream(string pdfFileName, string embeddedFileName)
{
byte[] data = GetEmbeddedFileData(pdfFileName, embeddedFileName);
if (data == null)
return null;
else
return new MemoryStream(data);
}
and
public static byte[] GetEmbeddedFileData(string pdfFileName, string embeddedFileName)
{
byte[] attachedFileBytes = null;
var reader = new iTextSharp.text.pdf.PdfReader(pdfFileName);
if (reader != null)
{
var root = reader.Catalog;
if (root != null)
{
var names = root.GetAsDict(iTextSharp.text.pdf.PdfName.NAMES);
if (names != null)
{
var embeddedFiles = names.GetAsDict(iTextSharp.text.pdf.PdfName.EMBEDDEDFILES);
if (embeddedFiles != null)
{
var namesArray = embeddedFiles.GetAsArray(iTextSharp.text.pdf.PdfName.NAMES);
if (namesArray != null)
{
int n = namesArray.Size;
for (int i = 0; i < n; i++)
{
i++;
var fileArray = namesArray.GetAsDict(i);
var file = fileArray.GetAsDict(iTextSharp.text.pdf.PdfName.EF);
foreach (iTextSharp.text.pdf.PdfName key in file.Keys)
{
string attachedFileName = fileArray.GetAsString(key).ToString();
if (attachedFileName == embeddedFileName)
{
var stream = (iTextSharp.text.pdf.PRStream)iTextSharp.text.pdf.PdfReader.GetPdfObject(file.GetAsIndirectObject(key));
attachedFileBytes = iTextSharp.text.pdf.PdfReader.GetStreamBytes(stream);
break;
}
}
if (attachedFileBytes != null) break;
}
}
}
}
}
reader.Close();
}
return attachedFileBytes;
}
I've checked my solution by computing MD5 checksums over the embedded files.

Extract texts from xps document to textbox

I keep running in to this code when researching, however copying this to my form gives me an error in the documentViewerElement part saying The name 'documentViewerElement' does not exist in the current context
XpsDocument _xpsDocument=new XpsDocument("/path",System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader
=_xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
IXpsFixedPageReader _page
= _document.FixedPages[documentViewerElement.MasterPageNumber];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null )
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
string _fullPageText = _currentText.ToString();
I'm hoping to get all the texts from an xps document and put it on a rich text box.

documentViewerElement is not defined hence your error.
In the following line:
IXpsFixedPageReader _page
= _document.FixedPages[documentViewerElement.MasterPageNumber];
documentViewerElement.MasterPageNumber is just the page number, so change it to the xps page you want to read, e.g.
IXpsFixedPageReader _page
= _document.FixedPages[0];
To read the text from the entire xps file you could try the following (it's pretty much the same as your code it's just looping (Taken from here).
private string ReadXpsFile(string fileName)
{
XpsDocument _xpsDocument = new XpsDocument(fileName, System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader = _xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
string _fullPageText="";
for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
{
IXpsFixedPageReader _page = _document.FixedPages[pageCount];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null)
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
_fullPageText += _currentText.ToString();
}
return _fullPageText;
}

Control validation against XmlSchemaSimpleTypeRestriction.Facets

I have a a desktop application with a System.Windows.Form containing some TextBox controls. I need to validate the control values against restrictions of an xml schema.
For each TextBox I can retrieve the relevant XmlSchemaSimpleTypeRestriction from its type and then use a method as follows to validate its value:
public static bool Validate(XmlSchemaSimpleTypeRestriction restriction, string value)
{
bool isENum = false;
bool isValidEnum = false;
foreach (var item in restriction.Facets)
{
XmlSchemaLengthFacet lengthFacet = item as XmlSchemaLengthFacet;
if (lengthFacet != null)
{
int length = Int32.Parse(lengthFacet.Value);
if (!(value.Length == length))
return false;
}
XmlSchemaMinLengthFacet minLenghtFacet = item as XmlSchemaMinLengthFacet;
if (minLenghtFacet != null)
{
int length = Int32.Parse(minLenghtFacet.Value);
if (!(value.Length >= length))
return false;
}
XmlSchemaMaxLengthFacet maxLenghtFacet = item as XmlSchemaMaxLengthFacet;
if (maxLenghtFacet != null)
{
int length = Int32.Parse(maxLenghtFacet.Value);
if (!(value.Length <= length))
return false;
}
XmlSchemaPatternFacet patternFacet = item as XmlSchemaPatternFacet;
if (patternFacet != null)
{
Regex re = new Regex(patternFacet.Value);
if (!re.IsMatch(value))
return false;
}
XmlSchemaEnumerationFacet enumFacet = item as XmlSchemaEnumerationFacet;
if (patternFacet != null)
{
isENum = true;
if (StringComparer.InvariantCultureIgnoreCase.Compare(value, enumFacet.Value) == 0)
isValidEnum = true;
}
if (isENum && (!isValidEnum))
return false;
return true;
}
I am going to use this method in the Validating event of the controls. Is there any simpler way of doing this?

Ok, it's a little more complicated than I initially thought. Basically, you need create an XmlSchema that expects a single element with the provided restriction. Then you create an XML element with the provided value and validate it against the schema using an XmlReader:
public static bool Validate(XmlSchemaSimpleTypeRestriction restriction, string value)
{
var schema = new XmlSchema();
schema.Items.Add(new XmlSchemaElement
{
Name = "value",
SchemaType = new XmlSchemaSimpleType { Content = restriction }
});
var schemaSet = new XmlSchemaSet();
schemaSet.Add(schema);
var readerSettings = new XmlReaderSettings
{
ValidationType = ValidationType.Schema,
ValidationFlags = XmlSchemaValidationFlags.ReportValidationWarnings,
Schemas = schemaSet
};
string xml = new XElement("value", value).ToString();
try
{
var reader = XmlReader.Create(new StringReader(xml), readerSettings);
while (reader.Read()) ;
return true;
}
catch (XmlSchemaValidationException)
{
return false;
}
}
I tested it with this code:
static void Main(string[] args)
{
var restriction = new XmlSchemaSimpleTypeRestriction { BaseTypeName = new XmlQualifiedName("string", "http://www.w3.org/2001/XMLSchema") };
restriction.Facets.Add(new XmlSchemaMinLengthFacet { Value = "3" });
Console.WriteLine(Validate(restriction, "str"));
}

Extract text from a XPS Document [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 4 years ago.
Improve this question
i need to extract the text of a specific page from a XPS document.
The extracted text should be written in a string. I need this to read out the extracted text using Microsofts SpeechLib.
Please examples only in C#.
Thanks

Add References to ReachFramework and WindowsBase and the following using statement:
using System.Windows.Xps.Packaging;
Then use this code:
XpsDocument _xpsDocument=new XpsDocument("/path",System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader
=_xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
IXpsFixedPageReader _page
= _document.FixedPages[documentViewerElement.MasterPageNumber];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null )
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
string _fullPageText = _currentText.ToString();
Text exists in Glyphs -> UnicodeString string attribute. You have to use XMLReader for fixed page.

Method that returns text from all pages (modified Amir:s code, hope that's ok):
/// <summary>
/// Get all text strings from an XPS file.
/// Returns a list of lists (one for each page) containing the text strings.
/// </summary>
private static List<List<string>> ExtractTextFromXps(string xpsFilePath)
{
var xpsDocument = new XpsDocument(xpsFilePath, FileAccess.Read);
var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
if (fixedDocSeqReader == null)
return null;
const string UnicodeString = "UnicodeString";
const string GlyphsString = "Glyphs";
var textLists = new List<List<string>>();
foreach (IXpsFixedDocumentReader fixedDocumentReader in fixedDocSeqReader.FixedDocuments)
{
foreach (IXpsFixedPageReader pageReader in fixedDocumentReader.FixedPages)
{
var pageContentReader = pageReader.XmlReader;
if (pageContentReader == null)
continue;
var texts = new List<string>();
while (pageContentReader.Read())
{
if (pageContentReader.Name != GlyphsString)
continue;
if (!pageContentReader.HasAttributes)
continue;
if (pageContentReader.GetAttribute(UnicodeString) != null)
texts.Add(pageContentReader.GetAttribute(UnicodeString));
}
textLists.Add(texts);
}
}
xpsDocument.Close();
return textLists;
}
Usage:
var txtLists = ExtractTextFromXps(#"C:\myfile.xps");
int pageIdx = 0;
foreach (List<string> txtList in txtLists)
{
pageIdx++;
Console.WriteLine("== Page {0} ==", pageIdx);
foreach (string txt in txtList)
Console.WriteLine(" "+txt);
Console.WriteLine();
}

private string ReadXpsFile(string fileName)
{
XpsDocument _xpsDocument = new XpsDocument(fileName, System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader
= _xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
string _fullPageText="";
for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
{
IXpsFixedPageReader _page
= _document.FixedPages[pageCount];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null)
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
_fullPageText += _currentText.ToString();
}
return _fullPageText;
}

Full Code of Class:
using System.Collections.Generic;
using System.Drawing;
using System.Windows.Forms;
using System.Windows.Xps.Packaging;
namespace XPS_Data_Transfer
{
internal static class XpsDataReader
{
public static List<string> ReadXps(string address, int pageNumber)
{
var xpsDocument = new XpsDocument(address, System.IO.FileAccess.Read);
var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
if (fixedDocSeqReader == null) return null;
const string uniStr = "UnicodeString";
const string glyphs = "Glyphs";
var document = fixedDocSeqReader.FixedDocuments[pageNumber - 1];
var page = document.FixedPages[0];
var currentText = new List<string>();
var pageContentReader = page.XmlReader;
if (pageContentReader == null) return null;
while (pageContentReader.Read())
{
if (pageContentReader.Name != glyphs) continue;
if (!pageContentReader.HasAttributes) continue;
if (pageContentReader.GetAttribute(uniStr) != null)
currentText.Add(Dashboard.CleanReversedPersianText(pageContentReader.GetAttribute(uniStr)));
}
return currentText;
}
}
}
that return a list of string data from custom page of custom file.

We Keep Coding

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

C# Extract Text from .XPS Document - c#

Related

Get string data Xpath

C# List embedded files inside a PDF file and get a file stream to one of the embedded files

Extract texts from xps document to textbox

Control validation against XmlSchemaSimpleTypeRestriction.Facets

Extract text from a XPS Document [closed]

Categories

Resources