Following this actual solution I am trying to get all the words inside a TextChunk and each of its coordinates (actual page, top, bottom, left, right).
Since a TextChunk could be a phrase, a word or whatever, I tried to do this manually, counting on the last word's rectangle and cutting it each time. I noticed this manual method could be so buggy (I would need to manually count on special characters and so on), so I asked myself if ITextSharp provides any easier way to perform this.
My Chunk and LocationTextExtractionStragy inherited classes are the following:
public class Chunk
{
public Guid Id { get; set; }
public Rectangle Rect { get; set; }
public TextRenderInfo Render { get; set; }
public BaseFont BF { get; set; }
public string Text { get; set; }
public int FontSize { get; set; }
public Chunk(Rectangle rect, TextRenderInfo renderInfo)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = Render.GetText();
Initialize();
}
public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = text;
Initialize();
}
private void Initialize()
{
this.Id = Guid.NewGuid();
this.BF = Render.GetFont();
this.FontSize = ObtainFontSize();
}
private int ObtainFontSize()
{
return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
}
}
public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
//Save each coordinate
public List<Chunk> ChunksInPage = new List<Chunk>();
//Automatically called on each chunk on PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
if (string.IsNullOrWhiteSpace(renderInfo.GetText())
|| renderInfo == null)
return;
//Get chunk Vectors
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create Rectangle based on previous Vectors
var rect = new Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]);
if (rect == null)
return;
//Add each chunk with its coordinates
ChunksInPage.Add(new Chunk(rect, renderInfo));
}
}
So once I get the file and so on, I proceed this way:
private void ProcessContent()
{
for (int page= 1; page <= pdfReader.NumberOfPages; page++)
{
var strategy = new LocationTextExtractionPersonalizada();
var currentPageText = PdfTextExtractor.GetTextFromPage(
pdfReader,
page,
strategy);
//Here is where I want to get each word with its coordinates
var chunksWords= ChunkRawToWord(strategy.ChunksInPage);
}
}
private List<Chunk> ChunkRawToWord(IList<Chunk> chunks)
{
if (chunks == null || chunks[0] == null)
return null;
var words = new List<Chunk>();
//Poor RegEx pattern to get the word and its wathever
string pattern = #"[#&\w+]*(-*\/*\s*\:*\;*\,*\.*\(*\)*\%*\>*\<*)?";
var something = chunks[0].Render.GetCharacterRenderInfos();
for (int i = 0; i < chunks.Count; i++)
{
var wordsInChunk = Regex.Matches(
chunks[i].Text,
pattern,
RegexOptions.IgnoreCase);
var rectangleChunk = new Rectangle(chunks[i].Rect);
for (int j = 0; j < wordsInChunk.Count; j++)
{
if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value))
continue;
var word = new Chunk(
rectangleChunk,
chunks[i].Render,
wordsInChunk[j].ToString());
if (j == 0)
{
word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize);
words.Add(word);
continue;
}
if (words.Count <= 0)
continue;
word.Rect.Left = words[j - 1].Rect.Right;
word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize);
words.Add(word);
}
}
return words;
}
Afterwards, I wrote a comment on Mkl's solution, being replied with "use getCharacterRenderInfos()", which I use and I get every single character into a TextRenderInfo's List.
I'm sorry but I'm starting to mix concepts, ways to find out how to apply that solution and blowing my mind.
I would really appreciate a hand here.
You can use the method TextRenderInfo.GetCharacterRenderInfos() to get a collection of TextRenderInfo for each and every char in your chunk. Then you can could regroup the individual characters into words and calculate the rectangle that contains the word using the coordinates of the first and last TextRenderInfo in that word.
In your custom text extraction strategy:
var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
{
var resultInfo = new List<TextRenderInfo>();
var chars = currentInfo.GetCharacterRenderInfos();
foreach (var charRenderInfo in chars)
{
resultInfo.Add(charRenderInfo);
var currentChar = charRenderInfo.GetText();
if (_separators.Contains(currentChar))
{
ProcessWord(currentInfo, resultInfo);
resultInfo.Clear();
}
}
ProcessWord(currentInfo, resultInfo);
}
private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
{
var firstRender = wordChunks.FirstOrDefault();
var lastRender = wordChunks.LastOrDefault();
if (firstRender == null || lastRender == null)
{
return;
}
var startCoords = firstRender.GetDescentLine().GetStartPoint();
var endCoords = lastRender.GetAscentLine().GetEndPoint();
var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
_chunks.Add(new CustomTextChunk(wordText, wordLocation));
}
I'm trying to parse an incoming stream of bytes that represents messages.
I need to split the stream and create a message structure for each part.
A message always starts with a 0x81 (BOM) and ends with a 0x82 (EOM).
start: 0x81
header: 3 bytes
data: arbitrary length
stop: 0x82
The data part is escaped using an escape byte 0x1B (ESC): Anytime a byte in the data part contains one of the control bytes {ESC, BOM, EOM}, it is prefixed with ESC.
The header part is not escaped, and may contain control bytes.
I would like to code this in a functional reactive style using Rx.Net, by consuming an IObservable<byte> and transforming it into an IObservable<Message>.
What is the most idiomatic way to do this?
Some examples:
[81 01 02 03 82] single message
[81 82 81 82 82] single message, header = [82 81 82]
[81 01 02 1B 82] single message, header = [01 02 1B].
[81 01 02 03 1B 82 82] single message, header = [01 02 03], (unescaped) data = [82]
[81 01 02 03 1B 1B 82 82] single message + dangling [82] which should be ignored.
header = [01 02 03], (unescaped) data = [1B]
Here's a state machine drawing for this:
If you are looking for something the is "more functional" then this may help, however the answer by #Evk pass these tests too.
Firstly can I suggest, to take the leg work out of providing a verifiable answer, could you provide a test suite to implement on complex questions like this.
Something like this would have been very helpful.
var scheduler = new TestScheduler();
var source = scheduler.CreateColdObservable<byte>(
ReactiveTest.OnNext<byte>(01,0x81), //BOM m1
ReactiveTest.OnNext<byte>(02,0x01),
ReactiveTest.OnNext<byte>(03,0x02),
ReactiveTest.OnNext<byte>(04,0x03),
ReactiveTest.OnNext<byte>(05,0x82), //EOM m1
ReactiveTest.OnNext<byte>(06,0x81), //BOM m2
ReactiveTest.OnNext<byte>(07,0x82),
ReactiveTest.OnNext<byte>(08,0x81),
ReactiveTest.OnNext<byte>(09,0x82),
ReactiveTest.OnNext<byte>(10,0x82), //EOM m2
ReactiveTest.OnNext<byte>(11,0x81), //BOM m3
ReactiveTest.OnNext<byte>(12,0x01),
ReactiveTest.OnNext<byte>(13,0x02),
ReactiveTest.OnNext<byte>(14,0x1B),
ReactiveTest.OnNext<byte>(15,0x82), //EOM m3
ReactiveTest.OnNext<byte>(16,0x81), //BOM m4
ReactiveTest.OnNext<byte>(17,0x01),
ReactiveTest.OnNext<byte>(18,0x02),
ReactiveTest.OnNext<byte>(19,0x03),
ReactiveTest.OnNext<byte>(20,0x1B), //Control character
ReactiveTest.OnNext<byte>(21,0x82), //Data
ReactiveTest.OnNext<byte>(22,0x82), //EOM m4
ReactiveTest.OnNext<byte>(23,0x81), //BOM m5
ReactiveTest.OnNext<byte>(24,0x01),
ReactiveTest.OnNext<byte>(25,0x02),
ReactiveTest.OnNext<byte>(26,0x03),
ReactiveTest.OnNext<byte>(27,0x1B), //Control character
ReactiveTest.OnNext<byte>(28,0x1B), //Data
ReactiveTest.OnNext<byte>(29,0x82), //EOM m5
ReactiveTest.OnNext<byte>(30,0x82));//Ignored (expected 0x81)
var observer = scheduler.CreateObserver<Message>();
//CurrentAnswer(source)
MyAnswer(source)
.Subscribe(observer);
scheduler.Start();
ReactiveAssert.AreElementsEqual(
new[] {
ReactiveTest.OnNext(05, new Message(){Header=new byte[]{0x01, 0x02, 0x03}, Data=new byte[0]{}}),
ReactiveTest.OnNext(10, new Message(){Header=new byte[]{0x82, 0x81, 0x82}, Data=new byte[0]{}}),
ReactiveTest.OnNext(15, new Message(){Header=new byte[]{0x01, 0x02, 0x1B}, Data=new byte[0]{}}),
ReactiveTest.OnNext(22, new Message(){Header=new byte[]{0x01, 0x02, 0x03}, Data=new byte[]{ 0x82}}),
ReactiveTest.OnNext(29, new Message(){Header=new byte[]{0x01, 0x02, 0x03}, Data=new byte[]{ 0x1B}}),
},
observer.Messages);
I have also written a version of Message that allows me to verify the code
public class Message
{
public static readonly byte BOM = 0x81;
public static readonly byte EOM = 0x82;
public static readonly byte Control = 0x1B;
public byte[] Header { get; set; }
public byte[] Data { get; set; }
public static Message Create(byte[] bytes)
{
if(bytes==null)
throw new ArgumentNullException(nameof(bytes));
if(bytes.Length<3)
throw new ArgumentException("bytes<3").Dump();
var header = new byte[3];
Array.Copy(bytes, header, 3);
var body = new List<byte>();
var escapeNext = false;
for (int i = 3; i < bytes.Length; i++)
{
var b = bytes[i];
if (b == Control && !escapeNext)
{
escapeNext = true;
}
else
{
body.Add(b);
escapeNext = false;
}
}
var msg = new Message { Header = header, Data = body.ToArray()};
return msg;
}
public override string ToString()
{
return string.Format("Message(Header=[{0}], Data=[{1}])", ByteArrayString(Header), ByteArrayString(Data));
}
private static string ByteArrayString(byte[] bytes)
{
return string.Join(",", bytes.Select(b => b.ToString("X")));
}
public override bool Equals(object obj)
{
var other = obj as Message;
if(obj==null)
return false;
return Equals(other);
}
protected bool Equals(Message other)
{
return IsSequenceEqual(Header, other.Header)
&& IsSequenceEqual(Data, other.Data);
}
private bool IsSequenceEqual<T>(IEnumerable<T> expected, IEnumerable<T> other)
{
if(expected==null && other==null)
return true;
if(expected==null || other==null)
return false;
return expected.SequenceEqual(other);
}
public override int GetHashCode()
{
unchecked
{
return ((Header != null ? Header.GetHashCode() : 0) * 397) ^ (Data != null ? Data.GetHashCode() : 0);
}
}
}
Now that I have all the plumbing, I can focus on the actual problem.
public static IObservable<Message> MyAnswer(IObservable<byte> source)
{
return source.Publish(s =>
{
return
Observable.Defer(()=>
//Start consuming once we see a BOM
s.SkipWhile(b => b != Message.BOM)
.Scan(new Accumulator(), (acc, cur)=>acc.Accumulate(cur))
)
.TakeWhile(acc=>!acc.IsEndOfMessage())
.Where(acc=>!acc.IsBeginingOfMessage())
.Select(acc=>acc.Value())
.ToArray()
.Where(buffer=>buffer.Any())
.Select(buffer => Message.Create(buffer))
.Repeat();
});
}
public class Accumulator
{
private int _index = 0;
private byte _current =0;
private bool _isCurrentEscaped = false;
private bool _isNextEscaped = false;
public Accumulator Accumulate(byte b)
{
_index++;
_current = b;
_isCurrentEscaped = _isNextEscaped;
_isNextEscaped = (!IsHeader() && !_isCurrentEscaped && b==Message.Control);
return this;
}
public byte Value()
{
return _current;
}
private bool IsHeader()
{
return _index < 5;
}
public bool IsBeginingOfMessage()
{
return _index == 1 && _current == Message.BOM;
}
public bool IsEndOfMessage()
{
return !IsHeader()
&& _current == Message.EOM
&& !_isCurrentEscaped;
}
}
For completeness, here is the guts of #Evk's answer so you easily swap in and out implementations.
public static IObservable<Message> CurrentAnswer(IObservable<byte> source)
{
return Observable.Create<Message>(o =>
{
// some crude parsing code for the sake of example
bool nextIsEscaped = false;
bool readingHeader = false;
bool readingBody = false;
List<byte> body = new List<byte>();
List<byte> header = new List<byte>();
return source.Subscribe(b =>
{
if (b == 0x81 && !nextIsEscaped && !readingHeader)
{
// start
readingHeader = true;
readingBody = false;
nextIsEscaped = false;
}
else if (b == 0x82 && !nextIsEscaped && !readingHeader)
{
// end
readingHeader = false;
readingBody = false;
if (header.Count > 0 || body.Count > 0)
{
o.OnNext(new Message()
{
Header = header.ToArray(),
Data = body.ToArray()
});
header.Clear();
body.Clear();
}
nextIsEscaped = false;
}
else if (b == 0x1B && !nextIsEscaped && !readingHeader)
{
nextIsEscaped = true;
}
else
{
if (readingHeader)
{
header.Add(b);
if (header.Count == 3)
{
readingHeader = false;
readingBody = true;
}
}
else if (readingBody)
body.Add(b);
nextIsEscaped = false;
}
});
});
}
You can just use basic building blocks: Observable.Create and Subscribe. First let's grab some code which will help us to read stream as observable of byte[] (there are many different examples of how to do that):
static class Extensions {
public static IObservable<byte[]> AsyncRead(this Stream stream, int bufferSize) {
var buffer = new byte[bufferSize];
var asyncRead = Observable.FromAsyncPattern<byte[], int, int, int>(
stream.BeginRead,
stream.EndRead);
return Observable.While(
() => stream.CanRead,
Observable.Defer(() => asyncRead(buffer, 0, bufferSize))
.Select(readBytes => buffer.Take(readBytes).ToArray()));
}
}
Then define message class:
class Message {
public byte[] Header { get; set; }
public byte[] Body { get; set; }
}
And then put that into small console app:
static void Main(string[] args) {
// original stream
var stream = new MemoryStream(new byte[] { 0x81, 0x01,0x02,0x03,0x1B,0x1B,0x82,0x82});
// your initial IObservable<byte[]>
IObservable<byte[]> bytes = stream.AsyncRead(128); // or any other buffer size
// your IObservable<Message>
var observable = Observable.Create<Message>(observer => {
// some crude parsing code for the sake of example
bool nextIsEscaped = false;
bool readingHeader = false;
bool readingBody = false;
List<byte> body = new List<byte>();
List<byte> header = new List<byte>();
return bytes.Subscribe(buffer => {
foreach (var b in buffer) {
if (b == 0x81 && !nextIsEscaped && !readingHeader) {
// start
readingHeader = true;
readingBody = false;
nextIsEscaped = false;
}
else if (b == 0x82 && !nextIsEscaped && !readingHeader) {
// end
readingHeader = false;
readingBody = false;
if (header.Count > 0 || body.Count > 0) {
observer.OnNext(new Message() {
Header = header.ToArray(),
Body = body.ToArray()
});
header.Clear();
body.Clear();
}
nextIsEscaped = false;
}
else if (b == 0x1B && !nextIsEscaped && !readingHeader) {
nextIsEscaped = true;
}
else {
if (readingHeader) {
header.Add(b);
if (header.Count == 3) {
readingHeader = false;
readingBody = true;
}
}
else if (readingBody)
body.Add(b);
nextIsEscaped = false;
}
}
});
});
observable.Subscribe(msg =>
{
Console.WriteLine("Header: " + BitConverter.ToString(msg.Header));
Console.WriteLine("Body: " + BitConverter.ToString(msg.Body));
});
Console.ReadKey();
}
I have some XML I am receiving from a server that sometimes has some invalid characters that I would like to remove before deserialization. I have no control over the XML file I receive so I need to check for the invalid characters myself.
Sample XML.....
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
As you can see, the customer po section has the invalid characters I need to remove. This sometimes occurs only in certain elements that include user typed data.
Here is my Response code.....
//configure http request
HttpWebRequest httpRequest = WebRequest.Create(url) as HttpWebRequest;
httpRequest.Method = "POST";
//prepare correct encoding for XML serialization
UTF8Encoding encoding = new UTF8Encoding();
//use Xml property to obtain serialized XML data
//convert into bytes using encoding specified above and get length
byte[] bodyBytes = encoding.GetBytes(Xml);
httpRequest.ContentLength = bodyBytes.Length;
//get http request stream for putting XML data into
Stream httpRequestBodyStream = httpRequest.GetRequestStream();
//fill stream with serialized XML data
httpRequestBodyStream.Write(bodyBytes, 0, bodyBytes.Length);
httpRequestBodyStream.Close();
//get http response
HttpWebResponse httpResponse = httpRequest.GetResponse() as HttpWebResponse;
StreamReader httpResponseStream = new StreamReader(httpResponse.GetResponseStream(), System.Text.Encoding.ASCII);
//extract XML from response
string httpResponseBody = httpResponseStream.ReadToEnd();
httpResponseStream.Close();
//ignore everything that isn't XML by removing headers
httpResponseBody = httpResponseBody.Substring(httpResponseBody.IndexOf("<?xml"));
//deserialize XML into ProductInquiryResponse
XmlSerializer serializer = new XmlSerializer(typeof(MyResponseClass));
StringReader responseReader = new StringReader(httpResponseBody);
//return MyResponseClass result
return serializer.Deserialize(responseReader) as MyResponseClass;
Does anyone happen to have any suggestions to check the XML? Should I just check the elements I am concerned with right before the xml string gets deserialized? Or is there a better way?
A general fix for your problem would be to recursively descend the XML, parsing as you go and comparing to the schema for that node. At any point if the input differs from the input expected from the schema, or is malformed in some way, allow an error handler to run to fix the input stream, rolling back to the most recent good state and proceeding forward with the fixed input.
The .Net XmlTextReader class is not flexible enough to do this. However, if you know in advance that from the schema that certain XML Elements cannot have children, then the following will read an XML input stream, and upon encountering an element whose fully qualified name matches the known names of leaf nodes, and "escape" the text of all such nodes:
public enum XmlDoctorStatus
{
NoFixNeeded,
FixMade,
FixFailed
}
public class XmlDoctor
{
internal class XmlFixData
{
public string InitialXml { get; private set; }
public string FixedXml { get; private set; }
public int LineNumber { get; private set; }
public int LinePosition { get; private set; }
public XmlFixData(string initialXml, string fixedXml, int lineNumber, int linePosition)
{
this.InitialXml = initialXml;
this.FixedXml = fixedXml;
this.LineNumber = lineNumber;
this.LinePosition = linePosition;
}
public bool ComesAfter(XmlFixData other)
{
if (LineNumber > other.LineNumber)
return true;
if (LineNumber == other.LineNumber && LinePosition > other.LinePosition)
return true;
return false;
}
}
internal class XmlFixedException : Exception
{
public XmlFixData XmlFixData { get; private set; }
public XmlFixedException(XmlFixData data)
{
this.XmlFixData = data;
}
}
readonly HashSet<XName> childlessNodes;
public string OriginalXml { get; private set; }
public XmlDoctor(string xml, IEnumerable<XName> childlessNodes)
{
if (xml == null)
throw new ArgumentNullException();
this.OriginalXml = xml;
this.childlessNodes = new HashSet<XName>(childlessNodes);
}
List<int> indices = null;
string passXml = string.Empty;
bool inPass = false;
void InitializePass(string xml)
{
if (inPass)
throw new Exception("nested pass");
ClearElementData();
TextHelper.NormalizeLines(xml, out passXml, out indices);
inPass = true;
}
void EndPass()
{
inPass = false;
indices = null;
passXml = string.Empty;
ClearElementData();
}
static int LineNumber(XmlReader reader)
{
return ((IXmlLineInfo)reader).LineNumber;
}
static int LinePosition(XmlReader reader)
{
return ((IXmlLineInfo)reader).LinePosition;
}
// Taken from https://stackoverflow.com/questions/1132494/string-escape-into-xml
public static string XmlEscape(string escaped)
{
var replacements = new KeyValuePair<string, string>[]
{
new KeyValuePair<string,string>("&", "&"),
new KeyValuePair<string,string>("\"", """),
new KeyValuePair<string,string>("'", "'"),
new KeyValuePair<string,string>("<", "<"),
new KeyValuePair<string,string>(">", ">"),
};
foreach (var pair in replacements)
foreach (var index in escaped.IndexesOf(pair.Key, 0).Reverse())
if (!replacements.Any(other => string.Compare(other.Value, 0, escaped, index, other.Value.Length, StringComparison.Ordinal) == 0))
{
escaped = escaped.Substring(0, index) + pair.Value + escaped.Substring(index + 1, escaped.Length - index - 1);
}
return escaped;
}
void HandleNode(XmlReader reader)
{
// Adapted from http://blogs.msdn.com/b/mfussell/archive/2005/02/12/371546.aspx
if (reader == null)
{
throw new ArgumentNullException("reader");
}
switch (reader.NodeType)
{
case XmlNodeType.Element:
HandleStartElement(reader);
if (reader.IsEmptyElement)
{
HandleEndElement(reader);
}
break;
case XmlNodeType.Text:
HandleText(reader);
break;
case XmlNodeType.Whitespace:
case XmlNodeType.SignificantWhitespace:
break;
case XmlNodeType.CDATA:
break;
case XmlNodeType.EntityReference:
break;
case XmlNodeType.XmlDeclaration:
case XmlNodeType.ProcessingInstruction:
break;
case XmlNodeType.DocumentType:
break;
case XmlNodeType.Comment:
break;
case XmlNodeType.EndElement:
HandleEndElement(reader);
break;
}
}
private void HandleText(XmlReader reader)
{
if (string.IsNullOrEmpty(currentElementLocalName) || string.IsNullOrEmpty(currentElementName))
return;
var name = XName.Get(currentElementLocalName, currentElementNameSpace);
if (!childlessNodes.Contains(name))
return;
var lineIndex = LineNumber(reader) - 1;
var charIndex = LinePosition(reader) - 1;
if (lineIndex < 0 || charIndex < 0)
return;
int startIndex = indices[lineIndex] + charIndex;
// Scan forward in the input string until we find either the beginning of a CDATA section or the end of this element.
// Patterns to match: </Name
//
string pattern1 = "</" + currentElementName;
var index1 = FindElementEnd(passXml, startIndex, pattern1);
if (index1 < 0)
return; // BAD XML.
string pattern2 = "<![CDATA[";
var index2 = passXml.IndexOf(pattern2, startIndex);
int endIndex = (index2 < 0 ? index1 : Math.Min(index1, index2));
var text = passXml.Substring(startIndex, endIndex - startIndex);
var escapeText = XmlEscape(text);
if (escapeText != text)
{
if (escapeText != XmlEscape(escapeText))
{
Debug.Assert(escapeText == XmlEscape(escapeText));
throw new InvalidOperationException("Escaping error");
}
string fixedXml = passXml.Substring(0, startIndex) + escapeText + passXml.Substring(endIndex, passXml.Length - endIndex);
throw new XmlFixedException(new XmlFixData(passXml, fixedXml, lineIndex + 1, charIndex + 1));
}
}
static bool IsXmlSpace(char ch)
{
// http://www.w3.org/TR/2000/REC-xml-20001006#NT-S
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
return ch == '\u0020' || ch == '\u0009' || ch == '\u000D' || ch == '\u000A';
}
private static int FindElementEnd(string passXml, int charPos, string tagEnd)
{
while (true)
{
var index = passXml.IndexOf(tagEnd, charPos);
if (index < 0)
return index;
int endPos = index + tagEnd.Length;
if (index + tagEnd.Length >= passXml.Length)
return -1; // Bad xml?
// Now we must have zero or more white space characters and a ">"
while (endPos < passXml.Length && IsXmlSpace(passXml[endPos]))
endPos++;
if (endPos >= passXml.Length)
return -1; // BAD XML;
if (passXml[endPos] == '>')
return index;
index = endPos;
// Spurious ending, keep searching.
}
}
string currentElementName = string.Empty;
string currentElementNameSpace = string.Empty;
string currentElementLocalName = string.Empty;
private void HandleStartElement(XmlReader reader)
{
currentElementName = reader.Name;
currentElementLocalName = reader.LocalName;
currentElementNameSpace = reader.NamespaceURI;
}
private void HandleEndElement(XmlReader reader)
{
ClearElementData();
}
private void ClearElementData()
{
currentElementName = string.Empty;
currentElementNameSpace = string.Empty;
currentElementLocalName = string.Empty;
}
public XmlDoctorStatus TryFix(out string newXml)
{
XmlFixData data = null;
while (true)
{
XmlFixData newData;
var status = TryFixOnePass((data == null ? OriginalXml : data.FixedXml), out newData);
switch (status)
{
case XmlDoctorStatus.FixFailed:
Debug.WriteLine("Could not fix XML");
newXml = OriginalXml;
return XmlDoctorStatus.FixFailed;
case XmlDoctorStatus.FixMade:
if (data != null && !newData.ComesAfter(data))
{
Debug.WriteLine("Warning -- possible infinite loop detected, aborting fix");
newXml = OriginalXml;
return XmlDoctorStatus.FixFailed;
}
data = newData;
break; // Try to fix more
case XmlDoctorStatus.NoFixNeeded:
if (data == null)
{
newXml = OriginalXml;
return XmlDoctorStatus.NoFixNeeded;
}
else
{
newXml = data.FixedXml;
return XmlDoctorStatus.FixMade;
}
}
}
}
XmlDoctorStatus TryFixOnePass(string xml, out XmlFixData data)
{
try
{
InitializePass(xml);
using (var textReader = new StringReader(passXml))
using (XmlReader reader = XmlReader.Create(textReader))
{
while (true)
{
bool read = reader.Read();
if (!read)
break;
HandleNode(reader);
}
}
}
catch (XmlFixedException ex)
{
// Success - a fix was made.
data = ex.XmlFixData;
return XmlDoctorStatus.FixMade;
}
catch (Exception ex)
{
// Failure - the file was not fixed and could not be parsed.
Debug.WriteLine("Fix Failed: " + ex.ToString());
data = null;
return XmlDoctorStatus.FixFailed;
}
finally
{
EndPass();
}
// No fix needed.
data = null;
return XmlDoctorStatus.NoFixNeeded;
}
}
public static class TextHelper
{
public static void NormalizeLines(string text, out string newText, out List<int> lineIndices)
{
var sb = new StringBuilder();
var indices = new List<int>();
using (var sr = new StringReader(text))
{
string line;
while ((line = sr.ReadLine()) != null)
{
indices.Add(sb.Length);
sb.AppendLine(line);
}
}
lineIndices = indices;
newText = sb.ToString();
}
public static IEnumerable<int> IndexesOf(this string str, string value, int startAt)
{
if (str == null)
yield break;
for (int index = startAt, valueLength = value.Length; ; index += valueLength)
{
index = str.IndexOf(value, index);
if (index == -1)
break;
yield return index;
}
}
}
Then use it like:
public static class TestXmlDoctor
{
public static void TestFix()
{
string xml1 = #"<?xml version=""1.0"" encoding=""UTF-8""?>
<MainClass>
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
</MainClass>
";
XName[] childlessNodes1 = new XName[]
{
XName.Get("CustomerPO", string.Empty),
};
try
{
TestFix(xml1, childlessNodes1);
}
catch (Exception ex)
{
Debug.WriteLine(ex);
}
}
public static string TestFix(string xml, IEnumerable<XName> childlessNodes)
{
string fixedXml;
var status = (new XmlDoctor(xml, childlessNodes).TryFix(out fixedXml));
switch (status)
{
case XmlDoctorStatus.NoFixNeeded:
return xml;
case XmlDoctorStatus.FixFailed:
Debug.WriteLine("Failed to fix xml");
return xml;
case XmlDoctorStatus.FixMade:
Debug.WriteLine("Fixed XML, new XML is as follows:");
Debug.WriteLine(fixedXml);
Debug.WriteLine(string.Empty);
return fixedXml;
default:
Debug.Assert(false, "Unknown fix status " + status.ToString());
return xml;
}
}
}
This with this, your XML fragment can be parsed, and becomes:
<?xml version="1.0" encoding="UTF-8"?>
<MainClass>
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
</MainClass>
Is there an easy way to extract text from an Rtf string without using RichTextBox?
Example:
{\rtf1\ansi\ansicpg1252\uc1\htmautsp\deff2{\fonttbl{\f0\fcharset0 Times New Roman;}{\f2\fcharset0 Segoe UI;}}{\colortbl\red0\green0\blue0;\red255\green255\blue255;}\loch\hich\dbch\pard\plain\ltrpar\itap0{\lang1033\fs18\f2\cf0 \cf0\ql{\f2 {\lang2070\ltrch foo}\li0\ri0\sa0\sb0\fi0\ql\par}
{\f2 {\lang2070\ltrch bar }\li0\ri0\sa0\sb0\fi0\ql\par}
}
}
should return:
foo
bar
How to do it in pure C# without any references to other libraries:
This guy wrote a class that strips RTF to plain text just as OP requested.
Here is the source
This is his code:
/// <summary>
/// Rich Text Stripper
/// </summary>
/// <remarks>
/// Translated from Python located at:
/// http://stackoverflow.com/a/188877/448
/// </remarks>
public static class RichTextStripper
{
private class StackEntry
{
public int NumberOfCharactersToSkip { get; set; }
public bool Ignorable { get; set; }
public StackEntry(int numberOfCharactersToSkip, bool ignorable)
{
NumberOfCharactersToSkip = numberOfCharactersToSkip;
Ignorable = ignorable;
}
}
private static readonly Regex _rtfRegex = new Regex(#"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);
private static readonly List<string> destinations = new List<string>
{
"aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid",
"atnparent","atnref","atntime","atrfend","atrfstart","author","background",
"bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping",
"colortbl","comment","company","creatim","datafield","datastore","defchp","defpap",
"do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt",
"fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl",
"ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype",
"fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr",
"footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl",
"header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc",
"hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers",
"leveltext","lfolevel","linkval","list","listlevel","listname","listoverride",
"listoverridetable","listpicture","liststylename","listtable","listtext",
"lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr",
"mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr",
"mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me",
"mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr",
"mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag",
"mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname",
"mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr",
"mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject",
"mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname",
"mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl",
"mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr",
"mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu",
"mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr",
"mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup",
"msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide",
"msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol",
"mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables",
"objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops",
"oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password",
"passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta",
"pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe",
"result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst",
"shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv",
"svb","tc","template","themedata","title","txe","ud","upr","userprops",
"wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform",
"xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl",
"xmlopen"
};
private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
{
{ "par", "\n" },
{ "sect", "\n\n" },
{ "page", "\n\n" },
{ "line", "\n" },
{ "tab", "\t" },
{ "emdash", "\u2014" },
{ "endash", "\u2013" },
{ "emspace", "\u2003" },
{ "enspace", "\u2002" },
{ "qmspace", "\u2005" },
{ "bullet", "\u2022" },
{ "lquote", "\u2018" },
{ "rquote", "\u2019" },
{ "ldblquote", "\u201C" },
{ "rdblquote", "\u201D" },
};
/// <summary>
/// Strip RTF Tags from RTF Text
/// </summary>
/// <param name="inputRtf">RTF formatted text</param>
/// <returns>Plain text from RTF</returns>
public static string StripRichTextFormat(string inputRtf)
{
if (inputRtf == null)
{
return null;
}
string returnString;
var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.
MatchCollection matches = _rtfRegex.Matches(inputRtf);
if (matches.Count > 0)
{
foreach (Match match in matches)
{
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;
if (!String.IsNullOrEmpty(brace))
{
curskip = 0;
if (brace == "{")
{
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
}
else if (brace == "}")
{
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
}
}
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
{
curskip = 0;
if (character == "~")
{
if (!ignorable)
{
outList.Add("\xA0");
}
}
else if ("{}\\".Contains(character))
{
if (!ignorable)
{
outList.Add(character);
}
}
else if (character == "*")
{
ignorable = true;
}
}
else if (!String.IsNullOrEmpty(word)) // \foo
{
curskip = 0;
if (destinations.Contains(word))
{
ignorable = true;
}
else if (ignorable)
{
}
else if (specialCharacters.ContainsKey(word))
{
outList.Add(specialCharacters[word]);
}
else if (word == "uc")
{
ucskip = Int32.Parse(arg);
}
else if (word == "u")
{
int c = Int32.Parse(arg);
if (c < 0)
{
c += 0x10000;
}
outList.Add(Char.ConvertFromUtf32(c));
curskip = ucskip;
}
}
else if (!String.IsNullOrEmpty(hex)) // \'xx
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
outList.Add(Char.ConvertFromUtf32(c));
}
}
else if (!String.IsNullOrEmpty(tchar))
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
outList.Add(tchar);
}
}
}
}
else
{
// Didn't match the regex
returnString = inputRtf;
}
returnString = String.Join(String.Empty, outList.ToArray());
return returnString;
}
}
EDIT 1:
In the meantime we had this code running for tests and adapted version in production. The new version does some additional safety checks & handles new lines better.
public static string StripRichTextFormat(string inputRtf)
{
if (inputRtf == null)
{
return null;
}
string returnString;
var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.
MatchCollection matches = _rtfRegex.Matches(inputRtf);
if (matches.Count > 0)
{
foreach (Match match in matches)
{
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;
if (!String.IsNullOrEmpty(brace))
{
curskip = 0;
if (brace == "{")
{
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
}
else if (brace == "}")
{
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
}
}
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
{
curskip = 0;
if (character == "~")
{
if (!ignorable)
{
outList.Add("\xA0");
}
}
else if ("{}\\".Contains(character))
{
if (!ignorable)
{
outList.Add(character);
}
}
else if (character == "*")
{
ignorable = true;
}
}
else if (!String.IsNullOrEmpty(word)) // \foo
{
curskip = 0;
if (destinations.Contains(word))
{
ignorable = true;
}
else if (ignorable)
{
}
else if (specialCharacters.ContainsKey(word))
{
outList.Add(specialCharacters[word]);
}
else if (word == "uc")
{
ucskip = Int32.Parse(arg);
}
else if (word == "u")
{
int c = Int32.Parse(arg);
if (c < 0)
{
c += 0x10000;
}
//Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
outList.Add(Char.ConvertFromUtf32(c));
else outList.Add("?");
curskip = ucskip;
}
}
else if (!String.IsNullOrEmpty(hex)) // \'xx
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
outList.Add(Char.ConvertFromUtf32(c));
}
}
else if (!String.IsNullOrEmpty(tchar))
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
outList.Add(tchar);
}
}
}
}
else
{
// Didn't match the regex
returnString = inputRtf;
}
returnString = String.Join(String.Empty, outList.ToArray());
return returnString;
}
There's a simple article on MSDN to achieve what you want: http://msdn.microsoft.com/en-us/library/cc488002.aspx
class ConvertFromRTF
{
static void Main()
{
string path = #"test.rtf";
//Create the RichTextBox. (Requires a reference to System.Windows.Forms.dll.)
System.Windows.Forms.RichTextBox rtBox = new System.Windows.Forms.RichTextBox();
// Get the contents of the RTF file. Note that when it is
// stored in the string, it is encoded as UTF-16.
string s = System.IO.File.ReadAllText(path);
// Display the RTF text.
System.Windows.Forms.MessageBox.Show(s);
// Convert the RTF to plain text.
rtBox.Rtf = s;
string plainText = rtBox.Text;
// Display plain text output in MessageBox because console
// cannot display Greek letters.
System.Windows.Forms.MessageBox.Show(plainText);
// Output plain text to file, encoded as UTF-8.
System.IO.File.WriteAllText(#"output.txt", plainText);
}
}
Can't agree with using RichTextBox or any other controls in such kind of tasks. Here is another one approach:
public string RtfToPlainText(string rtf)
{
var flowDocument = new FlowDocument();
var textRange = new TextRange(flowDocument.ContentStart, flowDocument.ContentEnd);
using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(rtf ?? string.Empty)))
{
textRange.Load(stream, DataFormats.Rtf);
}
return textRange.Text;
}