I am unable to find the solution that how to read Memo type field from DBF file.
As per my current RnD, I found out that the "Memo" data is stored in a separate file with the same name as the DBF file, but with a ".fpt" file extension. The "Memo" data file consists of a series of blocks, each of which contains a fixed number of bytes (typically 512 bytes), and each block is identified by a block number.
The "Memo" column in the DBF file contains a 10-byte pointer that identifies the block number of the first block of data for the corresponding record. If the data for a particular record exceeds the size of one block, additional blocks are allocated as needed and linked together using the pointers stored in the first block.
Moreover I used the source code provided here
using System;
using System.Collections.Generic;
using System.Data;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Runtime.InteropServices;
using System.Text;
namespace System.IO
{
/// <summary>
/// This class reads a dbf files
/// </summary>
public class DBFReader : IDisposable
{
private BinaryReader reader;
private Encoding encoding;
public DBFReader(Stream stream, Encoding encoding)
{
this.encoding = encoding;
this.reader = new BinaryReader(stream, encoding);
ReadHeader();
}
public DBFReader(string filename, Encoding encoding)
{
if (File.Exists(filename) == false)
throw new FileNotFoundException();
this.encoding = encoding;
var bs = new BufferedStream(File.OpenRead(filename));
this.reader = new BinaryReader(bs, encoding);
ReadHeader();
}
private void ReadHeader()
{
byte[] buffer = reader.ReadBytes(Marshal.SizeOf(typeof(DBFHeader)));
// Marshall the header into a DBFHeader structure
GCHandle handle = GCHandle.Alloc(buffer, GCHandleType.Pinned);
this.header = (DBFHeader)Marshal.PtrToStructure(handle.AddrOfPinnedObject(), typeof(DBFHeader));
handle.Free();
fields = new List<DBFFieldDescriptor>();
while (reader.PeekChar() != 13)
{
buffer = reader.ReadBytes(Marshal.SizeOf(typeof(DBFFieldDescriptor)));
handle = GCHandle.Alloc(buffer, GCHandleType.Pinned);
var fieldDescriptor = (DBFFieldDescriptor)Marshal.PtrToStructure(handle.AddrOfPinnedObject(), typeof(DBFFieldDescriptor));
if ((fieldDescriptor.Flags & DBFFieldFlags.System) != DBFFieldFlags.System )
{
fields.Add(fieldDescriptor);
}
handle.Free();
}
byte headerTerminator = reader.ReadByte();
byte[] backlink = reader.ReadBytes(263);
}
private void ReadRecords()
{
records = new List<Dictionary<DBFFieldDescriptor, object>>();
// Skip back to the end of the header.
reader.BaseStream.Seek(header.HeaderLenght, SeekOrigin.Begin);
for (int i = 0; i < header.NumberOfRecords; i++)
{
if (reader.PeekChar() == '*') // DELETED
{
continue;
}
var record = new Dictionary<DBFFieldDescriptor, object>();
var row = reader.ReadBytes(header.RecordLenght);
foreach (var field in fields)
{
byte[] buffer = new byte[field.FieldLength];
Array.Copy(row, field.Address, buffer, 0, field.FieldLength);
string text = (encoding.GetString(buffer) ?? String.Empty).Trim();
switch ((DBFFieldType)field.FieldType)
{
case DBFFieldType.Character:
record[field] = text;
break;
case DBFFieldType.Currency:
if (String.IsNullOrWhiteSpace(text))
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = 0.0m;
}
}
else
{
record[field] = Convert.ToDecimal(text);
}
break;
case DBFFieldType.Numeric:
case DBFFieldType.Float:
if (String.IsNullOrWhiteSpace(text))
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = 0.0f;
}
}
else
{
record[field] = Convert.ToSingle(text);
}
break;
case DBFFieldType.Date:
if (String.IsNullOrWhiteSpace(text))
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = DateTime.MinValue;
}
}
else
{
record[field] = DateTime.ParseExact(text, "yyyyMMdd", CultureInfo.InvariantCulture);
}
break;
case DBFFieldType.DateTime:
if (String.IsNullOrWhiteSpace(text) || BitConverter.ToInt64(buffer, 0) == 0)
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = DateTime.MinValue;
}
}
else
{
record[field] = JulianToDateTime(BitConverter.ToInt64(buffer, 0));
}
break;
case DBFFieldType.Double:
if (String.IsNullOrWhiteSpace(text))
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = 0.0;
}
}
else
{
record[field] = Convert.ToDouble(text);
}
break;
case DBFFieldType.Integer:
if (String.IsNullOrWhiteSpace(text))
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = 0;
}
}
else
{
record[field] = BitConverter.ToInt32(buffer, 0);
}
break;
case DBFFieldType.Logical:
if (String.IsNullOrWhiteSpace(text))
{
if ((field.Flags & DBFFieldFlags.AllowNullValues) == DBFFieldFlags.AllowNullValues)
{
record[field] = null;
}
else
{
record[field] = false;
}
}
else
{
record[field] = (buffer[0] == 'Y' || buffer[0] == 'T');
}
break;
case DBFFieldType.Memo:
case DBFFieldType.General:
case DBFFieldType.Picture:
default:
record[field] = buffer;
break;
}
}
records.Add(record);
}
}
public DataTable ReadToDataTable()
{
ReadRecords();
var table = new DataTable();
// Columns
foreach (var field in fields)
{
var colType = ToDbType(field.FieldType);
var column = new DataColumn(field.FieldName, colType ?? typeof(String));
table.Columns.Add(column);
}
// Rows
foreach (var record in records)
{
var row = table.NewRow();
foreach (var column in record.Keys)
{
row[column.FieldName] = record[column] ?? DBNull.Value;
}
table.Rows.Add(row);
}
return table;
}
public IEnumerable<Dictionary<string, object>> ReadToDictionary()
{
ReadRecords();
return records.Select(record => record.ToDictionary(r => r.Key.FieldName, r => r.Value)).ToList();
}
public IEnumerable<T> ReadToObject<T>()
where T : new()
{
ReadRecords();
var type = typeof(T);
var list = new List<T>();
foreach (var record in records)
{
T item = new T();
foreach (var pair in record.Select(s => new { Key = s.Key.FieldName, Value = s.Value }))
{
var property = type.GetProperty(pair.Key, BindingFlags.IgnoreCase | BindingFlags.Public | BindingFlags.Instance);
if (property != null)
{
if (property.PropertyType == pair.Value.GetType())
{
property.SetValue(item, pair.Value, null);
}
else
{
if (pair.Value != DBNull.Value)
{
property.SetValue(item, System.Convert.ChangeType(pair.Value, property.PropertyType), null);
}
}
}
}
list.Add(item);
}
return list;
}
private DBFHeader header;
private List<DBFFieldDescriptor> fields = new List<DBFFieldDescriptor>();
private List<Dictionary<DBFFieldDescriptor, object>> records = new List<Dictionary<DBFFieldDescriptor,object>>();
#region IDisposable
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
protected void Dispose(bool disposing)
{
if (disposing == false) return;
if (reader != null)
{
reader.Close();
reader.Dispose();
reader = null;
}
}
~DBFReader()
{
Dispose(false);
}
#endregion
/// <summary>
/// Convert a Julian Date as long to a .NET DateTime structure
/// Implemented from pseudo code at http://en.wikipedia.org/wiki/Julian_day
/// </summary>
/// <param name="julianDateAsLong">Julian Date to convert (days since 01/01/4713 BC)</param>
/// <returns>DateTime</returns>
private static DateTime JulianToDateTime(long julianDateAsLong)
{
if (julianDateAsLong == 0) return DateTime.MinValue;
double p = Convert.ToDouble(julianDateAsLong);
double s1 = p + 68569;
double n = Math.Floor(4 * s1 / 146097);
double s2 = s1 - Math.Floor(((146097 * n) + 3) / 4);
double i = Math.Floor(4000 * (s2 + 1) / 1461001);
double s3 = s2 - Math.Floor(1461 * i / 4) + 31;
double q = Math.Floor(80 * s3 / 2447);
double d = s3 - Math.Floor(2447 * q / 80);
double s4 = Math.Floor(q / 11);
double m = q + 2 - (12 * s4);
double j = (100 * (n - 49)) + i + s4;
return new DateTime(Convert.ToInt32(j), Convert.ToInt32(m), Convert.ToInt32(d));
}
/// <summary>
/// This is the file header for a DBF. We do this special layout with everything
/// packed so we can read straight from disk into the structure to populate it
/// </summary>
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi, Pack = 1)]
private struct DBFHeader
{
/// <summary>The version.</summary>
public readonly DBFVersion Version;
/// <summary>The update year.</summary>
public readonly byte UpdateYear;
/// <summary>The update month.</summary>
public readonly byte UpdateMonth;
/// <summary>The update day.</summary>
public readonly byte UpdateDay;
/// <summary>The number of records.</summary>
public readonly int NumberOfRecords;
/// <summary>The length of the header.</summary>
public readonly short HeaderLenght;
/// <summary>The length of the bytes records.</summary>
public readonly short RecordLenght;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 16)]
public readonly byte[] Reserved;
/// <summary>Table Flags</summary>
public readonly DBFTableFlags TableFlags;
/// <summary>Code Page Mark</summary>
public readonly byte CodePage;
/// <summary>Reserved, contains 0x00</summary>
public readonly short EndOfHeader;
}
public enum DBFVersion : byte
{
Unknown = 0,
FoxBase = 0x02,
FoxBaseDBase3NoMemo = 0x03,
VisualFoxPro = 0x30,
VisualFoxProWithAutoIncrement = 0x31,
dBase4SQLTableNoMemo = 0x43,
dBase4SQLSystemNoMemo = 0x63,
FoxBaseDBase3WithMemo = 0x83,
dBase4WithMemo = 0x8B,
dBase4SQLTableWithMemo = 0xCB,
FoxPro2WithMemo = 0xF5,
FoxBASE = 0xFB
}
[Flags]
public enum DBFTableFlags : byte
{
None = 0x00,
HasStructuralCDX = 0x01,
HasMemoField = 0x02,
IsDBC = 0x04
}
/// <summary>
/// This is the field descriptor structure. There will be one of these for each column in the table.
/// </summary>
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi, Pack = 1)]
private struct DBFFieldDescriptor
{
/// <summary>The field name.</summary>
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 11)]
public readonly string FieldName;
/// <summary>The field type.</summary>
public readonly char FieldType;
/// <summary>The field address.</summary>
public readonly int Address;
/// <summary>The field length in bytes.</summary>
public readonly byte FieldLength;
/// <summary>The field precision.</summary>
public readonly byte DecimalCount;
/// <summary>Field Flags</summary>
public readonly DBFFieldFlags Flags;
/// <summary>AutoIncrement next value</summary>
public readonly int AutoIncrementNextValue;
/// <summary>AutoIncrement step value</summary>
public readonly byte AutoIncrementStepValue;
/// <summary>Reserved</summary>
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 8)]
public readonly byte[] Reserved;
public override string ToString()
{
return String.Format("{0} {1}", FieldName, FieldType);
}
}
[Flags]
public enum DBFFieldFlags : byte
{
None = 0x00,
System = 0x01,
AllowNullValues = 0x02,
Binary = 0x04,
AutoIncrementing = 0x0C
}
public enum DBFFieldType : int
{
Character = 'C',
Currency = 'Y',
Numeric = 'N',
Float = 'F',
Date = 'D',
DateTime = 'T',
Double = 'B',
Integer = 'I',
Logical = 'L',
Memo = 'M',
General = 'G',
Picture = 'P'
}
public static Type ToDbType(char type)
{
switch ((DBFFieldType)type)
{
case DBFFieldType.Float:
return typeof(float);
case DBFFieldType.Integer:
return typeof(int);
case DBFFieldType.Currency:
return typeof(decimal);
case DBFFieldType.Character:
case DBFFieldType.Memo:
return typeof(string);
case DBFFieldType.Date:
case DBFFieldType.DateTime:
return typeof(DateTime);
case DBFFieldType.Logical:
return typeof(bool);
case DBFFieldType.General:
case DBFFieldType.Picture:
return typeof(byte[]);
default:
return null;
}
}
}
}
What is your question?
(Would be a mess as comment)
Your information about FPT file structure is not entirely correct and full details exist in the VFP help file.
Do you have to read it low level? If you have to, basically what you do is:
Collect memo block numbers from the DBF itself (block numbers are stored in corresponding field in DBF),
For a memo entry, go to block number in FPT (depends on block size)
From the block header read the size and get the bytes that matches the stored size.
While doing that, always read as binary, because unlike C strings, VFP string values are not ASCIIZ values, they can even contain '\x0'.
If it is not a must to read that way then the easy way is to simply read it using VFPOLEDB OleDb driver. ie:
void Main()
{
string cn = #"Provider=VFPOLEDB;Data source=C:\PROGRAM FILES (X86)\MICROSOFT VISUAL FOXPRO 9\SAMPLES\DATA;";
string query = "select emp_id, first_name, last_name, notes from Employee";
DataTable t = new DataTable();
new OleDbDataAdapter(query, cn).Fill(t);
foreach (var row in t.AsEnumerable())
{
Console.WriteLine($"{row["emp_id"]}, {row["first_name"]}, {row["last_name"]}, ({row["notes"]})");
}
}
Note that VFPOLEDB driver is 32 bits, you would need to target x86 platform. There are 64 bits drivers from Sybase ADS as they say (have never used).
PS: Also search for Tom Brother's Linq To VFp, Linq To EF VFP drivers.
PS2: I quickly glanced the source code you provided, and beware that is for older VFP versions and is not really correct (ie: Assumes Memo is a string which is not correct - if you accept them string in C# you are likely going to lose many data).
I have some XML I am receiving from a server that sometimes has some invalid characters that I would like to remove before deserialization. I have no control over the XML file I receive so I need to check for the invalid characters myself.
Sample XML.....
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
As you can see, the customer po section has the invalid characters I need to remove. This sometimes occurs only in certain elements that include user typed data.
Here is my Response code.....
//configure http request
HttpWebRequest httpRequest = WebRequest.Create(url) as HttpWebRequest;
httpRequest.Method = "POST";
//prepare correct encoding for XML serialization
UTF8Encoding encoding = new UTF8Encoding();
//use Xml property to obtain serialized XML data
//convert into bytes using encoding specified above and get length
byte[] bodyBytes = encoding.GetBytes(Xml);
httpRequest.ContentLength = bodyBytes.Length;
//get http request stream for putting XML data into
Stream httpRequestBodyStream = httpRequest.GetRequestStream();
//fill stream with serialized XML data
httpRequestBodyStream.Write(bodyBytes, 0, bodyBytes.Length);
httpRequestBodyStream.Close();
//get http response
HttpWebResponse httpResponse = httpRequest.GetResponse() as HttpWebResponse;
StreamReader httpResponseStream = new StreamReader(httpResponse.GetResponseStream(), System.Text.Encoding.ASCII);
//extract XML from response
string httpResponseBody = httpResponseStream.ReadToEnd();
httpResponseStream.Close();
//ignore everything that isn't XML by removing headers
httpResponseBody = httpResponseBody.Substring(httpResponseBody.IndexOf("<?xml"));
//deserialize XML into ProductInquiryResponse
XmlSerializer serializer = new XmlSerializer(typeof(MyResponseClass));
StringReader responseReader = new StringReader(httpResponseBody);
//return MyResponseClass result
return serializer.Deserialize(responseReader) as MyResponseClass;
Does anyone happen to have any suggestions to check the XML? Should I just check the elements I am concerned with right before the xml string gets deserialized? Or is there a better way?
A general fix for your problem would be to recursively descend the XML, parsing as you go and comparing to the schema for that node. At any point if the input differs from the input expected from the schema, or is malformed in some way, allow an error handler to run to fix the input stream, rolling back to the most recent good state and proceeding forward with the fixed input.
The .Net XmlTextReader class is not flexible enough to do this. However, if you know in advance that from the schema that certain XML Elements cannot have children, then the following will read an XML input stream, and upon encountering an element whose fully qualified name matches the known names of leaf nodes, and "escape" the text of all such nodes:
public enum XmlDoctorStatus
{
NoFixNeeded,
FixMade,
FixFailed
}
public class XmlDoctor
{
internal class XmlFixData
{
public string InitialXml { get; private set; }
public string FixedXml { get; private set; }
public int LineNumber { get; private set; }
public int LinePosition { get; private set; }
public XmlFixData(string initialXml, string fixedXml, int lineNumber, int linePosition)
{
this.InitialXml = initialXml;
this.FixedXml = fixedXml;
this.LineNumber = lineNumber;
this.LinePosition = linePosition;
}
public bool ComesAfter(XmlFixData other)
{
if (LineNumber > other.LineNumber)
return true;
if (LineNumber == other.LineNumber && LinePosition > other.LinePosition)
return true;
return false;
}
}
internal class XmlFixedException : Exception
{
public XmlFixData XmlFixData { get; private set; }
public XmlFixedException(XmlFixData data)
{
this.XmlFixData = data;
}
}
readonly HashSet<XName> childlessNodes;
public string OriginalXml { get; private set; }
public XmlDoctor(string xml, IEnumerable<XName> childlessNodes)
{
if (xml == null)
throw new ArgumentNullException();
this.OriginalXml = xml;
this.childlessNodes = new HashSet<XName>(childlessNodes);
}
List<int> indices = null;
string passXml = string.Empty;
bool inPass = false;
void InitializePass(string xml)
{
if (inPass)
throw new Exception("nested pass");
ClearElementData();
TextHelper.NormalizeLines(xml, out passXml, out indices);
inPass = true;
}
void EndPass()
{
inPass = false;
indices = null;
passXml = string.Empty;
ClearElementData();
}
static int LineNumber(XmlReader reader)
{
return ((IXmlLineInfo)reader).LineNumber;
}
static int LinePosition(XmlReader reader)
{
return ((IXmlLineInfo)reader).LinePosition;
}
// Taken from https://stackoverflow.com/questions/1132494/string-escape-into-xml
public static string XmlEscape(string escaped)
{
var replacements = new KeyValuePair<string, string>[]
{
new KeyValuePair<string,string>("&", "&"),
new KeyValuePair<string,string>("\"", """),
new KeyValuePair<string,string>("'", "'"),
new KeyValuePair<string,string>("<", "<"),
new KeyValuePair<string,string>(">", ">"),
};
foreach (var pair in replacements)
foreach (var index in escaped.IndexesOf(pair.Key, 0).Reverse())
if (!replacements.Any(other => string.Compare(other.Value, 0, escaped, index, other.Value.Length, StringComparison.Ordinal) == 0))
{
escaped = escaped.Substring(0, index) + pair.Value + escaped.Substring(index + 1, escaped.Length - index - 1);
}
return escaped;
}
void HandleNode(XmlReader reader)
{
// Adapted from http://blogs.msdn.com/b/mfussell/archive/2005/02/12/371546.aspx
if (reader == null)
{
throw new ArgumentNullException("reader");
}
switch (reader.NodeType)
{
case XmlNodeType.Element:
HandleStartElement(reader);
if (reader.IsEmptyElement)
{
HandleEndElement(reader);
}
break;
case XmlNodeType.Text:
HandleText(reader);
break;
case XmlNodeType.Whitespace:
case XmlNodeType.SignificantWhitespace:
break;
case XmlNodeType.CDATA:
break;
case XmlNodeType.EntityReference:
break;
case XmlNodeType.XmlDeclaration:
case XmlNodeType.ProcessingInstruction:
break;
case XmlNodeType.DocumentType:
break;
case XmlNodeType.Comment:
break;
case XmlNodeType.EndElement:
HandleEndElement(reader);
break;
}
}
private void HandleText(XmlReader reader)
{
if (string.IsNullOrEmpty(currentElementLocalName) || string.IsNullOrEmpty(currentElementName))
return;
var name = XName.Get(currentElementLocalName, currentElementNameSpace);
if (!childlessNodes.Contains(name))
return;
var lineIndex = LineNumber(reader) - 1;
var charIndex = LinePosition(reader) - 1;
if (lineIndex < 0 || charIndex < 0)
return;
int startIndex = indices[lineIndex] + charIndex;
// Scan forward in the input string until we find either the beginning of a CDATA section or the end of this element.
// Patterns to match: </Name
//
string pattern1 = "</" + currentElementName;
var index1 = FindElementEnd(passXml, startIndex, pattern1);
if (index1 < 0)
return; // BAD XML.
string pattern2 = "<![CDATA[";
var index2 = passXml.IndexOf(pattern2, startIndex);
int endIndex = (index2 < 0 ? index1 : Math.Min(index1, index2));
var text = passXml.Substring(startIndex, endIndex - startIndex);
var escapeText = XmlEscape(text);
if (escapeText != text)
{
if (escapeText != XmlEscape(escapeText))
{
Debug.Assert(escapeText == XmlEscape(escapeText));
throw new InvalidOperationException("Escaping error");
}
string fixedXml = passXml.Substring(0, startIndex) + escapeText + passXml.Substring(endIndex, passXml.Length - endIndex);
throw new XmlFixedException(new XmlFixData(passXml, fixedXml, lineIndex + 1, charIndex + 1));
}
}
static bool IsXmlSpace(char ch)
{
// http://www.w3.org/TR/2000/REC-xml-20001006#NT-S
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
return ch == '\u0020' || ch == '\u0009' || ch == '\u000D' || ch == '\u000A';
}
private static int FindElementEnd(string passXml, int charPos, string tagEnd)
{
while (true)
{
var index = passXml.IndexOf(tagEnd, charPos);
if (index < 0)
return index;
int endPos = index + tagEnd.Length;
if (index + tagEnd.Length >= passXml.Length)
return -1; // Bad xml?
// Now we must have zero or more white space characters and a ">"
while (endPos < passXml.Length && IsXmlSpace(passXml[endPos]))
endPos++;
if (endPos >= passXml.Length)
return -1; // BAD XML;
if (passXml[endPos] == '>')
return index;
index = endPos;
// Spurious ending, keep searching.
}
}
string currentElementName = string.Empty;
string currentElementNameSpace = string.Empty;
string currentElementLocalName = string.Empty;
private void HandleStartElement(XmlReader reader)
{
currentElementName = reader.Name;
currentElementLocalName = reader.LocalName;
currentElementNameSpace = reader.NamespaceURI;
}
private void HandleEndElement(XmlReader reader)
{
ClearElementData();
}
private void ClearElementData()
{
currentElementName = string.Empty;
currentElementNameSpace = string.Empty;
currentElementLocalName = string.Empty;
}
public XmlDoctorStatus TryFix(out string newXml)
{
XmlFixData data = null;
while (true)
{
XmlFixData newData;
var status = TryFixOnePass((data == null ? OriginalXml : data.FixedXml), out newData);
switch (status)
{
case XmlDoctorStatus.FixFailed:
Debug.WriteLine("Could not fix XML");
newXml = OriginalXml;
return XmlDoctorStatus.FixFailed;
case XmlDoctorStatus.FixMade:
if (data != null && !newData.ComesAfter(data))
{
Debug.WriteLine("Warning -- possible infinite loop detected, aborting fix");
newXml = OriginalXml;
return XmlDoctorStatus.FixFailed;
}
data = newData;
break; // Try to fix more
case XmlDoctorStatus.NoFixNeeded:
if (data == null)
{
newXml = OriginalXml;
return XmlDoctorStatus.NoFixNeeded;
}
else
{
newXml = data.FixedXml;
return XmlDoctorStatus.FixMade;
}
}
}
}
XmlDoctorStatus TryFixOnePass(string xml, out XmlFixData data)
{
try
{
InitializePass(xml);
using (var textReader = new StringReader(passXml))
using (XmlReader reader = XmlReader.Create(textReader))
{
while (true)
{
bool read = reader.Read();
if (!read)
break;
HandleNode(reader);
}
}
}
catch (XmlFixedException ex)
{
// Success - a fix was made.
data = ex.XmlFixData;
return XmlDoctorStatus.FixMade;
}
catch (Exception ex)
{
// Failure - the file was not fixed and could not be parsed.
Debug.WriteLine("Fix Failed: " + ex.ToString());
data = null;
return XmlDoctorStatus.FixFailed;
}
finally
{
EndPass();
}
// No fix needed.
data = null;
return XmlDoctorStatus.NoFixNeeded;
}
}
public static class TextHelper
{
public static void NormalizeLines(string text, out string newText, out List<int> lineIndices)
{
var sb = new StringBuilder();
var indices = new List<int>();
using (var sr = new StringReader(text))
{
string line;
while ((line = sr.ReadLine()) != null)
{
indices.Add(sb.Length);
sb.AppendLine(line);
}
}
lineIndices = indices;
newText = sb.ToString();
}
public static IEnumerable<int> IndexesOf(this string str, string value, int startAt)
{
if (str == null)
yield break;
for (int index = startAt, valueLength = value.Length; ; index += valueLength)
{
index = str.IndexOf(value, index);
if (index == -1)
break;
yield return index;
}
}
}
Then use it like:
public static class TestXmlDoctor
{
public static void TestFix()
{
string xml1 = #"<?xml version=""1.0"" encoding=""UTF-8""?>
<MainClass>
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
</MainClass>
";
XName[] childlessNodes1 = new XName[]
{
XName.Get("CustomerPO", string.Empty),
};
try
{
TestFix(xml1, childlessNodes1);
}
catch (Exception ex)
{
Debug.WriteLine(ex);
}
}
public static string TestFix(string xml, IEnumerable<XName> childlessNodes)
{
string fixedXml;
var status = (new XmlDoctor(xml, childlessNodes).TryFix(out fixedXml));
switch (status)
{
case XmlDoctorStatus.NoFixNeeded:
return xml;
case XmlDoctorStatus.FixFailed:
Debug.WriteLine("Failed to fix xml");
return xml;
case XmlDoctorStatus.FixMade:
Debug.WriteLine("Fixed XML, new XML is as follows:");
Debug.WriteLine(fixedXml);
Debug.WriteLine(string.Empty);
return fixedXml;
default:
Debug.Assert(false, "Unknown fix status " + status.ToString());
return xml;
}
}
}
This with this, your XML fragment can be parsed, and becomes:
<?xml version="1.0" encoding="UTF-8"?>
<MainClass>
<PrintStatus>N</PrintStatus>
<CustomerPO> >>>> pearl <<<<< </CustomerPO>
<Description>PO# pearl</Description>
<BranchID>4</BranchID>
<PostDate>
<Date>01/13/2015</Date>
</PostDate>
<ShipDate>
<Date>01/13/2015</Date>
</ShipDate>
</MainClass>
I have rtf documents that include an embedded object (an image). I need to extract this as an Image object (or any other usable format). I have checked out this CodeProject article but the default apps don't render it correctly (They render the 'default image' image, not the image itself), so I moved on.
Here is a sample of the RTF Code (I had to shorten it because of size):
{\rtf1\ansi\deff0{\fonttbl{\f0\fnil\fcharset0 MS Sans Serif;}}
\viewkind4\uc1\pard\lang1033\f0\fs18{\object\objemb{\*\objclass Package}\objw855\objh810{\*\objdata
01050000
02000000
08000000
5061636b61676500
00000000
00000000
1f900000
02007369675f5f2e6a706700433a5c55736572735c726563657074696f6e5c4465736b746f705c
5369676e6174757265735c7369675f5f2e6a7067000000030034000000433a5c55736572735c52
45434550547e315c417070446174615c4c6f63616c5c54656d705c7369675f5f20283132292e6a
706700c18e0000ffd8ffe000104a46494600010101004800470000ffdb00430001010101010101
010101010101010101010101010101010101010101010101010101010101010101010101010101
010101010101010101010101010101010101ffdb00430101010101010101010101010101010101
010101010101010101010101010101010101010101010101010101010101010101010101010101
010101010101010101ffc0001108012c03e803012200021101031101ffc4001f00010002030002
0301000000000000000000090a07080b050602030401ffc4003f10000006030001040201030301
04070900000203040506010708090a11121314152116172223314118192532591a24576598d6d8
2933384651788497b7ffc4001a010101000301010000000000000000000000030204050106ffc4
002b11010003010100020103030402030000000002030401051112130614211522230731415124
32536162ffda000c03010002110311003f00bfc000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000
...
005c0072006500630065007000740069006f006e005c004400650073006b0074006f0070005c00
5300690067006e006100740075007200650073005c007300690067005f005f002e006a00700067
00
01050000
00000000
}{\result{\pict\wmetafile8\picw2010\pich1905\picwgoal855\pichgoal810
0100090000033b0700000200210600000000050000000b0200000000050000000c02350038001c
000000fb02f4ff000000000000900100000001000000005365676f65205549000e0a52104c2308
00dd1900d894ef758001f3758d0e664a040000002d010000050000000902000000000500000001
02ffffff00a5000000410bc600880020002000000000002000200000000c002800000020000000
400000000100010000000000000100000000000000000000000000000000000000000000ffffff
...
0021001c001c000000fb021000070000000000bc02000000000102022253797374656d00008d0e
664a00000a0022008a0100000000ffffffff8cdd1900040000002d010100030000000000
}}}\par
}
Here is a piece of code that can extract all objects ('Package' class objects) from an RTF stream:
public static void ExtractPackageObjects(string filePath)
{
using (StreamReader sr = new StreamReader(filePath))
{
RtfReader reader = new RtfReader(sr);
IEnumerator<RtfObject> enumerator = reader.Read().GetEnumerator();
while(enumerator.MoveNext())
{
if (enumerator.Current.Text == "object")
{
if (RtfReader.MoveToNextControlWord(enumerator, "objclass"))
{
string className = RtfReader.GetNextText(enumerator);
if (className == "Package")
{
if (RtfReader.MoveToNextControlWord(enumerator, "objdata"))
{
byte[] data = RtfReader.GetNextTextAsByteArray(enumerator);
using (MemoryStream packageData = new MemoryStream())
{
RtfReader.ExtractObjectData(new MemoryStream(data), packageData);
packageData.Position = 0;
PackagedObject po = PackagedObject.Extract(packageData);
File.WriteAllBytes(po.DisplayName, po.Data);
}
}
}
}
}
}
}
}
And here are the utility classes that this code uses. There is a simple stream-based RTF parser that allows to get to the interesting control words.
There is also a utility to extract data from a serialized Object Packager instance. Object Packager is an almost 20-years ago OLE1.0 thing and the serialized binary format is not documented (to my knowledge), but it's understandable.
This works fine on your provided sample, but you may have to adapt things around.
public class RtfReader
{
public RtfReader(TextReader reader)
{
if (reader == null)
throw new ArgumentNullException("reader");
Reader = reader;
}
public TextReader Reader { get; private set; }
public IEnumerable<RtfObject> Read()
{
StringBuilder controlWord = new StringBuilder();
StringBuilder text = new StringBuilder();
Stack<RtfParseState> stack = new Stack<RtfParseState>();
RtfParseState state = RtfParseState.Group;
do
{
int i = Reader.Read();
if (i < 0)
{
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
yield return new RtfControlWord(controlWord.ToString());
if (!string.IsNullOrWhiteSpace(text.ToString()))
yield return new RtfText(text.ToString());
yield break;
}
char c = (char)i;
// noise chars
if ((c == '\r') ||
(c == '\n'))
continue;
switch (state)
{
case RtfParseState.Group:
if (c == '{')
{
stack.Push(state);
break;
}
if (c == '\\')
{
state = RtfParseState.ControlWord;
break;
}
break;
case RtfParseState.ControlWord:
if (c == '\\')
{
// another controlWord
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
if (c == '{')
{
// a new group
state = RtfParseState.Group;
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
if (c == '}')
{
// close group
state = stack.Count > 0 ? stack.Pop() : RtfParseState.Group;
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
if (!Char.IsLetterOrDigit(c))
{
state = RtfParseState.Text;
text.Append(c);
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
controlWord.Append(c);
break;
case RtfParseState.Text:
if (c == '\\')
{
state = RtfParseState.EscapedText;
break;
}
if (c == '{')
{
if (!string.IsNullOrWhiteSpace(text.ToString()))
{
yield return new RtfText(text.ToString());
text.Clear();
}
// a new group
state = RtfParseState.Group;
break;
}
if (c == '}')
{
if (!string.IsNullOrWhiteSpace(text.ToString()))
{
yield return new RtfText(text.ToString());
text.Clear();
}
// close group
state = stack.Count > 0 ? stack.Pop() : RtfParseState.Group;
break;
}
text.Append(c);
break;
case RtfParseState.EscapedText:
if ((c == '\\') || (c == '}') || (c == '{'))
{
state = RtfParseState.Text;
text.Append(c);
break;
}
// ansi character escape
if (c == '\'')
{
text.Append(FromHexa((char)Reader.Read(), (char)Reader.Read()));
break;
}
if (!string.IsNullOrWhiteSpace(text.ToString()))
{
yield return new RtfText(text.ToString());
text.Clear();
}
// in fact, it's a normal controlWord
controlWord.Append(c);
state = RtfParseState.ControlWord;
break;
}
}
while (true);
}
public static bool MoveToNextControlWord(IEnumerator<RtfObject> enumerator, string word)
{
if (enumerator == null)
throw new ArgumentNullException("enumerator");
while (enumerator.MoveNext())
{
if (enumerator.Current.Text == word)
return true;
}
return false;
}
public static string GetNextText(IEnumerator<RtfObject> enumerator)
{
if (enumerator == null)
throw new ArgumentNullException("enumerator");
while (enumerator.MoveNext())
{
RtfText text = enumerator.Current as RtfText;
if (text != null)
return text.Text;
}
return null;
}
public static byte[] GetNextTextAsByteArray(IEnumerator<RtfObject> enumerator)
{
if (enumerator == null)
throw new ArgumentNullException("enumerator");
while (enumerator.MoveNext())
{
RtfText text = enumerator.Current as RtfText;
if (text != null)
{
List<byte> bytes = new List<byte>();
for (int i = 0; i < text.Text.Length; i += 2)
{
bytes.Add((byte)FromHexa(text.Text[i], text.Text[i + 1]));
}
return bytes.ToArray();
}
}
return null;
}
// Extracts an EmbeddedObject/ObjectHeader from a stream
// see [MS -OLEDS]: Object Linking and Embedding (OLE) Data Structures for more information
// chapter 2.2: OLE1.0 Format Structures
public static void ExtractObjectData(Stream inputStream, Stream outputStream)
{
if (inputStream == null)
throw new ArgumentNullException("inputStream");
if (outputStream == null)
throw new ArgumentNullException("outputStream");
BinaryReader reader = new BinaryReader(inputStream);
reader.ReadInt32(); // OLEVersion
int formatId = reader.ReadInt32(); // FormatID
if (formatId != 2) // see 2.2.4 Object Header. 2 means EmbeddedObject
throw new NotSupportedException();
ReadLengthPrefixedAnsiString(reader); // className
ReadLengthPrefixedAnsiString(reader); // topicName
ReadLengthPrefixedAnsiString(reader); // itemName
int nativeDataSize = reader.ReadInt32();
byte[] bytes = reader.ReadBytes(nativeDataSize);
outputStream.Write(bytes, 0, bytes.Length);
}
// see chapter 2.1.4 LengthPrefixedAnsiString
private static string ReadLengthPrefixedAnsiString(BinaryReader reader)
{
int length = reader.ReadInt32();
if (length == 0)
return string.Empty;
byte[] bytes = reader.ReadBytes(length);
return Encoding.Default.GetString(bytes, 0, length - 1);
}
private enum RtfParseState
{
ControlWord,
Text,
EscapedText,
Group
}
private static char FromHexa(char hi, char lo)
{
return (char)byte.Parse(hi.ToString() + lo, NumberStyles.HexNumber);
}
}
// Utility class to parse an OLE1.0 OLEOBJECT
public class PackagedObject
{
private PackagedObject()
{
}
public string DisplayName { get; private set; }
public string IconFilePath { get; private set; }
public int IconIndex { get; private set; }
public string FilePath { get; private set; }
public byte[] Data { get; private set; }
private static string ReadAnsiString(BinaryReader reader)
{
StringBuilder sb = new StringBuilder();
do
{
byte b = reader.ReadByte();
if (b == 0)
return sb.ToString();
sb.Append((char)b);
}
while (true);
}
public static PackagedObject Extract(Stream inputStream)
{
if (inputStream == null)
throw new ArgumentNullException("inputStream");
BinaryReader reader = new BinaryReader(inputStream);
reader.ReadUInt16(); // sig
PackagedObject po = new PackagedObject();
po.DisplayName = ReadAnsiString(reader);
po.IconFilePath = ReadAnsiString(reader);
po.IconIndex = reader.ReadUInt16();
int type = reader.ReadUInt16();
if (type != 3) // 3 is file, 1 is link
throw new NotSupportedException();
reader.ReadInt32(); // nextsize
po.FilePath = ReadAnsiString(reader);
int dataSize = reader.ReadInt32();
po.Data = reader.ReadBytes(dataSize);
// note after that, there may be unicode + long path info
return po;
}
}
public class RtfObject
{
public RtfObject(string text)
{
if (text == null)
throw new ArgumentNullException("text");
Text = text.Trim();
}
public string Text { get; private set; }
}
public class RtfText : RtfObject
{
public RtfText(string text)
: base(text)
{
}
}
public class RtfControlWord : RtfObject
{
public RtfControlWord(string name)
: base(name)
{
}
}
OK, this should work for you. To demonstrate my solution, I created a WinForms project with a PictureBox whose paint event handler was mapped to the following function:
private void rtfImage_Paint(object sender, PaintEventArgs e)
{
string rtfStr = System.IO.File.ReadAllText("MySampleFile.rtf");
string imageDataHex = ExtractImgHex(rtfStr);
byte[] imageBuffer = ToBinary(imageDataHex);
Image image;
using (MemoryStream stream = new MemoryStream(imageBuffer))
{
image = Image.FromStream(stream);
}
Rectangle rect = new Rectangle(0, 0, 100, 100);
e.Graphics.DrawImage(image, rect);
}
This code relies the on the System.Drawing.Image.FromStream() method, along with two "helper" functions:
A string extractor:
string ExtractImgHex(string s)
{
// I'm sure you could use regex here, but this works.
// This assumes one picture per file; loops required otherwise
int pictTagIdx = s.IndexOf("{\\pict\\");
int startIndex = s.IndexOf(" ", pictTagIdx)+1;
int endIndex = s.IndexOf("}", startIndex);
return s.Substring(startIndex, endIndex - startIndex);
}
... and a binary converter:
public static byte[] ToBinary(string imageDataHex)
{
//this function taken entirely from:
// http://www.codeproject.com/Articles/27431/Writing-Your-Own-RTF-Converter
if (imageDataHex == null)
{
throw new ArgumentNullException("imageDataHex");
}
int hexDigits = imageDataHex.Length;
int dataSize = hexDigits / 2;
byte[] imageDataBinary = new byte[dataSize];
StringBuilder hex = new StringBuilder(2);
int dataPos = 0;
for (int i = 0; i < hexDigits; i++)
{
char c = imageDataHex[i];
if (char.IsWhiteSpace(c))
{
continue;
}
hex.Append(imageDataHex[i]);
if (hex.Length == 2)
{
imageDataBinary[dataPos] = byte.Parse(hex.ToString(), System.Globalization.NumberStyles.HexNumber);
dataPos++;
hex.Remove(0, 2);
}
}
return imageDataBinary;
}
Below code can extract all type of embedded objects. including image/docs/mails etc with original file name. And save them in a local path.
string MyDir = #"E:\temp\";
Document doc = new Document(MyDir + "Requirement#4.rtf");
NodeCollection nodeColl = doc.GetChildNodes(NodeType.Shape, true);
foreach (var node in nodeColl)
{
Shape shape1 = (Shape)node;
if (shape1.OleFormat != null)
{
shape1.OleFormat.Save(MyDir + shape1.OleFormat.SuggestedFileName + shape1.OleFormat.SuggestedExtension);
}
}
Is there an easy way to extract text from an Rtf string without using RichTextBox?
Example:
{\rtf1\ansi\ansicpg1252\uc1\htmautsp\deff2{\fonttbl{\f0\fcharset0 Times New Roman;}{\f2\fcharset0 Segoe UI;}}{\colortbl\red0\green0\blue0;\red255\green255\blue255;}\loch\hich\dbch\pard\plain\ltrpar\itap0{\lang1033\fs18\f2\cf0 \cf0\ql{\f2 {\lang2070\ltrch foo}\li0\ri0\sa0\sb0\fi0\ql\par}
{\f2 {\lang2070\ltrch bar }\li0\ri0\sa0\sb0\fi0\ql\par}
}
}
should return:
foo
bar
How to do it in pure C# without any references to other libraries:
This guy wrote a class that strips RTF to plain text just as OP requested.
Here is the source
This is his code:
/// <summary>
/// Rich Text Stripper
/// </summary>
/// <remarks>
/// Translated from Python located at:
/// http://stackoverflow.com/a/188877/448
/// </remarks>
public static class RichTextStripper
{
private class StackEntry
{
public int NumberOfCharactersToSkip { get; set; }
public bool Ignorable { get; set; }
public StackEntry(int numberOfCharactersToSkip, bool ignorable)
{
NumberOfCharactersToSkip = numberOfCharactersToSkip;
Ignorable = ignorable;
}
}
private static readonly Regex _rtfRegex = new Regex(#"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);
private static readonly List<string> destinations = new List<string>
{
"aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid",
"atnparent","atnref","atntime","atrfend","atrfstart","author","background",
"bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping",
"colortbl","comment","company","creatim","datafield","datastore","defchp","defpap",
"do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt",
"fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl",
"ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype",
"fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr",
"footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl",
"header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc",
"hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers",
"leveltext","lfolevel","linkval","list","listlevel","listname","listoverride",
"listoverridetable","listpicture","liststylename","listtable","listtext",
"lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr",
"mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr",
"mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me",
"mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr",
"mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag",
"mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname",
"mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr",
"mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject",
"mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname",
"mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl",
"mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr",
"mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu",
"mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr",
"mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup",
"msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide",
"msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol",
"mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables",
"objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops",
"oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password",
"passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta",
"pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe",
"result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst",
"shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv",
"svb","tc","template","themedata","title","txe","ud","upr","userprops",
"wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform",
"xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl",
"xmlopen"
};
private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
{
{ "par", "\n" },
{ "sect", "\n\n" },
{ "page", "\n\n" },
{ "line", "\n" },
{ "tab", "\t" },
{ "emdash", "\u2014" },
{ "endash", "\u2013" },
{ "emspace", "\u2003" },
{ "enspace", "\u2002" },
{ "qmspace", "\u2005" },
{ "bullet", "\u2022" },
{ "lquote", "\u2018" },
{ "rquote", "\u2019" },
{ "ldblquote", "\u201C" },
{ "rdblquote", "\u201D" },
};
/// <summary>
/// Strip RTF Tags from RTF Text
/// </summary>
/// <param name="inputRtf">RTF formatted text</param>
/// <returns>Plain text from RTF</returns>
public static string StripRichTextFormat(string inputRtf)
{
if (inputRtf == null)
{
return null;
}
string returnString;
var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.
MatchCollection matches = _rtfRegex.Matches(inputRtf);
if (matches.Count > 0)
{
foreach (Match match in matches)
{
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;
if (!String.IsNullOrEmpty(brace))
{
curskip = 0;
if (brace == "{")
{
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
}
else if (brace == "}")
{
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
}
}
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
{
curskip = 0;
if (character == "~")
{
if (!ignorable)
{
outList.Add("\xA0");
}
}
else if ("{}\\".Contains(character))
{
if (!ignorable)
{
outList.Add(character);
}
}
else if (character == "*")
{
ignorable = true;
}
}
else if (!String.IsNullOrEmpty(word)) // \foo
{
curskip = 0;
if (destinations.Contains(word))
{
ignorable = true;
}
else if (ignorable)
{
}
else if (specialCharacters.ContainsKey(word))
{
outList.Add(specialCharacters[word]);
}
else if (word == "uc")
{
ucskip = Int32.Parse(arg);
}
else if (word == "u")
{
int c = Int32.Parse(arg);
if (c < 0)
{
c += 0x10000;
}
outList.Add(Char.ConvertFromUtf32(c));
curskip = ucskip;
}
}
else if (!String.IsNullOrEmpty(hex)) // \'xx
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
outList.Add(Char.ConvertFromUtf32(c));
}
}
else if (!String.IsNullOrEmpty(tchar))
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
outList.Add(tchar);
}
}
}
}
else
{
// Didn't match the regex
returnString = inputRtf;
}
returnString = String.Join(String.Empty, outList.ToArray());
return returnString;
}
}
EDIT 1:
In the meantime we had this code running for tests and adapted version in production. The new version does some additional safety checks & handles new lines better.
public static string StripRichTextFormat(string inputRtf)
{
if (inputRtf == null)
{
return null;
}
string returnString;
var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.
MatchCollection matches = _rtfRegex.Matches(inputRtf);
if (matches.Count > 0)
{
foreach (Match match in matches)
{
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;
if (!String.IsNullOrEmpty(brace))
{
curskip = 0;
if (brace == "{")
{
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
}
else if (brace == "}")
{
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
}
}
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
{
curskip = 0;
if (character == "~")
{
if (!ignorable)
{
outList.Add("\xA0");
}
}
else if ("{}\\".Contains(character))
{
if (!ignorable)
{
outList.Add(character);
}
}
else if (character == "*")
{
ignorable = true;
}
}
else if (!String.IsNullOrEmpty(word)) // \foo
{
curskip = 0;
if (destinations.Contains(word))
{
ignorable = true;
}
else if (ignorable)
{
}
else if (specialCharacters.ContainsKey(word))
{
outList.Add(specialCharacters[word]);
}
else if (word == "uc")
{
ucskip = Int32.Parse(arg);
}
else if (word == "u")
{
int c = Int32.Parse(arg);
if (c < 0)
{
c += 0x10000;
}
//Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
outList.Add(Char.ConvertFromUtf32(c));
else outList.Add("?");
curskip = ucskip;
}
}
else if (!String.IsNullOrEmpty(hex)) // \'xx
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
outList.Add(Char.ConvertFromUtf32(c));
}
}
else if (!String.IsNullOrEmpty(tchar))
{
if (curskip > 0)
{
curskip -= 1;
}
else if (!ignorable)
{
outList.Add(tchar);
}
}
}
}
else
{
// Didn't match the regex
returnString = inputRtf;
}
returnString = String.Join(String.Empty, outList.ToArray());
return returnString;
}
There's a simple article on MSDN to achieve what you want: http://msdn.microsoft.com/en-us/library/cc488002.aspx
class ConvertFromRTF
{
static void Main()
{
string path = #"test.rtf";
//Create the RichTextBox. (Requires a reference to System.Windows.Forms.dll.)
System.Windows.Forms.RichTextBox rtBox = new System.Windows.Forms.RichTextBox();
// Get the contents of the RTF file. Note that when it is
// stored in the string, it is encoded as UTF-16.
string s = System.IO.File.ReadAllText(path);
// Display the RTF text.
System.Windows.Forms.MessageBox.Show(s);
// Convert the RTF to plain text.
rtBox.Rtf = s;
string plainText = rtBox.Text;
// Display plain text output in MessageBox because console
// cannot display Greek letters.
System.Windows.Forms.MessageBox.Show(plainText);
// Output plain text to file, encoded as UTF-8.
System.IO.File.WriteAllText(#"output.txt", plainText);
}
}
Can't agree with using RichTextBox or any other controls in such kind of tasks. Here is another one approach:
public string RtfToPlainText(string rtf)
{
var flowDocument = new FlowDocument();
var textRange = new TextRange(flowDocument.ContentStart, flowDocument.ContentEnd);
using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(rtf ?? string.Empty)))
{
textRange.Load(stream, DataFormats.Rtf);
}
return textRange.Text;
}