how to save xls file as xlsx file using NPOI c#? - c#

I'm using NPOI to open XLS file, then add some modifications to the XLS file.
at the end i want to save it as XLSX file.
i'm using this code to save it as XLS file:
using (var fs = new FileStream(Name, FileMode.Create, FileAccess.Write))
{
wb.Write(fs);
}
Is it possible to save this XLS file as XLSX file using NPOI in C#?
Thanks in advance for your response

It's an old question, but I've encountered with the similar problem. This code is using NPOI 2.2.1 from https://npoi.codeplex.com/releases. This code is based on code from Convert xlsx file to xls using NPOI in c# question, but it converts xls->xlsx, not xlsx->xls. Plus it converts styles and fixes bug with ranges. This code works for simple workbooks without charts and other complex stuff.
using NPOI.XSSF.UserModel;
using NPOI.SS.UserModel;
using NPOI.SS.Util;
using NPOI.HSSF.UserModel;
using System.Collections.Generic;
using System.Linq;
using System.IO;
namespace XlsToXlsxConverter
{
public static class XLSToXLSXConverter
{
public static byte[] Convert(Stream sourceStream)
{
var source = new HSSFWorkbook(sourceStream);
var destination = new XSSFWorkbook();
for (int i = 0; i < source.NumberOfSheets; i++)
{
var xssfSheet = (XSSFSheet)destination.CreateSheet(source.GetSheetAt(i).SheetName);
var hssfSheet = (HSSFSheet)source.GetSheetAt(i);
CopyStyles(hssfSheet, xssfSheet);
CopySheets(hssfSheet, xssfSheet);
}
using (var ms = new MemoryStream())
{
destination.Write(ms);
return ms.ToArray();
}
}
private static void CopyStyles(HSSFSheet from, XSSFSheet to)
{
for (short i = 0; i <= from.Workbook.NumberOfFonts; i++)
{
CopyFont(to.Workbook.CreateFont(), from.Workbook.GetFontAt(i));
}
for (short i = 0; i < from.Workbook.NumCellStyles; i++)
{
CopyStyle(to.Workbook.CreateCellStyle(), from.Workbook.GetCellStyleAt(i), to.Workbook, from.Workbook);
}
}
private static void CopyFont(IFont toFront, IFont fontFrom)
{
toFront.Boldweight = fontFrom.Boldweight;
toFront.Charset = fontFrom.Charset;
toFront.Color = fontFrom.Color;
toFront.FontHeightInPoints = fontFrom.FontHeightInPoints;
toFront.FontName = fontFrom.FontName;
toFront.IsBold = fontFrom.IsBold;
toFront.IsItalic = fontFrom.IsItalic;
toFront.IsStrikeout = fontFrom.IsStrikeout;
//toFront.Underline = fontFrom.Underline; <- bug in npoi setter
}
private static void CopyStyle(ICellStyle toCellStyle, ICellStyle fromCellStyle, IWorkbook toWorkbook, IWorkbook fromWorkbook)
{
toCellStyle.Alignment = fromCellStyle.Alignment;
toCellStyle.BorderBottom = fromCellStyle.BorderBottom;
toCellStyle.BorderDiagonal = fromCellStyle.BorderDiagonal;
toCellStyle.BorderDiagonalColor = fromCellStyle.BorderDiagonalColor;
toCellStyle.BorderDiagonalLineStyle = fromCellStyle.BorderDiagonalLineStyle;
toCellStyle.BorderLeft = fromCellStyle.BorderLeft;
toCellStyle.BorderRight = fromCellStyle.BorderRight;
toCellStyle.BorderTop = fromCellStyle.BorderTop;
toCellStyle.BottomBorderColor = fromCellStyle.BottomBorderColor;
toCellStyle.DataFormat = fromCellStyle.DataFormat;
toCellStyle.FillBackgroundColor = fromCellStyle.FillBackgroundColor;
toCellStyle.FillForegroundColor = fromCellStyle.FillForegroundColor;
toCellStyle.FillPattern = fromCellStyle.FillPattern;
toCellStyle.Indention = fromCellStyle.Indention;
toCellStyle.IsHidden = fromCellStyle.IsHidden;
toCellStyle.IsLocked = fromCellStyle.IsLocked;
toCellStyle.LeftBorderColor = fromCellStyle.LeftBorderColor;
toCellStyle.RightBorderColor = fromCellStyle.RightBorderColor;
toCellStyle.Rotation = fromCellStyle.Rotation;
toCellStyle.ShrinkToFit = fromCellStyle.ShrinkToFit;
toCellStyle.TopBorderColor = fromCellStyle.TopBorderColor;
toCellStyle.VerticalAlignment = fromCellStyle.VerticalAlignment;
toCellStyle.WrapText = fromCellStyle.WrapText;
toCellStyle.SetFont(toWorkbook.GetFontAt((short)(fromCellStyle.GetFont(fromWorkbook).Index + 1 )));
}
private static void CopySheets(HSSFSheet source, XSSFSheet destination)
{
var maxColumnNum = 0;
var mergedRegions = new List<CellRangeAddress>();
var styleMap = new Dictionary<int, HSSFCellStyle>();
for (int i = source.FirstRowNum; i <= source.LastRowNum; i++)
{
var srcRow = (HSSFRow)source.GetRow(i);
var destRow = (XSSFRow)destination.CreateRow(i);
if (srcRow != null)
{
CopyRow(source, destination, srcRow, destRow, mergedRegions);
if (srcRow.LastCellNum > maxColumnNum)
{
maxColumnNum = srcRow.LastCellNum;
}
}
}
for (int i = 0; i <= maxColumnNum; i++)
{
destination.SetColumnWidth(i, source.GetColumnWidth(i));
}
}
private static void CopyRow(HSSFSheet srcSheet, XSSFSheet destSheet, HSSFRow srcRow, XSSFRow destRow, List<CellRangeAddress> mergedRegions)
{
destRow.Height = srcRow.Height;
for (int j = srcRow.FirstCellNum; srcRow.LastCellNum >= 0 && j <= srcRow.LastCellNum; j++)
{
var oldCell = (HSSFCell)srcRow.GetCell(j);
var newCell = (XSSFCell)destRow.GetCell(j);
if (oldCell != null)
{
if (newCell == null)
{
newCell = (XSSFCell)destRow.CreateCell(j);
}
CopyCell(oldCell, newCell);
var mergedRegion = GetMergedRegion(srcSheet, srcRow.RowNum,
(short)oldCell.ColumnIndex);
if (mergedRegion != null)
{
var newMergedRegion = new CellRangeAddress(mergedRegion.FirstRow,
mergedRegion.LastRow, mergedRegion.FirstColumn, mergedRegion.LastColumn);
if (IsNewMergedRegion(newMergedRegion, mergedRegions))
{
mergedRegions.Add(newMergedRegion);
destSheet.AddMergedRegion(newMergedRegion);
}
}
}
}
}
private static void CopyCell(HSSFCell oldCell, XSSFCell newCell)
{
CopyCellStyle(oldCell, newCell);
CopyCellValue(oldCell, newCell);
}
private static void CopyCellValue(HSSFCell oldCell, XSSFCell newCell)
{
switch (oldCell.CellType)
{
case CellType.String:
newCell.SetCellValue(oldCell.StringCellValue);
break;
case CellType.Numeric:
newCell.SetCellValue(oldCell.NumericCellValue);
break;
case CellType.Blank:
newCell.SetCellType(CellType.Blank);
break;
case CellType.Boolean:
newCell.SetCellValue(oldCell.BooleanCellValue);
break;
case CellType.Error:
newCell.SetCellErrorValue(oldCell.ErrorCellValue);
break;
case CellType.Formula:
newCell.SetCellFormula(oldCell.CellFormula);
break;
default:
break;
}
}
private static void CopyCellStyle(HSSFCell oldCell, XSSFCell newCell)
{
if (oldCell.CellStyle == null) return;
newCell.CellStyle = newCell.Sheet.Workbook.GetCellStyleAt((short)(oldCell.CellStyle.Index + 1));
}
private static CellRangeAddress GetMergedRegion(HSSFSheet sheet, int rowNum, short cellNum)
{
for (var i = 0; i < sheet.NumMergedRegions; i++)
{
var merged = sheet.GetMergedRegion(i);
if (merged.IsInRange(rowNum, cellNum))
{
return merged;
}
}
return null;
}
private static bool IsNewMergedRegion(CellRangeAddress newMergedRegion,
List<CellRangeAddress> mergedRegions)
{
return !mergedRegions.Any(r =>
r.FirstColumn == newMergedRegion.FirstColumn &&
r.LastColumn == newMergedRegion.LastColumn &&
r.FirstRow == newMergedRegion.FirstRow &&
r.LastRow == newMergedRegion.LastRow);
}
}
}

It is possible in general, but not quite easy, rather it is significantly complicated task.
XLS file format is handled by HSSFWorkbook class (and according HSSFSheet and so on).
XLSX file format is handled by XSSFWorkbook class (and XSSFSheet and so on).
So in order to save your file as XLSX after you opened and modified it using NPOI, you need to create new XSSFWorkbook, then for each worksheet of your source file you need to create appropriate XSSFSheet, copy data to it from your original worksheet and so on until you will get full copy of your data, and then save workbook to file.

Related

Parsing CSV File with double quotes [duplicate]

Is there a default/official/recommended way to parse CSV files in C#? I don't want to roll my own parser.
Also, I've seen instances of people using ODBC/OLE DB to read CSV via the Text driver, and a lot of people discourage this due to its "drawbacks." What are these drawbacks?
Ideally, I'm looking for a way through which I can read the CSV by column name, using the first record as the header / field names. Some of the answers given are correct but work to basically deserialize the file into classes.
A CSV parser is now a part of .NET Framework.
Add a reference to Microsoft.VisualBasic.dll (works fine in C#, don't mind the name)
using (TextFieldParser parser = new TextFieldParser(#"c:\temp\test.csv"))
{
parser.TextFieldType = FieldType.Delimited;
parser.SetDelimiters(",");
while (!parser.EndOfData)
{
//Process row
string[] fields = parser.ReadFields();
foreach (string field in fields)
{
//TODO: Process field
}
}
}
The docs are here - TextFieldParser Class
P.S. If you need a CSV exporter, try CsvExport (discl: I'm one of the contributors)
CsvHelper (a library I maintain) will read a CSV file into custom objects.
using (var reader = new StreamReader("path\\to\\file.csv"))
using (var csv = new CsvReader(reader, CultureInfo.InvariantCulture))
{
var records = csv.GetRecords<Foo>();
}
Sometimes you don't own the objects you're trying to read into. In this case, you can use fluent mapping because you can't put attributes on the class.
public sealed class MyCustomObjectMap : CsvClassMap<MyCustomObject>
{
public MyCustomObjectMap()
{
Map( m => m.Property1 ).Name( "Column Name" );
Map( m => m.Property2 ).Index( 4 );
Map( m => m.Property3 ).Ignore();
Map( m => m.Property4 ).TypeConverter<MySpecialTypeConverter>();
}
}
Let a library handle all the nitty-gritty details for you! :-)
Check out FileHelpers and stay DRY - Don't Repeat Yourself - no need to re-invent the wheel a gazillionth time....
You basically just need to define that shape of your data - the fields in your individual line in the CSV - by means of a public class (and so well-thought out attributes like default values, replacements for NULL values and so forth), point the FileHelpers engine at a file, and bingo - you get back all the entries from that file. One simple operation - great performance!
In a business application, i use the Open Source project on codeproject.com, CSVReader.
It works well, and has good performance. There is some benchmarking on the link i provided.
A simple example, copied from the project page:
using (CsvReader csv = new CsvReader(new StreamReader("data.csv"), true))
{
int fieldCount = csv.FieldCount;
string[] headers = csv.GetFieldHeaders();
while (csv.ReadNextRecord())
{
for (int i = 0; i < fieldCount; i++)
Console.Write(string.Format("{0} = {1};", headers[i], csv[i]));
Console.WriteLine();
}
}
As you can see, it's very easy to work with.
I know its a bit late but just found a library Microsoft.VisualBasic.FileIO which has TextFieldParser class to process csv files.
Here is a helper class I use often, in case any one ever comes back to this thread (I wanted to share it).
I use this for the simplicity of porting it into projects ready to use:
public class CSVHelper : List<string[]>
{
protected string csv = string.Empty;
protected string separator = ",";
public CSVHelper(string csv, string separator = "\",\"")
{
this.csv = csv;
this.separator = separator;
foreach (string line in Regex.Split(csv, System.Environment.NewLine).ToList().Where(s => !string.IsNullOrEmpty(s)))
{
string[] values = Regex.Split(line, separator);
for (int i = 0; i < values.Length; i++)
{
//Trim values
values[i] = values[i].Trim('\"');
}
this.Add(values);
}
}
}
And use it like:
public List<Person> GetPeople(string csvContent)
{
List<Person> people = new List<Person>();
CSVHelper csv = new CSVHelper(csvContent);
foreach(string[] line in csv)
{
Person person = new Person();
person.Name = line[0];
person.TelephoneNo = line[1];
people.Add(person);
}
return people;
}
[Updated csv helper: bug fixed where the last new line character created a new line]
If you need only reading csv files then I recommend this library: A Fast CSV Reader
If you also need to generate csv files then use this one: FileHelpers
Both of them are free and opensource.
This solution is using the official Microsoft.VisualBasic assembly to parse CSV.
Advantages:
delimiter escaping
ignores Header
trim spaces
ignore comments
Code:
using Microsoft.VisualBasic.FileIO;
public static List<List<string>> ParseCSV (string csv)
{
List<List<string>> result = new List<List<string>>();
// To use the TextFieldParser a reference to the Microsoft.VisualBasic assembly has to be added to the project.
using (TextFieldParser parser = new TextFieldParser(new StringReader(csv)))
{
parser.CommentTokens = new string[] { "#" };
parser.SetDelimiters(new string[] { ";" });
parser.HasFieldsEnclosedInQuotes = true;
// Skip over header line.
//parser.ReadLine();
while (!parser.EndOfData)
{
var values = new List<string>();
var readFields = parser.ReadFields();
if (readFields != null)
values.AddRange(readFields);
result.Add(values);
}
}
return result;
}
I have written TinyCsvParser for .NET, which is one of the fastest .NET parsers around and highly configurable to parse almost any CSV format.
It is released under MIT License:
https://github.com/bytefish/TinyCsvParser
You can use NuGet to install it. Run the following command in the Package Manager Console.
PM> Install-Package TinyCsvParser
Usage
Imagine we have list of Persons in a CSV file persons.csv with their first name, last name and birthdate.
FirstName;LastName;BirthDate
Philipp;Wagner;1986/05/12
Max;Musterman;2014/01/02
The corresponding domain model in our system might look like this.
private class Person
{
public string FirstName { get; set; }
public string LastName { get; set; }
public DateTime BirthDate { get; set; }
}
When using TinyCsvParser you have to define the mapping between the columns in the CSV data and the property in you domain model.
private class CsvPersonMapping : CsvMapping<Person>
{
public CsvPersonMapping()
: base()
{
MapProperty(0, x => x.FirstName);
MapProperty(1, x => x.LastName);
MapProperty(2, x => x.BirthDate);
}
}
And then we can use the mapping to parse the CSV data with a CsvParser.
namespace TinyCsvParser.Test
{
[TestFixture]
public class TinyCsvParserTest
{
[Test]
public void TinyCsvTest()
{
CsvParserOptions csvParserOptions = new CsvParserOptions(true, new[] { ';' });
CsvPersonMapping csvMapper = new CsvPersonMapping();
CsvParser<Person> csvParser = new CsvParser<Person>(csvParserOptions, csvMapper);
var result = csvParser
.ReadFromFile(#"persons.csv", Encoding.ASCII)
.ToList();
Assert.AreEqual(2, result.Count);
Assert.IsTrue(result.All(x => x.IsValid));
Assert.AreEqual("Philipp", result[0].Result.FirstName);
Assert.AreEqual("Wagner", result[0].Result.LastName);
Assert.AreEqual(1986, result[0].Result.BirthDate.Year);
Assert.AreEqual(5, result[0].Result.BirthDate.Month);
Assert.AreEqual(12, result[0].Result.BirthDate.Day);
Assert.AreEqual("Max", result[1].Result.FirstName);
Assert.AreEqual("Mustermann", result[1].Result.LastName);
Assert.AreEqual(2014, result[1].Result.BirthDate.Year);
Assert.AreEqual(1, result[1].Result.BirthDate.Month);
Assert.AreEqual(1, result[1].Result.BirthDate.Day);
}
}
}
User Guide
A full User Guide is available at:
http://bytefish.github.io/TinyCsvParser/
Here is a short and simple solution.
using (TextFieldParser parser = new TextFieldParser(outputLocation))
{
parser.TextFieldType = FieldType.Delimited;
parser.SetDelimiters(",");
string[] headers = parser.ReadLine().Split(',');
foreach (string header in headers)
{
dataTable.Columns.Add(header);
}
while (!parser.EndOfData)
{
string[] fields = parser.ReadFields();
dataTable.Rows.Add(fields);
}
}
Here is my KISS implementation...
using System;
using System.Collections.Generic;
using System.Text;
class CsvParser
{
public static List<string> Parse(string line)
{
const char escapeChar = '"';
const char splitChar = ',';
bool inEscape = false;
bool priorEscape = false;
List<string> result = new List<string>();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < line.Length; i++)
{
char c = line[i];
switch (c)
{
case escapeChar:
if (!inEscape)
inEscape = true;
else
{
if (!priorEscape)
{
if (i + 1 < line.Length && line[i + 1] == escapeChar)
priorEscape = true;
else
inEscape = false;
}
else
{
sb.Append(c);
priorEscape = false;
}
}
break;
case splitChar:
if (inEscape) //if in escape
sb.Append(c);
else
{
result.Add(sb.ToString());
sb.Length = 0;
}
break;
default:
sb.Append(c);
break;
}
}
if (sb.Length > 0)
result.Add(sb.ToString());
return result;
}
}
Some time ago I had wrote simple class for CSV read/write based on Microsoft.VisualBasic library. Using this simple class you will be able to work with CSV like with 2 dimensions array. You can find my class by the following link: https://github.com/ukushu/DataExporter
Simple example of usage:
Csv csv = new Csv("\t");//delimiter symbol
csv.FileOpen("c:\\file1.csv");
var row1Cell6Value = csv.Rows[0][5];
csv.AddRow("asdf","asdffffff","5")
csv.FileSave("c:\\file2.csv");
For reading header only you need is to read csv.Rows[0] cells :)
This code reads csv to DataTable:
public static DataTable ReadCsv(string path)
{
DataTable result = new DataTable("SomeData");
using (TextFieldParser parser = new TextFieldParser(path))
{
parser.TextFieldType = FieldType.Delimited;
parser.SetDelimiters(",");
bool isFirstRow = true;
//IList<string> headers = new List<string>();
while (!parser.EndOfData)
{
string[] fields = parser.ReadFields();
if (isFirstRow)
{
foreach (string field in fields)
{
result.Columns.Add(new DataColumn(field, typeof(string)));
}
isFirstRow = false;
}
else
{
int i = 0;
DataRow row = result.NewRow();
foreach (string field in fields)
{
row[i++] = field;
}
result.Rows.Add(row);
}
}
}
return result;
}
Single source file solution for straightforward parsing needs, useful. Deals with all the nasty edge cases. Such as new line normalization and handling new lines in quoted string literals. Your welcome!
If you CSV file has a header you just read out the column names (and compute column indexes) from the first row. Simple as that.
Note that Dump is a LINQPad method, you might want to remove that if you are not using LINQPad.
void Main()
{
var file1 = "a,b,c\r\nx,y,z";
CSV.ParseText(file1).Dump();
var file2 = "a,\"b\",c\r\nx,\"y,z\"";
CSV.ParseText(file2).Dump();
var file3 = "a,\"b\",c\r\nx,\"y\r\nz\"";
CSV.ParseText(file3).Dump();
var file4 = "\"\"\"\"";
CSV.ParseText(file4).Dump();
}
static class CSV
{
public struct Record
{
public readonly string[] Row;
public string this[int index] => Row[index];
public Record(string[] row)
{
Row = row;
}
}
public static List<Record> ParseText(string text)
{
return Parse(new StringReader(text));
}
public static List<Record> ParseFile(string fn)
{
using (var reader = File.OpenText(fn))
{
return Parse(reader);
}
}
public static List<Record> Parse(TextReader reader)
{
var data = new List<Record>();
var col = new StringBuilder();
var row = new List<string>();
for (; ; )
{
var ln = reader.ReadLine();
if (ln == null) break;
if (Tokenize(ln, col, row))
{
data.Add(new Record(row.ToArray()));
row.Clear();
}
}
return data;
}
public static bool Tokenize(string s, StringBuilder col, List<string> row)
{
int i = 0;
if (col.Length > 0)
{
col.AppendLine(); // continuation
if (!TokenizeQuote(s, ref i, col, row))
{
return false;
}
}
while (i < s.Length)
{
var ch = s[i];
if (ch == ',')
{
row.Add(col.ToString().Trim());
col.Length = 0;
i++;
}
else if (ch == '"')
{
i++;
if (!TokenizeQuote(s, ref i, col, row))
{
return false;
}
}
else
{
col.Append(ch);
i++;
}
}
if (col.Length > 0)
{
row.Add(col.ToString().Trim());
col.Length = 0;
}
return true;
}
public static bool TokenizeQuote(string s, ref int i, StringBuilder col, List<string> row)
{
while (i < s.Length)
{
var ch = s[i];
if (ch == '"')
{
// escape sequence
if (i + 1 < s.Length && s[i + 1] == '"')
{
col.Append('"');
i++;
i++;
continue;
}
i++;
return true;
}
else
{
col.Append(ch);
i++;
}
}
return false;
}
}
Another one to this list, Cinchoo ETL - an open source library to read and write multiple file formats (CSV, flat file, Xml, JSON etc)
Sample below shows how to read CSV file quickly (No POCO object required)
string csv = #"Id, Name
1, Carl
2, Tom
3, Mark";
using (var p = ChoCSVReader.LoadText(csv)
.WithFirstLineHeader()
)
{
foreach (var rec in p)
{
Console.WriteLine($"Id: {rec.Id}");
Console.WriteLine($"Name: {rec.Name}");
}
}
Sample below shows how to read CSV file using POCO object
public partial class EmployeeRec
{
public int Id { get; set; }
public string Name { get; set; }
}
static void CSVTest()
{
string csv = #"Id, Name
1, Carl
2, Tom
3, Mark";
using (var p = ChoCSVReader<EmployeeRec>.LoadText(csv)
.WithFirstLineHeader()
)
{
foreach (var rec in p)
{
Console.WriteLine($"Id: {rec.Id}");
Console.WriteLine($"Name: {rec.Name}");
}
}
}
Please check out articles at CodeProject on how to use it.
This parser supports nested commas and quotes in a column:
static class CSVParser
{
public static string[] ParseLine(string line)
{
List<string> cols = new List<string>();
string value = null;
for(int i = 0; i < line.Length; i++)
{
switch(line[i])
{
case ',':
cols.Add(value);
value = null;
if(i == line.Length - 1)
{// It ends with comma
cols.Add(null);
}
break;
case '"':
cols.Add(ParseEnclosedColumn(line, ref i));
i++;
break;
default:
value += line[i];
if (i == line.Length - 1)
{// Last character
cols.Add(value);
}
break;
}
}
return cols.ToArray();
}//ParseLine
static string ParseEnclosedColumn(string line, ref int index)
{// Example: "b"",bb"
string value = null;
int numberQuotes = 1;
int index2 = index;
for (int i = index + 1; i < line.Length; i++)
{
index2 = i;
switch (line[i])
{
case '"':
numberQuotes++;
if (numberQuotes % 2 == 0)
{
if (i < line.Length - 1 && line[i + 1] == ',')
{
index = i;
return value;
}
}
else if (i > index + 1 && line[i - 1] == '"')
{
value += '"';
}
break;
default:
value += line[i];
break;
}
}
index = index2;
return value;
}//ParseEnclosedColumn
}//class CSVParser
Based on unlimit's post on How to properly split a CSV using C# split() function? :
string[] tokens = System.Text.RegularExpressions.Regex.Split(paramString, ",");
NOTE: this doesn't handle escaped / nested commas, etc., and therefore is only suitable for certain simple CSV lists.
If anyone wants a snippet they can plop into their code without having to bind a library or download a package. Here is a version I wrote:
public static string FormatCSV(List<string> parts)
{
string result = "";
foreach (string s in parts)
{
if (result.Length > 0)
{
result += ",";
if (s.Length == 0)
continue;
}
if (s.Length > 0)
{
result += "\"" + s.Replace("\"", "\"\"") + "\"";
}
else
{
// cannot output double quotes since its considered an escape for a quote
result += ",";
}
}
return result;
}
enum CSVMode
{
CLOSED = 0,
OPENED_RAW = 1,
OPENED_QUOTE = 2
}
public static List<string> ParseCSV(string input)
{
List<string> results;
CSVMode mode;
char[] letters;
string content;
mode = CSVMode.CLOSED;
content = "";
results = new List<string>();
letters = input.ToCharArray();
for (int i = 0; i < letters.Length; i++)
{
char letter = letters[i];
char nextLetter = '\0';
if (i < letters.Length - 1)
nextLetter = letters[i + 1];
// If its a quote character
if (letter == '"')
{
// If that next letter is a quote
if (nextLetter == '"' && mode == CSVMode.OPENED_QUOTE)
{
// Then this quote is escaped and should be added to the content
content += letter;
// Skip the escape character
i++;
continue;
}
else
{
// otherwise its not an escaped quote and is an opening or closing one
// Character is skipped
// If it was open, then close it
if (mode == CSVMode.OPENED_QUOTE)
{
results.Add(content);
// reset the content
content = "";
mode = CSVMode.CLOSED;
// If there is a next letter available
if (nextLetter != '\0')
{
// If it is a comma
if (nextLetter == ',')
{
i++;
continue;
}
else
{
throw new Exception("Expected comma. Found: " + nextLetter);
}
}
}
else if (mode == CSVMode.OPENED_RAW)
{
// If it was opened raw, then just add the quote
content += letter;
}
else if (mode == CSVMode.CLOSED)
{
// Otherwise open it as a quote
mode = CSVMode.OPENED_QUOTE;
}
}
}
// If its a comma seperator
else if (letter == ',')
{
// If in quote mode
if (mode == CSVMode.OPENED_QUOTE)
{
// Just read it
content += letter;
}
// If raw, then close the content
else if (mode == CSVMode.OPENED_RAW)
{
results.Add(content);
content = "";
mode = CSVMode.CLOSED;
}
// If it was closed, then open it raw
else if (mode == CSVMode.CLOSED)
{
mode = CSVMode.OPENED_RAW;
results.Add(content);
content = "";
}
}
else
{
// If opened quote, just read it
if (mode == CSVMode.OPENED_QUOTE)
{
content += letter;
}
// If opened raw, then read it
else if (mode == CSVMode.OPENED_RAW)
{
content += letter;
}
// It closed, then open raw
else if (mode == CSVMode.CLOSED)
{
mode = CSVMode.OPENED_RAW;
content += letter;
}
}
}
// If it was still reading when the buffer finished
if (mode != CSVMode.CLOSED)
{
results.Add(content);
}
return results;
}
For smaller input CSV data LINQ is fully enough.
For example for the following CSV file content:
schema_name,description,utype
"IX_HE","High-Energy data","x"
"III_spectro","Spectrosopic data","d"
"VI_misc","Miscellaneous","f"
"vcds1","Catalogs only available in CDS","d"
"J_other","Publications from other journals","b"
when we read the whole content into single string called data, then
using System;
using System.IO;
using System.Linq;
var data = File.ReadAllText(Path2CSV);
// helper split characters
var newline = Environment.NewLine.ToCharArray();
var comma = ",".ToCharArray();
var quote = "\"".ToCharArray();
// split input string data to lines
var lines = data.Split(newline);
// first line is header, take the header fields
foreach (var col in lines.First().Split(comma)) {
// do something with "col"
}
// we skip the first line, all the rest are real data lines/fields
foreach (var line in lines.Skip(1)) {
// first we split the data line by comma character
// next we remove double qoutes from each splitted element using Trim()
// finally we make an array
var fields = line.Split(comma)
.Select(_ => { _ = _.Trim(quote); return _; })
.ToArray();
// do something with the "fields" array
}

Get bookmark coordinates in PDF

I'm working in placing signature image to PDF at exact place. In PDF I add with report tool bookmark so I could know where to place signature image. Problem is I don't know how to get bookmark coordinates. Is it with ITextsharp possible?
------>EDIT
Picture of bookmark
----->EDIT
Solution is:
PdfReader pdfReader = new PdfReader(GlobalVars.PdfFile);
IList<Dictionary<string, object>> bookmarks = SimpleBookmark.GetBookmark(pdfReader);
string BookmarkID = "";
for (int j = 0; j < bookmarks.Count; j++)
{
//MessageBox.Show(bookmarks[i].Values.ToArray().GetValue(0).ToString());
string s = bookmarks[j].Values.ToArray().GetValue(0).ToString();
if (bookmarks[j].Values.ToArray().GetValue(0).ToString() == "##PODPIS##")
{
BookmarkID = bookmarks[j].Values.ToArray().GetValue(1).ToString();
}
}
var map = SimpleNamedDestination.GetNamedDestination(pdfReader, true);
foreach (KeyValuePair<string, string> entry in map)
{
if (entry.Key.ToString() == BookmarkID)
{
string[] LocationArray = entry.Value.Split(null);
GlobalVars.SignatuePageNumber = Convert.ToInt32(LocationArray[0]);
GlobalVars.SignatureX = float.Parse(LocationArray[2], CultureInfo.InvariantCulture.NumberFormat);
GlobalVars.SignatureY = float.Parse(LocationArray[3], CultureInfo.InvariantCulture.NumberFormat);
}
}
Solution is:
PdfReader pdfReader = new PdfReader(GlobalVars.PdfFile);
IList<Dictionary<string, object>> bookmarks = SimpleBookmark.GetBookmark(pdfReader);
string BookmarkID = "";
for (int j = 0; j < bookmarks.Count; j++)
{
//MessageBox.Show(bookmarks[i].Values.ToArray().GetValue(0).ToString());
string s = bookmarks[j].Values.ToArray().GetValue(0).ToString();
if (bookmarks[j].Values.ToArray().GetValue(0).ToString() == "##PODPIS##")
{
BookmarkID = bookmarks[j].Values.ToArray().GetValue(1).ToString();
}
}
var map = SimpleNamedDestination.GetNamedDestination(pdfReader, true);
foreach (KeyValuePair<string, string> entry in map)
{
if (entry.Key.ToString() == BookmarkID)
{
string[] LocationArray = entry.Value.Split(null);
GlobalVars.SignatuePageNumber = Convert.ToInt32(LocationArray[0]);
GlobalVars.SignatureX = float.Parse(LocationArray[2], CultureInfo.InvariantCulture.NumberFormat);
GlobalVars.SignatureY = float.Parse(LocationArray[3], CultureInfo.InvariantCulture.NumberFormat);
}
}

Get the positions of unique elements in a string[]

I have an xml file that I am accessing to create a report of time spent on a project. I'm returning the unique dates to a label created dynamically on a winform and would like to compile the time spent on a project for each unique date. I have been able to return all of the projects under each date or only one project. Currently I'm stuck on only returning one project. Can anyone please help me?? This is what the data should look like if it's correct.
04/11/15
26820 2.25
27111 8.00
04/12/15
26820 8.00
04/13/15
01det 4.33
26820 1.33
27225 4.25
etc.
This is how I'm retrieving the data
string[] weekDateString = elementDateWeekstring();
string[] uniqueDates = null;
string[] weeklyJobNumber = elementJobNumWeek();
string[] weeklyTicks = elementTicksWeek();
This is how I'm getting the unique dates.
IEnumerable<string> distinctWeekDateIE = weekDateString.Distinct();
foreach (string d in distinctWeekDateIE)
{
uniqueDates = distinctWeekDateIE.ToArray();
}
And this is how I'm creating the labels.
try
{
int dateCount;
dateCount = uniqueDates.Length;
Label[] lblDate = new Label[dateCount];
int htDate = 1;
int padDate = 10;
for (int i = 0; i < dateCount; i++ )
{
lblDate[i] = new Label();
lblDate[i].Name = uniqueDates[i].Trim('\r');
lblDate[i].Text = uniqueDates[i];
lblDate[i].TabIndex = i;
lblDate[i].Bounds = new Rectangle(18, 275 + padDate + htDate, 75, 22);
targetForm.Controls.Add(lblDate[i]);
htDate += 22;
foreach (string x in uniqueDates)
{
int[] posJobNumber;
posJobNumber = weekDateString.Select((b, a) => b == uniqueDates[i].ToString() ? a : -1).Where(a => a != -1).ToArray();
for (int pjn = 0; pjn < posJobNumber.Length; pjn++)
{
if (x.Equals(lblDate[i].Text))
{
Label lblJobNum = new Label();
int htJobNum = 1;
int padJobNum = 10;
lblJobNum.Name = weeklyJobNumber[i];
lblJobNum.Text = weeklyJobNumber[i];
lblJobNum.Bounds = new Rectangle(100, 295 + padJobNum + htJobNum, 75, 22);
targetForm.Controls.Add(lblJobNum);
htJobNum += 22;
htDate += 22;
padJobNum += 22;
}
}
}
}
}
I've been stuck on this for about 3 months. Is there anyone that can describe to me why I'm not able to properly retrieve the job numbers that are associated with a particular date. I don't believe that these are specifically being returned as dates. Just a string that looks like a date.
I really appreciate any help I can get. I'm just completely baffled. Thank you for any responses in advance. I truly appreciate the assistance.
EDIT: #Sayka - Here is the xml sample.
<?xml version="1.0" encoding="utf-8"?>
<Form1>
<Name Key="4/21/2014 6:51:17 AM">
<Date>4/21/2014</Date>
<JobNum>26820</JobNum>
<RevNum>00000</RevNum>
<Task>Modeling Secondary</Task>
<Start>06:51 AM</Start>
<End>04:27 PM</End>
<TotalTime>345945089017</TotalTime>
</Name>
<Name Key="4/22/2014 5:44:22 AM">
<Date>4/22/2014</Date>
<JobNum>26820</JobNum>
<RevNum>00000</RevNum>
<Task>Modeling Secondary</Task>
<Start>05:44 AM</Start>
<End>06:56 AM</End>
<TotalTime>43514201221</TotalTime>
</Name>
<Name Key="4/22/2014 6:57:02 AM">
<Date>4/22/2014</Date>
<JobNum>02e-n-g</JobNum>
<RevNum>00000</RevNum>
<Task>NET Eng</Task>
<Start>06:57 AM</Start>
<End>07:16 AM</End>
<TotalTime>11706118875</TotalTime>
</Name>
....
</Form1>
This is how I'm getting the information out of the xml file and returning a string[].
public static string[] elementDateWeekstring()
{
//string datetxtWeek = "";
XmlDocument xmldoc = new XmlDocument();
fileExistsWeek(xmldoc);
XmlNodeList nodeDate = xmldoc.GetElementsByTagName("Date");
int countTicks = 0;
string[] dateTxtWeek = new string[nodeDate.Count];
for (int i = 0; i < nodeDate.Count; i++)
{
dateTxtWeek[i] = nodeDate[i].InnerText;
countTicks++;
}
return dateTxtWeek;
}
Job number and Ticks are returned in a similar fashion. I've been able to reuse these snippets throught out the code. This is a one dimensional xml file?? It will always return a position for a jobnumber that equates to a date or Ticks. I will never have more or less of any one element.
You can use Linq-to-XML to parse the XML file, and then use Linq-to-objects to group (and order) the data by job date and order each group by job name.
The code to parse the XML file is like so:
var doc = XDocument.Load(filename);
var jobs = doc.Descendants("Name");
// Extract the date, job number, and total time from each "Name" element.:
var data = jobs.Select(job => new
{
Date = (DateTime)job.Element("Date"),
Number = (string)job.Element("JobNum"),
Duration = TimeSpan.FromTicks((long)job.Element("TotalTime"))
});
The code to group and order the jobs by date and order the groups by job name is:
var result =
data.GroupBy(job => job.Date).OrderBy(g => g.Key)
.Select(g => new
{
Date = g.Key,
Jobs = g.OrderBy(item => item.Number)
});
Then you can access the data by iterating over each group in result and then iterate over each job in the group, like so:
foreach (var jobsOnDate in result)
{
Console.WriteLine("{0:d}", jobsOnDate.Date);
foreach (var job in jobsOnDate.Jobs)
Console.WriteLine(" {0} {1:hh\\:mm}", job.Number, job.Duration);
}
Putting this all together in a sample compilable console application (substitute the filename for the XML file as appropriate):
using System;
using System.Linq;
using System.Xml.Linq;
namespace ConsoleApplication2
{
class Program
{
private static void Main()
{
string filename = #"d:\test\test.xml"; // Substitute your own filename here.
// Open XML file and get a collection of each "Name" element.
var doc = XDocument.Load(filename);
var jobs = doc.Descendants("Name");
// Extract the date, job number, and total time from each "Name" element.:
var data = jobs.Select(job => new
{
Date = (DateTime)job.Element("Date"),
Number = (string)job.Element("JobNum"),
Duration = TimeSpan.FromTicks((long)job.Element("TotalTime"))
});
// Group the jobs by date, and order the groups by job name:
var result =
data.GroupBy(job => job.Date).OrderBy(g => g.Key)
.Select(g => new
{
Date = g.Key,
Jobs = g.OrderBy(item => item.Number)
});
// Print out the results:
foreach (var jobsOnDate in result)
{
Console.WriteLine("{0:d}", jobsOnDate.Date);
foreach (var job in jobsOnDate.Jobs)
Console.WriteLine(" {0} {1:hh\\:mm}", job.Number, job.Duration);
}
}
}
}
The output is like this
Create a new project
Set form size bigger.
Apply these codes.
Set the location for your XML file.
Namespaces
using System.Xml;
using System.IO;
Form Code
public partial class Form1 : Form
{
const string XML_FILE_NAME = "D:\\emps.txt";
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
prepareDataGrid();
List<JOBS> jobsList = prepareXML(XML_FILE_NAME);
for (int i = 0; i < jobsList.Count; i++)
{
addDateRow(jobsList[i].jobDate.ToString("M'/'d'/'yyyy"));
for (int j = 0; j < jobsList[i].jobDetailsList.Count; j++)
dgv.Rows.Add(new string[] {
jobsList[i].jobDetailsList[j].JobNumber,
jobsList[i].jobDetailsList[j].JobHours
});
}
}
DataGridView dgv;
void prepareDataGrid()
{
dgv = new DataGridView();
dgv.BackgroundColor = Color.White;
dgv.GridColor = Color.White;
dgv.DefaultCellStyle.SelectionBackColor = Color.White;
dgv.DefaultCellStyle.SelectionForeColor = Color.Black;
dgv.DefaultCellStyle.ForeColor = Color.Black;
dgv.DefaultCellStyle.BackColor = Color.White;
dgv.DefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleRight;
dgv.Width = 600;
dgv.Dock = DockStyle.Left;
this.BackColor = Color.White;
dgv.Columns.Add("Col1", "Col1");
dgv.Columns.Add("Col2", "Col2");
dgv.Columns[0].Width = 110;
dgv.Columns[1].Width = 40;
dgv.DefaultCellStyle.Font = new System.Drawing.Font("Segoe UI", 10);
dgv.RowHeadersVisible = dgv.ColumnHeadersVisible = false;
dgv.AllowUserToAddRows =
dgv.AllowUserToDeleteRows =
dgv.AllowUserToOrderColumns =
dgv.AllowUserToResizeColumns =
dgv.AllowUserToResizeRows =
!(dgv.ReadOnly = true);
Controls.Add(dgv);
}
void addJobRow(string jobNum, string jobHours)
{
dgv.Rows.Add(new string[] {jobNum, jobHours });
}
void addDateRow(string date)
{
dgv.Rows.Add(new string[] { date, ""});
dgv.Rows[dgv.Rows.Count - 1].DefaultCellStyle.SelectionForeColor =
dgv.Rows[dgv.Rows.Count - 1].DefaultCellStyle.ForeColor = Color.Firebrick;
dgv.Rows[dgv.Rows.Count - 1].DefaultCellStyle.Font = new Font("Segoe UI Light", 13.5F);
dgv.Rows[dgv.Rows.Count - 1].DefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleLeft;
dgv.Rows[dgv.Rows.Count - 1].Height = 25;
}
List<JOBS> prepareXML(string fileName)
{
string xmlContent = "";
using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read))
using (StreamReader sr = new StreamReader(fs)) xmlContent = sr.ReadToEnd();
XmlDocument doc = new XmlDocument();
doc.LoadXml(xmlContent);
List<JOBS> jobsList = new List<JOBS>();
XmlNode form1Node = doc.ChildNodes[1];
for (int i = 0; i < form1Node.ChildNodes.Count; i++)
{
XmlNode dateNode = form1Node.ChildNodes[i].ChildNodes[0].ChildNodes[0],
jobNumNode = form1Node.ChildNodes[i].ChildNodes[1].ChildNodes[0],
timeTicksNode = form1Node.ChildNodes[i].ChildNodes[6].ChildNodes[0];
bool foundDate = false;
for (int j = 0; j < jobsList.Count; j++) if (jobsList[j].compareDate(dateNode.Value))
{
jobsList[j].addJob(jobNumNode.Value, Math.Round(TimeSpan.FromTicks(
(long)Convert.ToDouble(timeTicksNode.Value)).TotalHours, 2).ToString());
foundDate = true;
break;
}
if (!foundDate)
{
JOBS job = new JOBS(dateNode.Value);
string jbnum = jobNumNode.Value;
string tbtck = timeTicksNode.Value;
long tktk = Convert.ToInt64(tbtck);
double tkdb = TimeSpan.FromTicks(tktk).TotalHours;
job.addJob(jobNumNode.Value, Math.Round(TimeSpan.FromTicks(
Convert.ToInt64(timeTicksNode.Value)).TotalHours, 2).ToString());
jobsList.Add(job);
}
}
jobsList.OrderByDescending(x => x.jobDate);
return jobsList;
}
class JOBS
{
public DateTime jobDate;
public List<JobDetails> jobDetailsList = new List<JobDetails>();
public void addJob(string jobNumber, string jobHours)
{
jobDetailsList.Add(new JobDetails() { JobHours = jobHours, JobNumber = jobNumber });
}
public JOBS(string dateString)
{
jobDate = getDateFromString(dateString);
}
public JOBS() { }
public bool compareDate(string dateString)
{
return getDateFromString(dateString) == jobDate;
}
private DateTime getDateFromString(string dateString)
{
string[] vals = dateString.Split('/');
return new DateTime(Convert.ToInt32(vals[2]), Convert.ToInt32(vals[0]), Convert.ToInt32(vals[1]));
}
}
class JobDetails
{
public string JobNumber { get; set; }
public string JobHours { get; set; }
}
}

Performance problems with Linq To CSV

I have found a library which connects csv-files with linq. I understood the principles and my code works well. But i have some problems with big csv files.
http://www.codeproject.com/Articles/25133/LINQ-to-CSV-library
Now i want to access specific single items from my _dataTable object. I get them like this:
public class CsvFile
{
private IEnumerable<DataRow> _dataTable = cc.Read<DataRow>(_filePath, _inputFileDescription);
public string GetItem(int row, int column)
{
return _dataTable.ElementAt<DataRow>(row).ElementAt<DataRowItem>(column).Value;
}
}
When i now call the method like this in a loop:
CsvFile file1 = new CsvFile("C:\\dev_csvcompare\\Master.csv", ';', true);
for(int i = 0; i < 10000; i++)
{
string dummy = file1.GetItem(1, i); //Does not make sense, my loop is a bit more complicated
}
it gets very slow, because the IEnumerable opens the stream every call.
In the documentation(link) under "Deferred Reading" they say i can access the ienumerable "_dataTable" with a foreach loop (this does work fine), but this is in my case no option because i want access to specific items in the csv.
Are there possibilities to keep the filestream open so that the performace increases?
EDIT (My code, maybe a lot of nosense, im not so experienced with .net, c# and oop):
public void Compare(int key1, int key2, int col1, int col2)
{
string lastKeyCol1 = null;
string lastKeyCol2 = null;
List<string> sortedKeyColFile1 = new List<string>();
List<string> sortedKeyColFile2 = new List<string>();
int file1counter = 0;
int file2counter = 0;
int cnt = 0;
sortedKeyColFile1 = _file1.GetCol(key1);
sortedKeyColFile1.Sort();
sortedKeyColFile2 = _file2.GetCol(key2);
sortedKeyColFile2.Sort();
while ((file1counter < sortedKeyColFile1.Count) || (file2counter < sortedKeyColFile2.Count))
{
_outputList.Add(new OutputValues(key1, key2, col1, col2));
//Keys are in both files
if (sortedKeyColFile1[file1counter] == sortedKeyColFile2[file2counter])
{
if (lastKeyCol1 == sortedKeyColFile1[file1counter])
{
//Keys are redundant
_outputList[cnt].RedundantKeyF1 = true;
}
if (lastKeyCol2 == sortedKeyColFile2[file2counter])
{
//Keys are redundant
_outputList[cnt].RedundantKeyF2 = true;
}
lastKeyCol1 = sortedKeyColFile1[file1counter];
lastKeyCol2 = sortedKeyColFile2[file2counter];
_outputList[cnt].ValF1 = _file1.GetItem(file1counter, col1);
_outputList[cnt].ValF2 = _file2.GetItem(file2counter, col2);
_outputList[cnt].LineNumF1 = file1counter;
_outputList[cnt].LineNumF2 = file2counter;
//compare the values (because keys do match at this place)
_outputList[cnt].CompareResult = CompareString(_file1.GetItem(file1counter, col1), _file2.GetItem(file2counter, col2));
if (file1counter < sortedKeyColFile1.Count)
{
file1counter++;
}
if (file2counter < sortedKeyColFile2.Count)
{
file2counter++;
}
}
//Key sortedKeyColFile2[file2counter] is not in file 1
else if (file2counter < sortedKeyColFile2.Count && 0 < (string.Compare(sortedKeyColFile1[file1counter], sortedKeyColFile2[file2counter])))
{
_outputList[cnt].LineNumF2 = file2counter;
if (lastKeyCol2 == sortedKeyColFile2[file2counter])
{
//Keys are redundant
_outputList[cnt].RedundantKeyF2 = true;
}
lastKeyCol2 = sortedKeyColFile2[file2counter];
file2counter++;
}
//Key sortedKeyColFile1[file1counter] is not in file 2
else if (file1counter < sortedKeyColFile1.Count)
{
_outputList[cnt].LineNumF1 = file1counter;
if (lastKeyCol1 == sortedKeyColFile1[file1counter])
{
//Keys are redundant
_outputList[cnt].RedundantKeyF1 = true;
}
lastKeyCol1 = sortedKeyColFile1[file1counter];
file1counter++;
}
cnt++;
}
}
//And here the important part of the csv-file class, maybe not so interesting
public class CsvFile
{
private string _filePath = null;
private char _separator = ',';
private bool _hasHeader = true;
private CsvContext _cc = null;
private CsvFileDescription _inputFileDescription = null;
private List<string> _headers = null;
private IEnumerable<DataRow> _dataTable = null;
/// <summary>
/// Constructor for a new CsvFile object.
/// The Constructor initiates the Object and read the values out of the File
/// </summary>
/// <param name="filePath">Full path of the csv-file</param>
/// <param name="separator">Seperator of the csv-file, eg: ';' or ',' or '\t'</param>
/// <param name="hasHeader">Is true if the first col of the csv-file contains a headers</param>
public CsvFile(string filePath, char separator, bool hasHeader = true)
{
//Throws an exception if something is wrong with the file
File.OpenRead(filePath);
_filePath = filePath;
_separator = separator;
_hasHeader = hasHeader;
_cc = new CsvContext();
_inputFileDescription = new CsvFileDescription
{
SeparatorChar = separator,
FirstLineHasColumnNames = hasHeader
};
_dataTable = _cc.Read<DataRow>(_filePath, _inputFileDescription);
if (hasHeader)
{
ParseHeaders();
}
}
public List<string> GetCol(int col)
{
List<string> column = new List<string>();
int cnt = 0;
foreach(DataRow x in _dataTable)
{
column.Add(x[col].Value);
cnt++;
}
return column;
}
private void ParseHeaders()
{
System.IO.StreamReader file = new System.IO.StreamReader(_filePath);
if (!file.EndOfStream)
{
//_headers = file.ReadLine().Split(_separator);
_headers = new List<string> (file.ReadLine().Split(_separator));
}
file.Close();
}
}
Try this:
public class CsvFile
{
private IEnumerable<DataRow> rows = cc.Read<DataRow>(_filePath, _inputFileDescription);
//...
public IEnumerable<DataRow> Rows { get { return rows; } }
}
And then:
CsvFile file1 = new CsvFile("C:\\dev_csvcompare\\Master.csv", ';', true);
foreach(DataRow row in file1.Rows)
{
string dummy = row[1];
}

Parsing CSV files in C#, with header

Is there a default/official/recommended way to parse CSV files in C#? I don't want to roll my own parser.
Also, I've seen instances of people using ODBC/OLE DB to read CSV via the Text driver, and a lot of people discourage this due to its "drawbacks." What are these drawbacks?
Ideally, I'm looking for a way through which I can read the CSV by column name, using the first record as the header / field names. Some of the answers given are correct but work to basically deserialize the file into classes.
A CSV parser is now a part of .NET Framework.
Add a reference to Microsoft.VisualBasic.dll (works fine in C#, don't mind the name)
using (TextFieldParser parser = new TextFieldParser(#"c:\temp\test.csv"))
{
parser.TextFieldType = FieldType.Delimited;
parser.SetDelimiters(",");
while (!parser.EndOfData)
{
//Process row
string[] fields = parser.ReadFields();
foreach (string field in fields)
{
//TODO: Process field
}
}
}
The docs are here - TextFieldParser Class
P.S. If you need a CSV exporter, try CsvExport (discl: I'm one of the contributors)
CsvHelper (a library I maintain) will read a CSV file into custom objects.
using (var reader = new StreamReader("path\\to\\file.csv"))
using (var csv = new CsvReader(reader, CultureInfo.InvariantCulture))
{
var records = csv.GetRecords<Foo>();
}
Sometimes you don't own the objects you're trying to read into. In this case, you can use fluent mapping because you can't put attributes on the class.
public sealed class MyCustomObjectMap : CsvClassMap<MyCustomObject>
{
public MyCustomObjectMap()
{
Map( m => m.Property1 ).Name( "Column Name" );
Map( m => m.Property2 ).Index( 4 );
Map( m => m.Property3 ).Ignore();
Map( m => m.Property4 ).TypeConverter<MySpecialTypeConverter>();
}
}
Let a library handle all the nitty-gritty details for you! :-)
Check out FileHelpers and stay DRY - Don't Repeat Yourself - no need to re-invent the wheel a gazillionth time....
You basically just need to define that shape of your data - the fields in your individual line in the CSV - by means of a public class (and so well-thought out attributes like default values, replacements for NULL values and so forth), point the FileHelpers engine at a file, and bingo - you get back all the entries from that file. One simple operation - great performance!
In a business application, i use the Open Source project on codeproject.com, CSVReader.
It works well, and has good performance. There is some benchmarking on the link i provided.
A simple example, copied from the project page:
using (CsvReader csv = new CsvReader(new StreamReader("data.csv"), true))
{
int fieldCount = csv.FieldCount;
string[] headers = csv.GetFieldHeaders();
while (csv.ReadNextRecord())
{
for (int i = 0; i < fieldCount; i++)
Console.Write(string.Format("{0} = {1};", headers[i], csv[i]));
Console.WriteLine();
}
}
As you can see, it's very easy to work with.
I know its a bit late but just found a library Microsoft.VisualBasic.FileIO which has TextFieldParser class to process csv files.
Here is a helper class I use often, in case any one ever comes back to this thread (I wanted to share it).
I use this for the simplicity of porting it into projects ready to use:
public class CSVHelper : List<string[]>
{
protected string csv = string.Empty;
protected string separator = ",";
public CSVHelper(string csv, string separator = "\",\"")
{
this.csv = csv;
this.separator = separator;
foreach (string line in Regex.Split(csv, System.Environment.NewLine).ToList().Where(s => !string.IsNullOrEmpty(s)))
{
string[] values = Regex.Split(line, separator);
for (int i = 0; i < values.Length; i++)
{
//Trim values
values[i] = values[i].Trim('\"');
}
this.Add(values);
}
}
}
And use it like:
public List<Person> GetPeople(string csvContent)
{
List<Person> people = new List<Person>();
CSVHelper csv = new CSVHelper(csvContent);
foreach(string[] line in csv)
{
Person person = new Person();
person.Name = line[0];
person.TelephoneNo = line[1];
people.Add(person);
}
return people;
}
[Updated csv helper: bug fixed where the last new line character created a new line]
If you need only reading csv files then I recommend this library: A Fast CSV Reader
If you also need to generate csv files then use this one: FileHelpers
Both of them are free and opensource.
This solution is using the official Microsoft.VisualBasic assembly to parse CSV.
Advantages:
delimiter escaping
ignores Header
trim spaces
ignore comments
Code:
using Microsoft.VisualBasic.FileIO;
public static List<List<string>> ParseCSV (string csv)
{
List<List<string>> result = new List<List<string>>();
// To use the TextFieldParser a reference to the Microsoft.VisualBasic assembly has to be added to the project.
using (TextFieldParser parser = new TextFieldParser(new StringReader(csv)))
{
parser.CommentTokens = new string[] { "#" };
parser.SetDelimiters(new string[] { ";" });
parser.HasFieldsEnclosedInQuotes = true;
// Skip over header line.
//parser.ReadLine();
while (!parser.EndOfData)
{
var values = new List<string>();
var readFields = parser.ReadFields();
if (readFields != null)
values.AddRange(readFields);
result.Add(values);
}
}
return result;
}
I have written TinyCsvParser for .NET, which is one of the fastest .NET parsers around and highly configurable to parse almost any CSV format.
It is released under MIT License:
https://github.com/bytefish/TinyCsvParser
You can use NuGet to install it. Run the following command in the Package Manager Console.
PM> Install-Package TinyCsvParser
Usage
Imagine we have list of Persons in a CSV file persons.csv with their first name, last name and birthdate.
FirstName;LastName;BirthDate
Philipp;Wagner;1986/05/12
Max;Musterman;2014/01/02
The corresponding domain model in our system might look like this.
private class Person
{
public string FirstName { get; set; }
public string LastName { get; set; }
public DateTime BirthDate { get; set; }
}
When using TinyCsvParser you have to define the mapping between the columns in the CSV data and the property in you domain model.
private class CsvPersonMapping : CsvMapping<Person>
{
public CsvPersonMapping()
: base()
{
MapProperty(0, x => x.FirstName);
MapProperty(1, x => x.LastName);
MapProperty(2, x => x.BirthDate);
}
}
And then we can use the mapping to parse the CSV data with a CsvParser.
namespace TinyCsvParser.Test
{
[TestFixture]
public class TinyCsvParserTest
{
[Test]
public void TinyCsvTest()
{
CsvParserOptions csvParserOptions = new CsvParserOptions(true, new[] { ';' });
CsvPersonMapping csvMapper = new CsvPersonMapping();
CsvParser<Person> csvParser = new CsvParser<Person>(csvParserOptions, csvMapper);
var result = csvParser
.ReadFromFile(#"persons.csv", Encoding.ASCII)
.ToList();
Assert.AreEqual(2, result.Count);
Assert.IsTrue(result.All(x => x.IsValid));
Assert.AreEqual("Philipp", result[0].Result.FirstName);
Assert.AreEqual("Wagner", result[0].Result.LastName);
Assert.AreEqual(1986, result[0].Result.BirthDate.Year);
Assert.AreEqual(5, result[0].Result.BirthDate.Month);
Assert.AreEqual(12, result[0].Result.BirthDate.Day);
Assert.AreEqual("Max", result[1].Result.FirstName);
Assert.AreEqual("Mustermann", result[1].Result.LastName);
Assert.AreEqual(2014, result[1].Result.BirthDate.Year);
Assert.AreEqual(1, result[1].Result.BirthDate.Month);
Assert.AreEqual(1, result[1].Result.BirthDate.Day);
}
}
}
User Guide
A full User Guide is available at:
http://bytefish.github.io/TinyCsvParser/
Here is a short and simple solution.
using (TextFieldParser parser = new TextFieldParser(outputLocation))
{
parser.TextFieldType = FieldType.Delimited;
parser.SetDelimiters(",");
string[] headers = parser.ReadLine().Split(',');
foreach (string header in headers)
{
dataTable.Columns.Add(header);
}
while (!parser.EndOfData)
{
string[] fields = parser.ReadFields();
dataTable.Rows.Add(fields);
}
}
Here is my KISS implementation...
using System;
using System.Collections.Generic;
using System.Text;
class CsvParser
{
public static List<string> Parse(string line)
{
const char escapeChar = '"';
const char splitChar = ',';
bool inEscape = false;
bool priorEscape = false;
List<string> result = new List<string>();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < line.Length; i++)
{
char c = line[i];
switch (c)
{
case escapeChar:
if (!inEscape)
inEscape = true;
else
{
if (!priorEscape)
{
if (i + 1 < line.Length && line[i + 1] == escapeChar)
priorEscape = true;
else
inEscape = false;
}
else
{
sb.Append(c);
priorEscape = false;
}
}
break;
case splitChar:
if (inEscape) //if in escape
sb.Append(c);
else
{
result.Add(sb.ToString());
sb.Length = 0;
}
break;
default:
sb.Append(c);
break;
}
}
if (sb.Length > 0)
result.Add(sb.ToString());
return result;
}
}
Some time ago I had wrote simple class for CSV read/write based on Microsoft.VisualBasic library. Using this simple class you will be able to work with CSV like with 2 dimensions array. You can find my class by the following link: https://github.com/ukushu/DataExporter
Simple example of usage:
Csv csv = new Csv("\t");//delimiter symbol
csv.FileOpen("c:\\file1.csv");
var row1Cell6Value = csv.Rows[0][5];
csv.AddRow("asdf","asdffffff","5")
csv.FileSave("c:\\file2.csv");
For reading header only you need is to read csv.Rows[0] cells :)
This code reads csv to DataTable:
public static DataTable ReadCsv(string path)
{
DataTable result = new DataTable("SomeData");
using (TextFieldParser parser = new TextFieldParser(path))
{
parser.TextFieldType = FieldType.Delimited;
parser.SetDelimiters(",");
bool isFirstRow = true;
//IList<string> headers = new List<string>();
while (!parser.EndOfData)
{
string[] fields = parser.ReadFields();
if (isFirstRow)
{
foreach (string field in fields)
{
result.Columns.Add(new DataColumn(field, typeof(string)));
}
isFirstRow = false;
}
else
{
int i = 0;
DataRow row = result.NewRow();
foreach (string field in fields)
{
row[i++] = field;
}
result.Rows.Add(row);
}
}
}
return result;
}
Single source file solution for straightforward parsing needs, useful. Deals with all the nasty edge cases. Such as new line normalization and handling new lines in quoted string literals. Your welcome!
If you CSV file has a header you just read out the column names (and compute column indexes) from the first row. Simple as that.
Note that Dump is a LINQPad method, you might want to remove that if you are not using LINQPad.
void Main()
{
var file1 = "a,b,c\r\nx,y,z";
CSV.ParseText(file1).Dump();
var file2 = "a,\"b\",c\r\nx,\"y,z\"";
CSV.ParseText(file2).Dump();
var file3 = "a,\"b\",c\r\nx,\"y\r\nz\"";
CSV.ParseText(file3).Dump();
var file4 = "\"\"\"\"";
CSV.ParseText(file4).Dump();
}
static class CSV
{
public struct Record
{
public readonly string[] Row;
public string this[int index] => Row[index];
public Record(string[] row)
{
Row = row;
}
}
public static List<Record> ParseText(string text)
{
return Parse(new StringReader(text));
}
public static List<Record> ParseFile(string fn)
{
using (var reader = File.OpenText(fn))
{
return Parse(reader);
}
}
public static List<Record> Parse(TextReader reader)
{
var data = new List<Record>();
var col = new StringBuilder();
var row = new List<string>();
for (; ; )
{
var ln = reader.ReadLine();
if (ln == null) break;
if (Tokenize(ln, col, row))
{
data.Add(new Record(row.ToArray()));
row.Clear();
}
}
return data;
}
public static bool Tokenize(string s, StringBuilder col, List<string> row)
{
int i = 0;
if (col.Length > 0)
{
col.AppendLine(); // continuation
if (!TokenizeQuote(s, ref i, col, row))
{
return false;
}
}
while (i < s.Length)
{
var ch = s[i];
if (ch == ',')
{
row.Add(col.ToString().Trim());
col.Length = 0;
i++;
}
else if (ch == '"')
{
i++;
if (!TokenizeQuote(s, ref i, col, row))
{
return false;
}
}
else
{
col.Append(ch);
i++;
}
}
if (col.Length > 0)
{
row.Add(col.ToString().Trim());
col.Length = 0;
}
return true;
}
public static bool TokenizeQuote(string s, ref int i, StringBuilder col, List<string> row)
{
while (i < s.Length)
{
var ch = s[i];
if (ch == '"')
{
// escape sequence
if (i + 1 < s.Length && s[i + 1] == '"')
{
col.Append('"');
i++;
i++;
continue;
}
i++;
return true;
}
else
{
col.Append(ch);
i++;
}
}
return false;
}
}
Another one to this list, Cinchoo ETL - an open source library to read and write multiple file formats (CSV, flat file, Xml, JSON etc)
Sample below shows how to read CSV file quickly (No POCO object required)
string csv = #"Id, Name
1, Carl
2, Tom
3, Mark";
using (var p = ChoCSVReader.LoadText(csv)
.WithFirstLineHeader()
)
{
foreach (var rec in p)
{
Console.WriteLine($"Id: {rec.Id}");
Console.WriteLine($"Name: {rec.Name}");
}
}
Sample below shows how to read CSV file using POCO object
public partial class EmployeeRec
{
public int Id { get; set; }
public string Name { get; set; }
}
static void CSVTest()
{
string csv = #"Id, Name
1, Carl
2, Tom
3, Mark";
using (var p = ChoCSVReader<EmployeeRec>.LoadText(csv)
.WithFirstLineHeader()
)
{
foreach (var rec in p)
{
Console.WriteLine($"Id: {rec.Id}");
Console.WriteLine($"Name: {rec.Name}");
}
}
}
Please check out articles at CodeProject on how to use it.
This parser supports nested commas and quotes in a column:
static class CSVParser
{
public static string[] ParseLine(string line)
{
List<string> cols = new List<string>();
string value = null;
for(int i = 0; i < line.Length; i++)
{
switch(line[i])
{
case ',':
cols.Add(value);
value = null;
if(i == line.Length - 1)
{// It ends with comma
cols.Add(null);
}
break;
case '"':
cols.Add(ParseEnclosedColumn(line, ref i));
i++;
break;
default:
value += line[i];
if (i == line.Length - 1)
{// Last character
cols.Add(value);
}
break;
}
}
return cols.ToArray();
}//ParseLine
static string ParseEnclosedColumn(string line, ref int index)
{// Example: "b"",bb"
string value = null;
int numberQuotes = 1;
int index2 = index;
for (int i = index + 1; i < line.Length; i++)
{
index2 = i;
switch (line[i])
{
case '"':
numberQuotes++;
if (numberQuotes % 2 == 0)
{
if (i < line.Length - 1 && line[i + 1] == ',')
{
index = i;
return value;
}
}
else if (i > index + 1 && line[i - 1] == '"')
{
value += '"';
}
break;
default:
value += line[i];
break;
}
}
index = index2;
return value;
}//ParseEnclosedColumn
}//class CSVParser
Based on unlimit's post on How to properly split a CSV using C# split() function? :
string[] tokens = System.Text.RegularExpressions.Regex.Split(paramString, ",");
NOTE: this doesn't handle escaped / nested commas, etc., and therefore is only suitable for certain simple CSV lists.
If anyone wants a snippet they can plop into their code without having to bind a library or download a package. Here is a version I wrote:
public static string FormatCSV(List<string> parts)
{
string result = "";
foreach (string s in parts)
{
if (result.Length > 0)
{
result += ",";
if (s.Length == 0)
continue;
}
if (s.Length > 0)
{
result += "\"" + s.Replace("\"", "\"\"") + "\"";
}
else
{
// cannot output double quotes since its considered an escape for a quote
result += ",";
}
}
return result;
}
enum CSVMode
{
CLOSED = 0,
OPENED_RAW = 1,
OPENED_QUOTE = 2
}
public static List<string> ParseCSV(string input)
{
List<string> results;
CSVMode mode;
char[] letters;
string content;
mode = CSVMode.CLOSED;
content = "";
results = new List<string>();
letters = input.ToCharArray();
for (int i = 0; i < letters.Length; i++)
{
char letter = letters[i];
char nextLetter = '\0';
if (i < letters.Length - 1)
nextLetter = letters[i + 1];
// If its a quote character
if (letter == '"')
{
// If that next letter is a quote
if (nextLetter == '"' && mode == CSVMode.OPENED_QUOTE)
{
// Then this quote is escaped and should be added to the content
content += letter;
// Skip the escape character
i++;
continue;
}
else
{
// otherwise its not an escaped quote and is an opening or closing one
// Character is skipped
// If it was open, then close it
if (mode == CSVMode.OPENED_QUOTE)
{
results.Add(content);
// reset the content
content = "";
mode = CSVMode.CLOSED;
// If there is a next letter available
if (nextLetter != '\0')
{
// If it is a comma
if (nextLetter == ',')
{
i++;
continue;
}
else
{
throw new Exception("Expected comma. Found: " + nextLetter);
}
}
}
else if (mode == CSVMode.OPENED_RAW)
{
// If it was opened raw, then just add the quote
content += letter;
}
else if (mode == CSVMode.CLOSED)
{
// Otherwise open it as a quote
mode = CSVMode.OPENED_QUOTE;
}
}
}
// If its a comma seperator
else if (letter == ',')
{
// If in quote mode
if (mode == CSVMode.OPENED_QUOTE)
{
// Just read it
content += letter;
}
// If raw, then close the content
else if (mode == CSVMode.OPENED_RAW)
{
results.Add(content);
content = "";
mode = CSVMode.CLOSED;
}
// If it was closed, then open it raw
else if (mode == CSVMode.CLOSED)
{
mode = CSVMode.OPENED_RAW;
results.Add(content);
content = "";
}
}
else
{
// If opened quote, just read it
if (mode == CSVMode.OPENED_QUOTE)
{
content += letter;
}
// If opened raw, then read it
else if (mode == CSVMode.OPENED_RAW)
{
content += letter;
}
// It closed, then open raw
else if (mode == CSVMode.CLOSED)
{
mode = CSVMode.OPENED_RAW;
content += letter;
}
}
}
// If it was still reading when the buffer finished
if (mode != CSVMode.CLOSED)
{
results.Add(content);
}
return results;
}
For smaller input CSV data LINQ is fully enough.
For example for the following CSV file content:
schema_name,description,utype
"IX_HE","High-Energy data","x"
"III_spectro","Spectrosopic data","d"
"VI_misc","Miscellaneous","f"
"vcds1","Catalogs only available in CDS","d"
"J_other","Publications from other journals","b"
when we read the whole content into single string called data, then
using System;
using System.IO;
using System.Linq;
var data = File.ReadAllText(Path2CSV);
// helper split characters
var newline = Environment.NewLine.ToCharArray();
var comma = ",".ToCharArray();
var quote = "\"".ToCharArray();
// split input string data to lines
var lines = data.Split(newline);
// first line is header, take the header fields
foreach (var col in lines.First().Split(comma)) {
// do something with "col"
}
// we skip the first line, all the rest are real data lines/fields
foreach (var line in lines.Skip(1)) {
// first we split the data line by comma character
// next we remove double qoutes from each splitted element using Trim()
// finally we make an array
var fields = line.Split(comma)
.Select(_ => { _ = _.Trim(quote); return _; })
.ToArray();
// do something with the "fields" array
}

Categories