How to fasten this parsing loop from textfile - c#

I'm a c# begginer so I might have missed some easy tips.
Here's my question :
I have a loop, reading from a text file some data.
I Have 160 files with 14000 lines of 7 values inside.
i'm interested in getting only some of this line depending of a time correspondance form a existing column in datatable.
If i found a corresping line, I update the line in Datatable.
The actual code take about 6min by file.... It's really too long...
Any idea to fast up this loop ?
I allready tried to reduce the loop with a while loop...
File sample :
Sample of the file :
Col1 Col2 Col3 Col4 Col5 Col6
15943100 1188 1 2,49 2,28 75,3
15943200 1188 1 2,49 2,28 75,3
15943300 1188 1 2,50 2,28 75,3
15943400 1188 1 2,50 2,28 75,3
Here's the code : (Edited from Aluan Haddad answer but not faster)
foreach (string FileName in fileFastPath)
{
var start = DateTimeOffset.UtcNow;
var allLines = File.ReadAllLines(FileName);
foreach (var line in allLines.Where(line => line.Contains("Acquisition depuis")))
{
DateTimeOffset.TryParse(line.Split('\t')[1], out start);
}
double x = 0, y = 0, z = 0;
foreach (var (sp1, sp2, vear, milliseconds) in from line in allLines
where !line.Contains("Acquisition depuis")
select line.Split('\t') into values
where values.Length >= 5
where double.TryParse(values[3], out x)
where double.TryParse(values[4], out y)
where double.TryParse(values[5], out z)
let milliseconds = double.Parse(values[0])
where milliseconds % 1000 == 0
select (x, y, z, milliseconds))
{
var updatedStart = start + TimeSpan.FromMilliseconds(milliseconds);
var existingValue = Data
.Select($"Time = #{updatedStart: yyyy-MM-dd HH:mm:ss.fff}#")
.FirstOrDefault(existing => existing != null);
if (existingValue != null)
{
existingValue["SP1 Bar"] = sp1;
existingValue["SP2 Bar"] = sp2;
existingValue["VEAR_POS %"] = vear;
}
}
}

Try cleaning up your code so you can see the forest for the trees. As you do so, there are many optimizations that reveal themselves thanks to clearer program structure.
foreach (var fileName in fileFastPath)
{
var start = DateTimeOffset.UtcNow;
var allLines = File.ReadAllLines(fileName);
foreach (var line in allLines.Where(line => line.Contains("Acquisition depuis")))
{
DateTimeOffset.TryParse(line.Split('\t')[1], out start);
}
double x = 0, y = 0, z = 0;
var lineValues = from line in allLines
where !line.Contains("Acquisition depuis")
select line.Split('\t') into values
where double.TryParse(values[3], out x)
where double.TryParse(values[4], out y)
where double.TryParse(values[5], out z)
let milliseconds = double.Parse(values[0])
select (x, y, z, milliseconds);
foreach (var (sp1, sp2, vear, milliseconds) in lineValues)
{
var updatedStart = start + TimeSpan.FromMilliseconds(milliseconds);
var existingValue = Data
.Select($"Time = #{updatedStart: yyyy-MM-dd HH:mm:ss.fff}#")
.FirstOrDefault(existing => existing != null);
if (existingValue != null)
{
existingValue["SP1 Bar"] = sp1;
existingValue["SP2 Bar"] = sp2;
existingValue["VEAR_POS %"] = vear;
}
}
}

Related

Getting the sum per hour from 2 datatable

I'm writing a txt file from 2 data table.
Following is the 2 data table.
dt1
Transaction No. Time Amount Date
1 10:00:00 200.00 03/05/2020
2 10:30:11 250.00 03/05/2020
3 11:05:22 140.00 03/05/2020
4 11:45:33 230.00 03/05/2020
5 12:15:10 220.00 03/05/2020
dt2
Transaction No. Added Amount Date
1 40.00 03/05/2020
2 25.00 03/05/2020
3 40.00 03/05/2020
4 30.00 03/05/2020
5 30.00 03/05/2020
following is my code
using (StreamWriter sw = File.AppendText(fileName))
{
for (int a = 6; a <= 23; a++)
{
string aa = a.ToString().PadLeft(2, '0');
double salex = double.Parse(dt1.Rows[0]["Amount"].ToString());
if (salex.Equals(""))
{
salex = 0;
}
else
{
salex = double.Parse(dt1.Rows[0]["Amount"].ToString());
}
double vatx = double.Parse(dt2.Rows[0]["Added Amount"].ToString());
if (vatx.Equals(""))
{
vatx = 0;
}
else
{
vatx = double.Parse(dt2.Rows[0]["Added Amount"].ToString());
}
double dailysaleHRLY = -salex + -vatx;
sw.Write(dtpDate.Value.ToString("MM/dd/yyyy") + ",");
sw.Write(aa + ":00" + ",");
sw.Write(dailysaleHRLY.ToString("0.00") + ",");
}
for (int a = 0; a <= 5; a++)
{
string aa = a.ToString().PadLeft(2, '0');
double salex = double.Parse(dt1.Rows[0]["Amount"].ToString());
if (salex.Equals(""))
{
salex = 0;
}
else
{
salex = double.Parse(dt1.Rows[0]["Amount"].ToString());
}
double vatx = double.Parse(dt2.Rows[0]["Added Amount"].ToString());
if (vatx.Equals(""))
{
vatx = 0;
}
else
{
vatx = double.Parse(dt2.Rows[0]["Added Amount"].ToString());
}
double dailysaleHRLY = -salex + -vatx;
sw.Write(dtpDate.Value.ToString("MM/dd/yyyy") + ",");
sw.Write(aa + ":00" + ",");
sw.Write(dailysaleHRLY.ToString("0.00") + ",");
}
MessageBox.Show("Txt File succesfully created!", "SYSTEM", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
This is the output of my code.
Date, Time, Sum
03/05/2020,06:00,515.00
03/05/2020,07:00,515.00
03/05/2020,08:00,515.00
03/05/2020,09:00,515.00
03/05/2020,10:00,515.00
03/05/2020,11:00,515.00
03/05/2020,12:00,515.00
03/05/2020,13:00,515.00
03/05/2020,14:00,515.00
03/05/2020,15:00,515.00
03/05/2020,16:00,515.00
03/05/2020,17:00,515.00
03/05/2020,18:00,515.00
03/05/2020,19:00,515.00
03/05/2020,20:00,515.00
03/05/2020,21:00,515.00
03/05/2020,22:00,515.00
03/05/2020,23:00,515.00
03/05/2020,00:00,515.00
03/05/2020,01:00,515.00
03/05/2020,02:00,515.00
03/05/2020,03:00,515.00
03/05/2020,04:00,515.00
03/05/2020,05:00,515.00
I just want to get the sum of Amount and Added Amount base on hour. Like this.
Date, Time, Sum
03/05/2020,06:00,0.00
03/05/2020,07:00,0.00
03/05/2020,08:00,0.00
03/05/2020,09:00,0.00
03/05/2020,10:00,515.00
03/05/2020,11:00,440.00
03/05/2020,12:00,250.00
03/05/2020,13:00,0.00
03/05/2020,14:00,0.00
03/05/2020,15:00,0.00
03/05/2020,16:00,0.00
03/05/2020,17:00,0.00
03/05/2020,18:00,0.00
03/05/2020,19:00,0.00
03/05/2020,20:00,0.00
03/05/2020,21:00,0.00
03/05/2020,22:00,0.00
03/05/2020,23:00,0.00
03/05/2020,00:00,0.00
03/05/2020,01:00,0.00
03/05/2020,02:00,0.00
03/05/2020,03:00,0.00
03/05/2020,04:00,0.00
03/05/2020,05:00,0.00
Assuming that you have two DataTable-s and you have them filled with the mentioned data.
var dt1 = new DataTable();
var dt2 = new DataTable();
dt1.Columns.AddRange(new[]
{
new DataColumn("Transaction No.", typeof(int)),
new DataColumn("Time", typeof(DateTime)),
new DataColumn("Amount", typeof(decimal)),
new DataColumn("Date", typeof(DateTime)),
});
dt2.Columns.AddRange(new[]
{
new DataColumn("Transaction No.", typeof(int)),
new DataColumn("Added Amount", typeof(decimal)),
new DataColumn("Date", typeof(DateTime)),
});
Note: The double types have been replaced with decimal types since its the right type to be used when dealing with money.
As I understand the problem, you want to group the rows of dt1 by hour part of the Time field, sum the Amount, and add to the sum the Added Amount from dt2 rows where their Transaction No. equals to any Transaction No. of the grouped rows of dt1.
This will do:
var group = dt1.AsEnumerable().GroupBy(x => x.Field<DateTime>(1).Hour);
var sb = new StringBuilder();
sb.Append("Date,");
sb.Append("Time,".PadLeft(12, ' '));
sb.AppendLine("Sum".PadLeft(5, ' '));
//if PadLeft is not required in the output, then just:
//sb.AppendLine($"Date, Time, Sum");
foreach (var g in group)
{
var sum = 0M;
foreach (var r in g)
sum += r.Field<decimal>(2) + dt2.AsEnumerable()
.Where(x => x.Field<int>(0) == r.Field<int>(0))
.Sum(x => x.Field<decimal>(1));
sb.AppendLine($"{g.First().Field<DateTime>(3).ToString("MM/dd/yyyy")}, {g.Key.ToString("00")}:00, {sum.ToString("0.00")}");
}
Note: You can use the fields names instead of their indexes.
The output is:
Date, Time, Sum
03/05/2020, 10:00, 515.00
03/05/2020, 11:00, 440.00
03/05/2020, 12:00, 250.00
I don't know whether the DataTable-s already contain the required data to generate the output mentioned in the last quote block or you want to append the rest before writing to the text file. In case of the second scenario, you can do something like:
var group = dt1.AsEnumerable().GroupBy(x => x.Field<DateTime>(1).Hour);
var sb = new StringBuilder();
sb.AppendLine($"Date, Time, Sum");
for (var i = 0; i < 24; i++)
{
var g = group.FirstOrDefault(x => x.Key == i);
if (g != null)
{
var sum = 0M;
foreach (var r in g)
sum += r.Field<decimal>(2) + dt2.AsEnumerable()
.Where(x => x.Field<int>(0) == r.Field<int>(0))
.Sum(x => x.Field<decimal>(1));
sb.AppendLine($"{g.First().Field<DateTime>(3).ToString("MM/dd/yyyy")}, {g.Key.ToString("00")}:00, {sum.ToString("0.00")}");
}
else
sb.AppendLine($"{group.First().First().Field<DateTime>(3).ToString("MM/dd/yyyy")}, {i.ToString("00")}:00, 0.00");
}
If you need to preserve the same order of the hours:
for (var ii = 6; ii < 30; ii++)
{
var i = ii > 23 ? ii % 24 : ii;
var g = group.FirstOrDefault(x => x.Key == i);
if (g != null)
{
//The same...
}
Finally, to create or overwrite the text file (fileName):
File.WriteAllText(fileName, sb.ToString());
Or to append the output:
File.AppendAllText(fileName, sb.ToString());

c# csv count a specified data in file or in datagridview

I have a csv file and would like to count the 2. column how many times contains 111.
the csv file has 46 separated columns with separator ; .
"first col" "second col" "....."
abc 111 a
abc 112 b
abc 113 c
abc 111 d
abc 112 e
abc 113 f
i would like to count the 111.
Filled up first the datagridview fom datatable.
dgv.DataSource = dgv_table;
string[] raw_text = File.ReadAllLines("d:\\"+lb_csv.Text);
string[] data_col = null;
int x = 0;
foreach (string text_line in raw_text)
{
// MessageBox.Show(text_line);
data_col = text_line.Split(';');
if (x == 0)
{
for (int i = 0; i <= data_col.Count() - 1; i++)
{
dgv_table.Columns.Add(data_col[i]);
}
//header
x++;
}
else
{
//data
dgv_table.Rows.Add(data_col);
}
I find a lots of solution to count the 2nd columns specified data:111
but all time i had problems.
int xCount = dgv.Rows.Cast<DataGridViewRow>().Select(row => row.Cells["second col"].Value).Where(s => s !=null && Equals(111)).Count();
this.lb_qty.Text = xCount.ToString();
But it gives error for row.Cells["second col"].Value
An unhandled exception of type 'System.ArgumentException' occurred in System.Windows.Forms.dll
Additional information: Column named second col cannot be found.
Can someone help me how to solve this problem and get the needed result?
I would suggest you to skip using DataGridView and use counter variable in your loop, like Arkadiusz suggested.
If you still want to work with DataTable, count values like this:
int xCount = dgv_table.Rows.Cast<DataRow>().Count(r => r["second col"] != null && r["second col"].ToString() == "111");
I would try to read the file into a DataTable and use it as DataSource for the DataGridView.
DataTable d_Table = new DataTable();
//fill the DataTable
this.dgv_table.DataSource = d_Table;
To count the rows wich contains 111 in the second column, you can select the DataTable like this:
DataTable d_Table = new DataTable();
//fill the DataTable
DataRow[] rowCount = d_Table.Select("secondCol = '111'");
this.lb_qty.Text = rowCount.Length.ToString();
Or you can do it in a foreach-loop:
int count = 0;
foreach(DataGridViewRow dgr in this.dgv_table.Rows)
{
if(dgr.Cells["secondCol"].Value.ToString() == "111") count++;
}
this.lb_qty.Text = count.ToString();
you can use this method to save the CSV into List of arrays List
public static List<string[]> readCSV(String filename)
{
List<string[]> result = new List<string[]>();
try
{
string[] line = File.ReadAllLines(filename);
foreach (string l in line)
{
string[] value= vrstica.Split(',');
result.Add(value);
}
}
catch (Exception e)
{
Console.WriteLine("Error: '{0}'", e);
}
return result;
}
every array will represent a column, so you can simply find the frequency of any value using LINQ or even loop:
foreach (var item in tmp[1].GroupBy(c => c))
{
Console.WriteLine("{0} : {1}", item.Key, item.Count());
}
int CountValues(string input, string searchedValue, int ColumnNumber, bool skipFirstLine = false)
{
int numberOfSearchedValue= 0;
string line;
using (StreamReader reader = new StreamReader (input))
{
if(skipFirstLine)
reader.ReadLine();
while ((line = reader.ReadLine()) != null)
{
if(line.Split(';')[ColumnNumber] == searchedValue)
numberOfSearchedValue++;
}
}
return numberOfSearchedValue;
}
Edit:
StreamReader.ReadLine() reads the line but also, using this method we are jumping to second line. If there is no more lines it returns null, so that is our ending condition. Rest of the code is readable, I think
:)
Didn't test that so be careful :)
It might be necessary to use Trim() or ToUpperCase() in some places (as usually when you are searching).

Showing 0 as index for output

In following, it shows 0 as index for values.Format of output must be like:
2352:0.45678
9878:0.23423
..........
But after running, the result is like:
0:0.45678
0:0.23423
.........
After showing above result it runs again and shows
0:0
0:0
var results = new List<float>(1143600);
for (int z = 0; z < 1143600; z++)
{
results.Add(dotproduct(useridseq, z));//for multiply two vectors
}
var sb1 = new StringBuilder();
foreach (var resultwithindex in results.Select((r, index) => new { result = r, Index = index }).OrderByDescending(r => r.result).Take(n))
{
int indexx = resultwithindex.Index;
for (int yyyy = 0; yyyy <itemori.Length; yyyy++)
{
lineitemori5 = fileitemori.ReadLine();
if (!string.IsNullOrEmpty(lineitemori5))
{
string[] values5 = lineitemori5.Split('\t');
int itemidori5;
foreach (string valuewoow in values5)
{
itemidori5 = Convert.ToInt32(values5[0]);
if (indexx == itemidori5)
{
itemidseq5 = Convert.ToInt32(values5[1]);
}
}
}
}
sb1.AppendFormat("{0}: {1}", itemidseq5, resultwithindex.result);
sb1.AppendLine();
}
MessageBox.Show(sb1.ToString());
}
PS: I have two data sets that I worked with one of them and calculate the result which each result has an index that is not real,for getting the real index, i must use another data set.
Any idea?
Thanks in advance

How to make the custom parser for text file

Actually I set four columns using data table and I want this column retrieve value from text file. I used regex for remove the particular line from the text file.
My objective is that I want to show text file on the grid using data table so first I am trying to create data table and remove the line (show at the program) using regex.
Here I post my full code.
namespace class
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
StreamReader sreader = File.OpenText(#"C:\FareSearchRegex.txt");
string line;
DataTable dt = new DataTable();
DataRow dr;
dt.Columns.Add("PTC");
dt.Columns.Add("CUR");
dt.Columns.Add("TAX");
dt.Columns.Add("FARE BASIS");
while ((line = sreader.ReadLine()) != null)
{
var pattern = "---------- RECOMMENDATION 1 OF 3 IN GROUP 1 (USD 168.90)----------";
var result = Regex.Replace(line,pattern," ");
dt.Rows.Add(line);
}
}
}
class Class1
{
string PTC;
string CUR;
float TAX;
public string gsPTC
{
get{ return PTC; }
set{ PTC = value; }
}
public string gsCUR
{
get{ return CUR; }
set{ CUR = value; }
}
public float gsTAX
{
get{ return TAX; }
set{ TAX = value; }
}
}
}
If your format is strict(e.g. always 4 columns) and you want to remove only this complete line i don't see any reason to use regex:
var rows = File.ReadLines(#"C:\FareSearchRegex.txt")
.Where(l => l != "---------- RECOMMENDATION 1 OF 3 IN GROUP 1 (USD 168.90)----------")
.Select(l => new { line = l, items = l.Split(','), row = dt.Rows.Add() });
foreach (var x in rows)
x.row.ItemArray = x.items;
(assumed that the fields are separated by comma)
Edit: This works with your pastebin:
string header = " PTC CUR TAX FARE BASIS";
bool takeNextLine = false;
foreach (string line in File.ReadLines(#"C:\FareSearchRegex.txt"))
{
if (line.StartsWith(header))
takeNextLine = true;
else if (takeNextLine)
{
var tokens = line.Split(new[] { #" " }, StringSplitOptions.RemoveEmptyEntries);
dt.Rows.Add().ItemArray = tokens.Where((t, i) => i != 2).ToArray();
takeNextLine = false;
}
}
(since you have an empty column which you want to exclude from the result i've used the clumsy and possibly error-prone(?) query Where((t, i) => i != 2))
To parse the file you'll need to:
Split the text of the file into data chunks. A chunk, in your case can be identified by the header PTC CUR TAX FARE BASIS and by the TOTAL line. To split the text you'll need to tokenize the input as follows> (i) define a regular expression to match the headers, (ii) define a regular expression to match the Total lines (footers); Using (i) and (ii) you can join them by the order of appearance index and determine the total size of each chunk (see the line with (x,y)=>new{StartIndex = x.Match.Index, EndIndex = y.Match.Index + y.Match.Length}) below). Use String.Substring method to separate the chunks.
Extract the data from each individual chunk. Knowing that data is split by lines you just have to iterate through all lines in a chunk (ignoring header and footer) and process each line.
This code should help:
string file = #"C:\FareSearchRegex.txt";
string text = File.ReadAllText(file);
var headerRegex = new Regex(#"^(\)>)?\s+PTC\s+CUR\s+TAX\s+FARE BASIS$", RegexOptions.IgnoreCase | RegexOptions.Multiline);
var totalRegex = new Regex(#"^\s+TOTAL[\w\s.]+?$",RegexOptions.IgnoreCase | RegexOptions.Multiline);
var lineRegex = new Regex(#"^(?<Num>\d+)?\s+(?<PTC>[A-Z]+)\s+\d+\s(?<Cur>[A-Z]{3})\s+[\d.]+\s+(?<Tax>[\d.]+)",RegexOptions.IgnoreCase | RegexOptions.Multiline);
var dataIndices =
headerRegex.Matches(text).Cast<Match>()
.Select((m, index) => new{ Index = index, Match = m })
.Join(totalRegex.Matches(text).Cast<Match>().Select((m, index) => new{ Index = index, Match = m }),
x => x.Index,
x => x.Index,
(x, y) => new{ StartIndex = x.Match.Index, EndIndex = y.Match.Index + y.Match.Length });
var items = dataIndices
.Aggregate(new List<string>(), (list, x) =>
{
var item = text.Substring(x.StartIndex, x.EndIndex - x.StartIndex);
list.Add(item);
return list;
});
var result = items.SelectMany(x =>
{
var lines = x.Split(new string[]{Environment.NewLine, "\r", "\n"}, StringSplitOptions.RemoveEmptyEntries);
return lines.Skip(1) //Skip header
.Take(lines.Length - 2) // Ignore footer
.Select(line =>
{
var match = lineRegex.Match(line);
return new
{
Ptc = match.Groups["PTC"].Value,
Cur = match.Groups["Cur"].Value,
Tax = Convert.ToDouble(match.Groups["Tax"].Value)
};
});
});

C# Processing Fixed Width Files - Solution Not Working

I have implemented Cuong's solution here:
C# Processing Fixed Width Files
Here is my code:
var lines = File.ReadAllLines(#fileFull);
var widthList = lines.First().GroupBy(c => c)
.Select(g => g.Count())
.ToList();
var list = new List<KeyValuePair<int, int>>();
int startIndex = 0;
for (int i = 0; i < widthList.Count(); i++)
{
var pair = new KeyValuePair<int, int>(startIndex, widthList[i]);
list.Add(pair);
startIndex += widthList[i];
}
var csvLines = lines.Select(line => string.Join(",",
list.Select(pair => line.Substring(pair.Key, pair.Value))));
File.WriteAllLines(filePath + "\\" + fileName + ".csv", csvLines);
#fileFull = File Path & Name
The issue I have is the first line of the input file also contains digits. So it could be AAAAAABBC111111111DD2EEEEEE etc. For some reason the output from Cuong's code gives me CSV headings like 1111RRRR and 222223333.
Does anyone know why this is and how I would fix it?
Header row example:
AAAAAAAAAAAAAAAABBBBBBBBBBCCCCCCCCDEFCCCCCCCCCGGGGGGGGHHHHHHHHIJJJJJJJJKKKKLLLLMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOPPPPQQQQ1111RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR222222222333333333444444444555555555666666666777777777888888888999999999S00001111TTTTTTTTTTTTUVWXYZ!"£$$$$$$%&
Converted header row:
AAAAAAAAAAAAAAAA BBBBBBBBBB CCCCCCCCDEFCCCCCC C C C GGGGGGGG HHHHHHHH I JJJJJJJJ KKKK LLLL MMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN OOOOOOOOOOOOOOOOOOOOOOOOOOOOOO PPPP QQQQ 1111RRRR RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR2222 222223333 333334444 444445555 555556666 666667777 777778888 888889999 99999S000 0 1111 TTTTTTTTTTTT U V W X Y Z ! ",�,$$$$$$,%,&,"
Jodrell - I implemented your suggestion but the header output is like:
BBBBBBBBBBCCCCCC CCCCCCCCD DEFCCCC GGGGGGGG HHHHHHH IJJJJJJ KKKKLLL LLL MMM NNNNNNNNNNNNNNNNNNNNNNNNNNNNN OOOOOOOOOOOOOOOOOOOOOOOOOOOOO PPPPQQQQ1111RRRRRRRRRRRRRRRRR QQQ 111 RRR 33333333 44444444 55555555 66666666 77777777 88888888 99999999 S0000111 111 TTT UVWXYZ!"�$$ %&
As Jodrell already mentioned, your code doesn't work because it assumed that the character representing each column header is distinct. Change the code that parse the header widths would fix it.
Replace:
var widthList = lines.First().GroupBy(c => c)
.Select(g => g.Count())
.ToList();
With:
var widthList = new List<int>();
var header = lines.First().ToArray();
for (int i = 0; i < header.Length; i++)
{
if (i == 0 || header[i] != header[i-1])
widthList.Add(0);
widthList[widthList.Count-1]++;
}
Parsed header columns:
AAAAAAAAAAAAAAAA BBBBBBBBBB CCCCCCCC D E F CCCCCCCCC GGGGGGGG HHHHHHHH I JJJJJJJJ KKKK LLLL MMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN OOOOOOOOOOOOOOOOOOOOOOOOOOOOOO PPPP QQQQ 1111 RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR 222222222 333333333 444444444 555555555 666666666 777777777 888888888 999999999 S 0000 1111 TTTTTTTTTTTT U V W X Y Z ! " £ $$$$$$ % &
EDIT
Because the problem annoyed me I wrote some code that handles " and ,. This code replaces the header row with comma delimited alternating zeros and ones. Any commas or double quotes in the body are appropriately escaped.
static void FixedToCsv(string sourceFile)
{
if (sourceFile == null)
{
// Throw exception
}
var dir = Path.GetDirectory(sourceFile)
var destFile = string.Format(
"{0}{1}",
Path.GetFileNameWithoutExtension(sourceFile),
".csv");
if (dir != null)
{
destFile = Path.Combine(dir, destFile);
}
if (File.Exists(destFile))
{
// Throw Exception
}
var blocks = new List<KeyValuePair<int, int>>();
using (var output = File.OpenWrite(destFile))
{
using (var input = File.OpenText(sourceFile))
{
var outputLine = new StringBuilder();
// Make header
var header = input.ReadLine();
if (header == null)
{
return;
}
var even = false;
var lastc = header.First();
var counter = 0;
var blockCounter = 0;
foreach(var c in header)
{
counter++;
if (c == lastc)
{
blockCounter++;
}
else
{
blocks.Add(new KeyValuePair<int, int>(
counter - blockCounter - 1,
blockCounter));
blockCounter = 1;
outputLine.Append(',');
even = !even;
}
outputLine.Append(even ? '1' : '0');
lastc = c;
}
blocks.Add(new KeyValuePair<int, int>(
counter - blockCounter,
blockCounter));
outputLine.AppendLine();
var lineBytes = Encoding.UTF.GetBytes(outputLine.ToString());
outputLine.Clear();
output.Write(lineBytes, 0, lineBytes.Length);
// Process Body
var inputLine = input.ReadLine();
while (inputLine != null)
{
foreach(var block in block.Select(b =>
inputLine.Substring(b.Key, b.Value)))
{
var sanitisedBlock = block;
if (block.Contains(',') || block.Contains('"'))
{
santitisedBlock = string.Format(
"\"{0}\"",
block.Replace("\"", "\"\""));
}
outputLine.Append(sanitisedBlock);
outputLine.Append(',');
}
outputLine.Remove(outputLine.Length - 1, 1);
outputLine.AppendLine();
lineBytes = Encoding.UTF8.GetBytes(outputLne.ToString());
outputLine.Clear();
output.Write(lineBytes, 0, lineBytes.Length);
inputLine = input.ReadLine();
}
}
}
}
1 is repeated in your header row, so your two fours get counted as one eight and everything goes wrong from there.
(There is a block of four 1s after the Qs and another block of four 1s after the 0s)
Essentialy, your header row is invalid or, at least, doesen't work with the proposed solution.
Okay, you could do somthing like this.
public void FixedToCsv(string fullFile)
{
var lines = File.ReadAllLines(fullFile);
var firstLine = lines.First();
var widths = new List<KeyValuePair<int, int>>();
var innerCounter = 0;
var outerCounter = 0
var firstLineChars = firstLine.ToCharArray();
var lastChar = firstLineChars[0];
foreach(var c in firstLineChars)
{
if (c == lastChar)
{
innerCounter++;
}
else
{
widths.Add(new KeyValuePair<int, int>(
outerCounter
innerCounter);
innerCounter = 0;
lastChar = c;
}
outerCounter++;
}
var csvLines = lines.Select(line => string.Join(",",
widths.Select(pair => line.Substring(pair.Key, pair.Value))));
// Get filePath and fileName from fullFile here.
File.WriteAllLines(filePath + "\\" + fileName + ".csv", csvLines);
}

Categories