As input, I have a set of excel files with several worksheets inside. I need to export a single csv file for each worksheet. Below is my code which works but it is very slow. It builds upon the solutions proposed in this previous post. Please consider that I have to run this on rather big .xlsx files (approx. 300Mb).
QUESTION: Is there any way to improve this?
void Main()
{
string folder = #"\\PATH_TO_FOLDER\";
var files = Directory.GetFiles(folder, "*.xlsx", SearchOption.TopDirectoryOnly);
foreach (string file in files)
{
ConvertToCsv(file, Directory.GetParent(file) + #"\\output\");
}
}
public static void ConvertToCsv(string file, string targetFolder)
{
FileInfo finfo = new FileInfo(file);
ExcelPackage package = new ExcelPackage(finfo);
// if targetFolder doesn't exist, create it
if (!Directory.Exists(targetFolder)) {
Directory.CreateDirectory(targetFolder);
}
var worksheets = package.Workbook.Worksheets;
int sheetcount = 0;
foreach (ExcelWorksheet worksheet in worksheets)
{
sheetcount++;
var maxColumnNumber = worksheet.Dimension.End.Column;
var currentRow = new List<string>(maxColumnNumber);
var totalRowCount = worksheet.Dimension.End.Row+1;
var currentRowNum = 1;
//No need for a memory buffer, writing directly to a file
//var memory = new MemoryStream();
string file_name = targetFolder + Path.GetFileNameWithoutExtension(file) + "_" + sheetcount + ".csv";
using (var writer = new StreamWriter(file_name, false, Encoding.UTF8))
{
//the rest of the code remains the same
for (int i = 1; i < totalRowCount; i++)
{
i.Dump();
// populate line with semi columns separators
string line = "";
for (int j = 1; j < worksheet.Dimension.End.Column+1; j++)
{
if (worksheet.Cells[i, j].Value != null)
{
string cell = worksheet.Cells[i, j].Value.ToString() + ";";
line += cell;
}
}
// write line
writer.WriteLine(line);
}
}
}
}
Related
I am trying to insert data into csv file. I tried using XLWorkbook reference to access and insert data but I know that XLWorkbook can only support extension which are xlsx,xslm,xltx and xltm.
I am trying to find something similar to what I am trying to achieve through which I can insert data into specified column in csv file. I have used XLWorkbook for some other purpose but I am not aware to what I can use when I have to use csv.
//Accessing the csv file where I am trying to insert data.
string rootPath = Path.GetDirectoryName(Assembly.GetExecutingAssembly().CodeBase);
string filelocation = #"\csv\TestData.csv";
string location = rootPath + filelocation;
XLWorkbook workbook = new XLWorkbook(pathfile);
IXLWorksheet worksheet = workbook.Worksheet("Sheet1");
//Insert data after first row as first row contains column header
int lastrow = worksheet.LastRowUsed().RowNumber() + 1;
//through previous function I am trying get data from database and insert those data into csv cells
worksheet.Cell(String.Format("B{0}", lastrow)).Value = dummydata.FirstName;
worksheet.Cell(String.Format("C{0}", lastrow)).Value = dummydata.LastName;
worksheet.Cell(String.Format("D{0}", lastrow)).Value = dummydata.Address1;
worksheet.Cell(String.Format("E{0}", lastrow)).Value = dummydata.Address2;
worksheet.Cell(String.Format("F{0}", lastrow)).Value = dummydata.City;
worksheet.Cell(String.Format("G{0}", lastrow)).Value = dummydata.StateProvinceCode;
worksheet.Cell(String.Format("H{0}", lastrow)).Value = dummydata.ZipCode;
worksheet.Cell(String.Format("I{0}", lastrow)).Value = dummydata.Country;
worksheet.Cell(String.Format("J{0}", lastrow)).Value = dummydata.HomePhone;
worksheet.Cell(String.Format("L{0}", lastrow)).Value = dummydata.HomePhone;
worksheet.Cell(String.Format("M{0}", lastrow)).Value = dummydata.CellPhone;
worksheet.Cell(String.Format("T{0}", lastrow)).Value = dummydata.Email;
worksheet.Cell(String.Format("U{0}", lastrow)).Value = dummydata.Country;
//After inserting save the file
workbook.Save();
You can simply copy and use this code as is. It should resolve your issues.
Here's the class I developed to replace and/or add csv cells:
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace CSVManager
{
public class CSVWorker
{
private string m_FileName = string.Empty;
public CSVWorker(string fileName)
{
m_FileName = fileName;
}
public void AddCells(int row, int column, string newValue)
{
var encoding = Encoding.GetEncoding("iso-8859-1");
var csvLines = File.ReadAllLines(m_FileName, encoding);
if (row < csvLines.Length)
{
ReplaceCells(row, column, newValue);
}
else
{
using (FileStream stream = new FileStream(m_FileName, FileMode.Create))
{
using (StreamWriter writer = new StreamWriter(stream, encoding))
{
foreach (var line in csvLines)
{
writer.WriteLine(line);
}
int blankLines = row - csvLines.Length - 1;
for (int i = 0; i < blankLines; i++)
{
writer.WriteLine("");
}
string blankCols = string.Empty;
for (int i = 0; i < column-1; i++)
{
blankCols += ',';
}
writer.WriteLine(blankCols + newValue);
}
}
}
}
public void ReplaceCells(int row, int column, string newValue)
{
var encoding = Encoding.GetEncoding("iso-8859-1");
var csvLines = File.ReadAllLines(m_FileName, encoding);
for (int i = 0; i < csvLines.Length; i++)
{
//var values = csvLines[i].Split(',');
List <string> values = csvLines[i].Split(',').ToList();
if (i == row)
{
if (column < values.Count)
{
values[column] = newValue;
}
else
{
while (values.Count < column - 1)
{
values.Append(",");
}
values.Append(newValue);
}
using (FileStream stream = new FileStream(m_FileName, FileMode.Create))
{
using (StreamWriter writer = new StreamWriter(stream, encoding))
{
for (int currentLine = 0; currentLine < csvLines.Length; ++currentLine)
{
if (currentLine == i)
{
writer.WriteLine(string.Join(",", values));
}
else
{
writer.WriteLine(csvLines[currentLine]);
}
}
writer.Close();
}
stream.Close();
break;
}
}
}
}
}
}
Here's how I used it:
namespace CSVManager
{
class Program
{
static void Main(string[] args)
{
string fileName = #"C:\Users\mklig\Documents\TestCsv.csv";
CSVWorker csvWorker = new CSVWorker(fileName);
int row = 4;
int col = 4;
string newVal = "success";
//csvWorker.ReplaceCells(row, col, newVal);
csvWorker.AddCells(row, col, newVal);
}
}
}
I have a csv file with 2 million rows and file size of 2 GB. But due to a couple of free text form columns, these contain redundant CRLF and cause the file to not load in the SQL Server table. I get an error that the last column does not end with ".
I have the following code, but it gives an OutOfMemoryException when reading from fileName. The line is:
var lines = File.ReadAllLines(fileName);
How can I fix it? Ideally, I would like to split the file into two good and bad rows. Or delete rows that do not end with "CRLF.
int goodRow = 0;
int badRow = 0;
String badRowFileName = fileName.Substring(0, fileName.Length - 4) + "BadRow.csv";
String goodRowFileName = fileName.Substring(0, fileName.Length - 4) + "GoodRow.csv";
var charGood = "\"\"";
String lineOut = string.Empty;
String str = string.Empty;
var lines = File.ReadAllLines(fileName);
StringBuilder sbGood = new StringBuilder();
StringBuilder sbBad = new StringBuilder();
foreach (string line in lines)
{
if (line.Contains(charGood))
{
goodRow++;
sbGood.AppendLine(line);
}
else
{
badRow++;
sbBad.AppendLine(line);
}
}
if (badRow > 0)
{
File.WriteAllText(badRowFileName, sbBad.ToString());
}
if (goodRow > 0)
{
File.WriteAllText(goodRowFileName, sbGood.ToString());
}
sbGood.Clear();
sbBad.Clear();
msg = msg + "Good Rows - " + goodRow.ToString() + " Bad Rows - " + badRow.ToString() + " Done.";
You can translate that code like this to be much more efficient:
int goodRow = 0, badRow = 0;
String badRowFileName = fileName.Substring(0, fileName.Length - 4) + "BadRow.csv";
String goodRowFileName = fileName.Substring(0, fileName.Length - 4) + "GoodRow.csv";
var charGood = "\"\"";
using (var lines = File.ReadLines(fileName))
using (var swGood = new StreamWriter(goodRowFileName))
using (var swBad = new StreamWriter(badRowFileName))
{
foreach (string line in lines)
{
if (line.Contains(charGood))
{
goodRow++;
swGood.WriteLine(line);
}
else
{
badRow++;
swBad.WriteLine(line);
}
}
}
msg += $"Good Rows: {goodRow,9} Bad Rows: {badRow,9} Done.";
But I'd also look at using a real csv parser for this. There are plenty on NuGet. That might even let you clean up the data on the fly.
I would not suggest reading the entire file into memory, then processing the file, then writing all modified contents out to the new file.
Instead using file streams:
using (var rdr = new StreamReader(fileName))
using (var wrtrGood = new StreamWriter(goodRowFileName))
using (var wrtrBad = new StreamWriter(badRowFileName))
{
string line = null;
while ((line = rdr.ReadLine()) != null)
{
if (line.Contains(charGood))
{
goodRow++;
wrtr.WriteLine(line);
}
else
{
badRow++;
wrtrBad.WriteLine(line);
}
}
}
I am writing a pdf to word converter which works perfectly fine for me. But I want to be able to convert more than one file.
What happens now is that it read the first file and does the convert process.
public static void PdfToImage()
{
try
{
Application application = null;
application = new Application();
var doc = application.Documents.Add();
string path = #"C:\Users\Test\Desktop\pdfToWord\";
foreach (string file in Directory.EnumerateFiles(path, "*.pdf"))
{
using (var document = PdfiumViewer.PdfDocument.Load(file))
{
int pagecount = document.PageCount;
for (int index = 0; index < pagecount; index++)
{
var image = document.Render(index, 200, 200, true);
image.Save(#"C:\Users\chnikos\Desktop\pdfToWord\output" + index.ToString("000") + ".png", ImageFormat.Png);
application.Selection.InlineShapes.AddPicture(#"C:\Users\chnikos\Desktop\pdfToWord\output" + index.ToString("000") + ".png");
}
string getFileName = file.Substring(file.LastIndexOf("\\"));
string getFileWithoutExtras = Regex.Replace(getFileName, #"\\", "");
string getFileWihtoutExtension = Regex.Replace(getFileWithoutExtras, #".pdf", "");
string fileName = #"C:\Users\Test\Desktop\pdfToWord\" + getFileWihtoutExtension;
doc.PageSetup.PaperSize = WdPaperSize.wdPaperA4;
foreach (Microsoft.Office.Interop.Word.InlineShape inline in doc.InlineShapes)
{
if (inline.Height > inline.Width)
{
inline.ScaleWidth = 250;
inline.ScaleHeight = 250;
}
}
doc.PageSetup.TopMargin = 28.29f;
doc.PageSetup.LeftMargin = 28.29f;
doc.PageSetup.RightMargin = 30.29f;
doc.PageSetup.BottomMargin = 28.29f;
application.ActiveDocument.SaveAs(fileName, WdSaveFormat.wdFormatDocument);
doc.Close();
}
}
I thought that with my foreach that problem should not occur. And yes there are more than one pdf in this folder
The line
var doc = application.Documents.Add();
is outside the foreach loop. So you only create a single word document for all your *.pdf files.
Move the above line inside the foreach loop to add a new word document for each *.pdf file.
I am having the hardest time figuring out how to do this.
I have a listbox with a lot of data in it. I want to take this listbox and then have a button to save it.
The button will choose the directory to put the files in. Afterwards, the program should start saving these values into a text file with the naming schema Seed1.txt, Seed2.txt, etc.
The thing is, I would like to put only 100 items into each text file that is generated until the list is done.
For saving the path I have:
Stream s;
string folderPath = string.Empty;
using (FolderBrowserDialog fdb = new FolderBrowserDialog())
{
if (fdb.ShowDialog() == DialogResult.OK)
{
folderPath = fdb.SelectedPath;
MessageBox.Show(folderPath);
}
For saving everything in one shot, I believe this will work:
int total = list_failed.Items.Count;
for (int i = 0; i < list_failed.Items.Count; i++)
{
StreamWriter text = new StreamWriter(s);
text.Write(list_failed.Items[i]);
s.Close();
I'm not sure about the rest though. Something like this for the filenames perhaps
string filename;
int i = 0;
do
{
filename = "Seed" + ++i + ".txt";
} while (files.Contains(filename));
Here's a working example that you can use.
string pathname = Server.MapPath("/");
int counter = 1;
string file = String.Empty;
List<string> list = new List<string>();
//Add the list items
for (int i = 0; i <= 1234; i++)
{
list.Add(String.Format("item {0}", i));
}
//write to file
for (int i = 1; i < list.Count(); i++)
{
//generate a dynamic filename with path
file = String.Format("{0}Seed{1}.txt", pathname, counter);
//the using statement closes the streamwriter when it completes the process
using (StreamWriter text = new StreamWriter(file, true))
{
//write the line
text.Write(list[i]);
}
//check to see if the max lines have been written
if (i == counter * 100) counter++;
}
string folderPath;
const int ITEMS_PER_FILE=100;
void AskUserForFolder()
{
folderPath = string.Empty;
using (FolderBrowserDialog fdb = new FolderBrowserDialog())
{
if (fdb.ShowDialog() == DialogResult.OK)
{
folderPath = fdb.SelectedPath;
// MessageBox.Show(folderPath);
}
}
}
void SaveItems(ListBox listBox, int seed)
{
int total = listBox.Items.Count;
for ( int fileCount=0;fileCount<listBox.Items.Count/ITEMS_PER_FILE;++fileCount)
{
using (StreamWriter sw = new StreamWriter(folderPath + "\\" + GetFilePath(folderPath, "filename.txt",ref seed)))
{
for (int i = 0; i < listBox.Items.Count; i++)
{
sw.WriteLine(listBox.Items[i+(ITEMS_PER_FILE*fileCount)]);
}
sw.Close();
}
}
}
//I'm not sure about the rest though. Something like this for the filenames perhaps
/// <summary>
/// Gets a filename that has not been used before by incrementing a number at the end of the filename
/// </summary>
/// <param name="seed">seed is passed in as a referrect value and acts as a starting point to itterate through the list
/// By passing it in as a reference we can save ourselves from having to itterate unneccssarily for the start each time
/// </param>
/// <returns>the path of the file</returns>
string GetFilePath(string folderpath, string fileName,string extension,ref int seed)
{
FileInfo fi = new FileInfo(string.Format("{0}\\{1}{2}.{3}", folderPath, fileName, seed,extension));
while (fi.Exists)
{
fi = new FileInfo(string.Format("{0}\\{1}{2}.{3}", folderPath, fileName, ++seed,extension));
}
return fi.FullName;
}
Try this to iterate over ListBox items and put them in files with up to 100 items:
private void writeItemsToFile(ListBox lb)
{
string path = #"c:\test\";
string filename = "seed";
int itemCounter = 0;
int fileCounter = 1;
StreamWriter sw = new StreamWriter(File.OpenWrite(System.IO.Path.Combine(path,string.Format(filename+"{0}.txt",fileCounter))));
foreach (var s in lb.Items)
{
if (itemCounter > 100)
{
fileCounter++;
itemCounter = 0;
sw.Flush();
sw.Close();
sw.Dispose();
sw = null;
sw = new StreamWriter(File.OpenWrite(System.IO.Path.Combine(path,string.Format(filename+"{0}.txt",fileCounter))));
}
sw.WriteLine(s.ToString());
itemCounter++;
}
if (sw != null)
{
sw.Flush();
sw.Dispose();
}
}
Morning,
I'm trying to split a large text file (15,000,000 rows) using StreamReader/StreamWriter. Is there a quicker way?
I tested it with 130,000 rows and it took 2min 40sec which implies 15,000,000 rows will take approx 5hrs which seems a bit excessive.
//Perform split.
public void SplitFiles(int[] newFiles, string filePath, int processorCount)
{
using (StreamReader Reader = new StreamReader(filePath))
{
for (int i = 0; i < newFiles.Length; i++)
{
string extension = System.IO.Path.GetExtension(filePath);
string temp = filePath.Substring(0, filePath.Length - extension.Length)
+ i.ToString();
string FilePath = temp + extension;
if (!File.Exists(FilePath))
{
for (int x = 0; x < newFiles[i]; x++)
{
DataWriter(Reader.ReadLine(), FilePath);
}
}
else
{
return;
}
}
}
}
public void DataWriter(string rowData, string filePath)
{
bool appendData = true;
using (StreamWriter sr = new StreamWriter(filePath, appendData))
{
{
sr.WriteLine(rowData);
}
}
}
Thanks for your help.
You haven't made it very clear, but I'm assuming that the value of each element of the newFiles array is the number of lines to copy from the original into that file. Note that currently you don't detect the situation where there's either extra data at the end of the input file, or it's shorter than expected. I suspect you want something like this:
public void SplitFiles(int[] newFiles, string inputFile)
{
string baseName = Path.GetFileNameWithoutExtension(inputFile);
string extension = Path.GetExtension(inputFile);
using (TextReader reader = File.OpenText(inputFile))
{
for (int i = 0; i < newFiles.Length; i++)
{
string outputFile = baseName + i + extension;
if (File.Exists(outputFile))
{
// Better than silently returning, I'd suggest...
throw new IOException("File already exists: " + outputFile);
}
int linesToCopy = newFiles[i];
using (TextWriter writer = File.CreateText(outputFile))
{
for (int j = 0; i < linesToCopy; j++)
{
string line = reader.ReadLine();
if (line == null)
{
return; // Premature end of input
}
writer.WriteLine(line);
}
}
}
}
}
Note that this still won't detect if there's any unconsumed input... it's not clear what you want to do in that situation.
One option for code clarity is to extract the middle of this into a separate method:
public void SplitFiles(int[] newFiles, string inputFile)
{
string baseName = Path.GetFileNameWithoutExtension(inputFile);
string extension = Path.GetExtension(inputFile);
using (TextReader reader = File.OpenText(inputFile))
{
for (int i = 0; i < newFiles.Length; i++)
{
string outputFile = baseName + i + extension;
// Could put this into the CopyLines method if you wanted
if (File.Exists(outputFile))
{
// Better than silently returning, I'd suggest...
throw new IOException("File already exists: " + outputFile);
}
CopyLines(reader, outputFile, newFiles[i]);
}
}
}
private static void CopyLines(TextReader reader, string outputFile, int count)
{
using (TextWriter writer = File.CreateText(outputFile))
{
for (int i = 0; i < count; i++)
{
string line = reader.ReadLine();
if (line == null)
{
return; // Premature end of input
}
writer.WriteLine(line);
}
}
}
There are utilities for splitting files that may outperform your solution - e.g. search for "split file by line".
If they don't suit, there are solutions for loading all the source file into memory and then writing out the files but that probably isn't appropriate given the size of the source file.
In terms of improving your code, a minor improvement would be the generation of the destination file path (and also clarifying the confusing between the source filePath you use and the destination files). You don't need to re-establish the source file extension each time in your loop.
The second improvement (and probably more significant improvement - as highlighted by commenters) is about how you write out the destination files - these seem to have a differing number of lines from the source (value in each newFiles entry) that you specify you want in individual destination files? So I'd suggest for each entry you read all the source file relevant to the next destination file, then output the destination rather than repeatedly opening a destination file. You could "gather" the lines in a StringBuilder/List etc - alternatively just write them directly out to the destination file (but only opening it once)
public void SplitFiles(int[] newFiles, string sourceFilePath, int processorCount)
{
string sourceDirectory = System.IO.Path.GetDirectoryName(sourceFilePath);
string sourceFileName = System.IO.Path.GetFileNameWithoutExtension(sourceFilePath);
string extension = System.IO.Path.GetExtension(sourceFilePath);
using (StreamReader Reader = new StreamReader(sourceFilePath))
{
for (int i = 0; i < newFiles.Length; i++)
{
string destinationFileNameWithExtension = string.Format("{0}{1}{2}", sourceFileName, i, extension);
string destinationFilePath = System.IO.Path.Combine(sourceDirectory, destinationFileNameWithExtension);
if (!File.Exists(destinationFilePath))
{
// Read all the lines relevant to this destination file
// and temporarily store them in memory
StringBuilder destinationText = new StringBuilder();
for (int x = 0; x < newFiles[i]; x++)
{
destinationText.Append(Reader.ReadLine());
}
DataWriter(destinationFilePath, destinationText.ToString());
}
else
{
return;
}
}
}
}
private static void DataWriter(string destinationFilePath, string content)
{
using (StreamWriter sr = new StreamWriter(destinationFilePath))
{
{
sr.Write(content);
}
}
}
I've recently had to do this for several hundred files under 2 GB each (up to 1.92 GB), and the fastest method I found (if you have the memory available) is StringBuilder. All the other methods I tried were painfully slow.
Please note that this is memory dependent. Adjust "CurrentPosition = 130000" accordingly.
string CurrentLine = String.Empty;
int CurrentPosition = 0;
int CurrentSplit = 0;
foreach (string file in Directory.GetFiles(#"C:\FilesToSplit"))
{
StringBuilder sb = new StringBuilder();
using (StreamReader sr = new StreamReader(file))
{
while ((CurrentLine = sr.ReadLine()) != null)
{
if (CurrentPosition == 130000) // Or whatever you want to split by.
{
using (StreamWriter sw = new StreamWriter(#"C:\FilesToSplit\SplitFiles\" + Path.GetFileNameWithoutExtension(file) + "-" + CurrentSplit + "." + Path.GetExtension(file)))
{
// Append this line too, so we don't lose it.
sb.Append(CurrentLine);
// Write the StringBuilder contents
sw.Write(sb.ToString());
// Clear the StringBuilder buffer, so it doesn't get too big. You can adjust this based on your computer's available memory.
sb.Clear();
// Increment the CurrentSplit number.
CurrentSplit++;
// Reset the current line position. We've found 130,001 lines of text.
CurrentPosition = 0;
}
}
else
{
sb.Append(CurrentLine);
CurrentPosition++;
}
}
}
// Reset the integers at the end of each file check, otherwise it can quickly go out of order.
CurrentPosition = 0;
CurrentSplit = 0;
}