I am inputting a text file into a DataTable and then using SqlBulkCopy to copy to a Database. While BulkCopy is fast, inserting 50000+ lines into DataTable is not (around 5 mins). How do I make it efficient?
Can I insert data into the DataTable quickly?
If not, is there a way to save the inserted data permanently into the DataTable so I don't have to insert it every time I run the program?
for (; i < fares.Length; )
{
k = i;
Console.WriteLine("Inserting " + k + " out of " + (fares.Length));
for (; i <= (k + 3); i++)
{
if (i % 4 == 0)
{
for (int j = 0; j < fares.Length - 1; j++)
{
{
int space = fares[i].IndexOf(" ");
startStation = fares[i].Substring(0, space);
endStation = fares[i].Substring(space + 1, fares[i].Length - space - 1);
}
}
}
else if (i % 4 == 1)
{
valueFare = fares[i];
}
else if (i % 4 == 2)
{
standardFare = fares[i];
}
else if (i % 4 == 3)
{
time = int.Parse(fares[i]);
}
}
faresDT.Rows.Add(startStation, endStation, valueFare, standardFare, time);
If what you want is to optimize your load to the database, I suggest that you get rid of the DataTable completely. By making use of Marc Gravell's FastMember (and anyone who's using SqlBulkCopy should be using FastMember IMHO) you can get a DataReader directly from any IEnumerable.
I would use some variation of the below code whenever writing from a file directly to a database. The below code will stream the contents of the file directly to the SqlBulkCopy operation thru the clever use of yield returns and lazy load of IEnumerable.
using System;
using System.Collections.Generic;
using System.Data.SqlClient;
using System.IO;
using System.Text;
using FastMember;
namespace BulkCopyTest
{
public class Program
{
public static void Main(string[] args)
{
const string filePath = "SOME FILE THAT YOU WANT TO LOAD TO A DB";
WriteData(GetData<dynamic>(filePath));
}
private static void WriteData<T>(IEnumerable<T> data)
{
using (var bcp = new SqlBulkCopy(GetConnection(), SqlBulkCopyOptions.TableLock, null))
using (var reader = ObjectReader.Create(data))
{
SetColumnMappings<T>(bcp.ColumnMappings);
bcp.BulkCopyTimeout = 300;
bcp.BatchSize = 150000;
bcp.DestinationTableName = ""; //TODO: Set correct TableName
bcp.WriteToServer(reader);
}
}
private static void SetColumnMappings<T>(SqlBulkCopyColumnMappingCollection mappings)
{
//Setup your column mappings
}
private static IEnumerable<T> GetData<T>(string filePath)
{
using (var fileStream = File.OpenRead(filePath))
using (var reader = new StreamReader(fileStream, Encoding.UTF8))
{
string line;
while ((line = reader.ReadLine()) != null)
{
//TODO: Add actual parsing logic and whatever else is needed to create an instance of T
yield return Activator.CreateInstance<T>();
}
}
}
private static SqlConnection GetConnection()
{
return new SqlConnection(new SqlConnectionStringBuilder
{
//TODO: Set Connection information here
}.ConnectionString);
}
}
}
In this case I think you should take advantage of the BeginLoadData, LoadDataRow and EndLoadData methods provided in the DataTable class, you could use them like this:
try
{
faresDT.BeginLoadData();
// Your for loop...
{
// Logic defining the value of startStation, endStation, valueFare, standardFare and time removed for briefness.
faresDT.LoadDataRow(new object[] {startStation, endStation, valueFare, standardFare, time}, true);
}
}
finally
{
faresDT.EndLoadData();
}
What BeginLoadData() does is turning off some processing that happens every time you add a row, and only does it once when you are done loading data by calling EndLoadData().
You can find more details about these APIs here:
https://learn.microsoft.com/en-us/dotnet/api/system.data.datatable.loaddatarow?view=netframework-4.7.2
Related
I have created a list in c#, now I need to save the list in text file with the index for each item in the list? please explain with a simple example.
Try this Code: i hope you will get the basic idea from it.
namespace ConsoleApp
{
class Program
{
static void Main(string[] args)
{
List<string> _names = new List<string>()
{
"Rehan",
"Hamza",
"Adil",
"Arif",
"Hamid",
"Hadeed"
};
using (StreamWriter outputFile = new StreamWriter(#"E:\test.txt")
{
foreach (string line in _names)
outputFile.WriteLine(line);
}
}
}
}
OR you should try for loop as well.
namespace ConsoleApp
{
class Program
{
static void Main(string[] args)
{
List<string> _names = new List<string>()
{
"Rehan",
"Hamza",
"Adil",
"Arif",
"Hamid",
"Hadeed"
};
using (StreamWriter outputFile = new StreamWriter(#"E:\test.txt")
{
for (int index = 0; index < _names.Count; index++)
outputFile.WriteLine("Index : " + index + " - " + _names[index]);
}
}
}
}
According to your Comment Below:
How to save a list data into a SQL Server table. You can follow the
same principle from the above code:
Code:
namespace ConsoleApp
{
class Program
{
static void Main(string[] args)
{
// Table
// -------------
// | ID | Name |
// -------------
// Please Not that: ID Column in a database should not be identity Colomn because in this example i am going to add data to ID Column explicity...
// List of name that we are going to save in Database.
List<string> _names = new List<string>()
{
"Rehan",
"Hamza",
"Adil",
"Arif",
"Hamid",
"Hadeed"
};
SqlConnection connection = new SqlConnection("Connection string goes here...");
connection.Open();
for (int index = 0; index < _names.Count; index++)
{
SqlCommand command = new SqlCommand("INSERT INTO tbl_names (id,name) VALUES ('"+index+"', '"+_names[index]+"')",connection);
command.ExecuteNonQuery();
}
connection.Close();
}
}
}
Note: by using this syntax new SqlCommand("INSERT INTO tbl_names... there is a Chance of SQL Injection so by avoiding that you can use Store Procedure instead....
I have a working solution for uploading a CSV file. Currently, I use the IFormCollection for a user to upload multiple CSV files from a view.
The CSV files are saved as a temp file as follows:
List<string> fileLocations = new List<string>();
foreach (var formFile in files)
{
filePath = Path.GetTempFileName();
if (formFile.Length > 0)
{
using (var stream = new FileStream(filePath, FileMode.Create))
{
await formFile.CopyToAsync(stream);
}
}
fileLocations.Add(filePath);
}
I send the list of file locations to another method (just below). I loop through the file locations and stream the data from the temp files, I then use a data table and SqlBulkCopyto insert the data. I currently upload between 50 and 200 files at a time and each file is around 330KB. To insert a hundred, it takes around 6 minutes, which is around 30-35MB.
public void SplitCsvData(string fileLocation, Guid uid)
{
MetaDataModel MetaDatas;
List<RawDataModel> RawDatas;
var reader = new StreamReader(File.OpenRead(fileLocation));
List<string> listRows = new List<string>();
while (!reader.EndOfStream)
{
listRows.Add(reader.ReadLine());
}
var metaData = new List<string>();
var rawData = new List<string>();
foreach (var row in listRows)
{
var rowName = row.Split(',')[0];
bool parsed = int.TryParse(rowName, out int result);
if (parsed == false)
{
metaData.Add(row);
}
else
{
rawData.Add(row);
}
}
//Assigns the vertical header name and value to the object by splitting string
RawDatas = GetRawData.SplitRawData(rawData);
SaveRawData(RawDatas);
MetaDatas = GetMetaData.SplitRawData(rawData);
SaveRawData(RawDatas);
}
This code then passes the object to the to create the datatable and insert the data.
private DataTable CreateRawDataTable
{
get
{
var dt = new DataTable();
dt.Columns.Add("Id", typeof(int));
dt.Columns.Add("SerialNumber", typeof(string));
dt.Columns.Add("ReadingNumber", typeof(int));
dt.Columns.Add("ReadingDate", typeof(string));
dt.Columns.Add("ReadingTime", typeof(string));
dt.Columns.Add("RunTime", typeof(string));
dt.Columns.Add("Temperature", typeof(double));
dt.Columns.Add("ProjectGuid", typeof(Guid));
dt.Columns.Add("CombineDateTime", typeof(string));
return dt;
}
}
public void SaveRawData(List<RawDataModel> data)
{
DataTable dt = CreateRawDataTable;
var count = data.Count;
for (var i = 1; i < count; i++)
{
DataRow row = dt.NewRow();
row["Id"] = data[i].Id;
row["ProjectGuid"] = data[i].ProjectGuid;
row["SerialNumber"] = data[i].SerialNumber;
row["ReadingNumber"] = data[i].ReadingNumber;
row["ReadingDate"] = data[i].ReadingDate;
row["ReadingTime"] = data[i].ReadingTime;
row["CombineDateTime"] = data[i].CombineDateTime;
row["RunTime"] = data[i].RunTime;
row["Temperature"] = data[i].Temperature;
dt.Rows.Add(row);
}
using (var conn = new SqlConnection(connectionString))
{
conn.Open();
using (SqlTransaction tr = conn.BeginTransaction())
{
using (var sqlBulk = new SqlBulkCopy(conn, SqlBulkCopyOptions.Default, tr))
{
sqlBulk.BatchSize = 1000;
sqlBulk.DestinationTableName = "RawData";
sqlBulk.WriteToServer(dt);
}
tr.Commit();
}
}
}
Is there another way to do this or a better way to improve performance so that the time to upload is reduced as it can take a long time and I am seeing an ever increasing use of memory to around 500MB.
TIA
You can improve performance by removing the DataTable and reading from the input stream directly.
SqlBulkCopy has a WriteToServer overload that accepts an IDataReader instead of an entire DataTable.
CsvHelper can CSV files using a StreamReader as an input. It provides CsvDataReader as an IDataReader implementation on top of the CSV data. This allows reading directly from the input stream and writing to SqlBulkCopy.
The following method will read from an IFormFile, parse the stream using CsvHelper and use the CSV's fields to configure a SqlBulkCopy instance :
public async Task ToTable(IFormFile file, string table)
{
using (var stream = file.OpenReadStream())
using (var tx = new StreamReader(stream))
using (var reader = new CsvReader(tx))
using (var rd = new CsvDataReader(reader))
{
var headers = reader.Context.HeaderRecord;
var bcp = new SqlBulkCopy(_connection)
{
DestinationTableName = table
};
//Assume the file headers and table fields have the same names
foreach(var header in headers)
{
bcp.ColumnMappings.Add(header, header);
}
await bcp.WriteToServerAsync(rd);
}
}
This way nothing is ever written to a temp table or cached in memory. The uploaded files are parsed and written to the database directly.
In addition to #Panagiotis's answer, why don't you interleave your file processing with the file upload? Wrap up your file processing logic in an async method and change the loop to a Parallel.Foreach and process each file as it arrives instead of waiting for all of them?
private static readonly object listLock = new Object(); // only once at class level
List<string> fileLocations = new List<string>();
Parallel.ForEach(files, (formFile) =>
{
filePath = Path.GetTempFileName();
if (formFile.Length > 0)
{
using (var stream = new FileStream(filePath, FileMode.Create))
{
await formFile.CopyToAsync(stream);
}
await ProcessFileInToDbAsync(filePath);
}
// Added lock for thread safety of the List
lock (listLock)
{
fileLocations.Add(filePath);
}
});
Thanks to #Panagiotis Kanavos, I was able to work out what to do. Firstly, the way I was calling the methods, was leaving them in memory. The CSV file I have is in two parts, vertical metadata and then the usual horizontal information. So I needed to split them into two. Saving them as tmp files was also causing an overhead. It has gone from taking 5-6 minutes to now taking a minute, which for a 100 files containing 8,500 rows isn't bad I suppose.
Calling the method:
public async Task<IActionResult> UploadCsvFiles(ICollection<IFormFile> files, IFormCollection fc)
{
foreach (var f in files)
{
var getData = new GetData(_configuration);
await getData.SplitCsvData(f, uid);
}
return whatever;
}
This is the method doing the splitting:
public async Task SplitCsvData(IFormFile file, string uid)
{
var data = string.Empty;
var m = new List<string>();
var r = new List<string>();
var records = new List<string>();
using (var stream = file.OpenReadStream())
using (var reader = new StreamReader(stream))
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
var header = line.Split(',')[0].ToString();
bool parsed = int.TryParse(header, out int result);
if (!parsed)
{
m.Add(line);
}
else
{
r.Add(line);
}
}
}
//TODO: Validation
//This splits the list into the Meta data model. This is just a single object, with static fields.
var metaData = SplitCsvMetaData.SplitMetaData(m, uid);
DataTable dtm = CreateMetaData(metaData);
var serialNumber = metaData.LoggerId;
await SaveMetaData("MetaData", dtm);
//
var lrd = new List<RawDataModel>();
foreach (string row in r)
{
lrd.Add(new RawDataModel
{
Id = 0,
SerialNumber = serialNumber,
ReadingNumber = Convert.ToInt32(row.Split(',')[0]),
ReadingDate = Convert.ToDateTime(row.Split(',')[1]).ToString("yyyy-MM-dd"),
ReadingTime = Convert.ToDateTime(row.Split(',')[2]).ToString("HH:mm:ss"),
RunTime = row.Split(',')[3].ToString(),
Temperature = Convert.ToDouble(row.Split(',')[4]),
ProjectGuid = uid.ToString(),
CombineDateTime = Convert.ToDateTime(row.Split(',')[1] + " " + row.Split(',')[2]).ToString("yyyy-MM-dd HH:mm:ss")
});
}
await SaveRawData("RawData", lrd);
}
I then use a data table for the metadata (which takes 20 seconds for a 100 files) as I map the field names to the columns.
public async Task SaveMetaData(string table, DataTable dt)
{
using (SqlBulkCopy sqlBulk = new SqlBulkCopy(_configuration.GetConnectionString("DefaultConnection"), SqlBulkCopyOptions.Default))
{
sqlBulk.DestinationTableName = table;
await sqlBulk.WriteToServerAsync(dt);
}
}
I then use FastMember for the large data parts for the raw data, which is more like a traditional CSV.
public async Task SaveRawData(string table, IEnumerable<LogTagRawDataModel> lrd)
{
using (SqlBulkCopy sqlBulk = new SqlBulkCopy(_configuration.GetConnectionString("DefaultConnection"), SqlBulkCopyOptions.Default))
using (var reader = ObjectReader.Create(lrd, "Id","SerialNumber", "ReadingNumber", "ReadingDate", "ReadingTime", "RunTime", "Temperature", "ProjectGuid", "CombineDateTime"))
{
sqlBulk.DestinationTableName = table;
await sqlBulk.WriteToServerAsync(reader);
}
}
I am sure this can be improved on, but for now, this works really well.
Below is my class:
MsSql.cs:
public class MSSqlBLL
{
public static long RowsCopied { get; set; }
public long BulkCopy()
{
using (SqlBulkCopy bulkCopy = new SqlBulkCopy(conn))
{
bulkCopy.DestinationTableName = "dbo.Table1";
bulkCopy.BatchSize = 100;
bulkCopy.SqlRowsCopied +=
new SqlRowsCopiedEventHandler(OnSqlRowsCopied);
bulkCopy.NotifyAfter = 100;
try
{
bulkCopy.WriteToServer(reader);
}
return RowsCopied;
}
}
private static void OnSqlRowsCopied(object sender, SqlRowsCopiedEventArgs e)
{
RowsCopied = RowsCopied + e.RowsCopied;
}
}
I am calling BulkCopy function from this class and i want to get currently processed record in my affected records variable.
For eg :For each iteration of loop i would like to get affected records in my affectedRows variable.
public class MySqlBLL
{
public void GetTotalRows()
{
int totalRecords = 500;
var table = "Table1";
for (int i = 0; i < totalRecords / 100; i++) //
{
query = "SELECT * FROM " + table + " LIMIT " + 0 + "," + 100;
var reader = Execute(conn, query);
long affectedRecords = msSql.BulkCopy();
reader.Close();
}
}
}
In the above method i am sending chunk by chunk data to BulkCopy method to perform bulk copy but for each bulk copy i would like to get number of records that are processed by bulk copy but the problem is i am getting 0 in affectedRecords variable.
I want to get access current rows processed by sql bulk copy.
The RowsCopied property is only updated after 100 records are copied (as set using NotifyAfter). If you place
Console.WriteLine("Copied {0} so far...", e.RowsCopied);
in OnSqlRowsCopied event handler you will get ongoing progress in case of Console app.
But in your case you can simply select count(*) from source table to show the count.
-Source
EDIT: I am using SharpDevelop
I am new to C# so the answer may be an easy one...I have some code (below) and the WHILE loop runs just fine. The problem is that once the processing in the WHILE loop has finished, no more code is executed. If I put a breakpoint on my 'cn.Open(); line and run the program, I never hit that breakpoint. If I put a breakpoint on the curly bracket '}' just above the 'cn.Open();' line, the code will stop each time I hit that breakpoint. I am not sure how to get my additional code to run.
void MainFormLoad(object sender, EventArgs e)
{
DataTable dt = new DataTable();
string line = null;
int i = 0;
SqlConnection cn = new SqlConnection("Integrated Security=SSPI;Persist Security Info=False;Initial Catalog=Sandbox;Data Source=test");
StreamReader sr = File.OpenText(#"C:\Users\rl\Desktop\TEST_I~1.CSV");
while ((line = sr.ReadLine()) != null)
{
string[] data = line.Split(',');
if (data.Length > 0)
{
if (i == 0)
{
foreach (var item in data)
{
dt.Columns.Add(item.ToString());
}
i++;
}
DataRow row = dt.NewRow();
row.ItemArray = data;
dt.Rows.Add(row);
}
}
cn.Open();
SqlBulkCopy copy = new SqlBulkCopy(cn);
{
// copy.ColumnMappings.Add(0, 0);
// copy.ColumnMappings.Add(1, 1);
// copy.ColumnMappings.Add(2, 2);
// copy.ColumnMappings.Add(3, 3);
// copy.ColumnMappings.Add(4, 4);
copy.DestinationTableName = "Member2";
copy.WriteToServer(dt);
}
You have a few items you may want to address. These may or may not be related to whatever issue you're having debugging with #develop.
Declaring things long before you use them (style guidelines)
Not disposing of things that implement IDisposable (use using statements!)
Inner scope block; the copy variable is being used in it's own scope for no apparently good reason (I may be wrong, but it could be what's throwing #develop's debugger for a loop)
Instead, your code should be closer to this:
void MainFormLoad(object sender, EventArgs e)
{
var dt = new DataTable();
// You may want to pass other parameters to OpenText for read mode, etc.
using (var sr = File.OpenText(#"C:\Users\rl\Desktop\TEST_I~1.CSV"))
{
var first = true;
string line = null;
while ((line = sr.ReadLine()) != null)
{
string[] data = line.Split(',');
if (data.Length > 0)
{
if (first)
{
foreach (var item in data)
{
dt.Columns.Add(item.ToString());
}
first = false;
// Don't add the first row's data in the table (headers?)
continue;
}
var row = dt.NewRow();
row.ItemArray = data;
dt.Rows.Add(row);
}
}
}
using (var cn = new SqlConnection("<connection string>"))
{
cn.Open();
using (var copy = new SqlBulkCopy(cn))
{
// copy.ColumnMappings.Add(0, 0);
// copy.ColumnMappings.Add(1, 1);
// copy.ColumnMappings.Add(2, 2);
// copy.ColumnMappings.Add(3, 3);
// copy.ColumnMappings.Add(4, 4);
copy.DestinationTableName = "Member2";
copy.WriteToServer(dt);
}
}
}
The code is a bit odd but it looks like it should work. There's probably a file lock either making you run against old builds or hang on the .csv open line.
Cory's suggestions for tidying the code are rather good.
I think you have an infinite loop going on because your while check isn't quite right. You're asking if line = sr.ReadLine() is null, not if line is null. The result of setting line to the result of the read function will never return null.
Regarding this previous stackoverflow question:
DRY CLR table-valued functions
It seems it only runs in single thread mode. To test this I modified the code slightly to prepend the Name field with the current thread number. All of the returned results had the same thread number assigned. Is this action by design? Is there anyway to get it to multithread? Thanks.
private class ResultRow
// This class holds a row which we want to return.
{
public SqlInt32 CustId;
public SqlString Name;
public ResultRow(SqlInt32 custId_, SqlString name_)
{
int mythread = Thread.CurrentThread.ManagedThreadId;
CustId = custId_;
Name = "[" + mythread.ToString() + "] " + name_;
}
}
EDITED per Marc's question:
Here's the full piece of code. It returns 3470 records in 7 seconds.
using System;
using System.Data;
using System.Data.SqlClient;
using System.Data.SqlTypes;
using Microsoft.SqlServer.Server;
using System.Runtime.InteropServices;
using System.Text;
using System.Collections.Generic;
using System.Collections;
using System.Threading;
namespace CS_CLR_TVF
{
public partial class UserDefinedFunctions
{
// This class holds a row which we want to return.
private class ResultRow
{
public SqlString fldProductName;
public ResultRow(SqlString product_)
{
int mythread = Thread.CurrentThread.ManagedThreadId;
fldProductName = "[" + mythread.ToString() + "] " + product_;
}
}
[SqlFunction(DataAccess = DataAccessKind.Read, FillRowMethodName = "Test_FillRow", TableDefinition = "fldProductName nvarchar(1024)")]
public static IEnumerable xudf_cs_tvf(SqlString strSearchClue)
{
ArrayList results = new ArrayList();
using (SqlConnection connection = new SqlConnection("context connection=true"))
{
connection.Open();
string s1;
using (SqlCommand select = new SqlCommand("SELECT fldProductName FROM tblProducts", connection))
{
using (SqlDataReader reader = select.ExecuteReader())
{
while (reader.Read())
{
s1 = reader.GetSqlString(0).ToString();
// do a substring compare, if "match" grab the row
int idx = s1.IndexOf(strSearchClue.ToString());
if (idx > -1) results.Add(new ResultRow(reader.GetSqlString(0)));
}
}
}
}
return results;
}
// This function takes a row and tells SQL Server what variables we want to
// return from it and what types it contains.
public static void Test_FillRow(object resultsObj, out SqlString fldProductName)
{
ResultRow selectResults = (ResultRow)resultsObj;
fldProductName = selectResults.fldProductName;
}
}
}
Pretty straight forward internal select statement:
SELECT fldProductName FROM tblProducts
.
.
.
.
Here's a version implemented as a scalar UDF and it does do multithreading. It returns 3470 records in <1 second.
[Microsoft.SqlServer.Server.SqlFunction]
public static long xudf_csfake(SqlString strSearchClue, SqlString strStringtoSearch)
{
string s1 = strStringtoSearch.ToString();
// do a substring compare, if "match" grab the row
int idx = s1.IndexOf(strSearchClue.ToString());
if (idx > -1) return 1;
return 0;
}
Here is it's external select statement:
SELECT fldProductName FROM tblProducts WHERE (dbo.xudf_csfake('METAL' ,fldProductName) = 1)
So I seem to getting the opposite of what the article indicates.