C# Constructing a Dynamic Query From DataTable - c#

Trying to Generate a Dynamic Linq Query, based on DataTable returned to me... The column names in the DataTable will change, but I will know which ones I want to total, and which ones I will want to be grouped.
I can get this to work with loops and writing the output to a variable, then recasting the parts back into a data table, but I'm hoping there is a more elegant way of doing this.
//C#
DataTable dt = new DataTable;
Dt.columns(DynamicData1)
Dt.columns(DynamicData1)
Dt.columns(DynamicCount)
In this case the columns are LastName, FirstName, Age. I want to total ages by LastName,FirstName columns (yes both in the group by). So one of my parameters would specify group by = LastName, FirstName and another TotalBy = Age. The next query may return different column names.
Datarow dr =..
dr[0] = {"Smith","John",10}
dr[1] = {"Smith","John",11}
dr[2] = {"Smith","Sarah",8}
Given these different potential columns names...I'm looking to generate a linq query that creates a generic group by and Total output.
Result:
LastName, FirstName, AgeTotal
Smith, John = 21
Smith, Sarah = 8

If you use a simple converter for Linq you can achieve that easily.
Here a quick data generation i did for the sample :
// create dummy table
var dt = new DataTable();
dt.Columns.Add("LastName", typeof(string));
dt.Columns.Add("FirstName", typeof(string));
dt.Columns.Add("Age", typeof(int));
// action to create easily the records
var addData = new Action<string, string, int>((ln, fn, age) =>
{
var dr = dt.NewRow();
dr["LastName"] = ln;
dr["FirstName"] = fn;
dr["Age"] = age;
dt.Rows.Add(dr);
});
// add 3 datarows records
addData("Smith", "John", 10);
addData("Smith", "John", 11);
addData("Smith", "Sarah", 8);
This is how to use my simple transformation class :
// create a linq version of the table
var lqTable = new LinqTable(dt);
// make the group by query
var groupByNames = lqTable.Rows.GroupBy(row => row["LastName"].ToString() + "-" + row["FirstName"].ToString()).ToList();
// for each group create a brand new linqRow
var linqRows = groupByNames.Select(grp =>
{
//get all items. so we can use first item for last and first name and sum the age easily at the same time
var items = grp.ToList();
// return a new linq row
return new LinqRow()
{
Fields = new List<LinqField>()
{
new LinqField("LastName",items[0]["LastName"].ToString()),
new LinqField("FirstName",items[0]["FirstName"].ToString()),
new LinqField("Age",items.Sum(item => Convert.ToInt32(item["Age"]))),
}
};
}).ToList();
// create new linq Table since it handle the datatable format ad transform it directly
var finalTable = new LinqTable() { Rows = linqRows }.AsDataTable();
And finally here are the custom class that are used
public class LinqTable
{
public LinqTable()
{
}
public LinqTable(DataTable sourceTable)
{
LoadFromTable(sourceTable);
}
public List<LinqRow> Rows = new List<LinqRow>();
public List<string> Columns
{
get
{
var columns = new List<string>();
if (Rows != null && Rows.Count > 0)
{
Rows[0].Fields.ForEach(field => columns.Add(field.Name));
}
return columns;
}
}
public void LoadFromTable(DataTable sourceTable)
{
sourceTable.Rows.Cast<DataRow>().ToList().ForEach(row => Rows.Add(new LinqRow(row)));
}
public DataTable AsDataTable()
{
var dt = new DataTable("data");
if (Rows != null && Rows.Count > 0)
{
Rows[0].Fields.ForEach(field =>
{
dt.Columns.Add(field.Name, field.DataType);
});
Rows.ForEach(row =>
{
var dr = dt.NewRow();
row.Fields.ForEach(field => dr[field.Name] = field.Value);
dt.Rows.Add(dr);
});
}
return dt;
}
}
public class LinqRow
{
public List<LinqField> Fields = new List<LinqField>();
public LinqRow()
{
}
public LinqRow(DataRow sourceRow)
{
sourceRow.Table.Columns.Cast<DataColumn>().ToList().ForEach(col => Fields.Add(new LinqField(col.ColumnName, sourceRow[col], col.DataType)));
}
public object this[int index]
{
get
{
return Fields[index].Value;
}
set
{
Fields[index].Value = value;
}
}
public object this[string name]
{
get
{
return Fields.Find(f => f.Name == name).Value;
}
set
{
var fieldIndex = Fields.FindIndex(f => f.Name == name);
if (fieldIndex >= 0)
{
Fields[fieldIndex].Value = value;
}
}
}
public DataTable AsSingleRowDataTable()
{
var dt = new DataTable("data");
if (Fields != null && Fields.Count > 0)
{
Fields.ForEach(field =>
{
dt.Columns.Add(field.Name, field.DataType);
});
var dr = dt.NewRow();
Fields.ForEach(field => dr[field.Name] = field.Value);
dt.Rows.Add(dr);
}
return dt;
}
}
public class LinqField
{
public Type DataType;
public object Value;
public string Name;
public LinqField(string name, object value, Type dataType)
{
DataType = dataType;
Value = value;
Name = name;
}
public LinqField(string name, object value)
{
DataType = value.GetType();
Value = value;
Name = name;
}
public override string ToString()
{
return Value.ToString();
}
}

I think I'd just use a dictionary:
public Dictionary<string, int> GroupTot(DataTable dt, string[] groupBy, string tot){
var d = new Dictionary<string, int>();
foreach(DataRow ro in dt.Rows){
string key = "";
foreach(string col in groupBy)
key += (string)ro[col] + '\n';
if(!d.ContainsKey(key))
d[key] = 0;
d[key]+= (int)ro[tot];
}
return d;
}
If you want the total on each row, we could get cute and create a column that is an array of one int instead of an int:
public void GroupTot(DataTable dt, string[] groupBy, string tot){
var d = new Dictionary<string, int>();
var dc = dt.Columns.Add("Total_" + tot, typeof(int[]));
foreach(DataRow ro in dt.Rows){
string key = "";
foreach(string col in groupBy)
key += (string)ro[col] + '\n'; //build a grouping key from first and last name
if(!d.ContainsKey(key)) //have we seen this name pair before?
d[key] = new int[1]; //no we haven't, ensure we have a tracker for our total, for this first+last name
d[key][0] += (int)ro[tot]; //add the total
ro[dc] = d[key]; //link the row to the total tracker
}
}
At the end of the operation every row will have an array of int in the "Total_age" column that represents the total for that First+Last name. The reason I used int[] rather than int, is because int is a value type, whereas int[] is a reference. Because as the table is being iterated each row gets assigned a reference to an int[] some of them with the same First+Last name will end up with their int[] references pointing to the same object in memory, so incrementing a later one increments all the earlier ones too (all "John Smith" rows total column holds a refernece to the same int[]. If we'd made the column an int type, then every row would point to a different counter, because every time we say ro[dc] = d[key] it would copy the current value of d[key] int into ro[dc]'s int. Any reference type would do for this trick to work, but value types wouldn't. If you wanted your column to be value type you'd have to iterate the table again, or have two dictionaries, one that maps DataRow -> total and iterate the keys, assigning the totals back into the row

Related

Output from the sheet without repetitions

I am having a problem with the output in datagridview. I have a list of values ​​(for example / 1/3/5) and I need to display the names of objects that have at least one match with one of the digits. However, some objects have several matches and because of this they are displayed more than once, this is what I need to fix.
To select each element that I need to search for, I use a foreach. And in order to select the elements that have already been displayed, I created two sheets. In pruv I put all the elements that come in, and in pruv2 I put elements that were not previously displayed. To compare the elements of these sheets, I wrote a for-loop. But at the moment the displayed objects are displayed several more times.
public static DataTable table = new DataTable();
public static DataTable tages = new DataTable();
**********************************************************************************************
DB.conetc();
table = new DataTable();
string sql = "SELECT history FROM users WHERE login= '" + nameuser + "';";
DB.usradapt(sql, 2);
string[] histor = table.Rows[0]["history"].ToString().Split('/');
List<string> pruv = new List<string>();
List<string> pruv2 = new List<string>();
pruv2.Add("0");
table = new DataTable();
foreach (var word in histor)
{
int count = 0;
sql = "SELECT NameFilm FROM films WHERE tags LIKE'" + $"%{word}%'";
DB.usradapt(sql, 3);
pruv.Add(tages.Rows[count]["NameFilm"].ToString());
for (int i = 0; i < pruv.Count; i++)
{
for (int j = 0; j < pruv2.Count; j++)
{
if (pruv[i] != pruv2[j])
{
pruv2.Add(pruv[i]);
DB.usradapt(sql, 2);
}
}
}
count++;
}
dgvResult.DataSource = table;
dataGridView1.DataSource = tages; //I display pruv content to understand what's inside
DB.connection.Close();
**********************************************************************************************
public class DB
{
public static SQLiteConnection connection = new SQLiteConnection();
public static void usradapt(string sql, int f)
{
SQLiteDataAdapter adapter = new SQLiteDataAdapter(sql, DB.connection);
if (f == 1) { adapter.Fill(FormLogin.table); }
else if (f == 2) { adapter.Fill(FormPage.table); }
else if (f == 3) { adapter.Fill(FormPage.tages); }
}
}
}
Users have a history that records tags when searching for a movie:
Each movie has its own tags:
The left screen table:
should display the names of films that have at least one of the story tags. In theory, they should be displayed as in the right table, but in such a way that the same movie does not appear multiple times
You can try writing ProductA and then calling it instead of "for"
public class ProductA : IEquatable<ProductA>
{
public string Name { get; set; }
public bool Equals(ProductA other)
{
if (other is null)
return false;
return this.Name == other.Name;
}
public override bool Equals(object obj) => Equals(obj as ProductA);
public override int GetHashCode() => (Name).GetHashCode();
}
for (int i = 0; i < pruv.Count; i++)
{
ProductA[] storeA = { new ProductA { Name = pruv[count] } };
ProductA[] storeB = { new ProductA { Name = pruv2[i] } };
bool equalAB = storeA.SequenceEqual(storeB);
if (!equalAB)
{
DB.usradapt(sql, 2);
pruv.Add(pruv2[i]);
}
}

Insert Columns and Rows into spreadsheet from IQueryable object

How do I insert the columns and rows of data from an Queryable object? What I have so far is listed below. It seems I can get the column names into the spread sheet but Im not sure how to insert the values using the method I have written.
private IQueryable<ShippingRequest> GetRecordsFromDatabase()
{
var CurrentUserId = (int)Session["UserId"];
var results = db.ShippingRequests
.Where(r => r.UserId == CurrentUserId);
return results;
}
//Create the WorkSheet
ExcelWorksheet worksheet = excelPackage.Workbook.Worksheets.Add("FedEx Rates");
//get columns of table
var columnNames = typeof(ShippingRequest).GetProperties()
.Select(x => x.Name)
.ToArray();
int i = 0;
//Adding column name to worksheet
foreach (var col in columnNames)
{
i++;
worksheet.Cells[1, i].Value = col;
}
//Adding records to worksheet
int j;
for (i = 0; i<columnNames.Length; i++)
{
foreach (var item in db)
{
worksheet.Cells[i + 2, j + 1].Value = ???; //Not sure how to get this value
}
}
So you fetched some data as a sequence, and you want every element of this sequence to be added as one row to your table. The columns are all readable public properties of ShippingRequests.
Let's create a generic solution that will add any sequence of columns and show any sequence of objects of some class.
Quite often, the names of the columns don't have to fit one-on-one to the names of all your properties. Sometimes you want to show only some properties. Sometimes you want to create different column names or show different values. Maybe you don't want to show your data to an excel sheet, but to a different kind of table?
A reusable class to define columns from some table class could be something like:
class Column<TSource>
{
public int Index {get; set;}
public string Name {get; set;}
public Func<TSource, object> PropertyValueSelector {get; set;}
public object GetValue(TSource source)
{
return this.PropertyValueSelector(source);
}
... // possible other properties, like: IsVisible, IsSortable, DisplayFormat?
}
Apparently, you want to create a sequence of columns for your ShippingRequests containing every public property of ShippingRequest. The name of the column is the identifier of the property. The index is not important.
The following function will create your sequence of Columns:
public static IEnumerable<Column<TSource>> CreateColumns<TSource>()
where TSource : class
{
return typeof(TSource).GetProperties()
.Where(property => property.CanRead) // they must be at least readable
.Select( (propertyInfo, index) => new Column<TSource>
{
Index = index,
Name = propertyInfo.Name,
PropertyValueSelector = source => propertyInfo.GetValue(source);
});
}
Once we got our data and our columns, we can fill your worksheet:
void Fill<TSource>(this ExcelWorkSheet workSheet,
IEnumerable<Column<TSource>> columns,
IEnumerable<TSource> sourceData)
{
// TODO: clear worksheet?
//Add column names to worksheet
foreach (var column in columns)
{
worksheet.Cells[1, column.Index].Value = column.Name;
}
// add the source data
int nextRowIndex = 2;
foreach (var rowData in sourceData)
{
AddRow(workSheet, nextRowIndex, columns, rowData);
++nextRowIndex;
}
}
void AddRow<TSource> AddRow<TSource>(this ExcelWorkSheet workSheet,
int rowIndex,
IEnumerable<Column<TSource>> columns,
TSource rowData)
{
foreach (var column in columns)
{
var value = column.GetValue(rowData);
worksheet.Cells[rowIndex, column.Index].Value = value;
}
}
Now that you've got this, your code will be easy:
var workSheet = ...
var columns = ...
var data = ...
worksheet.Fill(columns, data);
In your case:
var worksheet = excelPackage.Workbook.Worksheets.Add("FedEx Rates");
var columns = CreateColumns<ShippingRequest>().ToList();
var shippingRequests = GetShippingRequests();
worksheet.Fill(columns, shippingRequests);
// Bam! Done!
The nice thing is that you can use the code to fill worksheets with data from any class.
For example, I have a class Student and I want to show some columns of the 100 youngest students.
// I only want to show the following columns of students:
var studentColumns = new Column<Student>
{
new Column {Index = 1, Name = "Id", PropertyValueSelector = student => student.Id },
new Column {Index = 3, Name = "Birthday", PropertyValueSelector = student => student.Id }
new Column {Index = 2, Name = "Student Name", PropertyValueSelector = student =>
String.Format("{0} {1} {2}", student.FirstName,
student.MiddleName,
student.FamilyName} },
};
// I only want 100 youngest students:
var studentsToDisplay = GetStudents()
.OrderByDescending(student => student.BirthDay)
.Take(100)
.ToList();
// filling the worksheet is only two lines:
var worksheet = excelPackage.Workbook.Worksheets.Add("Young Students");
worksheet.Fill(studentColumns, studentsToDisplay);

C# - Looking for the list of duplicated rows (need optimization)

Please, I would like to optimize this code in C#, if possible.
When there are less than 1000 lines, it's fine. But when we have at least 10000, it starts to take some time...
Here a little benchmark :
5000 lines => ~2s
15000 lines => ~20s
25000 lines => ~50s
Indeed, I'm looking for duplicated lines.
Method SequenceEqual to check values may be a problem (in my "benchmark", I have 4 fields considered as "keyField" ...).
Here is the code :
private List<DataRow> GetDuplicateKeys(DataTable table, List<string> keyFields)
{
Dictionary<List<object>, int> keys = new Dictionary<List<object>, int>(); // List of key values + their index in table
List<List<object>> duplicatedKeys = new List<List<object>>(); // List of duplicated keys values
List<DataRow> duplicatedRows = new List<DataRow>(); // Rows that are duplicated
foreach (DataRow row in table.Rows)
{
// Find keys fields values for the row
List<object> rowKeys = new List<object>();
keyFields.ForEach(keyField => rowKeys.Add(row[keyField]));
// Check if those keys are already defined
bool alreadyDefined = false;
foreach (List<object> keyValue in keys.Keys)
{
if (rowKeys.SequenceEqual(keyValue))
{
alreadyDefined = true;
break;
}
}
if (alreadyDefined)
{
duplicatedRows.Add(row);
// If first duplicate for this key, add the first occurence of this key
if (!duplicatedKeys.Contains(rowKeys))
{
duplicatedKeys.Add(rowKeys);
int i = keys[keys.Keys.First(key => key.SequenceEqual(rowKeys))];
duplicatedRows.Add(table.Rows[i]);
}
}
else
{
keys.Add(rowKeys, table.Rows.IndexOf(row));
}
}
return duplicatedRows;
}
Any ideas ?
I think this is the fastest and shortest way to find duplicate rows:
For 100.000 rows it executes in about 250ms.
Main and test data:
static void Main(string[] args)
{
var dt = new DataTable();
dt.Columns.Add("Id");
dt.Columns.Add("Value1");
dt.Columns.Add("Value2");
var rnd = new Random(DateTime.Now.Millisecond);
for (int i = 0; i < 100000; i++)
{
var dr = dt.NewRow();
dr[0] = rnd.Next(1, 1000);
dr[1] = rnd.Next(1, 1000);
dr[2] = rnd.Next(1, 1000);
dt.Rows.Add(dr);
}
Stopwatch sw = new Stopwatch();
sw.Start();
var duplicates = GetDuplicateRows(dt, "Id", "Value1", "Value2");
sw.Stop();
Console.WriteLine(
"Found {0} duplicates in {1} miliseconds.",
duplicates.Count,
sw.ElapsedMilliseconds);
Console.ReadKey();
}
GetDuplicateRows with LINQ:
private static List<DataRow> GetDuplicateRows(DataTable table, params string[] keys)
{
var duplicates =
table
.AsEnumerable()
.GroupBy(dr => String.Join("-", keys.Select(k => dr[k])), (groupKey, groupRows) => new { Key = groupKey, Rows = groupRows })
.Where(g => g.Rows.Count() > 1)
.SelectMany(g => g.Rows)
.ToList();
return duplicates;
}
Explanation (for those who are new to LINQ):
The most tricky part is the GroupBy I guess. Here I take as the first parameter a DataRow and for each row I create a group key from the values for the specified keys that I join to create a string like 1-1-2. Then the second parameter just selects the group key and the group rows into a new anonymous object. Then I check if there is more then 1 row and flatten the groups back into a list with SelectMany.
Try this. Use more linq, that improve perfomance, also try with PLinq if posible.
Regards
private List<DataRow> GetDuplicateKeys(DataTable table, List<string> keyFields)
{
Dictionary<List<object>, int> keys = new Dictionary<List<object>, int>(); // List of key values + their index in table
List<List<object>> duplicatedKeys = new List<List<object>>(); // List of duplicated keys values
List<DataRow> duplicatedRows = new List<DataRow>(); // Rows that are duplicated
foreach (DataRow row in table.Rows)
{
// Find keys fields values for the row
List<object> rowKeys = new List<object>();
keyFields.ForEach(keyField => rowKeys.Add(row[keyField]));
// Check if those keys are already defined
bool alreadyDefined = false;
foreach (List<object> keyValue in keys.Keys)
{
if (rowKeys.Any(keyValue))
{
alreadyDefined = true;
break;
}
}
if (alreadyDefined)
{
duplicatedRows.Add(row);
// If first duplicate for this key, add the first occurence of this key
if (!duplicatedKeys.Contains(rowKeys))
{
duplicatedKeys.Add(rowKeys);
int i = keys[keys.Keys.First(key => key.SequenceEqual(rowKeys))];
duplicatedRows.Add(table.Rows[i]);
}
}
else
{
keys.Add(rowKeys, table.Rows.IndexOf(row));
}
}
return duplicatedRows;
}

Is there a code pattern for mapping a CSV with random column order to defined properties?

I have a CSV that is delivered to my application from various sources. The CSV will always have the same number columns and the header values for the columns will always be the same.
However, the columns may not always be in the same order.
Day 1 CSV may look like this
ID,FirstName,LastName,Email
1,Johh,Lennon,jlennon#applerecords.com
2,Paul,McCartney,macca#applerecords.com
Day 2 CSV may look like this
Email,FirstName,ID,LastName
resident1#friarpark.com,George,3,Harrison
ringo#allstarrband.com,Ringo,4,Starr
I want to read in the header row for each file and have a simple mechanism for associating each "column" of data with the associated property I have defined in my class.
I know I can use selection statements to figure it out, but that seems like a "bad" way to handle it.
Is there a simple way to map "columns" to properties using a dictionary or class at runtime?
Use a Dictionary to map column heading text to column position.
Hard-code mapping of column heading text to object property.
Example:
// Parse first line of text to add column heading strings and positions to your dictionary
...
// Parse data row into an array, indexed by column position
...
// Assign data to object properties
x.ID = row[myDictionary["ID"]];
x.FirstName = row[myDictionary["FirstName"]];
...
You dont need a design pattern for this purpose.
http://www.codeproject.com/Articles/9258/A-Fast-CSV-Reader
I have used this Reader, while it is pretty good, it has a functionality as row["firstname"] or row["id"] which you can parse and create your objects.
I have parsed both CSV files using Microsoft.VisualBasic.FileIO.TextFieldParser. I have populated DataTable after parsing both csv files:
DataTable dt;
private void button1_Click(object sender, EventArgs e)
{
dt = new DataTable();
ParseCSVFile("day1.csv");
ParseCSVFile("day2.csv");
dataGridView1.DataSource = dt;
}
private void ParseCSVFile(string sFileName)
{
var dIndex = new Dictionary<string, int>();
using (TextFieldParser csvReader = new TextFieldParser(sFileName))
{
csvReader.Delimiters = new string[] { "," };
var colFields = csvReader.ReadFields();
for (int i = 0; i < colFields.Length; i++)
{
string sColField = colFields[i];
if (sColField != string.Empty)
{
dIndex.Add(sColField, i);
if (!dt.Columns.Contains(sColField))
dt.Columns.Add(sColField);
}
}
while (!csvReader.EndOfData)
{
string[] fieldData = csvReader.ReadFields();
if (fieldData.Length > 0)
{
DataRow dr = dt.NewRow();
foreach (var kvp in dIndex)
{
int iVal = kvp.Value;
if (iVal < fieldData.Length)
dr[kvp.Key] = fieldData[iVal];
}
dt.Rows.Add(dr);
}
}
}
}
day1.csv and day2.csv as mentioned in the question.
Here is how output dataGridView1 look like:
Here is a simple generic method that will take a CSV file (broken into string[]) and create from it a list of objects. The assumption is that the object properties will have the same name as the headers. If this is not the case you might look into the DataMemberAttribute property and modify accordingly.
private static List<T> ProcessCSVFile<T>(string[] lines)
{
List<T> list = new List<T>();
Type type = typeof(T);
string[] headerArray = lines[0].Split(new char[] { ',' });
PropertyInfo[] properties = new PropertyInfo[headerArray.Length];
for (int prop = 0; prop < properties.Length; prop++)
{
properties[prop] = type.GetProperty(headerArray[prop]);
}
for (int count = 1; count < lines.Length; count++)
{
string[] valueArray = lines[count].Split(new char[] { ',' });
T t = Activator.CreateInstance<T>();
list.Add(t);
for (int value = 0; value < valueArray.Length; value++)
{
properties[value].SetValue(t, valueArray[value], null);
}
}
return list;
}
Now, in order to use it just pass your file formatted as an array of strings. Let's say the class you want to read into looks like this:
class Music
{
public string ID { get; set; }
public string FirstName { get; set; }
public string LastName { get; set; }
public string Email { get; set; }
}
So you can call this:
List<Music> newlist = ProcessCSVFile<Music>(list.ToArray());
...and everything gets done with one call.

More efficient way of assigning values in DataTable?

I have a DataTable with two columns: JobDetailID and CalculatedID. JobDetailID is not always unique. I want one/the first instance of CalculatedID for a given JobDetailID to be JobDetailID + "A", and when there are multiple rows with the same JobDetailID, I want successive rows to be JobDetailID + "B", "C", etc. There aren't more than four or five rows with the same JobDetailID.
I currently have it implemented as follows, but it's unacceptably slow:
private void AddCalculatedID(DataTable data)
{
var calculatedIDColumn = new DataColumn { DataType = typeof(string), ColumnName = "CalculatedID" };
data.Columns.Add(calculatedIDColumn);
data.Columns["CalculatedID"].SetOrdinal(0);
var enumerableData = data.AsEnumerable();
foreach (DataRow row in data.Rows)
{
var jobDetailID = row["JobDetailID"].ToString();
// Give calculated ID of JobDetailID + A, B, C, etc. for multiple rows with same JobDetailID
int x = 65; // ASCII value for A
string calculatedID = jobDetailID + (char)x;
while (string.IsNullOrEmpty(row["CalculatedID"].ToString()))
{
if ((enumerableData
.Any(r => r.Field<string>("CalculatedID") == calculatedID)))
{
calculatedID = jobDetailID + (char)x;
x++;
}
else
{
row["CalculatedID"] = calculatedID;
break;
}
}
}
}
Assuming I need to adhere to this format of output, how might I improve this performance?
It would be better to add the code for generation of CalculatedID in the place where you are getting the data, but, if that is unavailable, you might want to avoid scanning the entire table each time a duplicate is found. You could use a Dictionary for the used keys, like this:
private void AddCalculatedID(DataTable data)
{
var calculatedIDColumn = new DataColumn { DataType = typeof(string), ColumnName = "CalculatedID" };
data.Columns.Add(calculatedIDColumn);
data.Columns["CalculatedID"].SetOrdinal(0);
Dictionary<string, string> UsedKeyIndex = new Dictionary<string, string>();
foreach (DataRow row in data.Rows)
{
string jobDetailID = row["JobDetailID"].ToString();
string calculatedID;
if (UsedKeyIndex.ContainsKey(jobDetailID))
{
calculatedID = jobDetailID + 'A';
UsedKeyIndex.Add(jobDetailID, 'A');
}
else
{
char nextKey = UsedKeyIndex[jobDetailID].Value+1;
calculatedID = jobDetailID + nextKey;
UsedKeyIndex[jobDetailID] = nextKey;
}
row["CalculatedID"] = calculatedID;
}
}
This will essentially trade memory for speed, as it will cache all used JobDetailID's along with the last char used for the generated key. If you have lots and lots of these JobDetailID, this might get a bit memory intensive, but I doubt that you'll have problems unless you have millions of rows to process.
If I understand your idea about setting CalculatedID for the rows, then following algorithm would do the trick and it's complexity is linear. Most important part is data.Select("","JobDetailID"), where I get a sorted list of rows.
I didn't compiled it myself, so there could be syntactical errors.
private void AddCalculatedID(DataTable data)
{
var calculatedIDColumn = new DataColumn { DataType = typeof(string), ColumnName = "CalculatedID" };
data.Columns.Add(calculatedIDColumn);
data.Columns["CalculatedID"].SetOrdinal(0);
int jobDetailID = -1;
int letter = 65;
foreach (DataRow row in data.Select("","JobDetailID"))
{
if((int)row["JobDetailID"] == jobDetailID)
{
row["CalculatedID"] = row["JobDetailID"].ToString() + (char)letter;
letter++;
}
else
{
letter = 65;
jobDetailID = (int)row["JobDetailID"];
}
}
}
You tagged this as LINQ, but you are using iterative methods. Probably the best way to do this would be to use a combination of both, iterating over each "grouping" and assigning the calculated ID for each row in the grouping.
foreach (var groupRows in data.AsEnumerable().GroupBy(d => d["JobDetailID"].ToString()))
{
if(string.IsNullOrEmpty(groupRows.Key))
continue;
// We now have each "grouping" of duplicate JobDetailIDs.
int x = 65; // ASCII value for A
foreach (var duplicate in groupRows)
{
string calcID = groupRows.Key + ((char)x++);
duplicate["CalculatedID"] = calcID;
//Can also do this and achieve same results.
//duplicate["CalculatedID"] = groupRows.Key + ((char)x++);
}
}
First thing you do is group on the column that's going to have duplicates. You're going to iterate over each of these groupings, and reset the suffix value for every grouping. For every row in the grouping, you're going to get the calculated ID (incrementing the suffix value at the same time) and assign the ID back to the duplicate row. As a side note, we're altering the items we're enumerating here, which is normally a bad thing. However, we're changing data that isn't associated with our enumeration declaration (GroupBy), so it will not alter the behavior of our enumeration.
This method gets the job done in a single pass. You can optimize it further if, for example, "JobDetailID" is an integer instead of a string, or if the DataTable is always receiving the data sorted by "JobDetailID" (you could get rid of the dictionary), but here's a draft:
private static void AddCalculatedID(DataTable data)
{
data.BeginLoadData();
try
{
var calculatedIDColumn = new DataColumn { DataType = typeof(string), ColumnName = "CalculatedID" };
data.Columns.Add(calculatedIDColumn);
data.Columns["CalculatedID"].SetOrdinal(0);
var jobDetails = new Dictionary<string, int>(data.Rows.Count);
foreach (DataRow row in data.Rows)
{
var jobDetailID = row["JobDetailID"].ToString();
int lastSuffix;
if (jobDetails.TryGetValue(jobDetailID, out lastSuffix))
{
lastSuffix++;
}
else
{
// ASCII value for A
lastSuffix = 65;
}
row["CalculatedID"] = jobDetailID + (char)lastSuffix;
jobDetails[jobDetailID] = lastSuffix;
}
}
finally
{
data.EndLoadData();
}
}

Categories