Parallel execution issue - c#

I need a little education here with regards to the execution of parallel tasks.
I have created a small fiddle:
https://dotnetfiddle.net/JO2a4m
What I am trying to do send a few accounts to process in batches to another method and creating a unit of work (task) for each batch but when I execute the tasks, it only executes the last task which was added. This is something I am trying to break my head around.
Code:
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
public class Program
{
public static void Main()
{
var accounts = GenerateAccount();
var accountsProcess = new List<Account>();
var taskList = new List<Task>();
var batch = 4;
var count = 0;
foreach (var account in accounts)
{
if (count == batch)
{
taskList.Add(new Task(() => ProcessAccount(accountsProcess)));
count = 0;
accountsProcess.Clear();
}
count++;
accountsProcess.Add(account);
}
Parallel.ForEach(taskList, t =>
{
t.Start();
}
);
Task.WaitAll(taskList.ToArray());
if (accountsProcess.Count > 0)
ProcessAccount(accountsProcess);
}
public static List<Account> GenerateAccount()
{
var accounts = new List<Account>();
var first = "First";
var second = "Second";
for (int i = 0; i <= 1000; i++)
{
var account = new Account();
account.first = first + i;
account.second = second + i;
accounts.Add(account);
}
return accounts;
}
public static void ProcessAccount(List<Account> accounts)
{
Console.WriteLine(accounts.Count);
foreach (var account in accounts)
{
Console.WriteLine(account.first + account.second);
}
}
}
public class Account
{
public string first;
public string second;
}

foreach (var account in accounts)
{
if (count == batch)
{
taskList.Add(new Task(() => ProcessAccount(accountsProcess)));
count = 0;
accountsProcess.Clear();
}
count++;
accountsProcess.Add(account);
}
The issue is that all of the Tasks are sharing the same List<Account> object.
I would suggest changing the code to:
foreach (var account in accounts)
{
if (count == batch)
{
var bob = accountsProcess;
taskList.Add(new Task(() => ProcessAccount(bob)));
count = 0;
accountsProcess = new List<Account>();
}
count++;
accountsProcess.Add(account);
}
By using bob and assigning a new List to accountsProcess we ensure each Task gets its own List - rather than sharing a single List.
Also, consider using MoreLINQ's Batch rather than rolling your own.

Related

Distributing jobs fairly on accounts - distribution algorithm

I'm working on a project that publish to a multiple websites using a multiple accounts. each account can publish to a specific website. as shown in the 'Dict' below. The problem is I'm trying to distribute the publishing jobs on accounts fairly and making a distance between accounts that has more than 1 job.
using System;
using System.Collections.Generic;
using System.Linq;
public class Program
{
public struct Job
{
public string Site { get; set; }
public string Account { get; set; }
}
private static readonly Dictionary<string, List<string>> Dict = new Dictionary<string, List<string>>();
private static List<string> _accounts;
public static void Main()
{
var sites = new List<string>{"Site-A", "Site-B", "Site-C", "Site-D", "Site-E"};
_accounts = new List<string>{"Account-A", "Account-B", "Account-C", "Account-D", "Account-E"};
// Permissions dictionary. specify accounts that has a permessions to publish on a particular Site
Dict.Add("Site-A", new List<string>{"Account-A", "Account-C"});
Dict.Add("Site-B", new List<string>{"Account-A", "Account-E"});
Dict.Add("Site-C", new List<string>{"Account-C", "Account-D"});
Dict.Add("Site-D", new List<string>{"Account-A"});
Dict.Add("Site-E", new List<string>{"Account-A"});
var jobs = new List<Job>();
foreach (var site in sites)
{
var job = new Job();
// Get an account that has a permissions to publish on 'site'
// checking against the permissions dictionary Dict
var account = GetAccountCanPost(Dict, site, _accounts);
job.Site = site;
job.Account = account;
jobs.Add(job);
}
var jobsCountForEachAccountDict = CalculateJobsCountForEachAccounts(jobs);
//////#### Now.. We need to re Order Jobs and swipe it here before send it to processing ####.....//////
foreach (var job in jobs)
{
Console.WriteLine(job.Account + " publish on " + job.Site);
}
}
public static Dictionary<string, int> CalculateJobsCountForEachAccounts(List<Job> jobs)
{
var dict = new Dictionary<string, int>();
foreach (var job in jobs)
{
if (dict.ContainsKey(job.Account))
dict[job.Account]++;
else
dict.Add(job.Account, 1);
}
return dict;
}
public static string GetAccountCanPost(Dictionary<string, List<string>> dict, string targetSite, List<string> accounts)
{
var accountIdsAssoc = GetAccountsIdsAssociatedWithCommunity(dict, targetSite);
var selectedId = PickRandom(accountIdsAssoc, new Random());
var account = accounts.FirstOrDefault(s => s == selectedId);
return account;
}
private static List<string> GetAccountsIdsAssociatedWithCommunity(Dictionary<string, List<string>> communitiesAccountsAssociationsDict, string communityId)
{
if (communitiesAccountsAssociationsDict.ContainsKey(communityId))
return communitiesAccountsAssociationsDict[communityId];
return null;
}
private static T PickRandom<T>(IList<T> list, Random random)
{
var index = random.Next(0, list.Count);
return list[(int) index];
}
}
when the jobs are created it is something similar to this: (Before re-adjusting the jobs distribution)
> Account-A Publish on Site-A
> Account-E Publish on Site-B
> Account-D Publish on Site-C
> Account-A Publish on Site-D
> Account-A Publish on Site-E
The publishing jobs created above are not fairly distributed to accounts, as you can see 'Account-A' has 3 jobs assigned while there's an other accounts can publish to the 'sites' as defined in 'Dict' so it should look something like:
> Account-C Publish on Site-A
> Account-E Publish on Site-B
> Account-A Publish on Site-D
> Account-D Publish on Site-C
> Account-A Publish on Site-E
In the output above the jobs are distributed fairly on accounts and also there's a distance between accounts that has more than 1 job
An example on distance between jobs:
> Account-A Publish on Site-A
> Account-E Publish on Site-B
> Account-D Publish on Site-C
> Account-A Publish on Site-D
> Account-A Publish on Site-E
Job 4 and 5 are being processed by Account-A. An account should not process two jobs sequentially, so it could be swapped with another job.
It will be highly appreciated if you could help. I need an algorithm that do the job distribution to get similar output. Performance is not important.
Thank you..
It can be improved, but this will do what you need:
using System;
using System.Collections.Generic;
using System.Linq;
namespace ConsoleApp1
{
public struct Job
{
public string Site { get; set; }
public string Account { get; set; }
}
public class Program
{
public static void Main(string[] args)
{
Dictionary<string, List<string>> permissionsDict = new Dictionary<string, List<string>>();
permissionsDict.Add("Site-A", new List<string> { "Account-A", "Account-C" });
permissionsDict.Add("Site-B", new List<string> { "Account-A", "Account-E" });
permissionsDict.Add("Site-C", new List<string> { "Account-C", "Account-D" });
permissionsDict.Add("Site-D", new List<string> { "Account-A" });
permissionsDict.Add("Site-E", new List<string> { "Account-A" });
// get responsibilities rate for each account
Dictionary<string, int> responsibilitiesRate = GetResponsibilitiesRate(permissionsDict);
List<Job> jobs = new List<Job>();
// building jobs list
foreach (var permission in permissionsDict)
{
var job = new Job();
job.Site = permission.Key;
// for the current site, see what account has lower responsibility rate
int minResponsibilities = permission.Value.Min(x => responsibilitiesRate[x]);
string account = permission.Value.First(x => responsibilitiesRate[x] == minResponsibilities);
responsibilitiesRate[account]++;
job.Account = account;
jobs.Add(job);
}
// order jobs making sure distance between accounts has more than 1 job
jobs = RandomOrderResponsibilities(jobs);
foreach (var job in jobs)
{
Console.WriteLine(job.Account + " publish on " + job.Site);
}
Console.ReadLine();
}
private static Dictionary<string, int> GetResponsibilitiesRate(Dictionary<string, List<string>> dict)
{
Dictionary<string, int> responsibilitiesCount = new Dictionary<string, int>();
foreach (var kvp in dict)
{
foreach (var account in kvp.Value)
{
if (responsibilitiesCount.ContainsKey(account))
{
responsibilitiesCount[account]++;
}
else
{
responsibilitiesCount.Add(account, 1);
}
}
}
return responsibilitiesCount.OrderBy(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
}
private static List<Job> RandomOrderResponsibilities(List<Job> jobs)
{
bool couldComplete = true;
var maxIterations = 1000;
var iterationCount = 0;
do
{
// shuffle
jobs = jobs.OrderBy(a => Guid.NewGuid()).ToList();
for (int i = 1; i < jobs.Count; i++)
{
if (jobs[i].Account == jobs[i - 1].Account)
{
couldComplete = false;
for (int j = i + 1; j < jobs.Count; j++)
{
if (jobs[j].Account != jobs[i].Account)
{
// swipe
var temp = jobs[i];
jobs[i] = jobs[j];
jobs[j] = temp;
couldComplete = true;
break;
}
}
}
}
iterationCount++;
} while (!couldComplete && iterationCount < maxIterations);
return jobs;
}
}
}
Output (random solution):
Account-A publish on Site-D
Account-C publish on Site-A
Account-A publish on Site-E
Account-D publish on Site-C
Account-E publish on Site-B

Starting tasks inside another task is duplicating my WebRequests

I use the code below to check some pdf files online and return a string accordingly.
The problem is: When I added the second Task.Factory.StartNew() it started duplicating all requests, but still returning only one answer(as it should be).
I need this to be as fast as possible so I can't waste time sending two requests to the server.
public static void Main(string[] args)
{
var listT = new List<string>()
{
"24006025062"
};
var task = listT.Select(x => Task.Factory.StartNew(() => TesteTask(x)));
Task.WaitAll(task.ToArray(), TimeSpan.FromSeconds(120));
List<string> results = new List<string>();
foreach (var result in task)
{
results.Add(result.Result);
}
}
private static string TesteTask(string codCart)
{
var teste = new Consulta();
var retorno = string.Empty;
var session = teste.GetCaptcha();
for (int i = 0; i < 10; i++)
{
session.CaptchaResolvida = QuebraCaptcha(session.CaptchaCodificada).CaptchaResolvida;
if (session.CaptchaResolvida.Length > 0)
{
var links = teste.Consulta(codCart, session).Retorno;
if (links.Any())
{
var tasks = links.Select(x => Task.Factory.StartNew(() => Executa(teste, session, x)));
Task.WaitAll(tasks.ToArray(), TimeSpan.FromSeconds(120));
var modelList = from Result in tasks select Result.Result;
retorno = teste.FinalizaProcesso(modelList.ToList());
break;
}
}
}
return retorno;
}
private static string Executa(Consulta teste, Model<Request> session, string link)
{
var retorno = string.Empty;
for (int i = 0; i < 10; i++)
{
var CaptchaResolvida = QuebraCaptcha(teste.GetCaptchaPdf(session)).CaptchaResolvida;
if (CaptchaResolvida != null && CaptchaResolvida != string.Empty)
{
var status = teste.BaixaPdf(link, CaptchaResolvida, session);
if (status != string.Empty)
{
retorno = status;
break;
}
}
}
return retorno;
}
Ps: This is my first post on stack overflow, if I'm not clear enough please let me know!
You are getting this behavior because you are iterating twice on the Select returned IEnumerable. Try this:
public static void Main(string[] args)
{
var listT = new List<string>()
{
"24006025062"
};
var task = list
.Select(x => Task.Factory.StartNew(() => TesteTask(x)))
.ToArray();
Task.WaitAll(task, TimeSpan.FromSeconds(120));
List<string> results = new List<string>();
foreach (var result in task)
{
results.Add(result.Result);
}
}
By moving the ToArray() just after the Select() it creates the results IEnumerable only once instead of twice.
Hope it helps!

Use Task.Run instead of Delegate.BeginInvoke

I have recently upgraded my projects to ASP.NET 4.5 and I have been waiting a long time to use 4.5's asynchronous capabilities. After reading the documentation I'm not sure whether I can improve my code at all.
I want to execute a task asynchronously and then forget about it. The way that I'm currently doing this is by creating delegates and then using BeginInvoke.
Here's one of the filters in my project with creates an audit in our database every time a user accesses a resource that must be audited:
public override void OnActionExecuting(ActionExecutingContext filterContext)
{
var request = filterContext.HttpContext.Request;
var id = WebSecurity.CurrentUserId;
var invoker = new MethodInvoker(delegate
{
var audit = new Audit
{
Id = Guid.NewGuid(),
IPAddress = request.UserHostAddress,
UserId = id,
Resource = request.RawUrl,
Timestamp = DateTime.UtcNow
};
var database = (new NinjectBinder()).Kernel.Get<IDatabaseWorker>();
database.Audits.InsertOrUpdate(audit);
database.Save();
});
invoker.BeginInvoke(StopAsynchronousMethod, invoker);
base.OnActionExecuting(filterContext);
}
But in order to finish this asynchronous task, I need to always define a callback, which looks like this:
public void StopAsynchronousMethod(IAsyncResult result)
{
var state = (MethodInvoker)result.AsyncState;
try
{
state.EndInvoke(result);
}
catch (Exception e)
{
var username = WebSecurity.CurrentUserName;
Debugging.DispatchExceptionEmail(e, username);
}
}
I would rather not use the callback at all due to the fact that I do not need a result from the task that I am invoking asynchronously.
How can I improve this code with Task.Run() (or async and await)?
If I understood your requirements correctly, you want to kick off a task and then forget about it. When the task completes, and if an exception occurred, you want to log it.
I'd use Task.Run to create a task, followed by ContinueWith to attach a continuation task. This continuation task will log any exception that was thrown from the parent task. Also, use TaskContinuationOptions.OnlyOnFaulted to make sure the continuation only runs if an exception occurred.
Task.Run(() => {
var audit = new Audit
{
Id = Guid.NewGuid(),
IPAddress = request.UserHostAddress,
UserId = id,
Resource = request.RawUrl,
Timestamp = DateTime.UtcNow
};
var database = (new NinjectBinder()).Kernel.Get<IDatabaseWorker>();
database.Audits.InsertOrUpdate(audit);
database.Save();
}).ContinueWith(task => {
task.Exception.Handle(ex => {
var username = WebSecurity.CurrentUserName;
Debugging.DispatchExceptionEmail(ex, username);
});
}, TaskContinuationOptions.OnlyOnFaulted);
As a side-note, background tasks and fire-and-forget scenarios in ASP.NET are highly discouraged. See The Dangers of Implementing Recurring Background Tasks In ASP.NET
It may sound a bit out of scope, but if you just want to forget after you launch it, why not using directly ThreadPool?
Something like:
ThreadPool.QueueUserWorkItem(
x =>
{
try
{
// Do something
...
}
catch (Exception e)
{
// Log something
...
}
});
I had to do some performance benchmarking for different async call methods and I found that (not surprisingly) ThreadPool works much better, but also that, actually, BeginInvoke is not that bad (I am on .NET 4.5). That's what I found out with the code at the end of the post. I did not find something like this online, so I took the time to check it myself. Each call is not exactly equal, but it is more or less functionally equivalent in terms of what it does:
ThreadPool: 70.80ms
Task: 90.88ms
BeginInvoke: 121.88ms
Thread: 4657.52ms
public class Program
{
public delegate void ThisDoesSomething();
// Perform a very simple operation to see the overhead of
// different async calls types.
public static void Main(string[] args)
{
const int repetitions = 25;
const int calls = 1000;
var results = new List<Tuple<string, double>>();
Console.WriteLine(
"{0} parallel calls, {1} repetitions for better statistics\n",
calls,
repetitions);
// Threads
Console.Write("Running Threads");
results.Add(new Tuple<string, double>("Threads", RunOnThreads(repetitions, calls)));
Console.WriteLine();
// BeginInvoke
Console.Write("Running BeginInvoke");
results.Add(new Tuple<string, double>("BeginInvoke", RunOnBeginInvoke(repetitions, calls)));
Console.WriteLine();
// Tasks
Console.Write("Running Tasks");
results.Add(new Tuple<string, double>("Tasks", RunOnTasks(repetitions, calls)));
Console.WriteLine();
// Thread Pool
Console.Write("Running Thread pool");
results.Add(new Tuple<string, double>("ThreadPool", RunOnThreadPool(repetitions, calls)));
Console.WriteLine();
Console.WriteLine();
// Show results
results = results.OrderBy(rs => rs.Item2).ToList();
foreach (var result in results)
{
Console.WriteLine(
"{0}: Done in {1}ms avg",
result.Item1,
(result.Item2 / repetitions).ToString("0.00"));
}
Console.WriteLine("Press a key to exit");
Console.ReadKey();
}
/// <summary>
/// The do stuff.
/// </summary>
public static void DoStuff()
{
Console.Write("*");
}
public static double RunOnThreads(int repetitions, int calls)
{
var totalMs = 0.0;
for (var j = 0; j < repetitions; j++)
{
Console.Write(".");
var toProcess = calls;
var stopwatch = new Stopwatch();
var resetEvent = new ManualResetEvent(false);
var threadList = new List<Thread>();
for (var i = 0; i < calls; i++)
{
threadList.Add(new Thread(() =>
{
// Do something
DoStuff();
// Safely decrement the counter
if (Interlocked.Decrement(ref toProcess) == 0)
{
resetEvent.Set();
}
}));
}
stopwatch.Start();
foreach (var thread in threadList)
{
thread.Start();
}
resetEvent.WaitOne();
stopwatch.Stop();
totalMs += stopwatch.ElapsedMilliseconds;
}
return totalMs;
}
public static double RunOnThreadPool(int repetitions, int calls)
{
var totalMs = 0.0;
for (var j = 0; j < repetitions; j++)
{
Console.Write(".");
var toProcess = calls;
var resetEvent = new ManualResetEvent(false);
var stopwatch = new Stopwatch();
var list = new List<int>();
for (var i = 0; i < calls; i++)
{
list.Add(i);
}
stopwatch.Start();
for (var i = 0; i < calls; i++)
{
ThreadPool.QueueUserWorkItem(
x =>
{
// Do something
DoStuff();
// Safely decrement the counter
if (Interlocked.Decrement(ref toProcess) == 0)
{
resetEvent.Set();
}
},
list[i]);
}
resetEvent.WaitOne();
stopwatch.Stop();
totalMs += stopwatch.ElapsedMilliseconds;
}
return totalMs;
}
public static double RunOnBeginInvoke(int repetitions, int calls)
{
var totalMs = 0.0;
for (var j = 0; j < repetitions; j++)
{
Console.Write(".");
var beginInvokeStopwatch = new Stopwatch();
var delegateList = new List<ThisDoesSomething>();
var resultsList = new List<IAsyncResult>();
for (var i = 0; i < calls; i++)
{
delegateList.Add(DoStuff);
}
beginInvokeStopwatch.Start();
foreach (var delegateToCall in delegateList)
{
resultsList.Add(delegateToCall.BeginInvoke(null, null));
}
// We lose a bit of accuracy, but if the loop is big enough,
// it should not really matter
while (resultsList.Any(rs => !rs.IsCompleted))
{
Thread.Sleep(10);
}
beginInvokeStopwatch.Stop();
totalMs += beginInvokeStopwatch.ElapsedMilliseconds;
}
return totalMs;
}
public static double RunOnTasks(int repetitions, int calls)
{
var totalMs = 0.0;
for (var j = 0; j < repetitions; j++)
{
Console.Write(".");
var resultsList = new List<Task>();
var stopwatch = new Stopwatch();
stopwatch.Start();
for (var i = 0; i < calls; i++)
{
resultsList.Add(Task.Factory.StartNew(DoStuff));
}
// We lose a bit of accuracy, but if the loop is big enough,
// it should not really matter
while (resultsList.Any(task => !task.IsCompleted))
{
Thread.Sleep(10);
}
stopwatch.Stop();
totalMs += stopwatch.ElapsedMilliseconds;
}
return totalMs;
}
}
Here's one of the filters in my project with creates an audit in our database every time a user accesses a resource that must be audited
Auditing is certainly not something I would call "fire and forget". Remember, on ASP.NET, "fire and forget" means "I don't care whether this code actually executes or not". So, if your desired semantics are that audits may occasionally be missing, then (and only then) you can use fire and forget for your audits.
If you want to ensure your audits are all correct, then either wait for the audit save to complete before sending the response, or queue the audit information to reliable storage (e.g., Azure queue or MSMQ) and have an independent backend (e.g., Azure worker role or Win32 service) process the audits in that queue.
But if you want to live dangerously (accepting that occasionally audits may be missing), you can mitigate the problems by registering the work with the ASP.NET runtime. Using the BackgroundTaskManager from my blog:
public override void OnActionExecuting(ActionExecutingContext filterContext)
{
var request = filterContext.HttpContext.Request;
var id = WebSecurity.CurrentUserId;
BackgroundTaskManager.Run(() =>
{
try
{
var audit = new Audit
{
Id = Guid.NewGuid(),
IPAddress = request.UserHostAddress,
UserId = id,
Resource = request.RawUrl,
Timestamp = DateTime.UtcNow
};
var database = (new NinjectBinder()).Kernel.Get<IDatabaseWorker>();
database.Audits.InsertOrUpdate(audit);
database.Save();
}
catch (Exception e)
{
var username = WebSecurity.CurrentUserName;
Debugging.DispatchExceptionEmail(e, username);
}
});
base.OnActionExecuting(filterContext);
}

Task fired again after WaitAll

Using HttpClient.GetAsync or any of its async method, or any BCL async method in Linq Select might result in some strange twice shoot.
Here a unit test case:
[TestMethod]
public void TestTwiceShoot()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Select(d =>
{
k++;
var client = new System.Net.Http.HttpClient();
return client.GetAsync(new Uri("http://testdevserver.ibs.local:8020/prestashop/api/products/1"));
});
Task.WaitAll(tasks.ToArray());
foreach (var r in tasks)
{
}
Assert.AreEqual(1, k);
}
The test will fail, since k is 2. Somehow the program run the delegate of firing GetAsync twice. Why?
If I remove foreach (var r in tasks), the test pass. Why?
[TestMethod]
public void TestTwiceShoot()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Select(d =>
{
k++;
var client = new System.Net.Http.HttpClient();
return client.GetAsync(new Uri("http://testdevserver.ibs.local:8020/prestashop/api/products/1"));
});
Task.WaitAll(tasks.ToArray());
Assert.AreEqual(1, k);
}
If I use foreach instead of items.Select, the test pass. Why?
[TestMethod]
public void TestTwiceShoot()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = new List<Task<System.Net.Http.HttpResponseMessage>>();
foreach (var item in items)
{
k++;
var client = new System.Net.Http.HttpClient();
tasks.Add( client.GetAsync(new Uri("http://testdevserver.ibs.local:8020/prestashop/api/products/1")));
};
Task.WaitAll(tasks.ToArray());
foreach (var r in tasks)
{
}
Assert.AreEqual(1, k);
}
Apparently the enumerator returned by items.Select is not living well with the Task object returned, as soon as I walk the enumerator, the delegate got fired again.
This test pass.
[TestMethod]
public void TestTwiceShoot()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Select(d =>
{
k++;
var client = new System.Net.Http.HttpClient();
return client.GetAsync(new Uri("http://testdevserver.ibs.local:8020/prestashop/api/products/1"));
});
var tasksArray = tasks.ToArray();
Task.WaitAll(tasksArray);
foreach (var r in tasksArray)
{
}
Assert.AreEqual(1, k);
}
Scott mentioned that the Select may run again when walking the enumerator, however, this test pass
[TestMethod]
public void TestTwiceShoot()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Select(d =>
{
k++;
return int.Parse(d);
});
foreach (var r in tasks)
{
};
Assert.AreEqual(1, k);
}
I guess the Linq Select has some special treatment against Task.
After all, what's the good way of firing multiple async method in Linq and the examine the results after WaitAll?
It is because tasks is IEnumerable<Task> and each time you enumerate through the list it will re-run the .Select() operation. Currently you run through the list twice, one when you call .ToArray() and once when you pass it in to the foreach
To fix the problem just use the .ToArray() like you are but move it earlier up.
var tasks = items.Select(d =>
{
k++;
var client = new System.Net.Http.HttpClient();
return client.GetAsync(new Uri("http://testdevserver.ibs.local:8020/prestashop/api/products/1"));
}).ToArray(); //This makes tasks a "Task[]" instead of a IEnumerable<Task>.
Task.WaitAll(tasks);
foreach (var r in tasks)
{
};
Things like what happened to you is why Microsoft reccomends that when you write Linq statements that they do not have any side effects (like incrementing k) because it is hard to tell how many times the statement will be run, especially if the resultant IEnumerable<T> goes out of your scope of control by being returned as a result or passed in to a new function.
I think the problem is my misconception about how enumeration works. These tests pass:
[TestMethod]
public void TestTwiceShoot()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Select(d =>
{
k++;
return int.Parse(d);
});
foreach (var r in tasks)
{
};
foreach (var r in tasks)
{
};
Assert.AreEqual(2, k);
}
[TestMethod]
public void TestTwiceShoot2()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Where(d =>
{
k++;
return true;
});
foreach (var r in tasks)
{
};
foreach (var r in tasks)
{
};
Assert.AreEqual(2, k);
}
I had though the Linq statement returns an IEnumerable object which store the results of the delegate. However, obviously it stores only the shortcuts to the delegates, so each enumerator walk will trigger the delegate. Therefore, it is good to use ToArray() or ToList() to get a list of results, like this one:
[TestMethod]
public void TestTwiceShoot2()
{
List<string> items = new List<string>();
items.Add("1");
int k = 0;
var tasks = items.Where(d =>
{
k++;
return true;
}).ToList();
foreach (var r in tasks)
{
};
foreach (var r in tasks)
{
};
Assert.AreEqual(1, k);
}

Producer Consumer model using TPL, Tasks in .net 4.0

I have a fairly large XML file(around 1-2GB).
The requirement is to persist the xml data in to database.
Currently this is achieved in 3 steps.
Read the large file with less memory foot print as much as possible
Create entities from the xml-data
Store the data from the created entities in to the database using SqlBulkCopy.
To achieve better performance I want to create a Producer-consumer model where the producer creates a set of entities say a batch of 10K and adds it to a Queue. And the consumer should take the batch of entities from the queue and persist to the database using sqlbulkcopy.
Thanks,
Gokul
void Main()
{
int iCount = 0;
string fileName = #"C:\Data\CatalogIndex.xml";
DateTime startTime = DateTime.Now;
Console.WriteLine("Start Time: {0}", startTime);
FileInfo fi = new FileInfo(fileName);
Console.WriteLine("File Size:{0} MB", fi.Length / 1048576.0);
/* I want to change this loop to create a producer consumer pattern here to process the data parallel-ly
*/
foreach (var element in StreamElements(fileName,"title"))
{
iCount++;
}
Console.WriteLine("Count: {0}", iCount);
Console.WriteLine("End Time: {0}, Time Taken:{1}", DateTime.Now, DateTime.Now - startTime);
}
private static IEnumerable<XElement> StreamElements(string fileName, string elementName)
{
using (var rdr = XmlReader.Create(fileName))
{
rdr.MoveToContent();
while (!rdr.EOF)
{
if ((rdr.NodeType == XmlNodeType.Element) && (rdr.Name == elementName))
{
var e = XElement.ReadFrom(rdr) as XElement;
yield return e;
}
else
{
rdr.Read();
}
}
rdr.Close();
}
}
Is this what you are trying to do?
void Main()
{
const int inputCollectionBufferSize = 1024;
const int bulkInsertBufferCapacity = 100;
const int bulkInsertConcurrency = 4;
BlockingCollection<object> inputCollection = new BlockingCollection<object>(inputCollectionBufferSize);
Task loadTask = Task.Factory.StartNew(() =>
{
foreach (object nextItem in ReadAllElements(...))
{
// this will potentially block if there are already enough items
inputCollection.Add(nextItem);
}
// mark this collection as done
inputCollection.CompleteAdding();
});
Action parseAction = () =>
{
List<object> bulkInsertBuffer = new List<object>(bulkInsertBufferCapacity);
foreach (object nextItem in inputCollection.GetConsumingEnumerable())
{
if (bulkInsertBuffer.Length == bulkInsertBufferCapacity)
{
CommitBuffer(bulkInsertBuffer);
bulkInsertBuffer.Clear();
}
bulkInsertBuffer.Add(nextItem);
}
};
List<Task> parseTasks = new List<Task>(bulkInsertConcurrency);
for (int i = 0; i < bulkInsertConcurrency; i++)
{
parseTasks.Add(Task.Factory.StartNew(parseAction));
}
// wait before exiting
loadTask.Wait();
Task.WaitAll(parseTasks.ToArray());
}

Categories