Read and Write as Parallel Tasks - c#

Reading and Writing in 2 parallel tasks as shown below:
Task[] tasks = new Task[2];
var entityCollection = new BlockingCollection<Dictionary<String, object>>();
tasks[0] = Task.Factory.StartNew(() => ReadData(entityCollection), TaskCreationOptions.LongRunning);
tasks[1] = Task.Factory.StartNew(() => WriteJsontoFile(JSONFileName, entityCollection), TaskCreationOptions.LongRunning);
Task.WaitAll(tasks);
Read Task:
private void ReadData(BlockingCollection<Dictionary<String, object>> collection)
{
do
{
//continuously data is being read in to entities, this part is working fine and then adding it to collection of BlockingCollection type to be consumed in Write task
entitites.ToList().ForEach(e => collection.Add(e));
} while (true);
collection.CompleteAdding();
}
Write Task:
private void WriteJsontoFile(String JsonFileName, BlockingCollection<Dictionary<String, object>> source)
{
using (StreamWriter sw = new StreamWriter(JsonFileName, true))
{
Parallel.ForEach(source.GetConsumingPartitioner(), (line) => ser.Serialize(sw, line));
}
}
GetConsumingPartitioner() related code:
public static class BlockingCollection
{
public static Partitioner<T> GetConsumingPartitioner<T>(
this BlockingCollection<T> collection)
{
return new BlockingCollectionPartitioner<T>(collection);
}
}
class BlockingCollectionPartitioner<T> : Partitioner<T>
{
private BlockingCollection<T> _collection;
internal BlockingCollectionPartitioner(BlockingCollection<T> collection)
{
if (collection == null)
throw new ArgumentNullException("collection");
_collection = collection;
}
public override bool SupportsDynamicPartitions
{
get { return true; }
}
public override IList<IEnumerator<T>> GetPartitions(int partitionCount)
{
if (partitionCount < 1)
throw new ArgumentOutOfRangeException("partitionCount");
var dynamicPartitioner = GetDynamicPartitions();
return Enumerable.Range(0, partitionCount).Select(_ =>
dynamicPartitioner.GetEnumerator()).ToArray();
}
public override IEnumerable<T> GetDynamicPartitions()
{
return _collection.GetConsumingEnumerable();
}
}
I am getting this below exception inside Write task:
Count cannot be less than zero.\r\nParameter name: count

That is not standard syntax for consuming
BlockingCollection Class
// Consume consume the BlockingCollection
while (true) Console.WriteLine(bc.Take());

Related

Is it better to block on an event in CPU-bound multithreaded method than making it async?

I have a method that will spawn lots of CPU-bound workers with Task.Run(). Each worker may in turn spawn more workers, but I'm guaranteed that eventually, all workers will stop executing. My first thought was writing my method like this:
public Result OrchestrateWorkers(WorkItem[] workitems)
{
this.countdown = new CountdownEvent(0);
this.results = new ConcurrentQueue<WorkerResult>();
foreach (var workItem in workitems)
{
SpawnWorker(workItem);
}
this.countdown.Wait(); // until all spawned workers have completed.
return ComputeTotalResult(this.results);
}
The public SpawnWorker method is used to start a worker, and to keep track of when they complete by enqueueing the worker's result and decrementing the countdown.
public void SpawnWorker(WorkItem workItem)
{
this.countdown.AddCount();
Task.Run(() => {
// Worker is passed an instance of this class
// so it can call SpawnWorker if it needs to.
var worker = new Worker(workItem, this);
var result = worker.DoWork();
this.results.Enqueue(result);
countdown.Signal();
});
}
Each worker can call SpawnWorker as much as they like, but they're guaranteed to terminate at some point.
In this design, the thread that calls OrchestrateWorkers will block untill all the workers have completed. My thinking is that it's a shame that there's a blocked thread; it would be nice if it could be doing work as well.
Would it be better to rearchitect the solution to something like this?
public Task<Result> OrchestrateWorkersAsync(WorkItem[] workitems)
{
if (this.tcs is not null) throw InvalidOperationException("Already running!");
this.tcs = new TaskCompletionSource<Result>();
this.countdown = 0; // just a normal integer.
this.results = new ConcurrentQueue<WorkerResult>();
foreach (var workItem in workitems)
{
SpawnWorker(workItem);
}
return tcs.Task;
}
public void SpawnWorker(WorkItem workItem)
{
Interlocked.Increment(ref this.countdown);
Task.Run(() => {
var worker = new Worker(workItem, this);
var result = worker.DoWork();
this.results.Enqueue(result);
if (Interlocked.Decrement(ref countdown) == 0)
{
this.tcs.SetResult(this.ComputeTotalResult(this.results));
}
});
}
EDIT: I've added a more full-fleshed sample below. It should be compileable and runnable. I'm seeing a ~10% performance improvement on my 8-core system, but I want to make sure this is the "canonical" way to orchestrate a swarm of spawning tasks.
using System.Collections.Concurrent;
using System.Diagnostics;
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using System.Linq;
public class Program
{
const int ITERATIONS = 2500000;
const int WORKERS = 200;
public static async Task Main()
{
var o = new Orchestrator<int, int>();
var oo = new OrchestratorAsync<int, int>();
var array = Enumerable.Range(0, WORKERS);
var result = Time(() => o.OrchestrateWorkers(array, DoWork));
Console.Error.WriteLine("Sync spawned {0} workers", result.Count());
var resultAsync = await TimeAsync(() => oo.OrchestrateWorkersAsync(array, DoWorkAsync));
Console.Error.WriteLine("Async spawned {0} workers", resultAsync.Count());
}
static async Task<T> TimeAsync<T>(Func<Task<T>> work)
{
var sw = new Stopwatch();
sw.Start();
var result = await work();
sw.Stop();
Console.WriteLine("Total async time: {0}", sw.ElapsedMilliseconds);
return result;
}
static T Time<T>(Func<T> work)
{
var sw = new Stopwatch();
sw.Start();
var result = work();
sw.Stop();
Console.WriteLine("Total time: {0}", sw.ElapsedMilliseconds);
return result;
}
static int DoWork(int x, Orchestrator<int, int> arg2)
{
var rnd = new Random();
int n = 0;
for (int i = 0; i < ITERATIONS; ++i)
{
n += rnd.Next();
}
if (x >= 0)
{
arg2.SpawnWorker(-1, DoWork);
arg2.SpawnWorker(-1, DoWork);
}
return n;
}
static int DoWorkAsync(int x, OrchestratorAsync<int, int> arg2)
{
var rnd = new Random();
int n = 0;
for (int i = 0; i < ITERATIONS; ++i)
{
n += rnd.Next();
}
if (x >= 0)
{
arg2.SpawnWorker(-1, DoWorkAsync);
arg2.SpawnWorker(-1, DoWorkAsync);
}
return n;
}
public class Orchestrator<TWorkItem, TResult>
{
private ConcurrentQueue<TResult> results;
private CountdownEvent countdownEvent;
public Orchestrator()
{
this.results = new();
this.countdownEvent = new(1);
}
public IEnumerable<TResult> OrchestrateWorkers(
IEnumerable<TWorkItem> workItems,
Func<TWorkItem, Orchestrator<TWorkItem, TResult>, TResult> worker)
{
foreach (var workItem in workItems)
{
SpawnWorker(workItem, worker);
}
countdownEvent.Signal();
countdownEvent.Wait();
return results;
}
public void SpawnWorker(
TWorkItem workItem,
Func<TWorkItem, Orchestrator<TWorkItem, TResult>, TResult> worker)
{
this.countdownEvent.AddCount(1);
Task.Run(() =>
{
var result = worker(workItem, this);
this.results.Enqueue(result);
countdownEvent.Signal();
});
}
}
public class OrchestratorAsync<TWorkItem, TResult>
{
private ConcurrentQueue<TResult> results;
private volatile int countdown;
private TaskCompletionSource<IEnumerable<TResult>> tcs;
public OrchestratorAsync()
{
this.results = new();
this.countdown = 0;
this.tcs = new TaskCompletionSource<IEnumerable<TResult>>();
}
public Task<IEnumerable<TResult>> OrchestrateWorkersAsync(
IEnumerable<TWorkItem> workItems,
Func<TWorkItem, OrchestratorAsync<TWorkItem, TResult>, TResult> worker)
{
this.countdown = 0; // just a normal integer.
foreach (var workItem in workItems)
{
SpawnWorker(workItem, worker);
}
return tcs.Task;
}
public void SpawnWorker(TWorkItem workItem,
Func<TWorkItem, OrchestratorAsync<TWorkItem, TResult>, TResult> worker)
{
Interlocked.Increment(ref this.countdown);
Task.Run(() =>
{
var result = worker(workItem, this);
this.results.Enqueue(result);
if (Interlocked.Decrement(ref countdown) == 0)
{
this.tcs.SetResult(this.results);
}
});
}
}
}
There's one big problem with the code as-written: the tasks fired off by Task.Run are discarded. This means there's no way to detect if anything goes wrong (i.e., an exception). It also means that there's not an easy way to aggregate results during execution, which is a common requirement; this lack of natural result handling is making the code collect results "out of band" in a separate collection.
These are the flags that this code is asking for adjustment to its structure. This is actual parallel code (i.e., not asynchronous), so parallel patterns are appropriate. You don't know how many tasks you need initially, so basic Data/Task Parallelism (such as a Parallel or PLINQ approach) won't suffice. At this point, you're needing Dynamic Task Parallelism, which is the most complex kind of parallelism. The TPL does support it, but your code just has to use the lower-level APIs to get it done.
Since you have dynamically-added work and since your structure is generally tree-shaped (each work can add other work), you can introduce an artificial root and then use child tasks. This will give you two - and possibly three - benefits:
All exceptions are no longer ignored. Child task exceptions are propagated up to their parents, all the way to the root.
You know when all the tasks are complete. Since parent tasks only complete when all their children complete, there's no need for a countdown event or any other orchestrating synchronization primitive; your code just has to wait on the root task, and all the work is done when that task completes.
If it is possible/desirable to reduce results as you go (a common requirement), then the child tasks can return the results and you will end up with the already-reduced results as the result of your root task.
Example code (ignoring (3) since it's not clear whether results can be reduced):
public class OrchestratorParentChild<TWorkItem, TResult>
{
private readonly ConcurrentQueue<TResult> results = new();
public IEnumerable<TResult> OrchestrateWorkers(
IEnumerable<TWorkItem> workItems,
Func<TWorkItem, OrchestratorParentChild<TWorkItem, TResult>, TResult> worker)
{
var rootTask = Task.Factory.StartNew(
() =>
{
foreach (var workItem in workItems)
SpawnWorker(workItem, worker);
},
default,
TaskCreationOptions.None,
TaskScheduler.Default);
rootTask.Wait();
return results;
}
public void SpawnWorker(
TWorkItem workItem,
Func<TWorkItem, OrchestratorParentChild<TWorkItem, TResult>, TResult> worker)
{
_ = Task.Factory.StartNew(
() => results.Enqueue(worker(workItem, this)),
default,
TaskCreationOptions.AttachedToParent,
TaskScheduler.Default);
}
}
Note that an "orchestrator" isn't normally used. Code using the Dynamic Task Parallelism pattern usually just calls StartNew directly instead of calling some orchestrator "spawn work" method.
In case you're wondering how this may look with results, here's one possibility:
public class OrchestratorParentChild<TWorkItem, TResult>
{
public TResult OrchestrateWorkers(
IEnumerable<TWorkItem> workItems,
Func<TWorkItem, OrchestratorParentChild<TWorkItem, TResult>, Func<IEnumerable<TResult>, TResult>, TResult> worker,
Func<IEnumerable<TResult>, TResult> resultReducer)
{
var rootTask = Task.Factory.StartNew(
() =>
{
var childTasks = workItems.Select(x => SpawnWorker(x, worker, resultReducer)).ToArray();
Task.WaitAll(childTasks);
return resultReducer(childTasks.Select(x => x.Result));
},
default,
TaskCreationOptions.None,
TaskScheduler.Default);
return rootTask.Result;
}
public Task<TResult> SpawnWorker(
TWorkItem workItem,
Func<TWorkItem, OrchestratorParentChild<TWorkItem, TResult>, Func<IEnumerable<TResult>, TResult>, TResult> worker,
Func<IEnumerable<TResult>, TResult> resultReducer)
{
return Task.Factory.StartNew(
() => worker(workItem, this, resultReducer),
default,
TaskCreationOptions.AttachedToParent,
TaskScheduler.Default);
}
}
As a final note, I rarely plug my book on this site, but you may find it helpful. Also a copy of "Parallel Programming with Microsoft® .NET: Design Patterns for Decomposition and Coordination on Multicore Architectures" if you can find it; it's a bit out of date in some places but still good overall if you want to do TPL programming.

Short circuit yield return & cleanup/dispose

Take this pseudo example code:
static System.Runtime.InteropServices.ComTypes.IEnumString GetUnmanagedObject() => null;
static IEnumerable<string> ProduceStrings()
{
System.Runtime.InteropServices.ComTypes.IEnumString obj = GetUnmanagedObject();
var result = new string[1];
var pFetched = Marshal.AllocHGlobal(sizeof(int));
while(obj.Next(1, result, pFetched) == 0)
{
yield return result[0];
}
Marshal.ReleaseComObject(obj);
}
static void Consumer()
{
foreach (var item in ProduceStrings())
{
if (item.StartsWith("foo"))
return;
}
}
Question is if i decide to not enumerate all values, how can i inform producer to do cleanup?
Even if you are after a solution using yield return, it might be useful to see how this can be accomplished with an explicit IEnumerator<string> implementation.
IEnumerator<T> derives from IDisposable and the Dispose() method will be called when foreach is left (at least since .NET 1.2, see here)
static IEnumerable<string> ProduceStrings()
{
return new ProduceStringsImpl();
}
This is the class implementing IEnumerable<string>
class ProduceStringsImpl : IEnumerable<string>
{
public IEnumerator<string> GetEnumerator()
{
return new EnumProduceStrings();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
And here we have the core of the solution, the IEnumerator<string> implementation:
class EnumProduceStrings : IEnumerator<string>
{
private System.Runtime.InteropServices.ComTypes.IEnumString _obj;
private string[] _result;
private IntPtr _pFetched;
public EnumProduceStrings()
{
_obj = GetUnmanagedObject();
_result = new string[1];
_pFetched = Marshal.AllocHGlobal(sizeof(int));
}
public bool MoveNext()
{
return _obj.Next(1, _result, _pFetched) == 0;
}
public string Current => _result[0];
void IEnumerator.Reset() => throw new NotImplementedException();
object IEnumerator.Current => Current;
public void Dispose()
{
Marshal.ReleaseComObject(_obj);
Marshal.FreeHGlobal(_pFetched);
}
}
I knew i can! Despite guard, Cancel is called only one time in all circumtances.
You can instead encapsulate logic with a type like IterationResult<T> and provide Cleanup method on it but its essentially same idea.
public class IterationCanceller
{
Action m_OnCancel;
public bool Cancelled { get; private set; }
public IterationCanceller(Action onCancel)
{
m_OnCancel = onCancel;
}
public void Cancel()
{
if (!Cancelled)
{
Cancelled = true;
m_OnCancel();
}
}
}
static IEnumerable<(string Result, IterationCanceller Canceller)> ProduceStrings()
{
var pUnmanaged = Marshal.AllocHGlobal(sizeof(int));
IterationCanceller canceller = new IterationCanceller(() =>
{
Marshal.FreeHGlobal(pUnmanaged);
});
for (int i = 0; i < 2; i++) // also try i < 0, 1
{
yield return (i.ToString(), canceller);
}
canceller.Cancel();
}
static void Consumer()
{
foreach (var (item, canceller) in ProduceStrings())
{
if(item.StartsWith("1")) // also try consuming all values
{
canceller.Cancel();
break;
}
}
}

C# How to partition parallel foreach loop to iterate my list

I am new in programming world. i am doing my graduation and also learning dotnet.
I want to iterate my list in parallel foreach but i want to use partition there. I have lack of knowledge so my code is not compiling.
Actually this way i did it first which is working.
Parallel.ForEach(MyBroker, broker =>,,
{
mybrow = new WeightageRowNumber();
mybrow.RowNumber = Interlocked.Increment(ref rowNumber);
lock (_lock)
{
Mylist.Add(mybrow);
}
});
now i want to use partition so i change my code this way but now my code not compiling. here is code
Parallel.ForEach(MyBroker, broker,
(j, loop, subtotal) =>
{
mybrow = new WeightageRowNumber();
mybrow.RowNumber = Interlocked.Increment(ref rowNumber);
lock (_lock)
{
Mylist.Add(mybrow);
}
return brokerRowWeightageRowNumber.RowNumber;
},
(finalResult) =>
var rownum= Interlocked.Increment(ref finalResult);
console.writeline(rownum);
);
please see my second set of code and show me how to restructure to use partition for parallel foreach to iterate my list.
please guide me. thanks
The Parallel.ForEach method has 20 overloads - perhaps try a different overload?
Without your dependencies included I can't give a 1-to-1 example on your implementation but here is an in-depth example (reformatted from here) that you can copy into your IDE and set debug breakpoints (if that's useful). Unfortunately building an instantiable overload of OrderablePartitioner appears non-trivial so sorry for all the boilerplate code:
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Threading;
using System.Collections.Concurrent;
using System.Collections;
using System.Linq;
// Simple partitioner that will extract one (index,item) pair at a time,
// in a thread-safe fashion, from the underlying collection.
class SingleElementOrderablePartitioner<T> : OrderablePartitioner<T>
{
// The collection being wrapped by this Partitioner
IEnumerable<T> m_referenceEnumerable;
// Class used to wrap m_index for the purpose of sharing access to it
// between an InternalEnumerable and multiple InternalEnumerators
private class Shared<U>
{
internal U Value;
public Shared(U item)
{
Value = item;
}
}
// Internal class that serves as a shared enumerable for the
// underlying collection.
private class InternalEnumerable : IEnumerable<KeyValuePair<long, T>>, IDisposable
{
IEnumerator<T> m_reader;
bool m_disposed = false;
Shared<long> m_index = null;
// These two are used to implement Dispose() when static partitioning is being performed
int m_activeEnumerators;
bool m_downcountEnumerators;
// "downcountEnumerators" will be true for static partitioning, false for
// dynamic partitioning.
public InternalEnumerable(IEnumerator<T> reader, bool downcountEnumerators)
{
m_reader = reader;
m_index = new Shared<long>(0);
m_activeEnumerators = 0;
m_downcountEnumerators = downcountEnumerators;
}
public IEnumerator<KeyValuePair<long, T>> GetEnumerator()
{
if (m_disposed)
throw new ObjectDisposedException("InternalEnumerable: Can't call GetEnumerator() after disposing");
// For static partitioning, keep track of the number of active enumerators.
if (m_downcountEnumerators) Interlocked.Increment(ref m_activeEnumerators);
return new InternalEnumerator(m_reader, this, m_index);
}
IEnumerator<KeyValuePair<long, T>> IEnumerable<KeyValuePair<long, T>>.GetEnumerator()
{
return this.GetEnumerator();
}
public void Dispose()
{
if (!m_disposed)
{
// Only dispose the source enumerator if you are doing dynamic partitioning
if (!m_downcountEnumerators)
{
m_reader.Dispose();
}
m_disposed = true;
}
}
// Called from Dispose() method of spawned InternalEnumerator. During
// static partitioning, the source enumerator will be automatically
// disposed once all requested InternalEnumerators have been disposed.
public void DisposeEnumerator()
{
if (m_downcountEnumerators)
{
if (Interlocked.Decrement(ref m_activeEnumerators) == 0)
{
m_reader.Dispose();
}
}
}
IEnumerator IEnumerable.GetEnumerator()
{
throw new NotImplementedException();
}
}
// Internal class that serves as a shared enumerator for
// the underlying collection.
private class InternalEnumerator : IEnumerator<KeyValuePair<long, T>>
{
KeyValuePair<long, T> m_current;
IEnumerator<T> m_source;
InternalEnumerable m_controllingEnumerable;
Shared<long> m_index = null;
bool m_disposed = false;
public InternalEnumerator(IEnumerator<T> source, InternalEnumerable controllingEnumerable, Shared<long> index)
{
m_source = source;
m_current = default(KeyValuePair<long, T>);
m_controllingEnumerable = controllingEnumerable;
m_index = index;
}
object IEnumerator.Current
{
get { return m_current; }
}
KeyValuePair<long, T> IEnumerator<KeyValuePair<long, T>>.Current
{
get { return m_current; }
}
void IEnumerator.Reset()
{
throw new NotSupportedException("Reset() not supported");
}
// This method is the crux of this class. Under lock, it calls
// MoveNext() on the underlying enumerator, grabs Current and index,
// and increments the index.
bool IEnumerator.MoveNext()
{
bool rval = false;
lock (m_source)
{
rval = m_source.MoveNext();
if (rval)
{
m_current = new KeyValuePair<long, T>(m_index.Value, m_source.Current);
m_index.Value = m_index.Value + 1;
}
else m_current = default(KeyValuePair<long, T>);
}
return rval;
}
void IDisposable.Dispose()
{
if (!m_disposed)
{
// Delegate to parent enumerable's DisposeEnumerator() method
m_controllingEnumerable.DisposeEnumerator();
m_disposed = true;
}
}
}
// Constructor just grabs the collection to wrap
public SingleElementOrderablePartitioner(IEnumerable<T> enumerable)
: base(true, true, true)
{
// Verify that the source IEnumerable is not null
if (enumerable == null)
throw new ArgumentNullException("enumerable");
m_referenceEnumerable = enumerable;
}
// Produces a list of "numPartitions" IEnumerators that can each be
// used to traverse the underlying collection in a thread-safe manner.
// This will return a static number of enumerators, as opposed to
// GetOrderableDynamicPartitions(), the result of which can be used to produce
// any number of enumerators.
public override IList<IEnumerator<KeyValuePair<long, T>>> GetOrderablePartitions(int numPartitions)
{
if (numPartitions < 1)
throw new ArgumentOutOfRangeException("NumPartitions");
List<IEnumerator<KeyValuePair<long, T>>> list = new List<IEnumerator<KeyValuePair<long, T>>>(numPartitions);
// Since we are doing static partitioning, create an InternalEnumerable with reference
// counting of spawned InternalEnumerators turned on. Once all of the spawned enumerators
// are disposed, dynamicPartitions will be disposed.
var dynamicPartitions = new InternalEnumerable(m_referenceEnumerable.GetEnumerator(), true);
for (int i = 0; i < numPartitions; i++)
list.Add(dynamicPartitions.GetEnumerator());
return list;
}
// Returns an instance of our internal Enumerable class. GetEnumerator()
// can then be called on that (multiple times) to produce shared enumerators.
public override IEnumerable<KeyValuePair<long, T>> GetOrderableDynamicPartitions()
{
// Since we are doing dynamic partitioning, create an InternalEnumerable with reference
// counting of spawned InternalEnumerators turned off. This returned InternalEnumerable
// will need to be explicitly disposed.
return new InternalEnumerable(m_referenceEnumerable.GetEnumerator(), false);
}
// Must be set to true if GetDynamicPartitions() is supported.
public override bool SupportsDynamicPartitions
{
get { return true; }
}
}
Here are examples of how to structure Parallel.ForEach using the above OrderablePartitioner. See how you can refactor-out your finally-block entirely out of the ForEach impl?
public class Program
{
static void Main(string[] args)
{
//
// First a fairly simple visual test
//
var someCollection = new string[] { "four", "score", "and", "twenty", "years", "ago" };
var someOrderablePartitioner = new SingleElementOrderablePartitioner<string>(someCollection);
Parallel.ForEach(someOrderablePartitioner, (item, state, index) =>
{
Console.WriteLine("ForEach: item = {0}, index = {1}, thread id = {2}", item, index, Thread.CurrentThread.ManagedThreadId);
});
//
// Now a more rigorous test of dynamic partitioning (used by Parallel.ForEach)
//
List<int> src = Enumerable.Range(0, 100000).ToList();
SingleElementOrderablePartitioner<int> myOP = new SingleElementOrderablePartitioner<int>(src);
int counter = 0;
bool mismatch = false;
Parallel.ForEach(myOP, (item, state, index) =>
{
if (item != index) mismatch = true;
Interlocked.Increment(ref counter);
});
if (mismatch) Console.WriteLine("OrderablePartitioner Test: index mismatch detected");
Console.WriteLine("OrderablePartitioner test: counter = {0}, should be 100000", counter);
}
}
Also this link might be useful ("Write a simple parallel.ForEach Loop")

c# multi threading process large file lines in batches of 100 [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 years ago.
Improve this question
I have a file with 500.000.000 lines.
The lines are string of max 10 characters.
How can I process this file using multi threading and in batches of 100?
Using MoreLinq's Batch method, this will create a collection of IEnumerable<string> which will contain the line batch size of 100, it will spin a new task for every 100 lines.
This is a basic implementation, it might be wise to use a Semaphore to only run a certain amount of tasks at any given time, and also seeing what overhead File.ReadAllLines will have on performance with 500,000,000 lines.
public class FileProcessor
{
public async Task ProcessFile()
{
List<Task> tasks = new List<Task>();
var lines = File.ReadAllLines("File.txt").Batch(100);
foreach (IEnumerable<string> linesBatch in lines)
{
IEnumerable<string> localLinesBatch = linesBatch;
Task task = Task.Factory.StartNew(() =>
{
// Perform operation on localLinesBatch
});
tasks.Add(task);
}
await Task.WhenAll(tasks);
}
}
public static class LinqExtensions
{
public static IEnumerable<IEnumerable<TSource>> Batch<TSource>(
this IEnumerable<TSource> source, int size)
{
TSource[] bucket = null;
var count = 0;
foreach (var item in source)
{
if (bucket == null)
bucket = new TSource[size];
bucket[count++] = item;
if (count != size)
continue;
yield return bucket;
bucket = null;
count = 0;
}
if (bucket != null && count > 0)
yield return bucket.Take(count);
}
}
Using additional libraries is not required if you use Parallel.ForEach from built-in TPL and write a couple of enumerators (listed below). Your code can look like this:
using (var input = new StreamReader(File.OpenRead(#"c:\path\to\my\file.txt")))
{
Parallel.ForEach(
input.ReadLines().TakeChunks(100),
new ParallelOptions() { MaxDegreeOfParallelism = 8 /* better be number of CPU cores */ },
batchOfLines => {
DoMyProcessing(batchOfLines);
});
}
for this to work, you need a couple of extension methods on IEnumerable<T> and a couple of enumerators, defined as follows:
public static class EnumerableExtensions
{
public static IEnumerable<string> ReadLines(this StreamReader input)
{
return new LineReadingEnumerable(input);
}
public static IEnumerable<IReadOnlyList<T>> TakeChunks<T>(this IEnumerable<T> source, int length)
{
return new ChunkingEnumerable<T>(source, length);
}
public class LineReadingEnumerable : IEnumerable<string>
{
private readonly StreamReader _input;
public LineReadingEnumerable(StreamReader input)
{
_input = input;
}
public IEnumerator<string> GetEnumerator()
{
return new LineReadingEnumerator(_input);
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
public class LineReadingEnumerator : IEnumerator<string>
{
private readonly StreamReader _input;
private string _current;
public LineReadingEnumerator(StreamReader input)
{
_input = input;
}
public void Dispose()
{
_input.Dispose();
}
public bool MoveNext()
{
_current = _input.ReadLine();
return (_current != null);
}
public void Reset()
{
throw new NotSupportedException();
}
public string Current
{
get { return _current; }
}
object IEnumerator.Current
{
get { return _current; }
}
}
public class ChunkingEnumerable<T> : IEnumerable<IReadOnlyList<T>>
{
private readonly IEnumerable<T> _inner;
private readonly int _length;
public ChunkingEnumerable(IEnumerable<T> inner, int length)
{
_inner = inner;
_length = length;
}
public IEnumerator<IReadOnlyList<T>> GetEnumerator()
{
return new ChunkingEnumerator<T>(_inner.GetEnumerator(), _length);
}
IEnumerator IEnumerable.GetEnumerator()
{
return this.GetEnumerator();
}
}
public class ChunkingEnumerator<T> : IEnumerator<IReadOnlyList<T>>
{
private readonly IEnumerator<T> _inner;
private readonly int _length;
private IReadOnlyList<T> _current;
private bool _endOfInner;
public ChunkingEnumerator(IEnumerator<T> inner, int length)
{
_inner = inner;
_length = length;
}
public void Dispose()
{
_inner.Dispose();
_current = null;
}
public bool MoveNext()
{
var currentBuffer = new List<T>();
while (currentBuffer.Count < _length && !_endOfInner)
{
if (!_inner.MoveNext())
{
_endOfInner = true;
break;
}
currentBuffer.Add(_inner.Current);
}
if (currentBuffer.Count > 0)
{
_current = currentBuffer;
return true;
}
_current = null;
return false;
}
public void Reset()
{
_inner.Reset();
_current = null;
_endOfInner = false;
}
public IReadOnlyList<T> Current
{
get
{
if (_current != null)
{
return _current;
}
throw new InvalidOperationException();
}
}
object IEnumerator.Current
{
get
{
return this.Current;
}
}
}
}

.net Rx: in-order batch-processing of messages

I am attempting to implement an asynchronous workflow using Rx and I seem to be doing it completely wrong.
What I would like to do is this:
From an undefined asynchronous stream of un-parsed message strings (i.e. an IObservable<string>)
parse the message strings asynchronously, but preserve their order. (IObservable<Message>)
Batch up parsed Messages in groups of 100 or so (IObservable<IEnumerable<Message>>)
Send each batch, when complete, to the UI thread to be processed. Batches must arrive in the same order they were started.
I can't seem to get the order-preservation, and also Rx doesn't appear to be doing things asynchronously when I expected them to.
I made an attempt at order preservation by using an IEnumerable instead of an IObservable, and then calling the .AsParallel().AsOrdered() operators on it. Here is the code. See notes below for the issues I'm having:
private IObservable<IEnumerable<Message>> messageSource;
public IObservable<IEnumerable<Message>> MessageSource { get { return messageSource; } }
/// <summary>
/// Sub-classes of MessageProviderBase provide this IEnumerable to
/// generate unparsed message strings synchronously
/// </summary>
protected abstract IEnumerable<string> UnparsedMessages { get; }
public MessageProviderBase()
{
// individual parsed messages as a PLINQ query
var parsedMessages = from unparsedMessage in UnparsedMessages.AsParallel().AsOrdered()
select ParseMessage(unparsedMessage);
// convert the above PLINQ query to an observable, buffering up to 100 messages at a time
var batchedMessages
= parsedMessages.ToObservable().BufferWithTimeOrCount(TimeSpan.FromMilliseconds(200), 100);
// ISSUE #1:
// batchedMessages seems to call OnNext before all of the messages in its buffer are parsed.
// If you convert the IObservable<Message> it generates to an enumerable, it blocks
// when you try to enumerate it.
// Convert each batch to an IEnumerable
// ISSUE #2: Even if the following Rx query were to run asynchronously (it doesn't now, see the above comment),
// it could still deliver messages out of order. Only, instead of delivering individual
// messages out of order, the message batches themselves could arrive out of order.
messageSource = from messageBatch in batchedMessages
select messageBatch.ToEnumerable().ToList();
}
My answer below is somewhat based on Enigmativity's code, but fixes a number of race conditions related to completion and also adds support for cancellation and custom schedulers (which would make unit testing it significantly easier).
public static IObservable<U> Fork<T, U>(this IObservable<T> source,
Func<T, U> selector)
{
return source.Fork<T, U>(selector, Scheduler.TaskPool);
}
public static IObservable<U> Fork<T, U>(this IObservable<T> source,
Func<T, U> selector, IScheduler scheduler)
{
return Observable.CreateWithDisposable<U>(observer =>
{
var runningTasks = new CompositeDisposable();
var lockGate = new object();
var queue = new Queue<ForkTask<U>>();
var completing = false;
var subscription = new MutableDisposable();
Action<Exception> onError = ex =>
{
lock(lockGate)
{
queue.Clear();
observer.OnError(ex);
}
};
Action dequeue = () =>
{
lock (lockGate)
{
var error = false;
while (queue.Count > 0 && queue.Peek().Completed)
{
var task = queue.Dequeue();
observer.OnNext(task.Value);
}
if (completing && queue.Count == 0)
{
observer.OnCompleted();
}
}
};
Action onCompleted = () =>
{
lock (lockGate)
{
completing = true;
dequeue();
}
};
Action<T> enqueue = t =>
{
var cancellation = new MutableDisposable();
var task = new ForkTask<U>();
lock(lockGate)
{
runningTasks.Add(cancellation);
queue.Enqueue(task);
}
cancellation.Disposable = scheduler.Schedule(() =>
{
try
{
task.Value = selector(t);
lock(lockGate)
{
task.Completed = true;
runningTasks.Remove(cancellation);
dequeue();
}
}
catch(Exception ex)
{
onError(ex);
}
});
};
return new CompositeDisposable(runningTasks,
source.AsObservable().Subscribe(
t => { enqueue(t); },
x => { onError(x); },
() => { onCompleted(); }
));
});
}
private class ForkTask<T>
{
public T Value = default(T);
public bool Completed = false;
}
Here is a sample that randomizes the task execution time to test it:
AutoResetEvent are = new AutoResetEvent(false);
Random rand = new Random();
Observable.Range(0, 5)
.Fork(i =>
{
int delay = rand.Next(50, 500);
Thread.Sleep(delay);
return i + 1;
})
.Subscribe(
i => Console.WriteLine(i),
() => are.Set()
);
are.WaitOne();
Console.ReadLine();
Given you have:
IObservable<string> UnparsedMessages = ...;
Func<string, Message> ParseMessage = ...;
Then you could use a SelectAsync extension method like so:
IObservable<Message> ParsedMessages = UnparsedMessages.SelectAsync(ParseMessage);
The SelectAsync extension method processes each unparsed message asynchronously and ensures that the results come back in the order they arrived.
Let me know if this does what you need.
Here's the code:
public static IObservable<U> SelectAsync<T, U>(this IObservable<T> source,
Func<T, U> selector)
{
var subject = new Subject<U>();
var queue = new Queue<System.Threading.Tasks.Task<U>>();
var completing = false;
var subscription = (IDisposable)null;
Action<Exception> onError = ex =>
{
queue.Clear();
subject.OnError(ex);
subscription.Dispose();
};
Action dequeue = () =>
{
lock (queue)
{
var error = false;
while (queue.Count > 0 && queue.Peek().IsCompleted)
{
var task = queue.Dequeue();
if (task.Exception != null)
{
error = true;
onError(task.Exception);
break;
}
else
{
subject.OnNext(task.Result);
}
}
if (!error && completing && queue.Count == 0)
{
subject.OnCompleted();
subscription.Dispose();
}
}
};
Action<T> enqueue = t =>
{
if (!completing)
{
var task = new System.Threading.Tasks.Task<U>(() => selector(t));
queue.Enqueue(task);
task.ContinueWith(tu => dequeue());
task.Start();
}
};
subscription = source.Subscribe(
t => { lock(queue) enqueue(t); },
x => { lock(queue) onError(x); },
() => { lock(queue) completing = true; });
return subject.AsObservable();
}
I ended up needing to revisit this for work and wrote a more robust version of this code (based also on Richard's answer.)
The key advantage to this code is the absence of any explicit queue. I'm purely using task continuations to put the results back in order. Works like a treat!
public static IObservable<U> ForkSelect<T, U>(this IObservable<T> source, Func<T, U> selector)
{
return source.ForkSelect<T, U>(t => Task<U>.Factory.StartNew(() => selector(t)));
}
public static IObservable<U> ForkSelect<T, U>(this IObservable<T> source, Func<T, Task<U>> selector)
{
if (source == null) throw new ArgumentNullException("source");
if (selector == null) throw new ArgumentNullException("selector");
return Observable.CreateWithDisposable<U>(observer =>
{
var gate = new object();
var onNextTask = Task.Factory.StartNew(() => { });
var sourceCompleted = false;
var taskErrored = false;
Action<Exception> onError = ex =>
{
sourceCompleted = true;
onNextTask = onNextTask.ContinueWith(t => observer.OnError(ex));
};
Action onCompleted = () =>
{
sourceCompleted = true;
onNextTask = onNextTask.ContinueWith(t => observer.OnCompleted());
};
Action<T> onNext = t =>
{
var task = selector(t);
onNextTask = Task.Factory.ContinueWhenAll(new[] { onNextTask, task }, ts =>
{
if (!taskErrored)
{
if (task.IsFaulted)
{
taskErrored = true;
observer.OnError(task.Exception);
}
else
{
observer.OnNext(task.Result);
}
}
});
};
var subscription = source
.AsObservable()
.Subscribe(
t => { if (!sourceCompleted) lock (gate) onNext(t); },
ex => { if (!sourceCompleted) lock (gate) onError(ex); },
() => { if (!sourceCompleted) lock (gate) onCompleted(); });
var #return = new CompositeDisposable(subscription);
return #return;
});
}
And the SelectMany overloads to allow LINQ to be used are:
public static IObservable<U> SelectMany<T, U>(this IObservable<T> source, Func<T, Task<U>> selector)
{
return source.ForkSelect<T, U>(selector);
}
public static IObservable<V> SelectMany<T, U, V>(this IObservable<T> source, Func<T, Task<U>> taskSelector, Func<T, U, V> resultSelector)
{
if (source == null) throw new ArgumentNullException("source");
if (taskSelector == null) throw new ArgumentNullException("taskSelector");
if (resultSelector == null) throw new ArgumentNullException("resultSelector");
return source.Zip(source.ForkSelect<T, U>(taskSelector), (t, u) => resultSelector(t, u));
}
So these methods can now be used like this:
var observableOfU = observableOfT.ForkSelect(funcOfT2U);
Or:
var observableOfU = observableOfT.ForkSelect(funcOfT2TaskOfU);
Or:
var observableOfU =
from t in observableOfT
from u in funcOfT2TaskOfU(t)
select u;
Enjoy!

Categories