TPL dataflow process N latest messages - c#

I'm trying to create some sort of queue that will process the N latest messages received. Right now I have this:
private static void SetupMessaging()
{
_messagingBroadcastBlock = new BroadcastBlock<string>(msg => msg, new ExecutionDataflowBlockOptions
{
//BoundedCapacity = 1,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingActionBlock = new ActionBlock<string>(msg =>
{
Console.WriteLine(msg);
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingBroadcastBlock.LinkTo(_messagingActionBlock, new DataflowLinkOptions { PropagateCompletion = true });
_messagingBroadcastBlock.LinkTo(DataflowBlock.NullTarget<string>());
}
The problem is if I post 1,2,3,4,5 to it I will get 1,2,5 but i'd like it to be 1,4,5. Any suggestions are welcome.
UPD 1
I was able to make the following solution work
class FixedCapacityActionBlock<T>
{
private readonly ActionBlock<CancellableMessage<T>> _actionBlock;
private readonly ConcurrentQueue<CancellableMessage<T>> _inputCollection = new ConcurrentQueue<CancellableMessage<T>>();
private readonly int _maxQueueSize;
private readonly object _syncRoot = new object();
public FixedCapacityActionBlock(Action<T> act, ExecutionDataflowBlockOptions opt)
{
var options = new ExecutionDataflowBlockOptions
{
EnsureOrdered = opt.EnsureOrdered,
CancellationToken = opt.CancellationToken,
MaxDegreeOfParallelism = opt.MaxDegreeOfParallelism,
MaxMessagesPerTask = opt.MaxMessagesPerTask,
NameFormat = opt.NameFormat,
SingleProducerConstrained = opt.SingleProducerConstrained,
TaskScheduler = opt.TaskScheduler,
//we intentionally ignore this value
//BoundedCapacity = opt.BoundedCapacity
};
_actionBlock = new ActionBlock<CancellableMessage<T>>(cmsg =>
{
if (cmsg.CancellationTokenSource.IsCancellationRequested)
{
return;
}
act(cmsg.Message);
}, options);
_maxQueueSize = opt.BoundedCapacity;
}
public bool Post(T msg)
{
var fullMsg = new CancellableMessage<T>(msg);
//what if next task starts here?
lock (_syncRoot)
{
_inputCollection.Enqueue(fullMsg);
var itemsToDrop = _inputCollection.Skip(1).Except(_inputCollection.Skip(_inputCollection.Count - _maxQueueSize + 1));
foreach (var item in itemsToDrop)
{
item.CancellationTokenSource.Cancel();
CancellableMessage<T> temp;
_inputCollection.TryDequeue(out temp);
}
return _actionBlock.Post(fullMsg);
}
}
}
And
class CancellableMessage<T> : IDisposable
{
public CancellationTokenSource CancellationTokenSource { get; set; }
public T Message { get; set; }
public CancellableMessage(T msg)
{
CancellationTokenSource = new CancellationTokenSource();
Message = msg;
}
public void Dispose()
{
CancellationTokenSource?.Dispose();
}
}
While this works and actually does the job this implementation looks dirty, also possibly not thread safe.

Here is a TransformBlock and ActionBlock implementation that drops the oldest messages in its queue, whenever newer messages are received and the BoundedCapacity limit has been reached. It behaves quite similar to a Channel configured with BoundedChannelFullMode.DropOldest.
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, Task<TOutput>> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (transform == null) throw new ArgumentNullException(nameof(transform));
dataflowBlockOptions = dataflowBlockOptions ?? new ExecutionDataflowBlockOptions();
var boundedCapacity = dataflowBlockOptions.BoundedCapacity;
var cancellationToken = dataflowBlockOptions.CancellationToken;
var queue = new Queue<TInput>(Math.Max(0, boundedCapacity));
var outputBlock = new BufferBlock<TOutput>(new DataflowBlockOptions()
{
BoundedCapacity = boundedCapacity,
CancellationToken = cancellationToken
});
if (boundedCapacity != DataflowBlockOptions.Unbounded)
dataflowBlockOptions.BoundedCapacity = checked(boundedCapacity * 2);
// After testing, at least boundedCapacity + 1 is required.
// Make it double to be sure that all non-dropped messages will be processed.
var transformBlock = new ActionBlock<object>(async _ =>
{
TInput item;
lock (queue)
{
if (queue.Count == 0) return;
item = queue.Dequeue();
}
var result = await transform(item).ConfigureAwait(false);
await outputBlock.SendAsync(result, cancellationToken).ConfigureAwait(false);
}, dataflowBlockOptions);
dataflowBlockOptions.BoundedCapacity = boundedCapacity; // Restore initial value
var inputBlock = new ActionBlock<TInput>(item =>
{
var droppedEntry = (Exists: false, Item: (TInput)default);
lock (queue)
{
transformBlock.Post(null);
if (queue.Count == boundedCapacity) droppedEntry = (true, queue.Dequeue());
queue.Enqueue(item);
}
if (droppedEntry.Exists) droppedMessages?.Report(droppedEntry.Item);
}, new ExecutionDataflowBlockOptions()
{
CancellationToken = cancellationToken
});
PropagateCompletion(inputBlock, transformBlock);
PropagateFailure(transformBlock, inputBlock);
PropagateCompletion(transformBlock, outputBlock);
_ = transformBlock.Completion.ContinueWith(_ => { lock (queue) queue.Clear(); },
TaskScheduler.Default);
return DataflowBlock.Encapsulate(inputBlock, outputBlock);
async void PropagateCompletion(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
var exception = source.Completion.IsFaulted ? source.Completion.Exception : null;
if (exception != null) target.Fault(exception); else target.Complete();
}
async void PropagateFailure(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
if (source.Completion.IsFaulted) target.Fault(source.Completion.Exception);
}
}
// Overload with synchronous lambda
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, TOutput> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateTransformBlockDropOldest(item => Task.FromResult(transform(item)),
dataflowBlockOptions, droppedMessages);
}
// ActionBlock equivalent
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Func<TInput, Task> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (action == null) throw new ArgumentNullException(nameof(action));
var block = CreateTransformBlockDropOldest<TInput, object>(
async item => { await action(item).ConfigureAwait(false); return null; },
dataflowBlockOptions, droppedMessages);
block.LinkTo(DataflowBlock.NullTarget<object>());
return block;
}
// ActionBlock equivalent with synchronous lambda
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Action<TInput> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateActionBlockDropOldest(
item => { action(item); return Task.CompletedTask; },
dataflowBlockOptions, droppedMessages);
}
The idea is to store the queued items in an auxiliary Queue, and pass dummy (null) values to an internal ActionBlock<object>. The block ignores the items passed as arguments, and takes instead an item from the queue, if there is any. Α lock is used to ensure that all non-dropped items in the queue will be eventually processed (unless of course an exception occurs).
There is also an extra feature. An optional IProgress<TInput> droppedMessages argument allows to receive notifications every time a message is dropped.
Usage example:
_messagingActionBlock = CreateActionBlockDropOldest<string>(msg =>
{
Console.WriteLine($"Processing: {msg}");
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
}, new Progress<string>(msg =>
{
Console.WriteLine($"Message dropped: {msg}");
}));

TPL Dataflow doesn't fit well into Last N messages, as it's meant to be queue, or pipeline (FIFO), not the stack (LIFO). Are you really need to do this with a dataflow library?
It's much easier with ConcurrentStack<T>, you just introduce one producer task, which posts to the stack, and one consumer task, which gets messages from stack while number of handled ones are lesser than N (More about Producer-Consumer).
If you need TPL Dataflow, you can use it in consumer task, to start handling the last messages, but not in producer, as it's really not the way it was meant to be used. Moreover, there are some other libraries with event-based architecture, which may fit more naturally for your problem.

Related

TPL Dataflow, how to discard previous (first) messages if BoundedCapacity is full [duplicate]

I'm trying to create some sort of queue that will process the N latest messages received. Right now I have this:
private static void SetupMessaging()
{
_messagingBroadcastBlock = new BroadcastBlock<string>(msg => msg, new ExecutionDataflowBlockOptions
{
//BoundedCapacity = 1,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingActionBlock = new ActionBlock<string>(msg =>
{
Console.WriteLine(msg);
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingBroadcastBlock.LinkTo(_messagingActionBlock, new DataflowLinkOptions { PropagateCompletion = true });
_messagingBroadcastBlock.LinkTo(DataflowBlock.NullTarget<string>());
}
The problem is if I post 1,2,3,4,5 to it I will get 1,2,5 but i'd like it to be 1,4,5. Any suggestions are welcome.
UPD 1
I was able to make the following solution work
class FixedCapacityActionBlock<T>
{
private readonly ActionBlock<CancellableMessage<T>> _actionBlock;
private readonly ConcurrentQueue<CancellableMessage<T>> _inputCollection = new ConcurrentQueue<CancellableMessage<T>>();
private readonly int _maxQueueSize;
private readonly object _syncRoot = new object();
public FixedCapacityActionBlock(Action<T> act, ExecutionDataflowBlockOptions opt)
{
var options = new ExecutionDataflowBlockOptions
{
EnsureOrdered = opt.EnsureOrdered,
CancellationToken = opt.CancellationToken,
MaxDegreeOfParallelism = opt.MaxDegreeOfParallelism,
MaxMessagesPerTask = opt.MaxMessagesPerTask,
NameFormat = opt.NameFormat,
SingleProducerConstrained = opt.SingleProducerConstrained,
TaskScheduler = opt.TaskScheduler,
//we intentionally ignore this value
//BoundedCapacity = opt.BoundedCapacity
};
_actionBlock = new ActionBlock<CancellableMessage<T>>(cmsg =>
{
if (cmsg.CancellationTokenSource.IsCancellationRequested)
{
return;
}
act(cmsg.Message);
}, options);
_maxQueueSize = opt.BoundedCapacity;
}
public bool Post(T msg)
{
var fullMsg = new CancellableMessage<T>(msg);
//what if next task starts here?
lock (_syncRoot)
{
_inputCollection.Enqueue(fullMsg);
var itemsToDrop = _inputCollection.Skip(1).Except(_inputCollection.Skip(_inputCollection.Count - _maxQueueSize + 1));
foreach (var item in itemsToDrop)
{
item.CancellationTokenSource.Cancel();
CancellableMessage<T> temp;
_inputCollection.TryDequeue(out temp);
}
return _actionBlock.Post(fullMsg);
}
}
}
And
class CancellableMessage<T> : IDisposable
{
public CancellationTokenSource CancellationTokenSource { get; set; }
public T Message { get; set; }
public CancellableMessage(T msg)
{
CancellationTokenSource = new CancellationTokenSource();
Message = msg;
}
public void Dispose()
{
CancellationTokenSource?.Dispose();
}
}
While this works and actually does the job this implementation looks dirty, also possibly not thread safe.
Here is a TransformBlock and ActionBlock implementation that drops the oldest messages in its queue, whenever newer messages are received and the BoundedCapacity limit has been reached. It behaves quite similar to a Channel configured with BoundedChannelFullMode.DropOldest.
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, Task<TOutput>> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (transform == null) throw new ArgumentNullException(nameof(transform));
dataflowBlockOptions = dataflowBlockOptions ?? new ExecutionDataflowBlockOptions();
var boundedCapacity = dataflowBlockOptions.BoundedCapacity;
var cancellationToken = dataflowBlockOptions.CancellationToken;
var queue = new Queue<TInput>(Math.Max(0, boundedCapacity));
var outputBlock = new BufferBlock<TOutput>(new DataflowBlockOptions()
{
BoundedCapacity = boundedCapacity,
CancellationToken = cancellationToken
});
if (boundedCapacity != DataflowBlockOptions.Unbounded)
dataflowBlockOptions.BoundedCapacity = checked(boundedCapacity * 2);
// After testing, at least boundedCapacity + 1 is required.
// Make it double to be sure that all non-dropped messages will be processed.
var transformBlock = new ActionBlock<object>(async _ =>
{
TInput item;
lock (queue)
{
if (queue.Count == 0) return;
item = queue.Dequeue();
}
var result = await transform(item).ConfigureAwait(false);
await outputBlock.SendAsync(result, cancellationToken).ConfigureAwait(false);
}, dataflowBlockOptions);
dataflowBlockOptions.BoundedCapacity = boundedCapacity; // Restore initial value
var inputBlock = new ActionBlock<TInput>(item =>
{
var droppedEntry = (Exists: false, Item: (TInput)default);
lock (queue)
{
transformBlock.Post(null);
if (queue.Count == boundedCapacity) droppedEntry = (true, queue.Dequeue());
queue.Enqueue(item);
}
if (droppedEntry.Exists) droppedMessages?.Report(droppedEntry.Item);
}, new ExecutionDataflowBlockOptions()
{
CancellationToken = cancellationToken
});
PropagateCompletion(inputBlock, transformBlock);
PropagateFailure(transformBlock, inputBlock);
PropagateCompletion(transformBlock, outputBlock);
_ = transformBlock.Completion.ContinueWith(_ => { lock (queue) queue.Clear(); },
TaskScheduler.Default);
return DataflowBlock.Encapsulate(inputBlock, outputBlock);
async void PropagateCompletion(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
var exception = source.Completion.IsFaulted ? source.Completion.Exception : null;
if (exception != null) target.Fault(exception); else target.Complete();
}
async void PropagateFailure(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
if (source.Completion.IsFaulted) target.Fault(source.Completion.Exception);
}
}
// Overload with synchronous lambda
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, TOutput> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateTransformBlockDropOldest(item => Task.FromResult(transform(item)),
dataflowBlockOptions, droppedMessages);
}
// ActionBlock equivalent
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Func<TInput, Task> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (action == null) throw new ArgumentNullException(nameof(action));
var block = CreateTransformBlockDropOldest<TInput, object>(
async item => { await action(item).ConfigureAwait(false); return null; },
dataflowBlockOptions, droppedMessages);
block.LinkTo(DataflowBlock.NullTarget<object>());
return block;
}
// ActionBlock equivalent with synchronous lambda
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Action<TInput> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateActionBlockDropOldest(
item => { action(item); return Task.CompletedTask; },
dataflowBlockOptions, droppedMessages);
}
The idea is to store the queued items in an auxiliary Queue, and pass dummy (null) values to an internal ActionBlock<object>. The block ignores the items passed as arguments, and takes instead an item from the queue, if there is any. Α lock is used to ensure that all non-dropped items in the queue will be eventually processed (unless of course an exception occurs).
There is also an extra feature. An optional IProgress<TInput> droppedMessages argument allows to receive notifications every time a message is dropped.
Usage example:
_messagingActionBlock = CreateActionBlockDropOldest<string>(msg =>
{
Console.WriteLine($"Processing: {msg}");
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
}, new Progress<string>(msg =>
{
Console.WriteLine($"Message dropped: {msg}");
}));
TPL Dataflow doesn't fit well into Last N messages, as it's meant to be queue, or pipeline (FIFO), not the stack (LIFO). Are you really need to do this with a dataflow library?
It's much easier with ConcurrentStack<T>, you just introduce one producer task, which posts to the stack, and one consumer task, which gets messages from stack while number of handled ones are lesser than N (More about Producer-Consumer).
If you need TPL Dataflow, you can use it in consumer task, to start handling the last messages, but not in producer, as it's really not the way it was meant to be used. Moreover, there are some other libraries with event-based architecture, which may fit more naturally for your problem.

Replace buffered value with latest in TPL Dataflow

I need help with making a TPL Dataflow pipeline update an input buffer with the latest value.
I am subscribed to a live stream of elements, which are posted one by one onto a dataflow pipeline. Each element is processed, which takes some time - it takes significantly more time to process one element than what it takes to produce it (i.e. fast producer, slow consumer).
However, if there are multiple elements on the input queue with the same identity, only the most recent one needs processing. The intermediate ones can be discarded. This is the part I am having trouble figuring out.
Here is an example of what I am trying to achieve:
public record Bid(int Id, int Value);
async Task Main()
{
// This block is just here to log that an input is received.
var startBlock = new TransformBlock<Bid, Bid>(d =>
{
Console.WriteLine("Input: {0} ({1})", d.Id, d.Value);
return d;
});
//TODO: Check for duplicate identity (Bid.Id) and replace the
// current element with the most recent one.
var updateWithMostRecentBlock = new TransformBlock<Bid, Bid>(d => d);
var processBlock = new TransformBlock<Bid, Bid>(async d =>
{
Console.WriteLine("Processing: {0} ({1})", d.Id, d.Value);
await Task.Delay(1000);
return d;
});
var finishBlock = new ActionBlock<Bid>(d =>
{
Console.WriteLine("Done: {0} ({1})", d.Id, d.Value);
});
var propagateCompletion = new DataflowLinkOptions { PropagateCompletion = true };
startBlock.LinkTo(updateWithMostRecentBlock, propagateCompletion);
updateWithMostRecentBlock.LinkTo(processBlock, propagateCompletion);
processBlock.LinkTo(finishBlock, propagateCompletion);
var data = new[]
{
new Bid(1, 0), // Processed immediately
new Bid(1, 1), // Replaced with (1,2)
new Bid(2, 0), // Replaced with (2,1)
new Bid(1, 2), // Queued
new Bid(2, 1) // Queued
};
foreach (var d in data)
startBlock.Post(d);
startBlock.Complete();
await finishBlock.Completion;
}
When processBlock is ready to receive the next element, I want updateWithMostRecentBlock to provide only the most relevant element.
Actual output:
Input: 1 (0)
Input: 1 (1)
Input: 2 (0)
Input: 1 (2)
Input: 2 (1)
Processing: 1 (0)
Processing: 1 (1)
Done: 1 (0)
Processing: 2 (0)
Done: 1 (1)
Processing: 1 (2)
Done: 2 (0)
Processing: 2 (1)
Done: 1 (2)
Done: 2 (1)
Expected output:
Input: 1 (0) // Immediately processed
Input: 1 (1) // Replaced by (1,2)
Input: 2 (0) // Replaced by (2,1)
Input: 1 (2) // Queued
Input: 2 (1) // Queued
Processing: 1 (0)
Done: 1 (0)
Processing: 1 (2)
Done: 1 (2)
Processing: 2 (1)
Done: 2 (1)
Hint:
Stephen Toub has an elegant solution to the exact opposite of what I'm trying to achieve. His solution rejects all incoming elements and retains the oldest one.
I'm sorry for answering my own question, but #TheGeneral brought me on the right track with his hint about bounded capacity.
I had to configure the processBlock to set bounded capacity to 1:
var processBlock = new TransformBlock<Bid, Bid>(
async d =>
{
Console.WriteLine("Processing: {0} ({1})", d.Id, d.Value);
await Task.Delay(1000);
return d;
},
new ExecutionDataflowBlockOptions
{
BoundedCapacity = 1
});
Then I replaced the updateWithMostRecentBlock with a custom block that has this implementation:
public class DiscardAndReplaceDuplicatesBlock<TValue, TKey> : IPropagatorBlock<TValue, TValue>
where TKey : IEquatable<TKey>
{
private readonly ITargetBlock<TValue> _target;
private readonly IReceivableSourceBlock<TValue> _source;
public DiscardAndReplaceDuplicatesBlock(Func<TValue, TKey> keyAccessor)
{
var buffer = new ConcurrentDictionary<TKey, (TValue Value, Task Task, CancellationTokenSource Token)>();
var outgoing = new BufferBlock<TValue>(new ExecutionDataflowBlockOptions
{
BoundedCapacity = 1,
MaxMessagesPerTask = 1
});
var incoming = new ActionBlock<TValue>(value =>
{
var key = keyAccessor(value);
var cts = new CancellationTokenSource();
var isQueued = buffer.TryGetValue(key, out var previous);
if (isQueued)
{
buffer.TryRemove(key, out var current);
Console.WriteLine("Remove: {0}", current.Value);
if (!previous.Task.IsCompleted)
{
previous.Token.Cancel();
previous.Token.Dispose();
Console.WriteLine("Cancel: {0}", current.Value);
}
}
var task = outgoing.SendAsync(value, cts.Token);
if (task.IsCompleted)
{
cts.Dispose();
Console.WriteLine("Sent: {0}", value);
return;
}
buffer.AddOrUpdate(key, (value, task, cts), (k, t) => (value, task, cts));
Console.WriteLine("Buffered: {0}", value);
});
incoming.Completion.ContinueWith(
async t =>
{
if (t.IsFaulted)
{
((ITargetBlock<TValue>)outgoing).Fault(t.Exception.InnerException);
}
else
{
await WaitForBufferToCompleteAsync().ConfigureAwait(false);
outgoing.Complete();
}
},
default,
TaskContinuationOptions.ExecuteSynchronously,
TaskScheduler.Default);
Task WaitForBufferToCompleteAsync()
{
if (!buffer.Any())
return Task.CompletedTask;
var buffered = buffer.Where(kvp => !kvp.Value.Task.IsCompleted);
var tasks = buffered.Select(b => b.Value.Task);
return Task.WhenAll(tasks);
}
_target = incoming;
_source = outgoing;
}
public Task Completion =>
_source.Completion;
public void Complete() =>
_target.Complete();
public void Fault(Exception exception) =>
_target.Fault(exception);
public IDisposable LinkTo(ITargetBlock<TValue> target, DataflowLinkOptions linkOptions) =>
_source.LinkTo(target, linkOptions);
public TValue ConsumeMessage(DataflowMessageHeader messageHeader, ITargetBlock<TValue> target, out bool messageConsumed) =>
_source.ConsumeMessage(messageHeader, target, out messageConsumed);
public DataflowMessageStatus OfferMessage(DataflowMessageHeader messageHeader, TValue messageValue, ISourceBlock<TValue>? source, bool consumeToAccept) =>
_target.OfferMessage(messageHeader, messageValue, source, consumeToAccept);
public bool ReserveMessage(DataflowMessageHeader messageHeader, ITargetBlock<TValue> target) =>
_source.ReserveMessage(messageHeader, target);
public void ReleaseReservation(DataflowMessageHeader messageHeader, ITargetBlock<TValue> target) =>
_source.ReleaseReservation(messageHeader, target);
}
It is not very pretty, and it is not production tested, but it seems to work. In order to actually replace an already dispatched element, I had to retain the cancellation token used so I could cancel an outdated but unprocessed element. I'm not sure this is the best idea so any critique is welcome!
One note, though: This will also process element (1,1) because after (1,0) has been dispatched to the processorBlock, element (1,1) is successfully sent to the custom block's output buffer. I don't think this can be avoided.
Here is my take on this problem:
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldestByKey<TInput, TOutput, TKey>(
Func<TInput, Task<TOutput>> transform,
Func<TInput, TKey> keySelector,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IEqualityComparer<TKey> keyComparer = null,
IProgress<TInput> droppedItems = null)
{
if (transform == null) throw new ArgumentNullException(nameof(transform));
if (keySelector == null) throw new ArgumentNullException(nameof(keySelector));
dataflowBlockOptions = dataflowBlockOptions ?? new ExecutionDataflowBlockOptions();
keyComparer = keyComparer ?? EqualityComparer<TKey>.Default;
var dictionary = new Dictionary<TKey, TInput>(keyComparer);
var outputBlock = new TransformManyBlock<TKey, TOutput>(async key =>
{
bool removed; TInput removedItem;
lock (dictionary) removed = dictionary.Remove(key, out removedItem);
if (!removed) return Enumerable.Empty<TOutput>();
return new[] { await transform(removedItem).ConfigureAwait(false) };
}, dataflowBlockOptions);
var inputBlock = new ActionBlock<TInput>(item =>
{
var key = keySelector(item);
bool dropped; TInput droppedItem;
lock (dictionary)
{
dropped = dictionary.TryGetValue(key, out droppedItem);
dictionary[key] = item;
}
if (dropped) droppedItems?.Report(droppedItem);
return outputBlock.SendAsync(key);
}, new ExecutionDataflowBlockOptions()
{
BoundedCapacity = 1,
CancellationToken = dataflowBlockOptions.CancellationToken,
TaskScheduler = dataflowBlockOptions.TaskScheduler,
});
PropagateCompletion(inputBlock, outputBlock);
PropagateFailure(outputBlock, inputBlock);
return DataflowBlock.Encapsulate(inputBlock, outputBlock);
async void PropagateCompletion(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
var ex = source.Completion.IsFaulted ? source.Completion.Exception : null;
if (ex != null) target.Fault(ex); else target.Complete();
}
async void PropagateFailure(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
if (source.Completion.IsFaulted) target.Fault(source.Completion.Exception);
}
}
Usage example:
var droppedItems = new Progress<Bid>(b =>
{
Console.WriteLine($"Dropped: {b.Id} ({b.Value})");
});
var processBlock = CreateTransformBlockDropOldestByKey<Bid, Bid, int>(async b =>
{
Console.WriteLine($"Processing: {b.Id} ({b.Value})");
await Task.Delay(1000);
return b;
}, b => b.Id, droppedItems: droppedItems);
The reason that the two internal blocks, the inputBlock and the outputBlock are not linked together directly, is because otherwise a fault in the outputBlock could potentially leave the inputBlock hanging in a not complete state forever. It is important that if one of the two blocks fail, the other should fail too, so that any pending SendAsync operation towards the inputBlock to be canceled. The blocks are linked together indirectly, by using the PropagateCompletion and PropagateFailure methods.
Configuring the processBlock with a BoundedCapacity should take into account that the block may contain in its input queue keys that may have been dropped, so setting this configuration to a slightly higher value is advised.

Shared context between tests

I have following code:
public class Batcher<TPayload> : IBatcher<TPayload>
{
private static readonly BufferBlock<BatchElement<TPayload>> BufferBlock = new BufferBlock<BatchElement<TPayload>>(new DataflowBlockOptions
{
EnsureOrdered = true
});
private readonly TransformBlock<BatchElement<TPayload>, BatchElement<TPayload>> BufferInterceptor;
private readonly TransformBlock<BatchElement<TPayload>, BatchElement<TPayload>> TimeoutInterceptor;
public EventsBatcher(int size, int interval, IMagicService magicService, ILogger<Batcher<TPayload, TStrategy>> logger)
{
BufferInterceptor =
new TransformBlock<BatchElement<TPayload>, BatchElement<TPayload>>(x =>
{
logger.LogInformation($"Get a message with value: {x}");
return x;
});
TimeoutInterceptor =
new TransformBlock<BatchElement<TPayload>, BatchElement<TPayload>>(x =>
{
logger.LogInformation($"Move out from transformation block with a value: {x}");
return x;
});
var batchBlock = new BatchBlock<BatchElement<TPayload>>(size, new GroupingDataflowBlockOptions()
{
EnsureOrdered = true
});
var timer = new Timer(async _ =>
{
try
{
batchBlock.TriggerBatch();
var data = await batchBlock.ReceiveAsync();
if (!data.Any() && data.SomeLogic())
return;
await magicService.PushMessageAsync(batchElement.Payload);
}
catch (Exception e)
{
logger.LogError($"Error occurs while trying to invoke action on batch", e);
}
}, null, 0, 500);
var timeoutBlock = new TransformBlock<BatchElement<TPayload>, BatchElement<TPayload>>(v =>
{
timer.Change(interval, Timeout.Infinite);
return v;
});
TimeoutInterceptor.LinkTo(batchBlock);
timeoutBlock.LinkTo(TimeoutInterceptor);
BufferInterceptor.LinkTo(timeoutBlock);
BufferBlock.LinkTo(BufferInterceptor);
}
public async Task<Result<Unit>> SendAsync(BatchElement<TPayload> msg, CancellationToken token = new CancellationToken())
{
try
{
var result = await BufferBlock.SendAsync(msg, token);
return result
? ResultFactory.CreateSuccess()
: ResultFactory.CreateFailure<Unit>("Message was refused by queue");
}
catch (Exception e)
{
return ResultFactory.CreateFailure<Unit>(e.Message);
}
}
}
Which responsibility is to evaluate somehow data every x milliseconds. I try to write unit tests to that to be sure that everything works fine. Those tests are here:
public class BatcherTests
{
public EventsBatcher<int> Initialize(Dictionary<DateTime, int> output)
{
var busMock = new Mock<IMagicService>();
busMock.Setup(x => x.PushMessageAsync(It.IsAny<int>()))
.Callback<Data>((data) =>
{
output.Add(DateTime.Now, data);
}).Returns(Task.CompletedTask);
var loggerMock = new Mock<ILogger<Batcher<int>>>();
return new Batcher<int>(
2,
5000,
busMock.Object,
loggerMock.Object
);
}
[Fact]
public async Task Batcher_ShouldRemoveDuplicatedMessages()
{
var output = new Dictionary<DateTime, int>();
var batcher = Initialize(output);
var first = await batcher.SendAsync(new MockEvent { Payload = 1 });
var second = await batcher.SendAsync(new MockEvent { Payload = 1 });
(first.IsSuccess && second.IsSuccess).ShouldBeTrue();
while (output.Count != 2)
{
}
output.Count.ShouldBe(2);
output.First().Value.ShouldBe(1);
output.Last().Value.ShouldBe(1);
output.Clear();
}
[Fact]
public async Task Batcher_WhenSizeIsSetTo2AndWeSend3Items_ReturnTwoBatchedItemsWithDateIntervalPlusMinus5000msAndAllSendRequestsEndsWithSuccess()
{
var output = new Dictionary<DateTime, int>();
var batcher = Initialize(output);
var first = await batcher.SendAsync(new MockEvent { Payload = 1 });
var second = await batcher.SendAsync(new MockEvent { Payload = 1 });
var third = await batcher.SendAsync(new MockEvent { Payload = 1 });
(first.IsSuccess && second.IsSuccess && third.IsSuccess).ShouldBeTrue();
while (output.Count != 2) //never ends because there are already two elements in output dictionary
{
}
output.Count.ShouldBe(2);
output.First().Value.ShouldBe(2);
output.Last().Value.ShouldBe(1);
var interval = (output.Last().Key - output.First().Key).TotalSeconds;
(interval >= 4.5d && interval <= 5.5d).ShouldBeTrue();
output.Clear();
}
}
But the strange thing is that when I run them separately they end up with a success status. But when I run them all together one of them seems to stuck. This is because a dictionary which is passed to a logic method has 2 elements inside while starting a test. I don't see here a possibility of shared context since stub class is created at the beginning of test cases, the same with a dictionary. Is there something that I missing? I also try to split those test cases to separe classes but the same behavior occurs.
There is shared stated, but it is not in the test (directly).
Your BufferBlock is declared as static in the class Batcher<TPayload>. There is your shared state.
private static readonly BufferBlock<BatchElement<TPayload>> BufferBlock = new BufferBlock<BatchElement<TPayload>>(new DataflowBlockOptions
{
EnsureOrdered = true
});
When multiple tests are executed that shared block is linked to the other blocks multiple times.

Async Producer / Consumer with throttled duration and batched consumption

I am trying to build a service that provides a queue for many asynchronous clients to make requests and await a response. I need to be able to throttle the queue processing by X requests per Y duration. For example: 50 web requests per second. It is for a 3rd party REST Service where I can only issue X requests per second.
Found many SO questions, it is lead me down the path of using TPL Dataflow, I've used a TranformBlock to provide my custom throttling and then X number of ActionBlocks to complete the tasks in parallel. The implementation of the Action seems a bit clunky, so wondering if there is a better way for me to pass Tasks into the pipeline that notify the callers once completed.
I'm wondering if there is there a better or more optimal/simpler way to do what I want? Is there any glaring issues with my implementation? I know it is missing cancellation and exception handing and I'll be doing this next, but your comments are most welcomed.
I've Extended Stephen Cleary's example for my Dataflow pipeline and used
svick's concept of a time throttled TransformBlock. I am wondering if what I've built could be easily achieved with a pure SemaphoreSlim design, its the time based throttling with max operations that I think will complicate things.
Here is the latest implementation. FIFO queue async queue where I can pass in custom actions.
public class ThrottledProducerConsumer<T>
{
private class TimerState<T1>
{
public SemaphoreSlim Sem;
public T1 Value;
}
private BufferBlock<T> _queue;
private IPropagatorBlock<T, T> _throttleBlock;
private List<Task> _consumers;
private static IPropagatorBlock<T1, T1> CreateThrottleBlock<T1>(TimeSpan Interval, Int32 MaxPerInterval)
{
SemaphoreSlim _sem = new SemaphoreSlim(MaxPerInterval);
return new TransformBlock<T1, T1>(async (x) =>
{
var sw = new Stopwatch();
sw.Start();
//Console.WriteLine($"Current count: {_sem.CurrentCount}");
await _sem.WaitAsync();
sw.Stop();
var now = DateTime.UtcNow;
var releaseTime = now.Add(Interval) - now;
//-- Using timer as opposed to Task.Delay as I do not want to await or wait for it to complete
var tm = new Timer((s) => {
var state = (TimerState<T1>)s;
//Console.WriteLine($"RELEASE: {state.Value} was released {DateTime.UtcNow:mm:ss:ff} Reset Sem");
state.Sem.Release();
}, new TimerState<T1> { Sem = _sem, Value = x }, (int)Interval.TotalMilliseconds,
-1);
/*
Task.Delay(delay).ContinueWith((t)=>
{
Console.WriteLine($"RELEASE(FAKE): {x} was released {DateTime.UtcNow:mm:ss:ff} Reset Sem");
//_sem.Release();
});
*/
//Console.WriteLine($"{x} was tramsformed in {sw.ElapsedMilliseconds}ms. Will release {now.Add(Interval):mm:ss:ff}");
return x;
},
//new ExecutionDataflowBlockOptions { BoundedCapacity = 1 });
//
new ExecutionDataflowBlockOptions { BoundedCapacity = 5, MaxDegreeOfParallelism = 10 });
}
public ThrottledProducerConsumer(TimeSpan Interval, int MaxPerInterval, Int32 QueueBoundedMax = 5, Action<T> ConsumerAction = null, Int32 MaxConsumers = 1)
{
var consumerOptions = new ExecutionDataflowBlockOptions { BoundedCapacity = 1, };
var linkOptions = new DataflowLinkOptions { PropagateCompletion = true, };
//-- Create the Queue
_queue = new BufferBlock<T>(new DataflowBlockOptions { BoundedCapacity = QueueBoundedMax, });
//-- Create and link the throttle block
_throttleBlock = CreateThrottleBlock<T>(Interval, MaxPerInterval);
_queue.LinkTo(_throttleBlock, linkOptions);
//-- Create and link the consumer(s) to the throttle block
var consumerAction = (ConsumerAction != null) ? ConsumerAction : new Action<T>(ConsumeItem);
_consumers = new List<Task>();
for (int i = 0; i < MaxConsumers; i++)
{
var consumer = new ActionBlock<T>(consumerAction, consumerOptions);
_throttleBlock.LinkTo(consumer, linkOptions);
_consumers.Add(consumer.Completion);
}
//-- TODO: Add some cancellation tokens to shut this thing down
}
/// <summary>
/// Default Consumer Action, just prints to console
/// </summary>
/// <param name="ItemToConsume"></param>
private void ConsumeItem(T ItemToConsume)
{
Console.WriteLine($"Consumed {ItemToConsume} at {DateTime.UtcNow}");
}
public async Task EnqueueAsync(T ItemToEnqueue)
{
await this._queue.SendAsync(ItemToEnqueue);
}
public async Task EnqueueItemsAsync(IEnumerable<T> ItemsToEnqueue)
{
foreach (var item in ItemsToEnqueue)
{
await this._queue.SendAsync(item);
}
}
public async Task CompleteAsync()
{
this._queue.Complete();
await Task.WhenAll(_consumers);
Console.WriteLine($"All consumers completed {DateTime.UtcNow}");
}
}
The test method
public class WorkItem<T>
{
public TaskCompletionSource<T> tcs;
//public T respone;
public string url;
public WorkItem(string Url)
{
tcs = new TaskCompletionSource<T>();
url = Url;
}
public override string ToString()
{
return $"{url}";
}
}
public static void TestQueue()
{
Console.WriteLine("Created the queue");
var defaultAction = new Action<WorkItem<String>>(async i => {
var taskItem = ((WorkItem<String>)i);
Console.WriteLine($"Consuming: {taskItem.url} {DateTime.UtcNow:mm:ss:ff}");
//-- Assume calling another async method e.g. await httpClient.DownloadStringTaskAsync(url);
await Task.Delay(5000);
taskItem.tcs.SetResult($"{taskItem.url}");
//Console.WriteLine($"Consumed: {taskItem.url} {DateTime.UtcNow}");
});
var queue = new ThrottledProducerConsumer<WorkItem<String>>(TimeSpan.FromMilliseconds(2000), 5, 2, defaultAction);
var results = new List<Task>();
foreach (var no in Enumerable.Range(0, 20))
{
var workItem = new WorkItem<String>($"http://someurl{no}.com");
results.Add(queue.EnqueueAsync(workItem));
results.Add(workItem.tcs.Task);
results.Add(workItem.tcs.Task.ContinueWith(response =>
{
Console.WriteLine($"Received: {response.Result} {DateTime.UtcNow:mm:ss:ff}");
}));
}
Task.WhenAll(results).Wait();
Console.WriteLine("All Work Items Have Been Processed");
}
Since asking, I have created a ThrottledConsumerProducer class based on TPL Dataflow. It was tested over a number of days which included concurrent producers which were queued and completed in order, approx 281k without any problems, however there my be bugs I've not discovered.
I am using a BufferBlock as an asynchronous queue, this is linked to:
A TransformBlock which provides the throttling and blocking I need. It is used in conjunction with a SempahoreSlim to control the max requests. As each item is passed through the block, it increments the semaphore and schedules a task to run X duration later to release the semaphore by one. This way I have a sliding window of X requests per duration; exactly what I wanted. Because of TPL I am also leveraging parallelism to the connected:
ActionBlock(s) which are responsible for performing the task I need.
The classes are generic, so it might be useful to others if they need something similar. I have not written cancellation or error handling, but thought I should just mark this as answered to move it along. I would be quite happy to see some alternatives and feedback, rather than mark mine as an accepted answer. Thanks for reading.
NOTE: I removed the Timer from the original implementation as it was doing weird stuff causing the semaphore to release more than the maximum, I am assuming it is dynamic context error, it occurred when I started running concurrent requests. I worked around it using Task.Delay to schedule a release of a semaphore lock.
Throttled Producer Consumer
public class ThrottledProducerConsumer<T>
{
private BufferBlock<T> _queue;
private IPropagatorBlock<T, T> _throttleBlock;
private List<Task> _consumers;
private static IPropagatorBlock<T1, T1> CreateThrottleBlock<T1>(TimeSpan Interval,
Int32 MaxPerInterval, Int32 BlockBoundedMax = 2, Int32 BlockMaxDegreeOfParallelism = 2)
{
SemaphoreSlim _sem = new SemaphoreSlim(MaxPerInterval, MaxPerInterval);
return new TransformBlock<T1, T1>(async (x) =>
{
//Log($"Transform blk: {x} {DateTime.UtcNow:mm:ss:ff} Semaphore Count: {_sem.CurrentCount}");
var sw = new Stopwatch();
sw.Start();
//Console.WriteLine($"Current count: {_sem.CurrentCount}");
await _sem.WaitAsync();
sw.Stop();
var delayTask = Task.Delay(Interval).ContinueWith((t) =>
{
//Log($"Pre-RELEASE: {x} {DateTime.UtcNow:mm:ss:ff} Semaphore Count {_sem.CurrentCount}");
_sem.Release();
//Log($"PostRELEASE: {x} {DateTime.UtcNow:mm:ss:ff} Semaphoere Count {_sem.CurrentCount}");
});
//},TaskScheduler.FromCurrentSynchronizationContext());
//Log($"Transformed: {x} in queue {sw.ElapsedMilliseconds}ms. {DateTime.Now:mm:ss:ff} will release {DateTime.Now.Add(Interval):mm:ss:ff} Semaphoere Count {_sem.CurrentCount}");
return x;
},
//-- Might be better to keep Bounded Capacity in sync with the semaphore
new ExecutionDataflowBlockOptions { BoundedCapacity = BlockBoundedMax,
MaxDegreeOfParallelism = BlockMaxDegreeOfParallelism });
}
public ThrottledProducerConsumer(TimeSpan Interval, int MaxPerInterval,
Int32 QueueBoundedMax = 5, Action<T> ConsumerAction = null, Int32 MaxConsumers = 1,
Int32 MaxThrottleBuffer = 20, Int32 MaxDegreeOfParallelism = 10)
{
//-- Probably best to link MaxPerInterval and MaxThrottleBuffer
// and MaxConsumers with MaxDegreeOfParallelism
var consumerOptions = new ExecutionDataflowBlockOptions { BoundedCapacity = 1, };
var linkOptions = new DataflowLinkOptions { PropagateCompletion = true, };
//-- Create the Queue
_queue = new BufferBlock<T>(new DataflowBlockOptions { BoundedCapacity = QueueBoundedMax, });
//-- Create and link the throttle block
_throttleBlock = CreateThrottleBlock<T>(Interval, MaxPerInterval);
_queue.LinkTo(_throttleBlock, linkOptions);
//-- Create and link the consumer(s) to the throttle block
var consumerAction = (ConsumerAction != null) ? ConsumerAction : new Action<T>(ConsumeItem);
_consumers = new List<Task>();
for (int i = 0; i < MaxConsumers; i++)
{
var consumer = new ActionBlock<T>(consumerAction, consumerOptions);
_throttleBlock.LinkTo(consumer, linkOptions);
_consumers.Add(consumer.Completion);
}
//-- TODO: Add some cancellation tokens to shut this thing down
}
/// <summary>
/// Default Consumer Action, just prints to console
/// </summary>
/// <param name="ItemToConsume"></param>
private void ConsumeItem(T ItemToConsume)
{
Log($"Consumed {ItemToConsume} at {DateTime.UtcNow}");
}
public async Task EnqueueAsync(T ItemToEnqueue)
{
await this._queue.SendAsync(ItemToEnqueue);
}
public async Task EnqueueItemsAsync(IEnumerable<T> ItemsToEnqueue)
{
foreach (var item in ItemsToEnqueue)
{
await this._queue.SendAsync(item);
}
}
public async Task CompleteAsync()
{
this._queue.Complete();
await Task.WhenAll(_consumers);
Console.WriteLine($"All consumers completed {DateTime.UtcNow}");
}
private static void Log(String messageToLog)
{
System.Diagnostics.Trace.WriteLine(messageToLog);
Console.WriteLine(messageToLog);
}
}
- Example Usage -
A Generic WorkItem
public class WorkItem<Toutput,Tinput>
{
private TaskCompletionSource<Toutput> _tcs;
public Task<Toutput> Task { get { return _tcs.Task; } }
public Tinput InputData { get; private set; }
public Toutput OutputData { get; private set; }
public WorkItem(Tinput inputData)
{
_tcs = new TaskCompletionSource<Toutput>();
InputData = inputData;
}
public void Complete(Toutput result)
{
_tcs.SetResult(result);
}
public void Failed(Exception ex)
{
_tcs.SetException(ex);
}
public override string ToString()
{
return InputData.ToString();
}
}
Creating the action block executed in the pipeline
private Action<WorkItem<Location,PointToLocation>> CreateProcessingAction()
{
return new Action<WorkItem<Location,PointToLocation>>(async i => {
var sw = new Stopwatch();
sw.Start();
var taskItem = ((WorkItem<Location,PointToLocation>)i);
var inputData = taskItem.InputData;
//Log($"Consuming: {inputData.Latitude},{inputData.Longitude} {DateTime.UtcNow:mm:ss:ff}");
//-- Assume calling another async method e.g. await httpClient.DownloadStringTaskAsync(url);
await Task.Delay(500);
sw.Stop();
Location outData = new Location()
{
Latitude = inputData.Latitude,
Longitude = inputData.Longitude,
StreetAddress = $"Consumed: {inputData.Latitude},{inputData.Longitude} Duration(ms): {sw.ElapsedMilliseconds}"
};
taskItem.Complete(outData);
//Console.WriteLine($"Consumed: {taskItem.url} {DateTime.UtcNow}");
});
}
Test Method
You'll need to provide your own implementation for PointToLocation and Location. Just an example of how you'd use it with your own classes.
int startRange = 0;
int nextRange = 1000;
ThrottledProducerConsumer<WorkItem<Location,PointToLocation>> tpc;
private void cmdTestPipeline_Click(object sender, EventArgs e)
{
Log($"Pipeline test started {DateTime.Now:HH:mm:ss:ff}");
if(tpc == null)
{
tpc = new ThrottledProducerConsumer<WorkItem<Location, PointToLocation>>(
//1010, 2, 20000,
TimeSpan.FromMilliseconds(1010), 45, 100000,
CreateProcessingAction(),
2,45,10);
}
var workItems = new List<WorkItem<Models.Location, PointToLocation>>();
foreach (var i in Enumerable.Range(startRange, nextRange))
{
var ptToLoc = new PointToLocation() { Latitude = i + 101, Longitude = i + 100 };
var wrkItem = new WorkItem<Location, PointToLocation>(ptToLoc);
workItems.Add(wrkItem);
wrkItem.Task.ContinueWith(t =>
{
var loc = t.Result;
string line = $"[Simulated:{DateTime.Now:HH:mm:ss:ff}] - {loc.StreetAddress}";
//txtResponse.Text = String.Concat(txtResponse.Text, line, System.Environment.NewLine);
//var lines = txtResponse.Text.Split(new string[] { System.Environment.NewLine},
// StringSplitOptions.RemoveEmptyEntries).LongCount();
//lblLines.Text = lines.ToString();
//Log(line);
});
//}, TaskScheduler.FromCurrentSynchronizationContext());
}
startRange += nextRange;
tpc.EnqueueItemsAsync(workItems);
Log($"Pipeline test completed {DateTime.Now:HH:mm:ss:ff}");
}

BatchBlock produces batch with elements sent after TriggerBatch()

I have a Dataflow pipeline consisting of several blocks.
When elements are flowing through my processing pipeline, I want to group them by field A. To do this I have a BatchBlock with high BoundedCapacity. In it I store my elements until I decide that they should be released. So I invoke TriggerBatch() method.
private void Forward(TStronglyTyped data)
{
if (ShouldCreateNewGroup(data))
{
GroupingBlock.TriggerBatch();
}
GroupingBlock.SendAsync(data).Wait(SendTimeout);
}
This is how it looks.
The problem is, that the batch produced, sometimes contains the next posted element, which shouldn't be there.
To illustrate:
BatchBlock.InputQueue = {A,A,A}
NextElement = B //we should trigger a Batch!
BatchBlock.TriggerBatch()
BatchBlock.SendAsync(B);
In this point I expect my batch to be {A,A,A}, but it is {A,A,A,B}
Like TriggerBatch() was asynchronous, and SendAsync was in fact executed before the batch was actually made.
How can I solve this?
I obviously don't want to put Task.Wait(x) in there (I tried, and it works, but then performance is poor, of course).
I also encountered this issue by trying to call TriggerBatch in the wrong place. As mentioned, the SlidingWindow example using DataflowBlock.Encapsulate is the answer here, but it took some time to adapt so I thought I'd share my completed block.
My ConditionalBatchBlock creates batches up to a maximum size, possibly sooner if a certain condition is met. In my specific scenario I needed to create batches of 100, but always create new batches when certain changes in the data were detected.
public static IPropagatorBlock<T, T[]> CreateConditionalBatchBlock<T>(int batchSize, Func<Queue<T>, T, bool> condition)
{
var queue = new Queue<T>();
var source = new BufferBlock<T[]>();
var target = new ActionBlock<T>(async item =>
{
// start a new batch if required by the condition
if (condition(queue, item))
{
await source.SendAsync(queue.ToArray());
queue.Clear();
}
queue.Enqueue(item);
// always send a batch when the max size has been reached
if (queue.Count == batchSize)
{
await source.SendAsync(queue.ToArray());
queue.Clear();
}
});
// send any remaining items
target.Completion.ContinueWith(async t =>
{
if (queue.Any())
await source.SendAsync(queue.ToArray());
source.Complete();
});
return DataflowBlock.Encapsulate(target, source);
}
The condition parameter may be simpler in your case. I needed to look at the queue as well as the current item to make the determination whether to create a new batch.
I used it like this:
public async Task RunExampleAsync<T>()
{
var conditionalBatchBlock = CreateConditionalBatchBlock<T>(100, (queue, currentItem) => ShouldCreateNewBatch(queue, currentItem));
var actionBlock = new ActionBlock<T[]>(async x => await PerformActionAsync(x));
conditionalBatchBlock.LinkTo(actionBlock, new DataflowLinkOptions { PropagateCompletion = true });
await ReadDataAsync<T>(conditionalBatchBlock);
await actionBlock.Completion;
}
Here is a specialized version of Loren Paulsen's CreateConditionalBatchBlock method. This one accepts a Func<TItem, TKey> keySelector argument, and emits a new batch every time an item with different key is received.
public static IPropagatorBlock<TItem, TItem[]> CreateConditionalBatchBlock<TItem, TKey>(
Func<TItem, TKey> keySelector,
DataflowBlockOptions dataflowBlockOptions = null,
int maxBatchSize = DataflowBlockOptions.Unbounded,
IEqualityComparer<TKey> keyComparer = null)
{
if (keySelector == null) throw new ArgumentNullException(nameof(keySelector));
if (maxBatchSize < 1 && maxBatchSize != DataflowBlockOptions.Unbounded)
throw new ArgumentOutOfRangeException(nameof(maxBatchSize));
keyComparer = keyComparer ?? EqualityComparer<TKey>.Default;
var options = new ExecutionDataflowBlockOptions();
if (dataflowBlockOptions != null)
{
options.BoundedCapacity = dataflowBlockOptions.BoundedCapacity;
options.CancellationToken = dataflowBlockOptions.CancellationToken;
options.MaxMessagesPerTask = dataflowBlockOptions.MaxMessagesPerTask;
options.TaskScheduler = dataflowBlockOptions.TaskScheduler;
}
var output = new BufferBlock<TItem[]>(options);
var queue = new Queue<TItem>(); // Synchronization is not needed
TKey previousKey = default;
var input = new ActionBlock<TItem>(async item =>
{
var key = keySelector(item);
if (queue.Count > 0 && !keyComparer.Equals(key, previousKey))
{
await output.SendAsync(queue.ToArray()).ConfigureAwait(false);
queue.Clear();
}
queue.Enqueue(item);
previousKey = key;
if (queue.Count == maxBatchSize)
{
await output.SendAsync(queue.ToArray()).ConfigureAwait(false);
queue.Clear();
}
}, options);
_ = input.Completion.ContinueWith(async t =>
{
if (queue.Count > 0)
{
await output.SendAsync(queue.ToArray()).ConfigureAwait(false);
queue.Clear();
}
if (t.IsFaulted)
{
((IDataflowBlock)output).Fault(t.Exception.InnerException);
}
else
{
output.Complete();
}
}, TaskScheduler.Default);
return DataflowBlock.Encapsulate(input, output);
}

Categories