I need help with making a TPL Dataflow pipeline update an input buffer with the latest value.
I am subscribed to a live stream of elements, which are posted one by one onto a dataflow pipeline. Each element is processed, which takes some time - it takes significantly more time to process one element than what it takes to produce it (i.e. fast producer, slow consumer).
However, if there are multiple elements on the input queue with the same identity, only the most recent one needs processing. The intermediate ones can be discarded. This is the part I am having trouble figuring out.
Here is an example of what I am trying to achieve:
public record Bid(int Id, int Value);
async Task Main()
{
// This block is just here to log that an input is received.
var startBlock = new TransformBlock<Bid, Bid>(d =>
{
Console.WriteLine("Input: {0} ({1})", d.Id, d.Value);
return d;
});
//TODO: Check for duplicate identity (Bid.Id) and replace the
// current element with the most recent one.
var updateWithMostRecentBlock = new TransformBlock<Bid, Bid>(d => d);
var processBlock = new TransformBlock<Bid, Bid>(async d =>
{
Console.WriteLine("Processing: {0} ({1})", d.Id, d.Value);
await Task.Delay(1000);
return d;
});
var finishBlock = new ActionBlock<Bid>(d =>
{
Console.WriteLine("Done: {0} ({1})", d.Id, d.Value);
});
var propagateCompletion = new DataflowLinkOptions { PropagateCompletion = true };
startBlock.LinkTo(updateWithMostRecentBlock, propagateCompletion);
updateWithMostRecentBlock.LinkTo(processBlock, propagateCompletion);
processBlock.LinkTo(finishBlock, propagateCompletion);
var data = new[]
{
new Bid(1, 0), // Processed immediately
new Bid(1, 1), // Replaced with (1,2)
new Bid(2, 0), // Replaced with (2,1)
new Bid(1, 2), // Queued
new Bid(2, 1) // Queued
};
foreach (var d in data)
startBlock.Post(d);
startBlock.Complete();
await finishBlock.Completion;
}
When processBlock is ready to receive the next element, I want updateWithMostRecentBlock to provide only the most relevant element.
Actual output:
Input: 1 (0)
Input: 1 (1)
Input: 2 (0)
Input: 1 (2)
Input: 2 (1)
Processing: 1 (0)
Processing: 1 (1)
Done: 1 (0)
Processing: 2 (0)
Done: 1 (1)
Processing: 1 (2)
Done: 2 (0)
Processing: 2 (1)
Done: 1 (2)
Done: 2 (1)
Expected output:
Input: 1 (0) // Immediately processed
Input: 1 (1) // Replaced by (1,2)
Input: 2 (0) // Replaced by (2,1)
Input: 1 (2) // Queued
Input: 2 (1) // Queued
Processing: 1 (0)
Done: 1 (0)
Processing: 1 (2)
Done: 1 (2)
Processing: 2 (1)
Done: 2 (1)
Hint:
Stephen Toub has an elegant solution to the exact opposite of what I'm trying to achieve. His solution rejects all incoming elements and retains the oldest one.
I'm sorry for answering my own question, but #TheGeneral brought me on the right track with his hint about bounded capacity.
I had to configure the processBlock to set bounded capacity to 1:
var processBlock = new TransformBlock<Bid, Bid>(
async d =>
{
Console.WriteLine("Processing: {0} ({1})", d.Id, d.Value);
await Task.Delay(1000);
return d;
},
new ExecutionDataflowBlockOptions
{
BoundedCapacity = 1
});
Then I replaced the updateWithMostRecentBlock with a custom block that has this implementation:
public class DiscardAndReplaceDuplicatesBlock<TValue, TKey> : IPropagatorBlock<TValue, TValue>
where TKey : IEquatable<TKey>
{
private readonly ITargetBlock<TValue> _target;
private readonly IReceivableSourceBlock<TValue> _source;
public DiscardAndReplaceDuplicatesBlock(Func<TValue, TKey> keyAccessor)
{
var buffer = new ConcurrentDictionary<TKey, (TValue Value, Task Task, CancellationTokenSource Token)>();
var outgoing = new BufferBlock<TValue>(new ExecutionDataflowBlockOptions
{
BoundedCapacity = 1,
MaxMessagesPerTask = 1
});
var incoming = new ActionBlock<TValue>(value =>
{
var key = keyAccessor(value);
var cts = new CancellationTokenSource();
var isQueued = buffer.TryGetValue(key, out var previous);
if (isQueued)
{
buffer.TryRemove(key, out var current);
Console.WriteLine("Remove: {0}", current.Value);
if (!previous.Task.IsCompleted)
{
previous.Token.Cancel();
previous.Token.Dispose();
Console.WriteLine("Cancel: {0}", current.Value);
}
}
var task = outgoing.SendAsync(value, cts.Token);
if (task.IsCompleted)
{
cts.Dispose();
Console.WriteLine("Sent: {0}", value);
return;
}
buffer.AddOrUpdate(key, (value, task, cts), (k, t) => (value, task, cts));
Console.WriteLine("Buffered: {0}", value);
});
incoming.Completion.ContinueWith(
async t =>
{
if (t.IsFaulted)
{
((ITargetBlock<TValue>)outgoing).Fault(t.Exception.InnerException);
}
else
{
await WaitForBufferToCompleteAsync().ConfigureAwait(false);
outgoing.Complete();
}
},
default,
TaskContinuationOptions.ExecuteSynchronously,
TaskScheduler.Default);
Task WaitForBufferToCompleteAsync()
{
if (!buffer.Any())
return Task.CompletedTask;
var buffered = buffer.Where(kvp => !kvp.Value.Task.IsCompleted);
var tasks = buffered.Select(b => b.Value.Task);
return Task.WhenAll(tasks);
}
_target = incoming;
_source = outgoing;
}
public Task Completion =>
_source.Completion;
public void Complete() =>
_target.Complete();
public void Fault(Exception exception) =>
_target.Fault(exception);
public IDisposable LinkTo(ITargetBlock<TValue> target, DataflowLinkOptions linkOptions) =>
_source.LinkTo(target, linkOptions);
public TValue ConsumeMessage(DataflowMessageHeader messageHeader, ITargetBlock<TValue> target, out bool messageConsumed) =>
_source.ConsumeMessage(messageHeader, target, out messageConsumed);
public DataflowMessageStatus OfferMessage(DataflowMessageHeader messageHeader, TValue messageValue, ISourceBlock<TValue>? source, bool consumeToAccept) =>
_target.OfferMessage(messageHeader, messageValue, source, consumeToAccept);
public bool ReserveMessage(DataflowMessageHeader messageHeader, ITargetBlock<TValue> target) =>
_source.ReserveMessage(messageHeader, target);
public void ReleaseReservation(DataflowMessageHeader messageHeader, ITargetBlock<TValue> target) =>
_source.ReleaseReservation(messageHeader, target);
}
It is not very pretty, and it is not production tested, but it seems to work. In order to actually replace an already dispatched element, I had to retain the cancellation token used so I could cancel an outdated but unprocessed element. I'm not sure this is the best idea so any critique is welcome!
One note, though: This will also process element (1,1) because after (1,0) has been dispatched to the processorBlock, element (1,1) is successfully sent to the custom block's output buffer. I don't think this can be avoided.
Here is my take on this problem:
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldestByKey<TInput, TOutput, TKey>(
Func<TInput, Task<TOutput>> transform,
Func<TInput, TKey> keySelector,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IEqualityComparer<TKey> keyComparer = null,
IProgress<TInput> droppedItems = null)
{
if (transform == null) throw new ArgumentNullException(nameof(transform));
if (keySelector == null) throw new ArgumentNullException(nameof(keySelector));
dataflowBlockOptions = dataflowBlockOptions ?? new ExecutionDataflowBlockOptions();
keyComparer = keyComparer ?? EqualityComparer<TKey>.Default;
var dictionary = new Dictionary<TKey, TInput>(keyComparer);
var outputBlock = new TransformManyBlock<TKey, TOutput>(async key =>
{
bool removed; TInput removedItem;
lock (dictionary) removed = dictionary.Remove(key, out removedItem);
if (!removed) return Enumerable.Empty<TOutput>();
return new[] { await transform(removedItem).ConfigureAwait(false) };
}, dataflowBlockOptions);
var inputBlock = new ActionBlock<TInput>(item =>
{
var key = keySelector(item);
bool dropped; TInput droppedItem;
lock (dictionary)
{
dropped = dictionary.TryGetValue(key, out droppedItem);
dictionary[key] = item;
}
if (dropped) droppedItems?.Report(droppedItem);
return outputBlock.SendAsync(key);
}, new ExecutionDataflowBlockOptions()
{
BoundedCapacity = 1,
CancellationToken = dataflowBlockOptions.CancellationToken,
TaskScheduler = dataflowBlockOptions.TaskScheduler,
});
PropagateCompletion(inputBlock, outputBlock);
PropagateFailure(outputBlock, inputBlock);
return DataflowBlock.Encapsulate(inputBlock, outputBlock);
async void PropagateCompletion(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
var ex = source.Completion.IsFaulted ? source.Completion.Exception : null;
if (ex != null) target.Fault(ex); else target.Complete();
}
async void PropagateFailure(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
if (source.Completion.IsFaulted) target.Fault(source.Completion.Exception);
}
}
Usage example:
var droppedItems = new Progress<Bid>(b =>
{
Console.WriteLine($"Dropped: {b.Id} ({b.Value})");
});
var processBlock = CreateTransformBlockDropOldestByKey<Bid, Bid, int>(async b =>
{
Console.WriteLine($"Processing: {b.Id} ({b.Value})");
await Task.Delay(1000);
return b;
}, b => b.Id, droppedItems: droppedItems);
The reason that the two internal blocks, the inputBlock and the outputBlock are not linked together directly, is because otherwise a fault in the outputBlock could potentially leave the inputBlock hanging in a not complete state forever. It is important that if one of the two blocks fail, the other should fail too, so that any pending SendAsync operation towards the inputBlock to be canceled. The blocks are linked together indirectly, by using the PropagateCompletion and PropagateFailure methods.
Configuring the processBlock with a BoundedCapacity should take into account that the block may contain in its input queue keys that may have been dropped, so setting this configuration to a slightly higher value is advised.
Related
I'm trying to create some sort of queue that will process the N latest messages received. Right now I have this:
private static void SetupMessaging()
{
_messagingBroadcastBlock = new BroadcastBlock<string>(msg => msg, new ExecutionDataflowBlockOptions
{
//BoundedCapacity = 1,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingActionBlock = new ActionBlock<string>(msg =>
{
Console.WriteLine(msg);
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingBroadcastBlock.LinkTo(_messagingActionBlock, new DataflowLinkOptions { PropagateCompletion = true });
_messagingBroadcastBlock.LinkTo(DataflowBlock.NullTarget<string>());
}
The problem is if I post 1,2,3,4,5 to it I will get 1,2,5 but i'd like it to be 1,4,5. Any suggestions are welcome.
UPD 1
I was able to make the following solution work
class FixedCapacityActionBlock<T>
{
private readonly ActionBlock<CancellableMessage<T>> _actionBlock;
private readonly ConcurrentQueue<CancellableMessage<T>> _inputCollection = new ConcurrentQueue<CancellableMessage<T>>();
private readonly int _maxQueueSize;
private readonly object _syncRoot = new object();
public FixedCapacityActionBlock(Action<T> act, ExecutionDataflowBlockOptions opt)
{
var options = new ExecutionDataflowBlockOptions
{
EnsureOrdered = opt.EnsureOrdered,
CancellationToken = opt.CancellationToken,
MaxDegreeOfParallelism = opt.MaxDegreeOfParallelism,
MaxMessagesPerTask = opt.MaxMessagesPerTask,
NameFormat = opt.NameFormat,
SingleProducerConstrained = opt.SingleProducerConstrained,
TaskScheduler = opt.TaskScheduler,
//we intentionally ignore this value
//BoundedCapacity = opt.BoundedCapacity
};
_actionBlock = new ActionBlock<CancellableMessage<T>>(cmsg =>
{
if (cmsg.CancellationTokenSource.IsCancellationRequested)
{
return;
}
act(cmsg.Message);
}, options);
_maxQueueSize = opt.BoundedCapacity;
}
public bool Post(T msg)
{
var fullMsg = new CancellableMessage<T>(msg);
//what if next task starts here?
lock (_syncRoot)
{
_inputCollection.Enqueue(fullMsg);
var itemsToDrop = _inputCollection.Skip(1).Except(_inputCollection.Skip(_inputCollection.Count - _maxQueueSize + 1));
foreach (var item in itemsToDrop)
{
item.CancellationTokenSource.Cancel();
CancellableMessage<T> temp;
_inputCollection.TryDequeue(out temp);
}
return _actionBlock.Post(fullMsg);
}
}
}
And
class CancellableMessage<T> : IDisposable
{
public CancellationTokenSource CancellationTokenSource { get; set; }
public T Message { get; set; }
public CancellableMessage(T msg)
{
CancellationTokenSource = new CancellationTokenSource();
Message = msg;
}
public void Dispose()
{
CancellationTokenSource?.Dispose();
}
}
While this works and actually does the job this implementation looks dirty, also possibly not thread safe.
Here is a TransformBlock and ActionBlock implementation that drops the oldest messages in its queue, whenever newer messages are received and the BoundedCapacity limit has been reached. It behaves quite similar to a Channel configured with BoundedChannelFullMode.DropOldest.
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, Task<TOutput>> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (transform == null) throw new ArgumentNullException(nameof(transform));
dataflowBlockOptions = dataflowBlockOptions ?? new ExecutionDataflowBlockOptions();
var boundedCapacity = dataflowBlockOptions.BoundedCapacity;
var cancellationToken = dataflowBlockOptions.CancellationToken;
var queue = new Queue<TInput>(Math.Max(0, boundedCapacity));
var outputBlock = new BufferBlock<TOutput>(new DataflowBlockOptions()
{
BoundedCapacity = boundedCapacity,
CancellationToken = cancellationToken
});
if (boundedCapacity != DataflowBlockOptions.Unbounded)
dataflowBlockOptions.BoundedCapacity = checked(boundedCapacity * 2);
// After testing, at least boundedCapacity + 1 is required.
// Make it double to be sure that all non-dropped messages will be processed.
var transformBlock = new ActionBlock<object>(async _ =>
{
TInput item;
lock (queue)
{
if (queue.Count == 0) return;
item = queue.Dequeue();
}
var result = await transform(item).ConfigureAwait(false);
await outputBlock.SendAsync(result, cancellationToken).ConfigureAwait(false);
}, dataflowBlockOptions);
dataflowBlockOptions.BoundedCapacity = boundedCapacity; // Restore initial value
var inputBlock = new ActionBlock<TInput>(item =>
{
var droppedEntry = (Exists: false, Item: (TInput)default);
lock (queue)
{
transformBlock.Post(null);
if (queue.Count == boundedCapacity) droppedEntry = (true, queue.Dequeue());
queue.Enqueue(item);
}
if (droppedEntry.Exists) droppedMessages?.Report(droppedEntry.Item);
}, new ExecutionDataflowBlockOptions()
{
CancellationToken = cancellationToken
});
PropagateCompletion(inputBlock, transformBlock);
PropagateFailure(transformBlock, inputBlock);
PropagateCompletion(transformBlock, outputBlock);
_ = transformBlock.Completion.ContinueWith(_ => { lock (queue) queue.Clear(); },
TaskScheduler.Default);
return DataflowBlock.Encapsulate(inputBlock, outputBlock);
async void PropagateCompletion(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
var exception = source.Completion.IsFaulted ? source.Completion.Exception : null;
if (exception != null) target.Fault(exception); else target.Complete();
}
async void PropagateFailure(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
if (source.Completion.IsFaulted) target.Fault(source.Completion.Exception);
}
}
// Overload with synchronous lambda
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, TOutput> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateTransformBlockDropOldest(item => Task.FromResult(transform(item)),
dataflowBlockOptions, droppedMessages);
}
// ActionBlock equivalent
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Func<TInput, Task> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (action == null) throw new ArgumentNullException(nameof(action));
var block = CreateTransformBlockDropOldest<TInput, object>(
async item => { await action(item).ConfigureAwait(false); return null; },
dataflowBlockOptions, droppedMessages);
block.LinkTo(DataflowBlock.NullTarget<object>());
return block;
}
// ActionBlock equivalent with synchronous lambda
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Action<TInput> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateActionBlockDropOldest(
item => { action(item); return Task.CompletedTask; },
dataflowBlockOptions, droppedMessages);
}
The idea is to store the queued items in an auxiliary Queue, and pass dummy (null) values to an internal ActionBlock<object>. The block ignores the items passed as arguments, and takes instead an item from the queue, if there is any. Α lock is used to ensure that all non-dropped items in the queue will be eventually processed (unless of course an exception occurs).
There is also an extra feature. An optional IProgress<TInput> droppedMessages argument allows to receive notifications every time a message is dropped.
Usage example:
_messagingActionBlock = CreateActionBlockDropOldest<string>(msg =>
{
Console.WriteLine($"Processing: {msg}");
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
}, new Progress<string>(msg =>
{
Console.WriteLine($"Message dropped: {msg}");
}));
TPL Dataflow doesn't fit well into Last N messages, as it's meant to be queue, or pipeline (FIFO), not the stack (LIFO). Are you really need to do this with a dataflow library?
It's much easier with ConcurrentStack<T>, you just introduce one producer task, which posts to the stack, and one consumer task, which gets messages from stack while number of handled ones are lesser than N (More about Producer-Consumer).
If you need TPL Dataflow, you can use it in consumer task, to start handling the last messages, but not in producer, as it's really not the way it was meant to be used. Moreover, there are some other libraries with event-based architecture, which may fit more naturally for your problem.
I'm using reactive extensions to collate data into buffers of 100ms:
this.subscription = this.dataService
.Where(x => !string.Equals("FOO", x.Key.Source))
.Buffer(TimeSpan.FromMilliseconds(100))
.ObserveOn(this.dispatcherService)
.Where(x => x.Count != 0)
.Subscribe(this.OnBufferReceived);
This works fine. However, I want slightly different behavior than that provided by the Buffer operation. Essentially, I want to reset the timer if another data item is received. Only when no data has been received for the entire 100ms do I want to handle it. This opens up the possibility of never handling the data, so I should also be able to specify a maximum count. I would imagine something along the lines of:
.SlidingBuffer(TimeSpan.FromMilliseconds(100), 10000)
I've had a look around and haven't been able to find anything like this in Rx? Can anyone confirm/deny this?
This is possible by combining the built-in Window and Throttle methods of Observable. First, let's solve the simpler problem where we ignore the maximum count condition:
public static IObservable<IList<T>> BufferUntilInactive<T>(this IObservable<T> stream, TimeSpan delay)
{
var closes = stream.Throttle(delay);
return stream.Window(() => closes).SelectMany(window => window.ToList());
}
The powerful Window method did the heavy lifting. Now it's easy enough to see how to add a maximum count:
public static IObservable<IList<T>> BufferUntilInactive<T>(this IObservable<T> stream, TimeSpan delay, Int32? max=null)
{
var closes = stream.Throttle(delay);
if (max != null)
{
var overflows = stream.Where((x,index) => index+1>=max);
closes = closes.Merge(overflows);
}
return stream.Window(() => closes).SelectMany(window => window.ToList());
}
I'll write a post explaining this on my blog. https://gist.github.com/2244036
Documentation for the Window method:
http://leecampbell.blogspot.co.uk/2011/03/rx-part-9join-window-buffer-and-group.html
http://enumeratethis.com/2011/07/26/financial-charts-reactive-extensions/
I wrote an extension to do most of what you're after - BufferWithInactivity.
Here it is:
public static IObservable<IEnumerable<T>> BufferWithInactivity<T>(
this IObservable<T> source,
TimeSpan inactivity,
int maximumBufferSize)
{
return Observable.Create<IEnumerable<T>>(o =>
{
var gate = new object();
var buffer = new List<T>();
var mutable = new SerialDisposable();
var subscription = (IDisposable)null;
var scheduler = Scheduler.ThreadPool;
Action dump = () =>
{
var bts = buffer.ToArray();
buffer = new List<T>();
if (o != null)
{
o.OnNext(bts);
}
};
Action dispose = () =>
{
if (subscription != null)
{
subscription.Dispose();
}
mutable.Dispose();
};
Action<Action<IObserver<IEnumerable<T>>>> onErrorOrCompleted =
onAction =>
{
lock (gate)
{
dispose();
dump();
if (o != null)
{
onAction(o);
}
}
};
Action<Exception> onError = ex =>
onErrorOrCompleted(x => x.OnError(ex));
Action onCompleted = () => onErrorOrCompleted(x => x.OnCompleted());
Action<T> onNext = t =>
{
lock (gate)
{
buffer.Add(t);
if (buffer.Count == maximumBufferSize)
{
dump();
mutable.Disposable = Disposable.Empty;
}
else
{
mutable.Disposable = scheduler.Schedule(inactivity, () =>
{
lock (gate)
{
dump();
}
});
}
}
};
subscription =
source
.ObserveOn(scheduler)
.Subscribe(onNext, onError, onCompleted);
return () =>
{
lock (gate)
{
o = null;
dispose();
}
};
});
}
With Rx Extensions 2.0, your can answer both requirements with a new Buffer overload accepting a timeout and a size:
this.subscription = this.dataService
.Where(x => !string.Equals("FOO", x.Key.Source))
.Buffer(TimeSpan.FromMilliseconds(100), 1)
.ObserveOn(this.dispatcherService)
.Where(x => x.Count != 0)
.Subscribe(this.OnBufferReceived);
See https://msdn.microsoft.com/en-us/library/hh229200(v=vs.103).aspx for the documentation.
I guess this can be implemented on top of Buffer method as shown below:
public static IObservable<IList<T>> SlidingBuffer<T>(this IObservable<T> obs, TimeSpan span, int max)
{
return Observable.CreateWithDisposable<IList<T>>(cl =>
{
var acc = new List<T>();
return obs.Buffer(span)
.Subscribe(next =>
{
if (next.Count == 0) //no activity in time span
{
cl.OnNext(acc);
acc.Clear();
}
else
{
acc.AddRange(next);
if (acc.Count >= max) //max items collected
{
cl.OnNext(acc);
acc.Clear();
}
}
}, err => cl.OnError(err), () => { cl.OnNext(acc); cl.OnCompleted(); });
});
}
NOTE: I haven't tested it, but I hope it gives you the idea.
Colonel Panic's solution is almost perfect. The only thing that is missing is a Publish component, in order to make the solution work with cold sequences too.
/// <summary>
/// Projects each element of an observable sequence into a buffer that's sent out
/// when either a given inactivity timespan has elapsed, or it's full,
/// using the specified scheduler to run timers.
/// </summary>
public static IObservable<IList<T>> BufferUntilInactive<T>(
this IObservable<T> source, TimeSpan dueTime, int maxCount,
IScheduler scheduler = default)
{
if (maxCount < 1) throw new ArgumentOutOfRangeException(nameof(maxCount));
scheduler ??= Scheduler.Default;
return source.Publish(published =>
{
var combinedBoundaries = Observable.Merge
(
published.Throttle(dueTime, scheduler),
published.Skip(maxCount - 1)
);
return published
.Window(() => combinedBoundaries)
.SelectMany(window => window.ToList());
});
}
Beyond adding the Publish, I've also replaced the original .Where((_, index) => index + 1 >= maxCount) with the equivalent but shorter .Skip(maxCount - 1). For completeness there is also an IScheduler parameter, which configures the scheduler where the timer is run.
Given a high-frequency observable stream of data, i want to only emit an item every XX seconds.
This is usually done in RX by using .Sample(TimeSpan.FromSeconds(XX))
However... I want the time-interval to vary based on some property on the data.
Let's say my data is:
class Position
{
...
public int Speed;
}
If Speed is less than 100, I want to emit data every 5 seconds. If speed is hight than 100 it should be every 2 seonds.
Is that possible with off-the-shelf Sample() or do I need to build something myself?
Here is a low level implementation, utilizing the System.Reactive.Concurrency.Scheduler.SchedulePeriodic extension method as a timer.
public static IObservable<TSource> Sample<TSource>(this IObservable<TSource> source,
Func<TSource, TimeSpan> intervalSelector, IScheduler scheduler = null)
{
if (source == null) throw new ArgumentNullException(nameof(source));
if (intervalSelector == null)
throw new ArgumentNullException(nameof(intervalSelector));
scheduler = scheduler ?? Scheduler.Default;
return Observable.Create<TSource>(observer =>
{
TimeSpan currentInterval = Timeout.InfiniteTimeSpan;
IDisposable timer = null;
TSource latestItem = default;
bool latestEmitted = true;
object locker = new object();
Action periodicAction = () =>
{
TSource itemToEmit;
lock (locker)
{
if (latestEmitted) return;
itemToEmit = latestItem;
latestItem = default;
latestEmitted = true;
}
observer.OnNext(itemToEmit);
};
return source.Subscribe(onNext: item =>
{
lock (locker)
{
latestItem = item;
latestEmitted = false;
}
var newInterval = intervalSelector(item);
if (newInterval != currentInterval)
{
timer?.Dispose();
timer = scheduler.SchedulePeriodic(newInterval, periodicAction);
currentInterval = newInterval;
}
}, onError: ex =>
{
timer?.Dispose();
observer.OnError(ex);
}, onCompleted: () =>
{
timer?.Dispose();
observer.OnCompleted();
});
});
}
Usage example:
observable.Sample(x => TimeSpan.FromSeconds(x.Speed < 100 ? 5.0 : 2.0));
The timer is restarted every time the intervalSelector callback returns a different interval. In the extreme case that the interval is changed with every new item, then this custom operator will behave more like the built-in Throttle than the built-in Sample.
Unlike Sample, Throttle's period is a sliding window. Each time Throttle receives a value, the window is reset. (citation)
Let me know if this works:
var query =
source
.Publish(ss =>
ss
.Select(s => s.Speed < 100 ? 5.0 : 2.0)
.Distinct()
.Select(x => ss.Sample(TimeSpan.FromSeconds(x))));
I'm trying to create some sort of queue that will process the N latest messages received. Right now I have this:
private static void SetupMessaging()
{
_messagingBroadcastBlock = new BroadcastBlock<string>(msg => msg, new ExecutionDataflowBlockOptions
{
//BoundedCapacity = 1,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingActionBlock = new ActionBlock<string>(msg =>
{
Console.WriteLine(msg);
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
EnsureOrdered = true,
MaxDegreeOfParallelism = 1,
MaxMessagesPerTask = 1
});
_messagingBroadcastBlock.LinkTo(_messagingActionBlock, new DataflowLinkOptions { PropagateCompletion = true });
_messagingBroadcastBlock.LinkTo(DataflowBlock.NullTarget<string>());
}
The problem is if I post 1,2,3,4,5 to it I will get 1,2,5 but i'd like it to be 1,4,5. Any suggestions are welcome.
UPD 1
I was able to make the following solution work
class FixedCapacityActionBlock<T>
{
private readonly ActionBlock<CancellableMessage<T>> _actionBlock;
private readonly ConcurrentQueue<CancellableMessage<T>> _inputCollection = new ConcurrentQueue<CancellableMessage<T>>();
private readonly int _maxQueueSize;
private readonly object _syncRoot = new object();
public FixedCapacityActionBlock(Action<T> act, ExecutionDataflowBlockOptions opt)
{
var options = new ExecutionDataflowBlockOptions
{
EnsureOrdered = opt.EnsureOrdered,
CancellationToken = opt.CancellationToken,
MaxDegreeOfParallelism = opt.MaxDegreeOfParallelism,
MaxMessagesPerTask = opt.MaxMessagesPerTask,
NameFormat = opt.NameFormat,
SingleProducerConstrained = opt.SingleProducerConstrained,
TaskScheduler = opt.TaskScheduler,
//we intentionally ignore this value
//BoundedCapacity = opt.BoundedCapacity
};
_actionBlock = new ActionBlock<CancellableMessage<T>>(cmsg =>
{
if (cmsg.CancellationTokenSource.IsCancellationRequested)
{
return;
}
act(cmsg.Message);
}, options);
_maxQueueSize = opt.BoundedCapacity;
}
public bool Post(T msg)
{
var fullMsg = new CancellableMessage<T>(msg);
//what if next task starts here?
lock (_syncRoot)
{
_inputCollection.Enqueue(fullMsg);
var itemsToDrop = _inputCollection.Skip(1).Except(_inputCollection.Skip(_inputCollection.Count - _maxQueueSize + 1));
foreach (var item in itemsToDrop)
{
item.CancellationTokenSource.Cancel();
CancellableMessage<T> temp;
_inputCollection.TryDequeue(out temp);
}
return _actionBlock.Post(fullMsg);
}
}
}
And
class CancellableMessage<T> : IDisposable
{
public CancellationTokenSource CancellationTokenSource { get; set; }
public T Message { get; set; }
public CancellableMessage(T msg)
{
CancellationTokenSource = new CancellationTokenSource();
Message = msg;
}
public void Dispose()
{
CancellationTokenSource?.Dispose();
}
}
While this works and actually does the job this implementation looks dirty, also possibly not thread safe.
Here is a TransformBlock and ActionBlock implementation that drops the oldest messages in its queue, whenever newer messages are received and the BoundedCapacity limit has been reached. It behaves quite similar to a Channel configured with BoundedChannelFullMode.DropOldest.
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, Task<TOutput>> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (transform == null) throw new ArgumentNullException(nameof(transform));
dataflowBlockOptions = dataflowBlockOptions ?? new ExecutionDataflowBlockOptions();
var boundedCapacity = dataflowBlockOptions.BoundedCapacity;
var cancellationToken = dataflowBlockOptions.CancellationToken;
var queue = new Queue<TInput>(Math.Max(0, boundedCapacity));
var outputBlock = new BufferBlock<TOutput>(new DataflowBlockOptions()
{
BoundedCapacity = boundedCapacity,
CancellationToken = cancellationToken
});
if (boundedCapacity != DataflowBlockOptions.Unbounded)
dataflowBlockOptions.BoundedCapacity = checked(boundedCapacity * 2);
// After testing, at least boundedCapacity + 1 is required.
// Make it double to be sure that all non-dropped messages will be processed.
var transformBlock = new ActionBlock<object>(async _ =>
{
TInput item;
lock (queue)
{
if (queue.Count == 0) return;
item = queue.Dequeue();
}
var result = await transform(item).ConfigureAwait(false);
await outputBlock.SendAsync(result, cancellationToken).ConfigureAwait(false);
}, dataflowBlockOptions);
dataflowBlockOptions.BoundedCapacity = boundedCapacity; // Restore initial value
var inputBlock = new ActionBlock<TInput>(item =>
{
var droppedEntry = (Exists: false, Item: (TInput)default);
lock (queue)
{
transformBlock.Post(null);
if (queue.Count == boundedCapacity) droppedEntry = (true, queue.Dequeue());
queue.Enqueue(item);
}
if (droppedEntry.Exists) droppedMessages?.Report(droppedEntry.Item);
}, new ExecutionDataflowBlockOptions()
{
CancellationToken = cancellationToken
});
PropagateCompletion(inputBlock, transformBlock);
PropagateFailure(transformBlock, inputBlock);
PropagateCompletion(transformBlock, outputBlock);
_ = transformBlock.Completion.ContinueWith(_ => { lock (queue) queue.Clear(); },
TaskScheduler.Default);
return DataflowBlock.Encapsulate(inputBlock, outputBlock);
async void PropagateCompletion(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
var exception = source.Completion.IsFaulted ? source.Completion.Exception : null;
if (exception != null) target.Fault(exception); else target.Complete();
}
async void PropagateFailure(IDataflowBlock source, IDataflowBlock target)
{
try { await source.Completion.ConfigureAwait(false); } catch { }
if (source.Completion.IsFaulted) target.Fault(source.Completion.Exception);
}
}
// Overload with synchronous lambda
public static IPropagatorBlock<TInput, TOutput>
CreateTransformBlockDropOldest<TInput, TOutput>(
Func<TInput, TOutput> transform,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateTransformBlockDropOldest(item => Task.FromResult(transform(item)),
dataflowBlockOptions, droppedMessages);
}
// ActionBlock equivalent
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Func<TInput, Task> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
if (action == null) throw new ArgumentNullException(nameof(action));
var block = CreateTransformBlockDropOldest<TInput, object>(
async item => { await action(item).ConfigureAwait(false); return null; },
dataflowBlockOptions, droppedMessages);
block.LinkTo(DataflowBlock.NullTarget<object>());
return block;
}
// ActionBlock equivalent with synchronous lambda
public static ITargetBlock<TInput>
CreateActionBlockDropOldest<TInput>(
Action<TInput> action,
ExecutionDataflowBlockOptions dataflowBlockOptions = null,
IProgress<TInput> droppedMessages = null)
{
return CreateActionBlockDropOldest(
item => { action(item); return Task.CompletedTask; },
dataflowBlockOptions, droppedMessages);
}
The idea is to store the queued items in an auxiliary Queue, and pass dummy (null) values to an internal ActionBlock<object>. The block ignores the items passed as arguments, and takes instead an item from the queue, if there is any. Α lock is used to ensure that all non-dropped items in the queue will be eventually processed (unless of course an exception occurs).
There is also an extra feature. An optional IProgress<TInput> droppedMessages argument allows to receive notifications every time a message is dropped.
Usage example:
_messagingActionBlock = CreateActionBlockDropOldest<string>(msg =>
{
Console.WriteLine($"Processing: {msg}");
Thread.Sleep(5000);
}, new ExecutionDataflowBlockOptions
{
BoundedCapacity = 2,
}, new Progress<string>(msg =>
{
Console.WriteLine($"Message dropped: {msg}");
}));
TPL Dataflow doesn't fit well into Last N messages, as it's meant to be queue, or pipeline (FIFO), not the stack (LIFO). Are you really need to do this with a dataflow library?
It's much easier with ConcurrentStack<T>, you just introduce one producer task, which posts to the stack, and one consumer task, which gets messages from stack while number of handled ones are lesser than N (More about Producer-Consumer).
If you need TPL Dataflow, you can use it in consumer task, to start handling the last messages, but not in producer, as it's really not the way it was meant to be used. Moreover, there are some other libraries with event-based architecture, which may fit more naturally for your problem.
I have a Dataflow pipeline consisting of several blocks.
When elements are flowing through my processing pipeline, I want to group them by field A. To do this I have a BatchBlock with high BoundedCapacity. In it I store my elements until I decide that they should be released. So I invoke TriggerBatch() method.
private void Forward(TStronglyTyped data)
{
if (ShouldCreateNewGroup(data))
{
GroupingBlock.TriggerBatch();
}
GroupingBlock.SendAsync(data).Wait(SendTimeout);
}
This is how it looks.
The problem is, that the batch produced, sometimes contains the next posted element, which shouldn't be there.
To illustrate:
BatchBlock.InputQueue = {A,A,A}
NextElement = B //we should trigger a Batch!
BatchBlock.TriggerBatch()
BatchBlock.SendAsync(B);
In this point I expect my batch to be {A,A,A}, but it is {A,A,A,B}
Like TriggerBatch() was asynchronous, and SendAsync was in fact executed before the batch was actually made.
How can I solve this?
I obviously don't want to put Task.Wait(x) in there (I tried, and it works, but then performance is poor, of course).
I also encountered this issue by trying to call TriggerBatch in the wrong place. As mentioned, the SlidingWindow example using DataflowBlock.Encapsulate is the answer here, but it took some time to adapt so I thought I'd share my completed block.
My ConditionalBatchBlock creates batches up to a maximum size, possibly sooner if a certain condition is met. In my specific scenario I needed to create batches of 100, but always create new batches when certain changes in the data were detected.
public static IPropagatorBlock<T, T[]> CreateConditionalBatchBlock<T>(int batchSize, Func<Queue<T>, T, bool> condition)
{
var queue = new Queue<T>();
var source = new BufferBlock<T[]>();
var target = new ActionBlock<T>(async item =>
{
// start a new batch if required by the condition
if (condition(queue, item))
{
await source.SendAsync(queue.ToArray());
queue.Clear();
}
queue.Enqueue(item);
// always send a batch when the max size has been reached
if (queue.Count == batchSize)
{
await source.SendAsync(queue.ToArray());
queue.Clear();
}
});
// send any remaining items
target.Completion.ContinueWith(async t =>
{
if (queue.Any())
await source.SendAsync(queue.ToArray());
source.Complete();
});
return DataflowBlock.Encapsulate(target, source);
}
The condition parameter may be simpler in your case. I needed to look at the queue as well as the current item to make the determination whether to create a new batch.
I used it like this:
public async Task RunExampleAsync<T>()
{
var conditionalBatchBlock = CreateConditionalBatchBlock<T>(100, (queue, currentItem) => ShouldCreateNewBatch(queue, currentItem));
var actionBlock = new ActionBlock<T[]>(async x => await PerformActionAsync(x));
conditionalBatchBlock.LinkTo(actionBlock, new DataflowLinkOptions { PropagateCompletion = true });
await ReadDataAsync<T>(conditionalBatchBlock);
await actionBlock.Completion;
}
Here is a specialized version of Loren Paulsen's CreateConditionalBatchBlock method. This one accepts a Func<TItem, TKey> keySelector argument, and emits a new batch every time an item with different key is received.
public static IPropagatorBlock<TItem, TItem[]> CreateConditionalBatchBlock<TItem, TKey>(
Func<TItem, TKey> keySelector,
DataflowBlockOptions dataflowBlockOptions = null,
int maxBatchSize = DataflowBlockOptions.Unbounded,
IEqualityComparer<TKey> keyComparer = null)
{
if (keySelector == null) throw new ArgumentNullException(nameof(keySelector));
if (maxBatchSize < 1 && maxBatchSize != DataflowBlockOptions.Unbounded)
throw new ArgumentOutOfRangeException(nameof(maxBatchSize));
keyComparer = keyComparer ?? EqualityComparer<TKey>.Default;
var options = new ExecutionDataflowBlockOptions();
if (dataflowBlockOptions != null)
{
options.BoundedCapacity = dataflowBlockOptions.BoundedCapacity;
options.CancellationToken = dataflowBlockOptions.CancellationToken;
options.MaxMessagesPerTask = dataflowBlockOptions.MaxMessagesPerTask;
options.TaskScheduler = dataflowBlockOptions.TaskScheduler;
}
var output = new BufferBlock<TItem[]>(options);
var queue = new Queue<TItem>(); // Synchronization is not needed
TKey previousKey = default;
var input = new ActionBlock<TItem>(async item =>
{
var key = keySelector(item);
if (queue.Count > 0 && !keyComparer.Equals(key, previousKey))
{
await output.SendAsync(queue.ToArray()).ConfigureAwait(false);
queue.Clear();
}
queue.Enqueue(item);
previousKey = key;
if (queue.Count == maxBatchSize)
{
await output.SendAsync(queue.ToArray()).ConfigureAwait(false);
queue.Clear();
}
}, options);
_ = input.Completion.ContinueWith(async t =>
{
if (queue.Count > 0)
{
await output.SendAsync(queue.ToArray()).ConfigureAwait(false);
queue.Clear();
}
if (t.IsFaulted)
{
((IDataflowBlock)output).Fault(t.Exception.InnerException);
}
else
{
output.Complete();
}
}, TaskScheduler.Default);
return DataflowBlock.Encapsulate(input, output);
}