Calculate max on a sliding window for TimeSeries - c#

Input:
public class MyObject
{
public double Value { get; set; }
public DateTime Date { get; set; }
}
Method to generate test objects:
public static MyObject[] GetTestObjects()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
result.Add(myObject);
}
return result.ToArray();
}
Given this I require to calculate maximum Value for previous 12 month for each myObject. I could just think of doing this InParallel, but maybe there is an optimized solution?
Sorry for being unclear, this is what I use right now to get what I want:
public MyObject[] BruteForceBackward(MyObject[] testData)
{
return testData.AsParallel().Select(point =>
{
var max = testData.Where(x => x.Date <= point.Date && x.Date >= point.Date.AddYears(-1)).Max(x => x.Value);
return new MyObject() { Date = point.Date, Value = point.Value / max };
}).OrderBy(r => r.Date).ToArray();
}
This works but it is slow and eats processor resources (imagine, you have 100k objects), I believe there must be something better

I had a simillar project where i had to calculate such stuff on tons of sensor data.
You can now find a little more refined version in my Github repository, which should be ready to use (.Net):
https://github.com/forReason/Statistics-Helper-Library
In general you want to reduce the amount of loops going over all your data. At best, you want to touch each element only one single time.
Process Array (equiv. of BruteForceBackwards)
public static MyObject[] FlowThroughForward(ref MyObject[] testData)
{
// generate return array
MyObject[] returnData = new MyObject[testData.Length];
// keep track to minimize processing
double currentMaximum = 0;
List<MyObject> maximumValues = new List<MyObject>();
// go through the elements
for (int i = 0; i < testData.Length; i++)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = testData[i].Date.AddYears(-1);
// maximum logic
if (testData[i].Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = testData[i].Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= testData[i].Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(testData[i]);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
returnData[i] = new MyObject() { Date = testData[i].Date, Value = testData[i].Value / currentMaximum }; ;
}
return returnData;
}
Real Time Data or Streamed Data
Note: If you have really large lists, you might get memory issues with your approach to pass a full array. In this case: pass one value at a time, pass them from oldest value to newest value. Store the values back one at a time.
This Function can also be used on real time data.
The test method is included in code.
static void Main(string[] args)
{
int length = 50000;
Stopwatch stopWatch1 = new Stopwatch();
stopWatch1.Start();
var myObject = new MyObject();
var result = new List<MyObject>();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
for (int i = 0; i < length; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
myObject.Value = rnd.NextDouble();
myObject.Date = date.AddMinutes(15 * i);
result.Add(CalculateNextObject(ref myObject));
}
stopWatch1.Stop();
Console.WriteLine("test code executed in " + stopWatch1.ElapsedMilliseconds + " ms");
Thread.Sleep(1000000);
}
private static Random rnd = new Random();
private static double currentMaximum = 0;
private static List<MyObject> maximumValues = new List<MyObject>();
public static MyObject CalculateNextObject(ref MyObject input)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = input.Date.AddYears(-1);
// maximum logic
if (input.Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = input.Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= input.Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(input);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
MyObject returnData = new MyObject() { Date = input.Date, Value = input.Value / currentMaximum };
return returnData;
}
Test Method
static void Main(string[] args)
{
MyObject[] testData = GetTestObjects();
Stopwatch stopWatch1 = new Stopwatch();
Stopwatch stopWatch2 = new Stopwatch();
stopWatch1.Start();
MyObject[] testresults1 = BruteForceBackward(testData);
stopWatch1.Stop();
Console.WriteLine("BruteForceBackward executed in " + stopWatch1.ElapsedMilliseconds + " ms");
stopWatch2.Start();
MyObject[] testresults2 = FlowThroughForward(ref testData);
stopWatch2.Stop();
Console.WriteLine("FlowThroughForward executed in " + stopWatch2.ElapsedMilliseconds + " ms");
Console.WriteLine();
Console.WriteLine("Comparing some random test results: ");
var rnd = new Random();
for (int i = 0; i < 10; i++)
{
int index = rnd.Next(0, testData.Length);
Console.WriteLine("Index: " + index + " brute: " + testresults1[index].Value + " flow: " + testresults2[index].Value);
}
Thread.Sleep(1000000);
}
Test result
Tests were performed on a machine with 32 cores, so in teory multithreaded aproach should be at advantage but youll see ;)
Function
Function Time
time %
BruteForceBackward
5334 ms
99.9%
FlowThroughForward
5 ms
0.094%
Performance improvement factor: ~time/1000
console output with data validation:
BruteForceBackward executed in 5264 ms
FlowThroughForward executed in 5 ms
Comparing some random test results:
Index: 25291 brute: 0.989688139105413 flow: 0.989688139105413
Index: 11945 brute: 0.59670821976193 flow: 0.59670821976193
Index: 30282 brute: 0.413238225210297 flow: 0.413238225210297
Index: 33898 brute: 0.38258761939139 flow: 0.38258761939139
Index: 8824 brute: 0.833512217105447 flow: 0.833512217105447
Index: 22092 brute: 0.648052464067263 flow: 0.648052464067263
Index: 24633 brute: 0.35859417692481 flow: 0.35859417692481
Index: 24061 brute: 0.540642018793402 flow: 0.540642018793402
Index: 34219 brute: 0.498785766613022 flow: 0.498785766613022
Index: 2396 brute: 0.151471808392111 flow: 0.151471808392111
Cpu usage was a lot higher on Bruteforce backwards due to parallelisation.
The worst case scenario are long periods of decreasing values. The code can still be vastly optimized but I guess this should be sufficient. For further optimisation, one might look to reduce the list shuffles when removing/adding elements to maximumValues.

An interesting and challenging problem. I put together a solution using a dynamic programming approach (first learned back in CS algorithms class back in '78). First, a tree is constructed containing pre-calculated local max values over recursively defined ranges. Once constructed, the max value for an arbitrary range can be efficiently calculated mostly using the pre-calculated values. Only at the fringes of the range does the calculation drop down to the element level.
It is not as fast as julian bechtold's FlowThroughForward method, but random access to ranges may be a plus.
Code to add to Main:
Console.WriteLine();
Stopwatch stopWatch3 = new Stopwatch();
stopWatch3.Start();
MyObject[] testresults3 = RangeTreeCalculation(ref testData, 10);
stopWatch3.Stop();
Console.WriteLine($"RangeTreeCalculation executed in {stopWatch3.ElapsedMilliseconds} ms");
... test comparison
Console.WriteLine($"Index: {index} brute: {testresults1[index].Value} flow: {testresults2[index].Value} rangeTree: {testresults3[index].Value}");
Test function:
public static MyObject[] RangeTreeCalculation(ref MyObject[] testDataArray, int partitionThreshold)
{
// For this implementation, we need to convert the Array to an ArrayList, because we need a
// reference type object that can be shared.
List<MyObject> testDataList = testDataArray.ToList();
// Construct a tree containing recursive collections of pre-calculated values
var rangeTree = new RangeTree(testDataList, partitionThreshold);
MyObject[] result = new MyObject[testDataList.Count];
Parallel.ForEach(testDataList, (item, state, i) =>
{
var max = rangeTree.MaxForDateRange(item.Date.AddYears(-1), item.Date);
result[i] = new MyObject() { Date = item.Date, Value = item.Value / max };
});
return result;
}
Supporting class:
// Class used to divide and conquer using dynamic programming.
public class RangeTree
{
public List<MyObject> Data; // This reference is shared by all members of the tree
public int Start { get; } // Index of first element covered by this node.
public int Count { get; } // Number of elements covered by this node.
public DateTime FirstDateTime { get; }
public DateTime LastDateTime { get; }
public double MaxValue { get; } // Pre-calculated max for all elements covered by this node.
List<RangeTree> ChildRanges { get; }
// Top level node constructor
public RangeTree(List<MyObject> data, int partitionThreshold)
: this(data, 0, data.Count, partitionThreshold)
{
}
// Child node constructor, which covers an recursively decreasing range of element.
public RangeTree(List<MyObject> data, int start, int count, int partitionThreshold)
{
Data = data;
Start = start;
Count = count;
FirstDateTime = Data[Start].Date;
LastDateTime = Data[Start + Count - 1].Date;
if (count <= partitionThreshold)
{
// If the range is smaller than the threshold, just calculate the local max
// directly from the items. No child ranges are defined.
MaxValue = Enumerable.Range(Start, Count).Select(i => Data[i].Value).Max();
}
else
{
// We still have a significant range. Decide how to further divide them up into sub-ranges.
// (There may be room for improvement here to better balance the tree.)
int partitionSize = (count - 1) / partitionThreshold + 1;
int partitionCount = (count - 1) / partitionSize + 1;
if (count < partitionThreshold * partitionThreshold)
{
// When one away from leaf nodes, prefer fewer full leaf nodes over more
// less populated leaf nodes.
partitionCount = (count - 1) / partitionThreshold + 1;
partitionSize = (count - 1) / partitionCount + 1;
}
ChildRanges = Enumerable.Range(0, partitionCount)
.Select(partitionNum => new {
ChildStart = Start + partitionNum * partitionSize,
ChildCount = Math.Min(partitionSize, Count - partitionNum * partitionSize)
})
.Where(part => part.ChildCount > 0) // Defensive
.Select(part => new RangeTree(Data, part.ChildStart, part.ChildCount, partitionThreshold))
.ToList();
// Now is the dynamic programming part:
// Calculate the local max as the max of all child max values.
MaxValue = ChildRanges.Max(chile => chile.MaxValue);
}
}
// Get the max value for a given range of dates withing this rangeTree node.
// This used the precalculated values as much as possible.
// Only at the fringes of the date range to we calculate at the element level.
public double MaxForDateRange(DateTime fromDate, DateTime thruDate)
{
double calculatedMax = Double.MinValue;
if (fromDate > this.LastDateTime || thruDate < this.FirstDateTime)
{
// Entire range is excluded. Nothing of interest here folks.
calculatedMax = Double.MinValue;
}
else if (fromDate <= this.FirstDateTime && thruDate >= this.LastDateTime)
{
// Entire range is included. Use the already-calculated max.
calculatedMax = this.MaxValue;
}
else if (ChildRanges != null)
{
// We have child ranges. Recurse and accumulate.
// Possible optimization: Calculate max for middle ranges first, and only bother
// with extreme partial ranges if their local max values exceed the preliminary result.
for (int i = 0; i < ChildRanges.Count; ++i)
{
double childMax = ChildRanges[i].MaxForDateRange(fromDate, thruDate);
if (childMax > calculatedMax)
{
calculatedMax = childMax;
}
}
}
else
{
// Leaf range. Loop through just this limited range of notes, checking individually for
// date in range and accumulating the result.
for (int i = 0; i < this.Count; ++i)
{
var element = Data[this.Start + i];
if (fromDate <= element.Date && element.Date <= thruDate && element.Value > calculatedMax)
{
calculatedMax = element.Value;
}
}
}
return calculatedMax;
}
}
There's plenty of room for improvement, such as parameterizing the types and generalizing the functionality to support more than just Max(Value), but the framework is there.

Assuming you meant you need the maximum Value for each of the last 12 months from result, then you can use LINQ:
var beginDateTime = DateTime.Now.AddMonths(-12);
var ans = result.Where(r => r.Date >= beginDateTime).GroupBy(r => r.Date.Month).Select(mg => mg.MaxBy(r => r.Value)).ToList();
Running some timing, I get that putting AsParallel after result changes the run time from around 16ms (first run) to around 32ms, so it is actually slower. It is about the same after the Where and about 23ms after the GroupBy (processing the 12 groups in parallel). On my PC at least, there isn't enough data or complex operations for parallelism, but the GroupBy isn't the most efficient.
Using an array and testing each element, I get the results in about 1.2ms:
var maxMOs = new MyObject[12];
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-1;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
Note that the results are not chronological; you could offset monthIndex by today's month to order the results if desired.
var maxMOs = new MyObject[12];
var offset = DateTime.Now.Month-11;
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-offset;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
A micro-optimization (mostly useful on repeat runnings) is to invert the test and use the null-propagating operator:
if (!(r.Value <= maxMOs[monthIndex]?.Value))
This saves about 0.2ms on the first run but up to 0.5ms on subsequent runs.

Here is a solution similar to julian bechtold's answer. Difference is that the maximum (and all related variables) are kept hidden away from the main implementation, in a separate class whose purpose is solely to keep track of the maximum over the past year. Algorithm is the same, I just use a few Linq expressions here and there.
We keep track of the maximum in the following class:
public class MaxSlidingWindow
{
private readonly List<MyObject> _maximumValues;
private double _max;
public MaxSlidingWindow()
{
_maximumValues = new List<MyObject>();
_max = double.NegativeInfinity;
}
public double Max => _max;
public void Add(MyObject myObject)
{
if (myObject.Value >= _max)
{
_maximumValues.Clear();
_max = myObject.Value;
}
else
{
RemoveValuesSmallerThan(myObject.Value);
}
_maximumValues.Add(myObject);
RemoveObservationsBefore(myObject.Date.AddYears(-1));
_max = _maximumValues[0].Value;
}
private void RemoveObservationsBefore(DateTime targetDate)
{
var toRemoveFromFront = 0;
while (_maximumValues[toRemoveFromFront].Date < targetDate && toRemoveFromFront <= maximumValues3.Count -1)
{
toRemoveFromFront++;
}
_maximumValues.RemoveRange(0, toRemoveFromFront);
}
private void RemoveValuesSmallerThan(double targetValue)
{
var maxEntry = _maximumValues.Count - 1;
var toRemoveFromBack = 0;
while (toRemoveFromBack <= maxEntry && _maximumValues[maxEntry - toRemoveFromBack].Value <= targetValue)
{
toRemoveFromBack++;
}
_maximumValues.RemoveRange(maxEntry - toRemoveFromBack + 1, toRemoveFromBack);
}
}
It can be used as follows:
public static MyObject[] GetTestObjects_MaxSlidingWindow()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
var maxSlidingWindow = new MaxSlidingWindow();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
maxSlidingWindow.Add(myObject);
var max = maxSlidingWindow.Max;
result.Add(new MyObject { Date = myObject.Date, Value = myObject.Value / max });
}
return result.ToArray();
}
See the relative timings below - above solution is slightly faster (timed over 10 million runs), but barely noticeable:
Relative timings

Related

Sorting an array of enumeration

Hello I have an array of enumerations, and I'm trying sort that array by their enumeration value and get the array index's of the top
private enum Values
{
LOW,
MEDIUM,
HIGH
}
private Values[] Settings = new Values[10];
Settings[0] = LOW;
Settings[1] = HIGH;
Settings[2] = MEDIUM;
Settings[3] = LOW;
Settings[4] = LOW;
Settings[5] = LOW;
Settings[6] = LOW;
Settings[7] = LOW;
Settings[8] = MEDIUM;
Settings[9] = MEDIUM;
Basically now, with what I have above, I need to sort the settings array by enumeration value and get the array indexes of the top (let's say 3) items;
So I'd get back values 1, 2, 8
The platform I'm using does not support LINQ so those handy functions are not available.
I've been trying to wrap my head around this but it would help to have another pair of eyes.
thanks.
Implement a wrapper reference type,
class ValueWrapper : IComparable<ValueWrapper>
{
public Values Value { get; set; }
public int Index { get; set; }
public int CompareTo(ValueWrapper other)
{
return this.Value.CompareTo(other.Value) * -1; // Negating since you want reversed order
}
}
Usage -
ValueWrapper[] WrappedSettings = new ValueWrapper[10];
for(int i = 0; i < WrappedSettings.Length; i++)
{
WrappedSettings[i] = new ValueWrapper { Value = Settings[i], Index = i };
}
Array.Sort(WrappedSettings);
WrappedSettings will be sorted as you specified, preserving the indexes they were in the original array.
how about this:
Values first = Values.Low,second = Values.Low,third = Values.Low;
int firstI = -1,secondI = -1, thirdI = -1;
for(int i = 0;i < Settings.Length;i++)
{
if(Settings[i] > first || firstI == -1)
{
third = second;
thirdI = secondI;
second= first;
secondI= firstI;
first = Settings[i];
firstI = i;
}
else if(Settings[i] > second || secondI == -1)
{
third = second;
thirdI = secondI;
second = Settings[i];
secondI = i;
}
else if(Settings[i] > third || thirdI == -1)
{
third = Settings[i];
thirdI = i;
}
}
So, since you say you work in an environment where Linq is not available, I assume that other things like generics, nullables and so on will not be available either. A very low-tech solution.
Basic idea:
For every possible enum value, from highest to lowest, go through the list. If we find that value, output it and remember how many we have output. If we reach 3, stop the algorithm.
So, we first look for HIGH in the list, then for MEDIUM and so on.
class Program
{
private enum Values
{
LOW,
MEDIUM,
HIGH
}
static void Main(string[] args)
{
// Sample data
Values[] settings = new Values[10];
settings[0] = Values.LOW;
settings[1] = Values.HIGH;
settings[2] = Values.MEDIUM;
settings[3] = Values.LOW;
settings[4] = Values.LOW;
settings[5] = Values.LOW;
settings[6] = Values.LOW;
settings[7] = Values.LOW;
settings[8] = Values.MEDIUM;
settings[9] = Values.MEDIUM;
// Get Values of the enum type
// This list is sorted ascending by value but may contain duplicates
Array enumValues = Enum.GetValues(typeof(Values));
// Number of results found so far
int numberFound = 0;
// The enum value we used during the last outer loop, so
// we skip duplicate enum values
int lastValue = -1;
// For each enum value starting with the highest to the lowest
for (int i= enumValues.Length -1; i >= 0; i--)
{
// Get this enum value
int enumValue = (int)enumValues.GetValue(i);
// Check whether we had the same value in the previous loop
// If yes, skip it.
if(enumValue == lastValue)
{
continue;
}
lastValue = enumValue;
// For each entry in the list where we are searching
for(int j=0; j< settings.Length; j++)
{
// Check to see whether it is the currently searched value
if (enumValue == (int)settings[j])
{
// if yes, then output it.
Console.WriteLine(j);
numberFound++;
// Stop after 3 found entries
if (numberFound == 3)
{
goto finished;
}
}
}
}
finished:
Console.ReadLine();
}
}
Output is as requested 1,2,8
I'm not sure if this is exactly what you want because it doesn't sort the original array, but one way to get the indexes of the top three values is to simply store the indexes of the top values in another array. Then we can loop through the original array, and for each item, see if it's larger than any of the items at the indexes we've stored so far. If it is, then swap it with that item.
For example:
// Start the topIndexes array with all invalid indexes
var topIndexes = new[] {-1, -1, -1};
for (var settingIndex = 0; settingIndex < Settings.Length; settingIndex++)
{
var setting = Settings[settingIndex];
var topIndexLessThanSetting = -1;
// Find the smallest topIndex setting that's less than this setting
for (int topIndex = 0; topIndex < topIndexes.Length; topIndex++)
{
if (topIndexes[topIndex] == -1)
{
topIndexLessThanSetting = topIndex;
break;
}
if (setting <= Settings[topIndexes[topIndex]]) continue;
if (topIndexLessThanSetting == -1 ||
Settings[topIndexes[topIndex]] < Settings[topIndexes[topIndexLessThanSetting]])
{
topIndexLessThanSetting = topIndex;
}
}
topIndexes[topIndexLessThanSetting] = settingIndex;
}
// topIndexes = { 1, 2, 8 }

C# Increase Array Find Loop Performance

I've got an Datapoint[] file = new Datapoint[2592000] array. This array is filled with timestamps and random values. Creating them costs me like 2s. But in another function prepareData(); I'm preparing 240 Values for another Array TempBuffer.
In the prepareData() function I'm searching for matching values in the file array. If I can't find any I take the timestamp and set the value to 0 else I'm taking the found value + same timestamp.
The function looks like this:
public void prepareData()
{
stopWatch.Reset();
stopWatch.Start();
Int32 unixTimestamp = (Int32)(DateTime.UtcNow.Subtract(new DateTime(1970, 1, 1))).TotalSeconds;
for (double i = unixTimestamp; unixTimestamp - 240 < i; i--)
{
bool exists = true;
if (exists != (Array.Exists(file, element => element.XValue == i)))
{
TempBuffer = TempBuffer.Skip(1).Concat(new DataPoint[] { new DataPoint(UnixTODateTime(i).ToOADate(), 0) }).ToArray();
}
else
{
DataPoint point = Array.Find(file, element => element.XValue == i);
TempBuffer = TempBuffer.Skip(1).Concat(new DataPoint[] { new DataPoint(UnixTODateTime(i).ToOADate(), point.YValues) }).ToArray();
}
}
stopWatch.Stop();
TimeSpan ts = stopWatch.Elapsed;
}
Now the problem is with this amount of data in the file (2'592'000) the function needs like 40 seconds! With smaller amounts like 10'000 it's not problem and working fine and fast. But as soon as I set the file size to my prefered 2'592'000 points the CPU is pushed to 99% Performance and the function needs way too long.
TempBuffer Sample Value:
X = Converted UnixTimeStamp to DateTime and DateTime Converted To AODate
{X=43285.611087963, Y=23}
File Sample Value:
X = Unixtimestamp
{X=1530698090, Y=24}
It's important that the tempbuffer values are converted into AODate since the data inside the tempbuffer array is displayed in a mschart.
Is there a way to improve my code so I've got better performance?
This is most performant way for your task (this is just a template, not the final code):
public void prepareData()
{
// it will be initialized with null values
var tempbuffer = new DataPoint[240];
var timestamp = (int)(DateTime.UtcNow.Subtract(new DateTime(1970, 1, 1))).TotalSeconds;
var oldest = timestamp - 240 + 1;
// fill tempbuffer with existing DataPoints
for (int i = 0; i < file.Length; i++)
{
if (file[i].XValue <= timestamp && file[i].XValue > timestamp - 240)
{
tempbuffer[file[i].XValue - oldest] = new DataPoint(file[i].XValue, file[i].YValues);
}
}
// fill null values in tempbuffer with 'empty' DataPoints
for (int i = 0; i < tempbuffer.Length; i++)
{
tempbuffer[i] = tempbuffer[i] ?? new DataPoint(oldest + i, 0);
}
}
I have about ~10 ms
Update from comments:
If you want to fetch multiple DataPoint's and get the result using some function (e.g. average), then:
public void prepareData()
{
// use array of lists of YValues
var tempbuffer = new List<double>[240];
// initialize it
for (int i = 0; i < tempbuffer.Length; i++)
{
tempbuffer[i] = new List<double>(); //set capacity for better performance
}
var timestamp = (int)(DateTime.UtcNow.Subtract(new DateTime(1970, 1, 1))).TotalSeconds;
var oldest = timestamp - 240 + 1;
// fill tempbuffer with existing DataPoint's YValues
for (int i = 0; i < file.Length; i++)
{
if (file[i].XValue <= timestamp && file[i].XValue > timestamp - 240)
{
tempbuffer[file[i].XValue - oldest].Add(file[i].YValues);
}
}
// get result
var result = new DataPoint[tempbuffer.Length];
for (int i = 0; i < result.Length; i++)
{
result[i] = new DataPoint(oldest + i, tempbuffer[i].Count == 0 ? 0 : tempbuffer[i].Average());
}
}
You haven't given us a complete picture of your code. I would ideally like sample data and the full class definitions. But given the limit information available I think you'll find something like this works:
public void prepareData()
{
Int32 unixTimestamp = (Int32)(DateTime.UtcNow.Subtract(new DateTime(1970, 1, 1))).TotalSeconds;
var map = file.ToLookup(x => x.XValue);
TempBuffer =
Enumerable
.Range(0, 240)
.Select(x => unixTimestamp - x)
.SelectMany(x =>
map[x]
.Concat(new DataPoint(UnixTODateTime(x).ToOADate(), 0)).Take(1))
.ToArray();
}
Array.Exists() and Array.Find() are O(N) operations, you are performing them x M (240) times.
Try LINQ Join instead:
DataPoint[] dataPoints; // your "file" variable
var seekedTimestamps = Enumerable.Range(0, 240).Select(i => unixTimestamp - i);
var matchingDataPoints = dataPoints.Join(seekedTimestamps, dp => dp.XValue, sts => sts, (dp, sts) => dp);
var missingTimestamps = seekedTimestamps.Except(matchingDataPoints.Select(mdp => mdp.XValue));
// do your logic with found and missing here
// ...
LINQ Join uses hashing (on the selected "keys") and is close to O(n)
Alternatively, assuming Timestamps in input are unique and you plan to do multiple operations on the input, construct a Dictionary<int (Timestamp), DataPoint> (expensive), which will give you O(1) retrieval of a wanted data point: var dataPoint = dict[wantedTimestamp];
If DataPoint is unique (no 2 instances with identical values) you should switch the list file to a dictionary. The dictionary lookup is way faster than iterating potentially all members of the array.
Of course you need to implement GetHashCode and Equals or define a unique key for each Datapoint.

LINQ to calculate a moving average of a SortedList<dateTime,double>

I have a time series in the form of a SortedList<dateTime,double>. I would like to calculate a moving average of this series. I can do this using simple for loops. I was wondering if there is a better way to do this using linq.
my version:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
var mySeries = new SortedList<DateTime, double>();
mySeries.Add(new DateTime(2011, 01, 1), 10);
mySeries.Add(new DateTime(2011, 01, 2), 25);
mySeries.Add(new DateTime(2011, 01, 3), 30);
mySeries.Add(new DateTime(2011, 01, 4), 45);
mySeries.Add(new DateTime(2011, 01, 5), 50);
mySeries.Add(new DateTime(2011, 01, 6), 65);
var calcs = new calculations();
var avg = calcs.MovingAverage(mySeries, 3);
foreach (var item in avg)
{
Console.WriteLine("{0} {1}", item.Key, item.Value);
}
}
}
class calculations
{
public SortedList<DateTime, double> MovingAverage(SortedList<DateTime, double> series, int period)
{
var result = new SortedList<DateTime, double>();
for (int i = 0; i < series.Count(); i++)
{
if (i >= period - 1)
{
double total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
double average = total / period;
result.Add(series.Keys[i], average);
}
}
return result;
}
}
}
In order to achieve an asymptotical performance of O(n) (as the hand-coded solution does), you could use the Aggregate function like in
series.Skip(period-1).Aggregate(
new {
Result = new SortedList<DateTime, double>(),
Working = List<double>(series.Take(period-1).Select(item => item.Value))
},
(list, item)=>{
list.Working.Add(item.Value);
list.Result.Add(item.Key, list.Working.Average());
list.Working.RemoveAt(0);
return list;
}
).Result;
The accumulated value (implemented as anonymous type) contains two fields: Result contains the result list build up so far. Working contains the last period-1 elements. The aggregate function adds the current value to the Working list, builds the current average and adds it to the result and then removes the first (i.e. oldest) value from the working list.
The "seed" (i.e. the starting value for the accumulation) is build by putting the first period-1 elements into Working and initializing Result to an empty list.
Consequently tha aggregation starts with element period (by skipping (period-1) elements at the beginning)
In functional programming this is a typical usage pattern for the aggretate (or fold) function, btw.
Two remarks:
The solution is not "functionally" clean in that the same list objects (Working and Result) are reused in every step. I'm not sure if that might cause problems if some future compilers try to parallellize the Aggregate function automatically (on the other hand I'm also not sure, if that's possible after all...). A purely functional solution should "create" new lists at every step.
Also note that C# lacks powerful list expressions. In some hypothetical Python-C#-mixed pseudocode one could write the aggregation function like
(list, item)=>
new {
Result = list.Result + [(item.Key, (list.Working+[item.Value]).Average())],
Working=list.Working[1::]+[item.Value]
}
which would be a bit more elegant in my humble opinion :)
For the most efficient way possible to compute a Moving Average with LINQ, you shouldn't use LINQ!
Instead I propose creating a helper class which computes a moving average in the most efficient way possible (using a circular buffer and causal moving average filter), then an extension method to make it accessible to LINQ.
First up, the moving average
public class MovingAverage
{
private readonly int _length;
private int _circIndex = -1;
private bool _filled;
private double _current = double.NaN;
private readonly double _oneOverLength;
private readonly double[] _circularBuffer;
private double _total;
public MovingAverage(int length)
{
_length = length;
_oneOverLength = 1.0 / length;
_circularBuffer = new double[length];
}
public MovingAverage Update(double value)
{
double lostValue = _circularBuffer[_circIndex];
_circularBuffer[_circIndex] = value;
// Maintain totals for Push function
_total += value;
_total -= lostValue;
// If not yet filled, just return. Current value should be double.NaN
if (!_filled)
{
_current = double.NaN;
return this;
}
// Compute the average
double average = 0.0;
for (int i = 0; i < _circularBuffer.Length; i++)
{
average += _circularBuffer[i];
}
_current = average * _oneOverLength;
return this;
}
public MovingAverage Push(double value)
{
// Apply the circular buffer
if (++_circIndex == _length)
{
_circIndex = 0;
}
double lostValue = _circularBuffer[_circIndex];
_circularBuffer[_circIndex] = value;
// Compute the average
_total += value;
_total -= lostValue;
// If not yet filled, just return. Current value should be double.NaN
if (!_filled && _circIndex != _length - 1)
{
_current = double.NaN;
return this;
}
else
{
// Set a flag to indicate this is the first time the buffer has been filled
_filled = true;
}
_current = _total * _oneOverLength;
return this;
}
public int Length { get { return _length; } }
public double Current { get { return _current; } }
}
This class provides a very fast and lightweight implementation of a MovingAverage filter. It creates a circular buffer of Length N and computes one add, one subtract and one multiply per data-point appended, as opposed to the N multiply-adds per point for the brute force implementation.
Next, to LINQ-ify it!
internal static class MovingAverageExtensions
{
public static IEnumerable<double> MovingAverage<T>(this IEnumerable<T> inputStream, Func<T, double> selector, int period)
{
var ma = new MovingAverage(period);
foreach (var item in inputStream)
{
ma.Push(selector(item));
yield return ma.Current;
}
}
public static IEnumerable<double> MovingAverage(this IEnumerable<double> inputStream, int period)
{
var ma = new MovingAverage(period);
foreach (var item in inputStream)
{
ma.Push(item);
yield return ma.Current;
}
}
}
The above extension methods wrap the MovingAverage class and allow insertion into an IEnumerable stream.
Now to use it!
int period = 50;
// Simply filtering a list of doubles
IEnumerable<double> inputDoubles;
IEnumerable<double> outputDoubles = inputDoubles.MovingAverage(period);
// Or, use a selector to filter T into a list of doubles
IEnumerable<Point> inputPoints; // assuming you have initialised this
IEnumerable<double> smoothedYValues = inputPoints.MovingAverage(pt => pt.Y, period);
You already have an answer showing you how you can use LINQ but frankly I wouldn't use LINQ here as it will most likely perform poorly compared to your current solution and your existing code already is clear.
However instead of calculating the total of the previous period elements on every step, you can keep a running total and adjust it on each iteration. That is, change this:
total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
to this:
if (i >= period) {
total -= series.Values[i - period];
}
total += series.Values[i];
This will mean that your code will take the same amount of time to execute regardless of the size of period.
This block
double total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
double average = total / period;
can be rewritten as:
double average = series.Values.Skip(i - period + 1).Take(period).Sum() / period;
Your method may look like:
series.Skip(period - 1)
.Select((item, index) =>
new
{
item.Key,
series.Values.Skip(index).Take(period).Sum() / period
});
As you can see, linq is very expressive. I recommend to start with some tutorial like Introducing LINQ and 101 LINQ Samples.
To do this in a more functional way, you'd need a Scan method which exists in Rx but not in LINQ.
Let's look how it would look like if we'd have a scan method
var delta = 3;
var series = new [] {1.1, 2.5, 3.8, 4.8, 5.9, 6.1, 7.6};
var seed = series.Take(delta).Average();
var smas = series
.Skip(delta)
.Zip(series, Tuple.Create)
.Scan(seed, (sma, values)=>sma - (values.Item2/delta) + (values.Item1/delta));
smas = Enumerable.Repeat(0.0, delta-1).Concat(new[]{seed}).Concat(smas);
And here's the scan method, taken and adjusted from here:
public static IEnumerable<TAccumulate> Scan<TSource, TAccumulate>(
this IEnumerable<TSource> source,
TAccumulate seed,
Func<TAccumulate, TSource, TAccumulate> accumulator
)
{
if (source == null) throw new ArgumentNullException("source");
if (seed == null) throw new ArgumentNullException("seed");
if (accumulator == null) throw new ArgumentNullException("accumulator");
using (var i = source.GetEnumerator())
{
if (!i.MoveNext())
{
throw new InvalidOperationException("Sequence contains no elements");
}
var acc = accumulator(seed, i.Current);
while (i.MoveNext())
{
yield return acc;
acc = accumulator(acc, i.Current);
}
yield return acc;
}
}
This should have better performance than the brute force method since we are using a running total to calculate the SMA.
What's going on here?
To start we need to calculate the first period which we call seed here. Then, every subsequent value we calculate from the accumulated seed value. To do that we need the old value (that is t-delta) and the newest value for which we zip together the series, once from the beginning and once shifted by the delta.
At the end we do some cleanup by adding zeroes for the length of the first period and adding the initial seed value.
Another option is to use MoreLINQ's Windowed method, which simplifies the code significantly:
var averaged = mySeries.Windowed(period).Select(window => window.Average(keyValuePair => keyValuePair.Value));
I use this code to calculate SMA:
private void calculateSimpleMA(decimal[] values, out decimal[] buffer)
{
int period = values.Count(); // gets Period (assuming Period=Values-Array-Size)
buffer = new decimal[period]; // initializes buffer array
var sma = SMA(period); // gets SMA function
for (int i = 0; i < period; i++)
buffer[i] = sma(values[i]); // fills buffer with SMA calculation
}
static Func<decimal, decimal> SMA(int p)
{
Queue<decimal> s = new Queue<decimal>(p);
return (x) =>
{
if (s.Count >= p)
{
s.Dequeue();
}
s.Enqueue(x);
return s.Average();
};
}
Here is an extension method:
public static IEnumerable<double> MovingAverage(this IEnumerable<double> source, int period)
{
if (source is null)
{
throw new ArgumentNullException(nameof(source));
}
if (period < 1)
{
throw new ArgumentOutOfRangeException(nameof(period));
}
return Core();
IEnumerable<double> Core()
{
var sum = 0.0;
var buffer = new double[period];
var n = 0;
foreach (var x in source)
{
n++;
sum += x;
var index = n % period;
if (n >= period)
{
sum -= buffer[index];
yield return sum / period;
}
buffer[index] = x;
}
}
}

I need to add ranges and if a range is missed while adding i should display a message

EXAMPLE :if i add a range {1,10}
and another range{15,20}
i should get a message saying the ranges from 11 to 14 are missing.
How are you adding the ranges?
If the ranges are added in order, you can just keep the end of the last range and compare to the start of the range that you add.
If you can add the ranges in any order, you have to do the check when all ranges are added. Sort the ranges on the range start, then loop through the ranges and compare the end of a range with the start of the next range.
update
Example of the second case. First a class to hold ranges:
public class Range {
public int From { get; private set; }
public int To { get; private set; }
public Range(int from, int to) {
From = from;
To = to;
}
}
Create a list of ranges:
List<Range> ranges = new List<Range>();
ranges.Add(new Range(15, 20));
ranges.Add(new Range(1, 10));
Test the ranges:
ranges = ranges.OrderBy(r => r.From).ToList();
for (int i = 1; i < ranges.Count; i++) {
int to = ranges[i - 1].To;
int from = ranges[i].From;
int diff = to.CompareTo(from - 1);
if (diff < 0) {
Response.Write("Range from " + (to + 1).ToString() + " to " + (from - 1).ToString() + " is missing<br/>");
} else if (diff > 0) {
Response.Write("Range ending at " + to.ToString() + " overlaps range starting at " + from.ToString() + "<br/>");
}
}
Note: The code checks for both gaps in the ranges and overlapping ranges. If overlapping ranges is not a problem, just remove the part that checks for diff > 0.
It isn't clear what your code is trying to do, but here's the basic approach you should probably take:
Create a Range type that has a Start and End
Order the collection by the Start property (possibly using Enumerable.OrderBy).
Traverse the pairs of ranges in the ordered collection (possibly using Enumerable.Zip to zip the collection with itself offset by one) and check whether they are adjacent. If not, yield the missing range.
Format an error messages using the missing ranges collected from (3).
e.g.
var ordered = ranges.OrderBy(range => range.Start);
var pairs = ordered.Zip(ordered.Skip(1), (a, b) => new { a, b });
var missing = from pair in pairs
where pair.a.End + 1 < pair.b.Start
select new Range(pair.a.End + 1, pair.b.Start - 1);
An input tolerant solution and the programming Kata used to build it.
public class Range
{
public int End;
public int Start;
public Range(int start, int end)
{
Start = start;
End = end;
}
}
public class RangeGapFinder
{
public Range FindGap(Range range1, Range range2)
{
if (range1 == null)
{
throw new ArgumentNullException("range1", "range1 cannot be null");
}
if (range2 == null)
{
throw new ArgumentNullException("range2", "range2 cannot be null");
}
if (range2.Start < range1.Start)
{
return FindGap(range2, range1);
}
if (range1.End < range1.Start)
{
range1 = new Range(range1.End, range1.Start);
}
if (range2.End < range2.Start)
{
range2 = new Range(range2.End, range2.Start);
}
if (range1.End + 1 >= range2.Start)
{
return null; // no gap
}
return new Range(range1.End + 1, range2.Start - 1);
}
}

Getting Random Durations within a range in C#

For a random event generator I'm writing I need a simple algorithm to generate random ranges.
So, for example:
I may say I want 10 random intervals, between 1/1 and 1/7, with no overlap, in the states (1,2,3) where state 1 events add up to 1 day, state 2 events add up to 2 days and state 3 events add up to the rest.
Or in code:
struct Interval
{
public DateTime Date;
public long Duration;
public int State;
}
struct StateSummary
{
public int State;
public long TotalSeconds;
}
public Interval[] GetRandomIntervals(DateTime start, DateTime end, StateSummary[] sums, int totalEvents)
{
// insert your cool algorithm here
}
I'm working on this now, but in case someone beats me to a solution (or knows of an elegant pre-existing algorithm) I'm posting this on SO.
First use DateTime.Subtract to determine how many minutes/seconds/whatever between your min and max dates. Then use Math.Random to get a random number of minutes/seconds/whatever in that range. Then use the result of that to construct another TimeSpan instance and add that to your min DateTime.
Here's an implementation that compiles and works, although it's still somewhat rough. It requires that the input state array properly account for the entire time range of interest (end - start), but it would be trivial to add a bit of code that would make the final state fill up the time not accounted for in the first N-1 states. I also modified your structure definitions to use ints instead of longs for the durations, just to simplify things a bit.
For clarity (and laziness) I omitted all error checking. It works fine for the inputs like the ones you described, but it's by no means bulletproof.
public static Interval[] GetRandomIntervals( DateTime start, DateTime end,
StateSummary[] states, int totalIntervals )
{
Random r = new Random();
// stores the number of intervals to generate for each state
int[] intervalCounts = new int[states.Length];
int intervalsTemp = totalIntervals;
// assign at least one interval for each of the states
for( int i = 0; i < states.Length; i++ )
intervalCounts[i] = 1;
intervalsTemp -= states.Length;
// assign remaining intervals randomly to the various states
while( intervalsTemp > 0 )
{
int iState = r.Next( states.Length );
intervalCounts[iState] += 1;
intervalsTemp -= 1;
}
// make a scratch copy of the state array
StateSummary[] statesTemp = (StateSummary[])states.Clone();
List<Interval> result = new List<Interval>();
DateTime next = start;
while( result.Count < totalIntervals )
{
// figure out which state this interval will go in (this could
// be made more efficient, but it works just fine)
int iState = r.Next( states.Length );
if( intervalCounts[iState] < 1 )
continue;
intervalCounts[iState] -= 1;
// determine how long the interval should be
int length;
if( intervalCounts[iState] == 0 )
{
// last one for this state, use up all remaining time
length = statesTemp[iState].TotalSeconds;
}
else
{
// use up at least one second of the remaining time, but
// leave some time for the remaining intervals
int maxLength = statesTemp[iState].TotalSeconds -
intervalCounts[iState];
length = r.Next( 1, maxLength + 1 );
}
// keep track of how much time is left to assign for this state
statesTemp[iState].TotalSeconds -= length;
// add a new interval
Interval interval = new Interval();
interval.State = states[iState].State;
interval.Date = next;
interval.Duration = length;
result.Add( interval );
// update the start time for the next interval
next += new TimeSpan( 0, 0, length );
}
return result.ToArray();
}
Here is my current implementation that seems to work ok and accounts for all time. This would be so much cleaner if I didn't have to target .net 1.1
public class Interval
{
public Interval(int state)
{
this.State = state;
this.Duration = -1;
this.Date = DateTime.MinValue;
}
public DateTime Date;
public long Duration;
public int State;
}
class StateSummary
{
public StateSummary(StateEnum state, long totalSeconds)
{
State = (int)state;
TotalSeconds = totalSeconds;
}
public int State;
public long TotalSeconds;
}
Interval[] GetRandomIntervals(DateTime start, DateTime end, StateSummary[] sums, int totalEvents)
{
Random r = new Random();
ArrayList intervals = new ArrayList();
for (int i=0; i < sums.Length; i++)
{
intervals.Add(new Interval(sums[i].State));
}
for (int i=0; i < totalEvents - sums.Length; i++)
{
intervals.Add(new Interval(sums[r.Next(0,sums.Length)].State));
}
Hashtable eventCounts = new Hashtable();
foreach (Interval interval in intervals)
{
if (eventCounts[interval.State] == null)
{
eventCounts[interval.State] = 1;
}
else
{
eventCounts[interval.State] = ((int)eventCounts[interval.State]) + 1;
}
}
foreach(StateSummary sum in sums)
{
long avgDuration = sum.TotalSeconds / (int)eventCounts[sum.State];
foreach (Interval interval in intervals)
{
if (interval.State == sum.State)
{
long offset = ((long)(r.NextDouble() * avgDuration)) - (avgDuration / 2);
interval.Duration = avgDuration + offset;
}
}
}
// cap the durations.
Hashtable eventTotals = new Hashtable();
foreach (Interval interval in intervals)
{
if (eventTotals[interval.State] == null)
{
eventTotals[interval.State] = interval.Duration;
}
else
{
eventTotals[interval.State] = ((long)eventTotals[interval.State]) + interval.Duration;
}
}
foreach(StateSummary sum in sums)
{
long diff = sum.TotalSeconds - (long)eventTotals[sum.State];
if (diff != 0)
{
long diffPerInterval = diff / (int)eventCounts[sum.State];
long mod = diff % (int)eventCounts[sum.State];
bool first = true;
foreach (Interval interval in intervals)
{
if (interval.State == sum.State)
{
interval.Duration += diffPerInterval;
if (first)
{
interval.Duration += mod;
first = false;
}
}
}
}
}
Shuffle(intervals);
DateTime d = start;
foreach (Interval interval in intervals)
{
interval.Date = d;
d = d.AddSeconds(interval.Duration);
}
return (Interval[])intervals.ToArray(typeof(Interval));
}
public static ICollection Shuffle(ICollection c)
{
Random rng = new Random();
object[] a = new object[c.Count];
c.CopyTo(a, 0);
byte[] b = new byte[a.Length];
rng.NextBytes(b);
Array.Sort(b, a);
return new ArrayList(a);
}

Categories