Related
Input:
public class MyObject
{
public double Value { get; set; }
public DateTime Date { get; set; }
}
Method to generate test objects:
public static MyObject[] GetTestObjects()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
result.Add(myObject);
}
return result.ToArray();
}
Given this I require to calculate maximum Value for previous 12 month for each myObject. I could just think of doing this InParallel, but maybe there is an optimized solution?
Sorry for being unclear, this is what I use right now to get what I want:
public MyObject[] BruteForceBackward(MyObject[] testData)
{
return testData.AsParallel().Select(point =>
{
var max = testData.Where(x => x.Date <= point.Date && x.Date >= point.Date.AddYears(-1)).Max(x => x.Value);
return new MyObject() { Date = point.Date, Value = point.Value / max };
}).OrderBy(r => r.Date).ToArray();
}
This works but it is slow and eats processor resources (imagine, you have 100k objects), I believe there must be something better
I had a simillar project where i had to calculate such stuff on tons of sensor data.
You can now find a little more refined version in my Github repository, which should be ready to use (.Net):
https://github.com/forReason/Statistics-Helper-Library
In general you want to reduce the amount of loops going over all your data. At best, you want to touch each element only one single time.
Process Array (equiv. of BruteForceBackwards)
public static MyObject[] FlowThroughForward(ref MyObject[] testData)
{
// generate return array
MyObject[] returnData = new MyObject[testData.Length];
// keep track to minimize processing
double currentMaximum = 0;
List<MyObject> maximumValues = new List<MyObject>();
// go through the elements
for (int i = 0; i < testData.Length; i++)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = testData[i].Date.AddYears(-1);
// maximum logic
if (testData[i].Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = testData[i].Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= testData[i].Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(testData[i]);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
returnData[i] = new MyObject() { Date = testData[i].Date, Value = testData[i].Value / currentMaximum }; ;
}
return returnData;
}
Real Time Data or Streamed Data
Note: If you have really large lists, you might get memory issues with your approach to pass a full array. In this case: pass one value at a time, pass them from oldest value to newest value. Store the values back one at a time.
This Function can also be used on real time data.
The test method is included in code.
static void Main(string[] args)
{
int length = 50000;
Stopwatch stopWatch1 = new Stopwatch();
stopWatch1.Start();
var myObject = new MyObject();
var result = new List<MyObject>();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
for (int i = 0; i < length; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
myObject.Value = rnd.NextDouble();
myObject.Date = date.AddMinutes(15 * i);
result.Add(CalculateNextObject(ref myObject));
}
stopWatch1.Stop();
Console.WriteLine("test code executed in " + stopWatch1.ElapsedMilliseconds + " ms");
Thread.Sleep(1000000);
}
private static Random rnd = new Random();
private static double currentMaximum = 0;
private static List<MyObject> maximumValues = new List<MyObject>();
public static MyObject CalculateNextObject(ref MyObject input)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = input.Date.AddYears(-1);
// maximum logic
if (input.Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = input.Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= input.Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(input);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
MyObject returnData = new MyObject() { Date = input.Date, Value = input.Value / currentMaximum };
return returnData;
}
Test Method
static void Main(string[] args)
{
MyObject[] testData = GetTestObjects();
Stopwatch stopWatch1 = new Stopwatch();
Stopwatch stopWatch2 = new Stopwatch();
stopWatch1.Start();
MyObject[] testresults1 = BruteForceBackward(testData);
stopWatch1.Stop();
Console.WriteLine("BruteForceBackward executed in " + stopWatch1.ElapsedMilliseconds + " ms");
stopWatch2.Start();
MyObject[] testresults2 = FlowThroughForward(ref testData);
stopWatch2.Stop();
Console.WriteLine("FlowThroughForward executed in " + stopWatch2.ElapsedMilliseconds + " ms");
Console.WriteLine();
Console.WriteLine("Comparing some random test results: ");
var rnd = new Random();
for (int i = 0; i < 10; i++)
{
int index = rnd.Next(0, testData.Length);
Console.WriteLine("Index: " + index + " brute: " + testresults1[index].Value + " flow: " + testresults2[index].Value);
}
Thread.Sleep(1000000);
}
Test result
Tests were performed on a machine with 32 cores, so in teory multithreaded aproach should be at advantage but youll see ;)
Function
Function Time
time %
BruteForceBackward
5334 ms
99.9%
FlowThroughForward
5 ms
0.094%
Performance improvement factor: ~time/1000
console output with data validation:
BruteForceBackward executed in 5264 ms
FlowThroughForward executed in 5 ms
Comparing some random test results:
Index: 25291 brute: 0.989688139105413 flow: 0.989688139105413
Index: 11945 brute: 0.59670821976193 flow: 0.59670821976193
Index: 30282 brute: 0.413238225210297 flow: 0.413238225210297
Index: 33898 brute: 0.38258761939139 flow: 0.38258761939139
Index: 8824 brute: 0.833512217105447 flow: 0.833512217105447
Index: 22092 brute: 0.648052464067263 flow: 0.648052464067263
Index: 24633 brute: 0.35859417692481 flow: 0.35859417692481
Index: 24061 brute: 0.540642018793402 flow: 0.540642018793402
Index: 34219 brute: 0.498785766613022 flow: 0.498785766613022
Index: 2396 brute: 0.151471808392111 flow: 0.151471808392111
Cpu usage was a lot higher on Bruteforce backwards due to parallelisation.
The worst case scenario are long periods of decreasing values. The code can still be vastly optimized but I guess this should be sufficient. For further optimisation, one might look to reduce the list shuffles when removing/adding elements to maximumValues.
An interesting and challenging problem. I put together a solution using a dynamic programming approach (first learned back in CS algorithms class back in '78). First, a tree is constructed containing pre-calculated local max values over recursively defined ranges. Once constructed, the max value for an arbitrary range can be efficiently calculated mostly using the pre-calculated values. Only at the fringes of the range does the calculation drop down to the element level.
It is not as fast as julian bechtold's FlowThroughForward method, but random access to ranges may be a plus.
Code to add to Main:
Console.WriteLine();
Stopwatch stopWatch3 = new Stopwatch();
stopWatch3.Start();
MyObject[] testresults3 = RangeTreeCalculation(ref testData, 10);
stopWatch3.Stop();
Console.WriteLine($"RangeTreeCalculation executed in {stopWatch3.ElapsedMilliseconds} ms");
... test comparison
Console.WriteLine($"Index: {index} brute: {testresults1[index].Value} flow: {testresults2[index].Value} rangeTree: {testresults3[index].Value}");
Test function:
public static MyObject[] RangeTreeCalculation(ref MyObject[] testDataArray, int partitionThreshold)
{
// For this implementation, we need to convert the Array to an ArrayList, because we need a
// reference type object that can be shared.
List<MyObject> testDataList = testDataArray.ToList();
// Construct a tree containing recursive collections of pre-calculated values
var rangeTree = new RangeTree(testDataList, partitionThreshold);
MyObject[] result = new MyObject[testDataList.Count];
Parallel.ForEach(testDataList, (item, state, i) =>
{
var max = rangeTree.MaxForDateRange(item.Date.AddYears(-1), item.Date);
result[i] = new MyObject() { Date = item.Date, Value = item.Value / max };
});
return result;
}
Supporting class:
// Class used to divide and conquer using dynamic programming.
public class RangeTree
{
public List<MyObject> Data; // This reference is shared by all members of the tree
public int Start { get; } // Index of first element covered by this node.
public int Count { get; } // Number of elements covered by this node.
public DateTime FirstDateTime { get; }
public DateTime LastDateTime { get; }
public double MaxValue { get; } // Pre-calculated max for all elements covered by this node.
List<RangeTree> ChildRanges { get; }
// Top level node constructor
public RangeTree(List<MyObject> data, int partitionThreshold)
: this(data, 0, data.Count, partitionThreshold)
{
}
// Child node constructor, which covers an recursively decreasing range of element.
public RangeTree(List<MyObject> data, int start, int count, int partitionThreshold)
{
Data = data;
Start = start;
Count = count;
FirstDateTime = Data[Start].Date;
LastDateTime = Data[Start + Count - 1].Date;
if (count <= partitionThreshold)
{
// If the range is smaller than the threshold, just calculate the local max
// directly from the items. No child ranges are defined.
MaxValue = Enumerable.Range(Start, Count).Select(i => Data[i].Value).Max();
}
else
{
// We still have a significant range. Decide how to further divide them up into sub-ranges.
// (There may be room for improvement here to better balance the tree.)
int partitionSize = (count - 1) / partitionThreshold + 1;
int partitionCount = (count - 1) / partitionSize + 1;
if (count < partitionThreshold * partitionThreshold)
{
// When one away from leaf nodes, prefer fewer full leaf nodes over more
// less populated leaf nodes.
partitionCount = (count - 1) / partitionThreshold + 1;
partitionSize = (count - 1) / partitionCount + 1;
}
ChildRanges = Enumerable.Range(0, partitionCount)
.Select(partitionNum => new {
ChildStart = Start + partitionNum * partitionSize,
ChildCount = Math.Min(partitionSize, Count - partitionNum * partitionSize)
})
.Where(part => part.ChildCount > 0) // Defensive
.Select(part => new RangeTree(Data, part.ChildStart, part.ChildCount, partitionThreshold))
.ToList();
// Now is the dynamic programming part:
// Calculate the local max as the max of all child max values.
MaxValue = ChildRanges.Max(chile => chile.MaxValue);
}
}
// Get the max value for a given range of dates withing this rangeTree node.
// This used the precalculated values as much as possible.
// Only at the fringes of the date range to we calculate at the element level.
public double MaxForDateRange(DateTime fromDate, DateTime thruDate)
{
double calculatedMax = Double.MinValue;
if (fromDate > this.LastDateTime || thruDate < this.FirstDateTime)
{
// Entire range is excluded. Nothing of interest here folks.
calculatedMax = Double.MinValue;
}
else if (fromDate <= this.FirstDateTime && thruDate >= this.LastDateTime)
{
// Entire range is included. Use the already-calculated max.
calculatedMax = this.MaxValue;
}
else if (ChildRanges != null)
{
// We have child ranges. Recurse and accumulate.
// Possible optimization: Calculate max for middle ranges first, and only bother
// with extreme partial ranges if their local max values exceed the preliminary result.
for (int i = 0; i < ChildRanges.Count; ++i)
{
double childMax = ChildRanges[i].MaxForDateRange(fromDate, thruDate);
if (childMax > calculatedMax)
{
calculatedMax = childMax;
}
}
}
else
{
// Leaf range. Loop through just this limited range of notes, checking individually for
// date in range and accumulating the result.
for (int i = 0; i < this.Count; ++i)
{
var element = Data[this.Start + i];
if (fromDate <= element.Date && element.Date <= thruDate && element.Value > calculatedMax)
{
calculatedMax = element.Value;
}
}
}
return calculatedMax;
}
}
There's plenty of room for improvement, such as parameterizing the types and generalizing the functionality to support more than just Max(Value), but the framework is there.
Assuming you meant you need the maximum Value for each of the last 12 months from result, then you can use LINQ:
var beginDateTime = DateTime.Now.AddMonths(-12);
var ans = result.Where(r => r.Date >= beginDateTime).GroupBy(r => r.Date.Month).Select(mg => mg.MaxBy(r => r.Value)).ToList();
Running some timing, I get that putting AsParallel after result changes the run time from around 16ms (first run) to around 32ms, so it is actually slower. It is about the same after the Where and about 23ms after the GroupBy (processing the 12 groups in parallel). On my PC at least, there isn't enough data or complex operations for parallelism, but the GroupBy isn't the most efficient.
Using an array and testing each element, I get the results in about 1.2ms:
var maxMOs = new MyObject[12];
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-1;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
Note that the results are not chronological; you could offset monthIndex by today's month to order the results if desired.
var maxMOs = new MyObject[12];
var offset = DateTime.Now.Month-11;
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-offset;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
A micro-optimization (mostly useful on repeat runnings) is to invert the test and use the null-propagating operator:
if (!(r.Value <= maxMOs[monthIndex]?.Value))
This saves about 0.2ms on the first run but up to 0.5ms on subsequent runs.
Here is a solution similar to julian bechtold's answer. Difference is that the maximum (and all related variables) are kept hidden away from the main implementation, in a separate class whose purpose is solely to keep track of the maximum over the past year. Algorithm is the same, I just use a few Linq expressions here and there.
We keep track of the maximum in the following class:
public class MaxSlidingWindow
{
private readonly List<MyObject> _maximumValues;
private double _max;
public MaxSlidingWindow()
{
_maximumValues = new List<MyObject>();
_max = double.NegativeInfinity;
}
public double Max => _max;
public void Add(MyObject myObject)
{
if (myObject.Value >= _max)
{
_maximumValues.Clear();
_max = myObject.Value;
}
else
{
RemoveValuesSmallerThan(myObject.Value);
}
_maximumValues.Add(myObject);
RemoveObservationsBefore(myObject.Date.AddYears(-1));
_max = _maximumValues[0].Value;
}
private void RemoveObservationsBefore(DateTime targetDate)
{
var toRemoveFromFront = 0;
while (_maximumValues[toRemoveFromFront].Date < targetDate && toRemoveFromFront <= maximumValues3.Count -1)
{
toRemoveFromFront++;
}
_maximumValues.RemoveRange(0, toRemoveFromFront);
}
private void RemoveValuesSmallerThan(double targetValue)
{
var maxEntry = _maximumValues.Count - 1;
var toRemoveFromBack = 0;
while (toRemoveFromBack <= maxEntry && _maximumValues[maxEntry - toRemoveFromBack].Value <= targetValue)
{
toRemoveFromBack++;
}
_maximumValues.RemoveRange(maxEntry - toRemoveFromBack + 1, toRemoveFromBack);
}
}
It can be used as follows:
public static MyObject[] GetTestObjects_MaxSlidingWindow()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
var maxSlidingWindow = new MaxSlidingWindow();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
maxSlidingWindow.Add(myObject);
var max = maxSlidingWindow.Max;
result.Add(new MyObject { Date = myObject.Date, Value = myObject.Value / max });
}
return result.ToArray();
}
See the relative timings below - above solution is slightly faster (timed over 10 million runs), but barely noticeable:
Relative timings
I have a stream of data (integers) with given (constant) frequency. From time to time I need to compute different averages (predefined). I am looking for solution to do it fast and efficient.
Assumptions:
Sampling rate is constant (predefined) and might be something between 125-500 SPS
Averages I need to compute are predefined and it might me one average or many (for example only last 200ms average or last 250ms and last 500ms). There might be many averages but they are predefined!
At any time I need to be able to compute current average (real time)
What I have right now:
I assume that in particular timeframe there will be always the same amount of data. So having frequency 100SPS I assume that one second contain exactly 100 values
Queue with constant length is created (something like buffer)
For EVERY defined average, Sum variable is created
Every time new sample arrive I place it on the queue.
Every time I have new sample in the queue I add its value to the every Sum variables I have and also remove value of element which is out of the window (based on position in Queue)
Once I need to compute average I just take the particular Sum variable and divide it by number of elements this Sum should contain
To give you more better insight there is a code which I have right now:
public class Buffer<T> : LinkedList<T>
{
private readonly int capacity;
public bool IsFull => Count >= capacity;
public Buffer(int capacity)
{
this.capacity = capacity;
}
public void Enqueue(T item)
{
if (Count == capacity)
{
RemoveFirst();
}
AddLast(item);
}
}
public class MovingAverage
{
private readonly Buffer<float> Buffer;
private static readonly object bufferLock = new object();
public Dictionary<string, float> Sums { get; private set; }
public Dictionary<string, int> Counts { get; private set; }
public MovingAverage(List<int> sampleCounts, List<string> names)
{
if (sampleCounts.Count != names.Count)
{
throw new ArgumentException("Wrong Moving Averages parameters");
}
Buffer = new Buffer<float>(sampleCounts.Max());
Sums = new Dictionary<string, float>();
Counts = new Dictionary<string, int>();
for (int i = 0; i < names.Count; i++)
{
Sums[names[i]] = 0;
Counts[names[i]] = sampleCounts[i];
}
}
public void ProcessAveraging(float val)
{
lock (bufferLock)
{
if (float.IsNaN(val))
{
val = 0;
}
foreach (var keyVal in Counts.OrderBy(a => a.Value))
{
Sums[keyVal.Key] += val;
if (Buffer.Count >= keyVal.Value)
{
Sums[keyVal.Key] -= Buffer.ElementAt(Buffer.Count - keyVal.Value);
}
}
Buffer.Enqueue(val);
}
}
public float GetLastAverage(string averageName)
{
lock (bufferLock)
{
if (Buffer.Count >= Counts[averageName])
{
return Sums[averageName] / Counts[averageName];
}
else
{
return Sums[averageName] / Buffer.Count;
}
}
}
}
That works really nice and is fast enough but in real world having 100 SPS doesnt really mean you will always have 100 samples in 1 second. Sometimes its 100, sometimes 99, sometimes 101. Computing these averages is critical for my system and 1 sample more or less could change a lot. Thats why I need a real timer telling me whether sample is already out of moving-average window or not.
The idea with adding timestamp to every sample seems to be promising
Plenty of answers here.. Might as well add another one :)
This one might need some minor debugging for "off by one" etc - I didn't have a real dataset to work with so perhaps treat it as pseudocode
It's like yours: there's a buffer that is circular - give it enough capacity to hold N samples where N is enough to inspect your moving averages - 100 SPS and want to inspect 250ms I think you'll need at least 25, but we aren't short on space so you could make it more
struct Cirray
{
long _head;
TimedFloat[] _data;
public Cirray(int capacity)
{
_head = 0;
_data = new TimedFloat[capacity];
}
public void Add(float f)
{
_data[_head++%_data.Length] = new TimedFloat() { F = f };
}
public IEnumerable<float> GetAverages(int[] forDeltas)
{
double sum = 0;
long start = _head - 1;
long now = _data[start].T;
int whichDelta = 0;
for (long idx = start; idx >= 0 && whichDelta < forDeltas.Length; idx--)
{
if (_data[idx % _data.Length].T < now - forDeltas[whichDelta])
{
yield return (float)(sum / (start - idx));
whichDelta++;
}
sum += _data[idx % _data.Length].F;
}
}
}
struct TimedFloat
{
[DllImport("Kernel32.dll", CallingConvention = CallingConvention.Winapi)]
private static extern void GetSystemTimePreciseAsFileTime(out long filetime);
private float _f;
public float F { get => _f;
set {
_f = value;
GetSystemTimePreciseAsFileTime(out long x);
T = DateTime.FromFileTimeUtc(x).Ticks;
}
}
public long T;
}
The normal DateTime.UtcNow isn't very precise - about 16ms - so it's probably no good for timestamping data like this if youre saying that even one sample could throw it off. Instead we can make it so we get the ticks equivalent of the high resolution timer, if your system supports it (if not, you might have to change system, or abuse a StopWatch class into giving a higher resolution supplement) and we're timestamping every data item.
I thought about going to the complexity of maintaining N number of constantly moving pointers to various tail ends of the data and dec/incrementing N number of sums - it could still be done (and you clearly know how) but your question read like you'd probably call for the averages infrequently enough that an N sums/counts solution would spend more time maintaining the counts than it would to just run through 250 or 500 floats every now and then and just add them up. GetAverages as a result takes an array of ticks (10 thousand per ms) of the ranges you want the data over, e.g. new[] { 50 * 10000, 100 * 10000, 150 * 10000, 200 * 10000, 250 * 10000 } for 50ms to 250ms in steps of 50, and it starts at the current head and sums backwards until the point where it's going to break a time boundary (and this might be the off-by-one bit) whereupon it yields the average for that timespan, then resumes summing and counting (the count given by math of the start minus the current index) for the next time span.. I think I understood right that you want e.g. the "average over the last 50ms" and "average over the last 100ms", not "average for the recent 50ms" and "average for the 50ms before recent"
Edit:
Thought about it some more and did this:
struct Cirray
{
long _head;
TimedFloat[] _data;
RunningAverage[] _ravgs;
public Cirray(int capacity)
{
_head = 0;
_data = new TimedFloat[capacity];
}
public Cirray(int capacity, int[] deltas) : this(capacity)
{
_ravgs = new RunningAverage[deltas.Length];
for (int i = 0; i < deltas.Length; i++)
_ravgs[i] = new RunningAverage() { OverMilliseconds = deltas[i] };
}
public void Add(float f)
{
//in c# every assignment returns the assigned value; capture it for use later
var addedTF = (_data[_head++ % _data.Length] = new TimedFloat() { F = f });
if (_ravgs == null)
return;
foreach (var ra in _ravgs)
{
//add the new tf to each RA
ra.Count++;
ra.Total += addedTF.F;
//move the end pointer in the RA circularly up the array, subtracting/uncounting as we go
var boundary = addedTF.T - ra.OverMilliseconds;
while (_data[ra.EndPointer].T < boundary) //while the sample is timed before the boundary, move the
{
ra.Count--;
ra.Total -= _data[ra.EndPointer].F;
ra.EndPointer = (ra.EndPointer + 1) % _data.Length; //circular indexing
}
}
}
public IEnumerable<float> GetAverages(int[] forDeltas)
{
double sum = 0;
long start = _head - 1;
long now = _data[start].T;
int whichDelta = 0;
for (long idx = start; idx >= 0 && whichDelta < forDeltas.Length; idx--)
{
if (_data[idx % _data.Length].T < now - forDeltas[whichDelta])
{
yield return (float)(sum / (start - idx));
whichDelta++;
}
sum += _data[idx % _data.Length].F;
}
}
public IEnumerable<float> GetAverages() //from the built ins
{
foreach (var ra in _ravgs)
{
if (ra.Count == 0)
yield return 0;
else
yield return (float)(ra.Total / ra.Count);
}
}
}
Absolutely haven't tested it, but it embodies my thinking in the comments
Instead of using a linked list I would fall back to some internal functions as array copy. In this answer I included a possible rewrite for your buffer class. Taking over the idea to keep a sum at every position.
This buffer keeps track of all the sums but in order to do that it needs to sum up every item with the new value. Based on the frequency you need to get that average it might be better to sum up when you need it and only keep the individual values.
In any way I just wanted to point out how you could do it with Array.Copy
public class BufferSum
{
private readonly int _capacity;
private readonly int _last;
private float[] _items;
public int Count { get; private set; }
public bool IsFull => Count >= _capacity;
public BufferSum(int capacity)
{
_capacity = capacity;
_last = capacity - 1;
_items = new float[_capacity];
}
public void Enqueue(float item)
{
if (Count == _capacity)
{
Array.Copy(_items, 1, _items, 0, _last);
_items[_last] = 0;
}
else
{
Count++;
}
for (var i = 0; i < Count; i ++)
{
_items[i] += item;
}
}
public float Avarage => _items[0] / Count;
public float AverageAt(int ms, int fps)
{
var _pos = Convert.ToInt32(ms / 1000 * fps);
return _items[Count - _pos] / _pos;
}
}
Additional be careful with the lock statement that will take a lot of time to.
Make an array of size 500, int counter c.
For every sample:
summ -= A[c % 500] //remove old value
summ += sample
A[c % 500] = sample //replace it with new value
c++
if needed, calculate
average = summ / 500
You always want to remove the oldest element on one side of your sequence and add a new element at the other side of the sequence: you need a queue instead of a stack.
I think a round list will be faster: as long as you have not the maximum size, just add the elements, once you've got the maximum size, replace the oldest element.
This seems like a nice reusable class. Later we'll add the moving average part.
class RoundArray<T>
{
public RoundArray(int maxSize)
{
this.MaxSize = maxSize;
this.roundArray = new List<T>(maxSize);
}
private readonly int maxSize;
private readonly List<T> roundArray;
public int indexOldestItem = 0;
public void Add(T item)
{
// if list not full, just add
if (this.roundArray.Count < this.maxSize)
this.roundArray.Add(item);
else
{
// list is full, replace the oldest item:
this.roundArray[oldestItem] = item;
oldestItem = (oldestItem + 1) % this.maxSize;
}
public int Count => this.roundArray.Count;
public T Oldest => this.roundArray[this.indexOldestItem];
}
}
To make this class useful, add methods to enumerate the data, starting at the oldest or the newest, consider to add other useful reusable methods. Maybe you should implement IReadOnlyCollection<T>. Maybe some private fields should have public properties.
Your moving average calculator will use this RoundArray. Whenever an item is added, and your roundArray is not full yet, the item is added to the sum and to the round array.
If the roundArray is full, then the item replaces the oldest item. You subtract the value of the OldestItem from the Sum, and add the new Item to the Sum.
class MovingAverageCalculator
{
public MovingAverageCalculator(int maxSize)
{
this.roundArray = new RoundArray<int>(maxSize);
}
private readonly RoundArray<int> roundArray;
private int sum = 0;
private int Count => this.RoundArray.Count;
private int Average => this.sum / this.Count;
public voidAdd(int value)
{
if (this.Count == this.MaxSize)
{
// replace: remove the oldest value from the sum and add the new one
this.Sum += value - this.RoundArray.Oldest;
}
else
{
// still building: just add the new value to the Sum
this.Sum += value;
}
this.RoundArray.Add(value);
}
}
Cumulative sums.
Compute a series of cumulative sums1 for every block of ~1000 or so elements. (Could be less however 500 or 1000 is not that much of a difference and this will be more comfortable) You want to hold every block as long as at least one element inside is relevant. Then it can be recycled.2
When you need your current sum and you are within one block, your desired sum is:block[max_index] - block[last_relevant_number].
For the case when you are at the borderline of two blocks b1, b2 in this order, your desired sum is:
b1[b1.length - 1] - b1[last_relevant_number] + b2[max_index]
And we are done. The main advantage of this approach is that you don't need to know beforehands how many elements you want to keep and you can compute the result on the go.
You also don't need to handle the removal of the elements as you will naturally overwrite them when you recycle the segment - keeping the indices is all you need.
Example: let us have a constant timeseries ts = [1,1,1, .... 1]. The cumulative sums of the series will be cumsum = [1,2,3 ... n]. The sum from i-th to the j-th(inclusive) element of the ts will be cumsum[j] - cumsum[i - 1] = j - i - 1. For i = 5, j = 6 it will be 6 - 4 = 2 which is correct.
1 For array [1,2,3,4,5] these would be [1,3,6,10,15] - just for the sake of completeness.
2 Since you mentioned ~500 elements, two blocks should be enough.
I have a time series in the form of a SortedList<dateTime,double>. I would like to calculate a moving average of this series. I can do this using simple for loops. I was wondering if there is a better way to do this using linq.
my version:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
var mySeries = new SortedList<DateTime, double>();
mySeries.Add(new DateTime(2011, 01, 1), 10);
mySeries.Add(new DateTime(2011, 01, 2), 25);
mySeries.Add(new DateTime(2011, 01, 3), 30);
mySeries.Add(new DateTime(2011, 01, 4), 45);
mySeries.Add(new DateTime(2011, 01, 5), 50);
mySeries.Add(new DateTime(2011, 01, 6), 65);
var calcs = new calculations();
var avg = calcs.MovingAverage(mySeries, 3);
foreach (var item in avg)
{
Console.WriteLine("{0} {1}", item.Key, item.Value);
}
}
}
class calculations
{
public SortedList<DateTime, double> MovingAverage(SortedList<DateTime, double> series, int period)
{
var result = new SortedList<DateTime, double>();
for (int i = 0; i < series.Count(); i++)
{
if (i >= period - 1)
{
double total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
double average = total / period;
result.Add(series.Keys[i], average);
}
}
return result;
}
}
}
In order to achieve an asymptotical performance of O(n) (as the hand-coded solution does), you could use the Aggregate function like in
series.Skip(period-1).Aggregate(
new {
Result = new SortedList<DateTime, double>(),
Working = List<double>(series.Take(period-1).Select(item => item.Value))
},
(list, item)=>{
list.Working.Add(item.Value);
list.Result.Add(item.Key, list.Working.Average());
list.Working.RemoveAt(0);
return list;
}
).Result;
The accumulated value (implemented as anonymous type) contains two fields: Result contains the result list build up so far. Working contains the last period-1 elements. The aggregate function adds the current value to the Working list, builds the current average and adds it to the result and then removes the first (i.e. oldest) value from the working list.
The "seed" (i.e. the starting value for the accumulation) is build by putting the first period-1 elements into Working and initializing Result to an empty list.
Consequently tha aggregation starts with element period (by skipping (period-1) elements at the beginning)
In functional programming this is a typical usage pattern for the aggretate (or fold) function, btw.
Two remarks:
The solution is not "functionally" clean in that the same list objects (Working and Result) are reused in every step. I'm not sure if that might cause problems if some future compilers try to parallellize the Aggregate function automatically (on the other hand I'm also not sure, if that's possible after all...). A purely functional solution should "create" new lists at every step.
Also note that C# lacks powerful list expressions. In some hypothetical Python-C#-mixed pseudocode one could write the aggregation function like
(list, item)=>
new {
Result = list.Result + [(item.Key, (list.Working+[item.Value]).Average())],
Working=list.Working[1::]+[item.Value]
}
which would be a bit more elegant in my humble opinion :)
For the most efficient way possible to compute a Moving Average with LINQ, you shouldn't use LINQ!
Instead I propose creating a helper class which computes a moving average in the most efficient way possible (using a circular buffer and causal moving average filter), then an extension method to make it accessible to LINQ.
First up, the moving average
public class MovingAverage
{
private readonly int _length;
private int _circIndex = -1;
private bool _filled;
private double _current = double.NaN;
private readonly double _oneOverLength;
private readonly double[] _circularBuffer;
private double _total;
public MovingAverage(int length)
{
_length = length;
_oneOverLength = 1.0 / length;
_circularBuffer = new double[length];
}
public MovingAverage Update(double value)
{
double lostValue = _circularBuffer[_circIndex];
_circularBuffer[_circIndex] = value;
// Maintain totals for Push function
_total += value;
_total -= lostValue;
// If not yet filled, just return. Current value should be double.NaN
if (!_filled)
{
_current = double.NaN;
return this;
}
// Compute the average
double average = 0.0;
for (int i = 0; i < _circularBuffer.Length; i++)
{
average += _circularBuffer[i];
}
_current = average * _oneOverLength;
return this;
}
public MovingAverage Push(double value)
{
// Apply the circular buffer
if (++_circIndex == _length)
{
_circIndex = 0;
}
double lostValue = _circularBuffer[_circIndex];
_circularBuffer[_circIndex] = value;
// Compute the average
_total += value;
_total -= lostValue;
// If not yet filled, just return. Current value should be double.NaN
if (!_filled && _circIndex != _length - 1)
{
_current = double.NaN;
return this;
}
else
{
// Set a flag to indicate this is the first time the buffer has been filled
_filled = true;
}
_current = _total * _oneOverLength;
return this;
}
public int Length { get { return _length; } }
public double Current { get { return _current; } }
}
This class provides a very fast and lightweight implementation of a MovingAverage filter. It creates a circular buffer of Length N and computes one add, one subtract and one multiply per data-point appended, as opposed to the N multiply-adds per point for the brute force implementation.
Next, to LINQ-ify it!
internal static class MovingAverageExtensions
{
public static IEnumerable<double> MovingAverage<T>(this IEnumerable<T> inputStream, Func<T, double> selector, int period)
{
var ma = new MovingAverage(period);
foreach (var item in inputStream)
{
ma.Push(selector(item));
yield return ma.Current;
}
}
public static IEnumerable<double> MovingAverage(this IEnumerable<double> inputStream, int period)
{
var ma = new MovingAverage(period);
foreach (var item in inputStream)
{
ma.Push(item);
yield return ma.Current;
}
}
}
The above extension methods wrap the MovingAverage class and allow insertion into an IEnumerable stream.
Now to use it!
int period = 50;
// Simply filtering a list of doubles
IEnumerable<double> inputDoubles;
IEnumerable<double> outputDoubles = inputDoubles.MovingAverage(period);
// Or, use a selector to filter T into a list of doubles
IEnumerable<Point> inputPoints; // assuming you have initialised this
IEnumerable<double> smoothedYValues = inputPoints.MovingAverage(pt => pt.Y, period);
You already have an answer showing you how you can use LINQ but frankly I wouldn't use LINQ here as it will most likely perform poorly compared to your current solution and your existing code already is clear.
However instead of calculating the total of the previous period elements on every step, you can keep a running total and adjust it on each iteration. That is, change this:
total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
to this:
if (i >= period) {
total -= series.Values[i - period];
}
total += series.Values[i];
This will mean that your code will take the same amount of time to execute regardless of the size of period.
This block
double total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
double average = total / period;
can be rewritten as:
double average = series.Values.Skip(i - period + 1).Take(period).Sum() / period;
Your method may look like:
series.Skip(period - 1)
.Select((item, index) =>
new
{
item.Key,
series.Values.Skip(index).Take(period).Sum() / period
});
As you can see, linq is very expressive. I recommend to start with some tutorial like Introducing LINQ and 101 LINQ Samples.
To do this in a more functional way, you'd need a Scan method which exists in Rx but not in LINQ.
Let's look how it would look like if we'd have a scan method
var delta = 3;
var series = new [] {1.1, 2.5, 3.8, 4.8, 5.9, 6.1, 7.6};
var seed = series.Take(delta).Average();
var smas = series
.Skip(delta)
.Zip(series, Tuple.Create)
.Scan(seed, (sma, values)=>sma - (values.Item2/delta) + (values.Item1/delta));
smas = Enumerable.Repeat(0.0, delta-1).Concat(new[]{seed}).Concat(smas);
And here's the scan method, taken and adjusted from here:
public static IEnumerable<TAccumulate> Scan<TSource, TAccumulate>(
this IEnumerable<TSource> source,
TAccumulate seed,
Func<TAccumulate, TSource, TAccumulate> accumulator
)
{
if (source == null) throw new ArgumentNullException("source");
if (seed == null) throw new ArgumentNullException("seed");
if (accumulator == null) throw new ArgumentNullException("accumulator");
using (var i = source.GetEnumerator())
{
if (!i.MoveNext())
{
throw new InvalidOperationException("Sequence contains no elements");
}
var acc = accumulator(seed, i.Current);
while (i.MoveNext())
{
yield return acc;
acc = accumulator(acc, i.Current);
}
yield return acc;
}
}
This should have better performance than the brute force method since we are using a running total to calculate the SMA.
What's going on here?
To start we need to calculate the first period which we call seed here. Then, every subsequent value we calculate from the accumulated seed value. To do that we need the old value (that is t-delta) and the newest value for which we zip together the series, once from the beginning and once shifted by the delta.
At the end we do some cleanup by adding zeroes for the length of the first period and adding the initial seed value.
Another option is to use MoreLINQ's Windowed method, which simplifies the code significantly:
var averaged = mySeries.Windowed(period).Select(window => window.Average(keyValuePair => keyValuePair.Value));
I use this code to calculate SMA:
private void calculateSimpleMA(decimal[] values, out decimal[] buffer)
{
int period = values.Count(); // gets Period (assuming Period=Values-Array-Size)
buffer = new decimal[period]; // initializes buffer array
var sma = SMA(period); // gets SMA function
for (int i = 0; i < period; i++)
buffer[i] = sma(values[i]); // fills buffer with SMA calculation
}
static Func<decimal, decimal> SMA(int p)
{
Queue<decimal> s = new Queue<decimal>(p);
return (x) =>
{
if (s.Count >= p)
{
s.Dequeue();
}
s.Enqueue(x);
return s.Average();
};
}
Here is an extension method:
public static IEnumerable<double> MovingAverage(this IEnumerable<double> source, int period)
{
if (source is null)
{
throw new ArgumentNullException(nameof(source));
}
if (period < 1)
{
throw new ArgumentOutOfRangeException(nameof(period));
}
return Core();
IEnumerable<double> Core()
{
var sum = 0.0;
var buffer = new double[period];
var n = 0;
foreach (var x in source)
{
n++;
sum += x;
var index = n % period;
if (n >= period)
{
sum -= buffer[index];
yield return sum / period;
}
buffer[index] = x;
}
}
}
I need to generate bins for the purposes of calculating a histogram. Language is C#. Basically I need to take in an array of decimal numbers and generate a histogram plot out of those.
Haven't been able to find a decent library to do this outright so now I'm just looking for either a library or an algorithm to help me do the binning of the data.
So...
Are there any C# libraries out there that will take in an array of decimal data and output a binned histogram?
Is there generic algorithm for building the bins to be used in generated a histogram?
Here is a simple bucket function I use. Sadly, .NET generics doesn't support a numerical type contraint so you will have to implement a different version of the following function for decimal, int, double, etc.
public static List<int> Bucketize(this IEnumerable<decimal> source, int totalBuckets)
{
var min = source.Min();
var max = source.Max();
var buckets = new List<int>();
var bucketSize = (max - min) / totalBuckets;
foreach (var value in source)
{
int bucketIndex = 0;
if (bucketSize > 0.0)
{
bucketIndex = (int)((value - min) / bucketSize);
if (bucketIndex == totalBuckets)
{
bucketIndex--;
}
}
buckets[bucketIndex]++;
}
return buckets;
}
I got odd results using #JakePearson accepted answer. It has to do with an edge case.
Here is the code I used to test his method. I changed the extension method ever so slightly, returning an int[] and accepting double instead of decimal.
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
Random rand = new Random(1325165);
int maxValue = 100;
int numberOfBuckets = 100;
List<double> values = new List<double>();
for (int i = 0; i < 10000000; i++)
{
double value = rand.NextDouble() * (maxValue+1);
values.Add(value);
}
int[] bins = values.Bucketize(numberOfBuckets);
PointPairList points = new PointPairList();
for (int i = 0; i < numberOfBuckets; i++)
{
points.Add(i, bins[i]);
}
zedGraphControl1.GraphPane.AddBar("Random Points", points,Color.Black);
zedGraphControl1.GraphPane.YAxis.Title.Text = "Count";
zedGraphControl1.GraphPane.XAxis.Title.Text = "Value";
zedGraphControl1.AxisChange();
zedGraphControl1.Refresh();
}
}
public static class Extension
{
public static int[] Bucketize(this IEnumerable<double> source, int totalBuckets)
{
var min = source.Min();
var max = source.Max();
var buckets = new int[totalBuckets];
var bucketSize = (max - min) / totalBuckets;
foreach (var value in source)
{
int bucketIndex = 0;
if (bucketSize > 0.0)
{
bucketIndex = (int)((value - min) / bucketSize);
if (bucketIndex == totalBuckets)
{
bucketIndex--;
}
}
buckets[bucketIndex]++;
}
return buckets;
}
}
Everything works well when using 10,000,000 random double values between 0 and 100 (exclusive). Each bucket has roughly the same number of values, which makes sense given that Random returns a normal distribution.
But when I changed the value generation line from
double value = rand.NextDouble() * (maxValue+1);
to
double value = rand.Next(0, maxValue + 1);
and you get the following result, which double counts the last bucket.
It appears that when a value is same as one of the boundaries of a bucket, the code as it is written puts the value in the incorrect bucket. This artifact doesn't appear to happen with random double values as the chance of a random number being equal to a boundary of a bucket is rare and wouldn't be obvious.
The way I corrected this is to define what side of the bucket boundary is inclusive vs. exclusive.
Think of
0< x <=1 1< x <=2 ... 99< x <=100
vs.
0<= x <1 1<= x <2 ... 99<= x <100
You cannot have both boundaries inclusive, as the method wouldn't know which bucket to put it in if you have a value that is exactly equal to a boundary.
public enum BucketizeDirectionEnum
{
LowerBoundInclusive,
UpperBoundInclusive
}
public static int[] Bucketize(this IList<double> source, int totalBuckets, BucketizeDirectionEnum inclusivity = BucketizeDirectionEnum.UpperBoundInclusive)
{
var min = source.Min();
var max = source.Max();
var buckets = new int[totalBuckets];
var bucketSize = (max - min) / totalBuckets;
if (inclusivity == BucketizeDirectionEnum.LowerBoundInclusive)
{
foreach (var value in source)
{
int bucketIndex = (int)((value - min) / bucketSize);
if (bucketIndex == totalBuckets)
continue;
buckets[bucketIndex]++;
}
}
else
{
foreach (var value in source)
{
int bucketIndex = (int)Math.Ceiling((value - min) / bucketSize) - 1;
if (bucketIndex < 0)
continue;
buckets[bucketIndex]++;
}
}
return buckets;
}
The only issue now is if the input dataset has a lot of min and max values, the binning method will exclude many of those values and the resulting graph will misrepresent the dataset.
I need to know if a number compared to a set of numbers is outside of 1 stddev from the mean, etc..
While the sum of squares algorithm works fine most of the time, it can cause big trouble if you are dealing with very large numbers. You basically may end up with a negative variance...
Plus, don't never, ever, ever, compute a^2 as pow(a,2), a * a is almost certainly faster.
By far the best way of computing a standard deviation is Welford's method. My C is very rusty, but it could look something like:
public static double StandardDeviation(List<double> valueList)
{
double M = 0.0;
double S = 0.0;
int k = 1;
foreach (double value in valueList)
{
double tmpM = M;
M += (value - tmpM) / k;
S += (value - tmpM) * (value - M);
k++;
}
return Math.Sqrt(S / (k-2));
}
If you have the whole population (as opposed to a sample population), then use return Math.Sqrt(S / (k-1));.
EDIT: I've updated the code according to Jason's remarks...
EDIT: I've also updated the code according to Alex's remarks...
10 times faster solution than Jaime's, but be aware that,
as Jaime pointed out:
"While the sum of squares algorithm works fine most of the time, it
can cause big trouble if you are dealing with very large numbers. You
basically may end up with a negative variance"
If you think you are dealing with very large numbers or a very large quantity of numbers, you should calculate using both methods, if the results are equal, you know for sure that you can use "my" method for your case.
public static double StandardDeviation(double[] data)
{
double stdDev = 0;
double sumAll = 0;
double sumAllQ = 0;
//Sum of x and sum of x²
for (int i = 0; i < data.Length; i++)
{
double x = data[i];
sumAll += x;
sumAllQ += x * x;
}
//Mean (not used here)
//double mean = 0;
//mean = sumAll / (double)data.Length;
//Standard deviation
stdDev = System.Math.Sqrt(
(sumAllQ -
(sumAll * sumAll) / data.Length) *
(1.0d / (data.Length - 1))
);
return stdDev;
}
The accepted answer by Jaime is great, except you need to divide by k-2 in the last line (you need to divide by "number_of_elements-1").
Better yet, start k at 0:
public static double StandardDeviation(List<double> valueList)
{
double M = 0.0;
double S = 0.0;
int k = 0;
foreach (double value in valueList)
{
k++;
double tmpM = M;
M += (value - tmpM) / k;
S += (value - tmpM) * (value - M);
}
return Math.Sqrt(S / (k-1));
}
The Math.NET library provides this for you to of the box.
PM> Install-Package MathNet.Numerics
var populationStdDev = new List<double>(1d, 2d, 3d, 4d, 5d).PopulationStandardDeviation();
var sampleStdDev = new List<double>(2d, 3d, 4d).StandardDeviation();
See PopulationStandardDeviation for more information.
Code snippet:
public static double StandardDeviation(List<double> valueList)
{
if (valueList.Count < 2) return 0.0;
double sumOfSquares = 0.0;
double average = valueList.Average(); //.NET 3.0
foreach (double value in valueList)
{
sumOfSquares += Math.Pow((value - average), 2);
}
return Math.Sqrt(sumOfSquares / (valueList.Count - 1));
}
You can avoid making two passes over the data by accumulating the mean and mean-square
cnt = 0
mean = 0
meansqr = 0
loop over array
cnt++
mean += value
meansqr += value*value
mean /= cnt
meansqr /= cnt
and forming
sigma = sqrt(meansqr - mean^2)
A factor of cnt/(cnt-1) is often appropriate as well.
BTW-- The first pass over the data in Demi and McWafflestix answers are hidden in the calls to Average. That kind of thing is certainly trivial on a small list, but if the list exceed the size of the cache, or even the working set, this gets to be a bid deal.
I found that Rob's helpful answer didn't quite match what I was seeing using excel. To match excel, I passed the Average for valueList in to the StandardDeviation calculation.
Here is my two cents... and clearly you could calculate the moving average (ma) from valueList inside the function - but I happen to have already before needing the standardDeviation.
public double StandardDeviation(List<double> valueList, double ma)
{
double xMinusMovAvg = 0.0;
double Sigma = 0.0;
int k = valueList.Count;
foreach (double value in valueList){
xMinusMovAvg = value - ma;
Sigma = Sigma + (xMinusMovAvg * xMinusMovAvg);
}
return Math.Sqrt(Sigma / (k - 1));
}
With Extension methods.
using System;
using System.Collections.Generic;
namespace SampleApp
{
internal class Program
{
private static void Main()
{
List<double> data = new List<double> {1, 2, 3, 4, 5, 6};
double mean = data.Mean();
double variance = data.Variance();
double sd = data.StandardDeviation();
Console.WriteLine("Mean: {0}, Variance: {1}, SD: {2}", mean, variance, sd);
Console.WriteLine("Press any key to continue...");
Console.ReadKey();
}
}
public static class MyListExtensions
{
public static double Mean(this List<double> values)
{
return values.Count == 0 ? 0 : values.Mean(0, values.Count);
}
public static double Mean(this List<double> values, int start, int end)
{
double s = 0;
for (int i = start; i < end; i++)
{
s += values[i];
}
return s / (end - start);
}
public static double Variance(this List<double> values)
{
return values.Variance(values.Mean(), 0, values.Count);
}
public static double Variance(this List<double> values, double mean)
{
return values.Variance(mean, 0, values.Count);
}
public static double Variance(this List<double> values, double mean, int start, int end)
{
double variance = 0;
for (int i = start; i < end; i++)
{
variance += Math.Pow((values[i] - mean), 2);
}
int n = end - start;
if (start > 0) n -= 1;
return variance / (n);
}
public static double StandardDeviation(this List<double> values)
{
return values.Count == 0 ? 0 : values.StandardDeviation(0, values.Count);
}
public static double StandardDeviation(this List<double> values, int start, int end)
{
double mean = values.Mean(start, end);
double variance = values.Variance(mean, start, end);
return Math.Sqrt(variance);
}
}
}
/// <summary>
/// Calculates standard deviation, same as MATLAB std(X,0) function
/// <seealso cref="http://www.mathworks.co.uk/help/techdoc/ref/std.html"/>
/// </summary>
/// <param name="values">enumumerable data</param>
/// <returns>Standard deviation</returns>
public static double GetStandardDeviation(this IEnumerable<double> values)
{
//validation
if (values == null)
throw new ArgumentNullException();
int lenght = values.Count();
//saves from devision by 0
if (lenght == 0 || lenght == 1)
return 0;
double sum = 0.0, sum2 = 0.0;
for (int i = 0; i < lenght; i++)
{
double item = values.ElementAt(i);
sum += item;
sum2 += item * item;
}
return Math.Sqrt((sum2 - sum * sum / lenght) / (lenght - 1));
}
The trouble with all the other answers is that they assume you have your
data in a big array. If your data is coming in on the fly, this would be
a better approach. This class works regardless of how or if you store your data. It also gives you the choice of the Waldorf method or the sum-of-squares method. Both methods work using a single pass.
public final class StatMeasure {
private StatMeasure() {}
public interface Stats1D {
/** Add a value to the population */
void addValue(double value);
/** Get the mean of all the added values */
double getMean();
/** Get the standard deviation from a sample of the population. */
double getStDevSample();
/** Gets the standard deviation for the entire population. */
double getStDevPopulation();
}
private static class WaldorfPopulation implements Stats1D {
private double mean = 0.0;
private double sSum = 0.0;
private int count = 0;
#Override
public void addValue(double value) {
double tmpMean = mean;
double delta = value - tmpMean;
mean += delta / ++count;
sSum += delta * (value - mean);
}
#Override
public double getMean() { return mean; }
#Override
public double getStDevSample() { return Math.sqrt(sSum / (count - 1)); }
#Override
public double getStDevPopulation() { return Math.sqrt(sSum / (count)); }
}
private static class StandardPopulation implements Stats1D {
private double sum = 0.0;
private double sumOfSquares = 0.0;
private int count = 0;
#Override
public void addValue(double value) {
sum += value;
sumOfSquares += value * value;
count++;
}
#Override
public double getMean() { return sum / count; }
#Override
public double getStDevSample() {
return (float) Math.sqrt((sumOfSquares - ((sum * sum) / count)) / (count - 1));
}
#Override
public double getStDevPopulation() {
return (float) Math.sqrt((sumOfSquares - ((sum * sum) / count)) / count);
}
}
/**
* Returns a way to measure a population of data using Waldorf's method.
* This method is better if your population or values are so large that
* the sum of x-squared may overflow. It's also probably faster if you
* need to recalculate the mean and standard deviation continuously,
* for example, if you are continually updating a graphic of the data as
* it flows in.
*
* #return A Stats1D object that uses Waldorf's method.
*/
public static Stats1D getWaldorfStats() { return new WaldorfPopulation(); }
/**
* Return a way to measure the population of data using the sum-of-squares
* method. This is probably faster than Waldorf's method, but runs the
* risk of data overflow.
*
* #return A Stats1D object that uses the sum-of-squares method
*/
public static Stats1D getSumOfSquaresStats() { return new StandardPopulation(); }
}
We may be able to use statistics module in Python. It has stedev() and pstdev() commands to calculate standard deviation of sample and population respectively.
details here: https://www.geeksforgeeks.org/python-statistics-stdev/
import statistics as st
print(st.ptdev(dataframe['column name']))
This is Population standard deviation
private double calculateStdDev(List<double> values)
{
double average = values.Average();
return Math.Sqrt((values.Select(val => (val - average) * (val - average)).Sum()) / values.Count);
}
For Sample standard deviation, just change [values.Count] to [values.Count -1] in above code.
Make sure you don't have only 1 data point in your set.