Comparing the multithreaded performance of various test cases

Comparing the multithreaded performance of various test cases - c#

Recently I have been doing some tests with C# multi-threaded performance, below are the tests and the result:
Note that all the tests run three times for an array with the size of 1000000, the first run on the main thread, the second using Parallel.For, the third uses a custom parallel function where run tasks equal to the number of Environment.ProcessorCount and the load are distributed on the tasks as evenly as possible. I run the tests on i7-10700 which comes with 8 cores and 16 threads. I'm using .NET 6 and Visual Studio 2022.
My test cases:
Simple Write: write the index value for every element in the array
Write Random: write a random value for every element in the array
Date struct: create a Date object for every element in the array (where Date is struct)
Date class: create a Date object for every element in the array (where Date is class)
Lookup: run multiple lookups to fetch the value that will be written to the array element
Min: find the min value in the array
Min with wrapper: find the min value in the array, wrap the min values, so they don't share the cache line
My measured times:
routine
single-thread
multi-thread (parallel for)
multi-thread (custom)
Simple Write
6.4779 ms
10.5307 ms
2.751 ms
Write Random
8.2883 ms
20.1144 ms
5.7795 ms
Date struct
19.0835 ms
76.5366 ms
11.0556 ms
Date class
67.922 ms
101.3405 ms
71.2441 ms
Lookup
4.4336 ms
2.293 ms
1.3446 ms
Min
3.4373 ms
6.9232 ms
1.2026 ms
Min with wrapper
3.5319 ms
5.834 ms
1.3308 ms
I attached the code to my test cases, my questions are:
why is the Parallel.For run so poorly?
In tests one and two, I only achieved around 2X the performance, is there a better way to achieve more?
this is just an observation: from tests three and four we can see that object allocation on the heap doesn't work well with multi-threading.
test number five is where multi-threading shine, we can get to around 4x the performance of the single-thread, not sure why this is the case.
shouldn't I be reaching 16x performance, am I doing anything wrong, is there anything I should be aware of?
using System.Diagnostics;
public static class Ext
{
public static T[] Fill<T>(this T[] array, Func<T> cons)
{
for (int i = 0; i < array.Length; i++)
array[i] = cons();
return array;
}
}
public static class App
{
public static void Main()
{
Simple_Write_Test();
Rand_Test();
DateStruct_Test();
DateClass_Test();
Lookup_Test();
Min_Test();
Min_Wrapper_Test();
Console.ReadKey();
}
const int size = 1000000;
public static void Simple_Write_Test()
{
RunTest("Simple Write", size, () =>
{
return (new int[size], new int[size], new int[size]);
},
(data, i) =>
{
data.Item1[i] = i;
data.Item2[i] = i;
data.Item3[i] = i;
},
(data, i) =>
{
data.Item1[i] = i;
data.Item2[i] = i;
data.Item3[i] = i;
},
(data, i, id) =>
{
data.Item1[i] = i;
data.Item2[i] = i;
data.Item3[i] = i;
});
}
public static void Rand_Test()
{
var rand_lock = new object();
RunTest("Write Random", size, () =>
{
return (new int[size],
new Random[Environment.ProcessorCount].Fill(() => new Random()));
},
(data, i) =>
{
var rand = data.Item2[0];
data.Item1[i] = rand.Next();
},
(data, i) =>
{
int value;
lock (rand_lock)
{
value = data.Item2[0].Next();
}
data.Item1[i] = value;
},
(data, i, id) =>
{
var rand = data.Item2[id];
data.Item1[i] = rand.Next();
});
}
struct DateStruct
{
public int year;
public int month;
public int day;
public DateStruct(int year, int month, int day)
{
this.year = year;
this.month = month;
this.day = day;
}
}
public static void DateStruct_Test()
{
var rand_lock = new object();
RunTest("Date struct", size, () =>
{
return (new DateStruct[size], new Random[Environment.ProcessorCount].Fill(() => new Random()));
},
(data, i) =>
{
var rand = data.Item2[0];
data.Item1[i] = new DateStruct(rand.Next(), rand.Next(), rand.Next());
},
(data, i) =>
{
DateStruct value;
lock (rand_lock)
{
var rand = data.Item2[0];
value = new DateStruct(rand.Next(), rand.Next(), rand.Next());
}
data.Item1[i] = value;
},
(data, i, id) =>
{
var rand = data.Item2[id];
data.Item1[i] = new DateStruct(rand.Next(), rand.Next(), rand.Next());
});
}
class DateClass
{
public int year;
public int month;
public int day;
public DateClass(int year, int month, int day)
{
this.year = year;
this.month = month;
this.day = day;
}
}
public static void DateClass_Test()
{
var rand_lock = new object();
RunTest("Date Class", size, () =>
{
return (new DateClass[size], new Random[Environment.ProcessorCount].Fill(() => new Random()));
},
(data, i) =>
{
var rand = data.Item2[0];
data.Item1[i] = new DateClass(rand.Next(), rand.Next(), rand.Next());
},
(data, i) =>
{
DateClass value;
lock (rand_lock)
{
var rand = data.Item2[0];
value = new DateClass(rand.Next(), rand.Next(), rand.Next());
}
data.Item1[i] = value;
},
(data, i, id) =>
{
var rand = data.Item2[id];
data.Item1[i] = new DateClass(rand.Next(), rand.Next(), rand.Next());
});
}
public static void Lookup_Test()
{
RunTest("Lookup", size, () =>
{
var rand = new Random();
return (new int[size].Fill(() => rand.Next() % 100), new int[100].Fill(() => rand.Next() % 1000), new int[1000].Fill(() => rand.Next()));
},
(data, i) =>
{
data.Item1[i] = data.Item3[data.Item2[data.Item1[i]]];
},
(data, i) =>
{
data.Item1[i] = data.Item3[data.Item2[data.Item1[i]]];
},
(data, i, id) =>
{
data.Item1[i] = data.Item3[data.Item2[data.Item1[i]]];
});
}
public static void Min_Test()
{
object min_lock = new object();
RunTest("Min", size, () =>
{
var rand = new Random();
var array = new int[size].Fill(() => rand.Next());
var minValues = new int[Environment.ProcessorCount].Fill(() => int.MaxValue);
var minIndices = new int[Environment.ProcessorCount].Fill(() => -1);
return (array, minValues, minIndices);
},
(data, i) =>
{
var array = data.Item1;
if (array[i] < data.Item2[0])
{
data.Item2[0] = array[i];
data.Item3[0] = i;
}
},
(data, i) =>
{
var array = data.Item1;
if (array[i] < data.Item2[0])
{
lock (min_lock)
{
data.Item2[0] = array[i];
data.Item3[0] = i;
}
}
},
(data, i, id) =>
{
var array = data.Item1;
if (array[i] < data.Item2[id])
{
data.Item2[id] = array[i];
data.Item3[id] = i;
}
});
}
class Wrapper<T>
{
public T value;
public Wrapper(T value)
{
this.value = value;
}
public static implicit operator Wrapper<T>(T value) => new Wrapper<T>(value);
}
public static void Min_Wrapper_Test()
{
object min_lock = new object();
//wrap the values, so they don't share the cache line
RunTest("Min with wrapper", size, () =>
{
var rand = new Random();
var array = new int[size].Fill(() => rand.Next());
var minValues = new Wrapper<int>[Environment.ProcessorCount].Fill(() => int.MaxValue);
var minIndices = new Wrapper<int>[Environment.ProcessorCount].Fill(() => -1);
return (array, minValues, minIndices);
},
(data, i) =>
{
var array = data.Item1;
if (array[i] < data.Item2[0].value)
{
data.Item2[0] = array[i];
data.Item3[0] = i;
}
},
(data, i) =>
{
var array = data.Item1;
if (array[i] < data.Item2[0].value)
{
lock (min_lock)
{
data.Item2[0] = array[i];
data.Item3[0] = i;
}
}
},
(data, i, id) =>
{
var array = data.Item1;
if (array[i] < data.Item2[id].value)
{
data.Item2[id] = array[i];
data.Item3[id] = i;
}
});
}
public static void RunTest<T>(string name, int size,
Func<T> DataIntializer,
Action<T, int> excute,
Action<T, int> parallelForExecute,
Action<T, int, int> parallelExecute)
{
Log(name);
T data = DataIntializer();
var watch = Stopwatch.StartNew();
for (int i = 0; i < size; i++)
excute(data, i);
Log($"single thread time {watch.ElapsedTicks / 10000f} ms");
data = DataIntializer();
watch.Restart();
Parallel.For(0, size, (i) =>
{
excute(data, i);
});
Log($"multi thread (parallel for) time {watch.ElapsedTicks / 10000f} ms");
data = DataIntializer();
watch.Restart();
For(size, (i, id) =>
{
parallelExecute(data, i, id);
});
Log($"multi thread (custom) time {watch.ElapsedTicks / 10000f} ms\n");
}
public static void For(int size, Action<int, int> excute)
{
Task[] tasks = new Task[Environment.ProcessorCount];
int seg = size / Environment.ProcessorCount;
int r = size - seg * Environment.ProcessorCount;
int last = 0;
for (int p = 0; p < tasks.Length; p++)
{
int start = last;
int end = last + seg + (r-- > 0 ? 1 : 0);
int id = p;
last = end;
tasks[p] = Task.Run(() =>
{
for (int i = start; i < end; i++)
excute(i, id);
});
}
Task.WaitAll(tasks);
}
public static void Log(object text)
{
Console.WriteLine(text);
}
}

why is the Parallel.For run so poorly?
because the work in each iteration is far to small, so the overhead becomes dominating. What you should be doing is splitting the work into decently sized chunks, sort of what you are doing in your custom "parallel for".
The body of your parallel loop should probably take on the order of microseconds. But there is a balance here, if you make your chunks to small you will suffer due to overheads starting to dominate, if you make your chunks to large you might not be able to use all cores efficiently. See also how to speed up small loop bodies
If you just want to limit the number of cores used you should be using the Parallel.For overload that takes a ParallelOption: new ParallelOptions(){MaxDegreeOfParallelism = Environment.ProcessorCount}.
In tests one and two, I only achieved around 2X the performance, is there a better way to achieve more?
My guess is that you are bottle necking on the delegate invocation. Writing to memory takes on the order of cycles, doing a method call is kind of expensive on these timescales. In some cases methods can be inlined to avoid this penalty, but I do not think this can be done with delegates in the current .Net runtime.
this is just an observation: from tests three and four we can see that object allocation on the heap doesn't work well with multi-threading.
That is not the observation I would make. My conclusion is that you should avoid using contested locks, and if you need something like a Random you should be using the parallel.For overload that creates a thread local objects, so each thread gets its own object.
Heap allocations on multiple threads should perform fairly well, I believe each thread has a local segment it can allocate from, but I'm really not an expert on the inner details of the memory allocator. But I can say with confidence that you should not do high frequency allocations if you are writing high performance code, regardless of how many threads you are using.
shouldn't I be reaching 16x performance, am I doing anything wrong, is there anything I should be aware of?
You only have 8 real cores, the extra "threads" can help in some very specific circumstances, but typically something like 20% in the best case. Reaching even 8 times scaling should be considered good.
Keep in mind that cores are not the only resource that matters, there is also memory bandwidth, caches, cache coherency and many other things that might limit performance.
Note that you should probably be using Benchmark.Net to avoid common pitfalls when writing benchmarks.

Short Answer
The short answer is that the overhead of creating, managing and switching between the tasks is greater than any benefit in running your operation on multiple threads.
Simply splitting up any operation into parallelly running tasks will not make it faster. The operation needs to be one that would benefit being broken up. Generally this will be an operation that is sufficiently long or that computationally intensive.
Secondly the way that an operation is split up is important. Splitting up an operation into 100 tasks may result in an overhead to creating and processing tasks that is greater than the benefit of using them but splitting the operation into 10 tasks might provide a performance benefit.
Using an analogy to explain the situation better: getting 1000 people (together at the same time) to complete a 100 piece puzzle is likely to be worse than having just 1 person complete it. But having 4 or 10 people complete the puzzle is likely to be better than 1.
Further Information
Note that the Parallel.For implementation and the custom parallel implementation are not equivalent.
The custom parallel implementation runs a range of iterations (within the for loop) per task but the Parallel.For implementation runs a single iteration per task (in practice Parallel.For may not use one Task for every iteration but that is the extreme scenario).
This means that the Parallel.For implementation will use many more tasks which will cause a lot more overhead for the creation and processing of the tasks.
As #Wyck mentioned in a comment, range partitioning can be used with Parallel.For implementation to mitigate this issue.
Whenever using tasks for performance you need to weigh up whether the overhead of using multiple tasks will be less than any reduction in duration achieved by splitting up an operation.
Sometimes you might need to adjust the number of times you split up an operation like in this example - there is a benefit, if you split it up into 8/16 Tasks but not (up to) 1000000 tasks.

Related

Thread local BigInteger variable in nested Parallel.For is not processed for aggregation with standard patterns?

I tryed to refactor a nested sequential for loop into a nested Parallel.For loop.
But following the recommended parallel patterns and locks, the overall result was too low compared with the sequential result.
The problem was caused by a wrong or inconsistent use of BigInteger calculation methods.
For BigInteger you need to use ++-operator or BigInteger methods like BigInteger.Add().
My sources:
How to: Write a Parallel.For Loop with Thread-Local Variables
Threading in C# - Parallel Programming - The Parallel Class - For and ForEach
Please find sample code below:
internal static class Program
{
static Object lockObj = new Object();
static void Main()
{
//target result: 575
NestedLoopAggregationTest();
return;
}
private static void NestedLoopAggregationTest()
{
BigInteger totalSequential = 0;
BigInteger totalRecomandedPattern = 0;
BigInteger totalAntiPattern = 0;
const int iEnd1 = 5;
const int iEnd2 = 10;
const int iEnd3 = 15;
for (int iCn1 = 1; iCn1 <= iEnd1; iCn1++)
{
for (int iCn2 = 1; iCn2 <= iEnd2; iCn2++)
{
for (int iCn3 = iCn2 - 1; iCn3 <= iEnd3; iCn3++)
{
totalSequential++;
}
}
}
Parallel.For(1, iEnd1 + 1, (iCn1) =>
{
Parallel.For(1, iEnd2 + 1, (iCn2) =>
{
Parallel.For<BigInteger>(iCn2 - 1, iEnd3 + 1, () => 0, (iCn3, state, subtotal) =>
{
//Solution:
//for BigInteger use ++-operator or BigInteger.Add()
subtotal = BigInteger.Add(subtotal, 1);
return subtotal;
},
(subtotal) =>
{
lock (lockObj)
{
totalRecomandedPattern = BigInteger.Add(totalRecomandedPattern, subtotal);
}
}
);
});
});
MessageBox.Show(totalSequential.ToString() + Environment.NewLine + totalRecomandedPattern.ToString() +
}
}

Your current parallel implementation requires a lock every time subtotal is modified in the inner loop. This modified approach is faster than both your serial and parallel implementaions because it avoids a lock in the innermost loop:
Parallel.For(1, iEnd1 + 1, (iCn1) =>
{
Parallel.For(1, iEnd2 + 1, (iCn2) =>
{
BigInteger subtotal = 0;
for (var iCnt3 = iCn2 - 1; iCnt3 < iEnd3 + 1; iCnt3++)
{
//Solution:
//for BigInteger use ++-operator or BigInteger.Add()
subtotal = BigInteger.Add(subtotal, 1);
}
lock (lockObj)
{
totalRecomandedPatternModified = BigInteger.Add(totalRecomandedPatternModified, subtotal);
}
});
});
I increased each of the endpoints by a factor of 10 so the runtime is long enough to be measured on my hardware, then got the following average times:
Serial: 9ms
Parallel: 11ms
Modified: 2ms

Calculate max on a sliding window for TimeSeries

Input:
public class MyObject
{
public double Value { get; set; }
public DateTime Date { get; set; }
}
Method to generate test objects:
public static MyObject[] GetTestObjects()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
result.Add(myObject);
}
return result.ToArray();
}
Given this I require to calculate maximum Value for previous 12 month for each myObject. I could just think of doing this InParallel, but maybe there is an optimized solution?
Sorry for being unclear, this is what I use right now to get what I want:
public MyObject[] BruteForceBackward(MyObject[] testData)
{
return testData.AsParallel().Select(point =>
{
var max = testData.Where(x => x.Date <= point.Date && x.Date >= point.Date.AddYears(-1)).Max(x => x.Value);
return new MyObject() { Date = point.Date, Value = point.Value / max };
}).OrderBy(r => r.Date).ToArray();
}
This works but it is slow and eats processor resources (imagine, you have 100k objects), I believe there must be something better

I had a simillar project where i had to calculate such stuff on tons of sensor data.
You can now find a little more refined version in my Github repository, which should be ready to use (.Net):
https://github.com/forReason/Statistics-Helper-Library
In general you want to reduce the amount of loops going over all your data. At best, you want to touch each element only one single time.
Process Array (equiv. of BruteForceBackwards)
public static MyObject[] FlowThroughForward(ref MyObject[] testData)
{
// generate return array
MyObject[] returnData = new MyObject[testData.Length];
// keep track to minimize processing
double currentMaximum = 0;
List<MyObject> maximumValues = new List<MyObject>();
// go through the elements
for (int i = 0; i < testData.Length; i++)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = testData[i].Date.AddYears(-1);
// maximum logic
if (testData[i].Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = testData[i].Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= testData[i].Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(testData[i]);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
returnData[i] = new MyObject() { Date = testData[i].Date, Value = testData[i].Value / currentMaximum }; ;
}
return returnData;
}
Real Time Data or Streamed Data
Note: If you have really large lists, you might get memory issues with your approach to pass a full array. In this case: pass one value at a time, pass them from oldest value to newest value. Store the values back one at a time.
This Function can also be used on real time data.
The test method is included in code.
static void Main(string[] args)
{
int length = 50000;
Stopwatch stopWatch1 = new Stopwatch();
stopWatch1.Start();
var myObject = new MyObject();
var result = new List<MyObject>();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
for (int i = 0; i < length; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
myObject.Value = rnd.NextDouble();
myObject.Date = date.AddMinutes(15 * i);
result.Add(CalculateNextObject(ref myObject));
}
stopWatch1.Stop();
Console.WriteLine("test code executed in " + stopWatch1.ElapsedMilliseconds + " ms");
Thread.Sleep(1000000);
}
private static Random rnd = new Random();
private static double currentMaximum = 0;
private static List<MyObject> maximumValues = new List<MyObject>();
public static MyObject CalculateNextObject(ref MyObject input)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = input.Date.AddYears(-1);
// maximum logic
if (input.Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = input.Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= input.Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(input);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
MyObject returnData = new MyObject() { Date = input.Date, Value = input.Value / currentMaximum };
return returnData;
}
Test Method
static void Main(string[] args)
{
MyObject[] testData = GetTestObjects();
Stopwatch stopWatch1 = new Stopwatch();
Stopwatch stopWatch2 = new Stopwatch();
stopWatch1.Start();
MyObject[] testresults1 = BruteForceBackward(testData);
stopWatch1.Stop();
Console.WriteLine("BruteForceBackward executed in " + stopWatch1.ElapsedMilliseconds + " ms");
stopWatch2.Start();
MyObject[] testresults2 = FlowThroughForward(ref testData);
stopWatch2.Stop();
Console.WriteLine("FlowThroughForward executed in " + stopWatch2.ElapsedMilliseconds + " ms");
Console.WriteLine();
Console.WriteLine("Comparing some random test results: ");
var rnd = new Random();
for (int i = 0; i < 10; i++)
{
int index = rnd.Next(0, testData.Length);
Console.WriteLine("Index: " + index + " brute: " + testresults1[index].Value + " flow: " + testresults2[index].Value);
}
Thread.Sleep(1000000);
}
Test result
Tests were performed on a machine with 32 cores, so in teory multithreaded aproach should be at advantage but youll see ;)
Function
Function Time
time %
BruteForceBackward
5334 ms
99.9%
FlowThroughForward
5 ms
0.094%
Performance improvement factor: ~time/1000
console output with data validation:
BruteForceBackward executed in 5264 ms
FlowThroughForward executed in 5 ms
Comparing some random test results:
Index: 25291 brute: 0.989688139105413 flow: 0.989688139105413
Index: 11945 brute: 0.59670821976193 flow: 0.59670821976193
Index: 30282 brute: 0.413238225210297 flow: 0.413238225210297
Index: 33898 brute: 0.38258761939139 flow: 0.38258761939139
Index: 8824 brute: 0.833512217105447 flow: 0.833512217105447
Index: 22092 brute: 0.648052464067263 flow: 0.648052464067263
Index: 24633 brute: 0.35859417692481 flow: 0.35859417692481
Index: 24061 brute: 0.540642018793402 flow: 0.540642018793402
Index: 34219 brute: 0.498785766613022 flow: 0.498785766613022
Index: 2396 brute: 0.151471808392111 flow: 0.151471808392111
Cpu usage was a lot higher on Bruteforce backwards due to parallelisation.
The worst case scenario are long periods of decreasing values. The code can still be vastly optimized but I guess this should be sufficient. For further optimisation, one might look to reduce the list shuffles when removing/adding elements to maximumValues.

An interesting and challenging problem. I put together a solution using a dynamic programming approach (first learned back in CS algorithms class back in '78). First, a tree is constructed containing pre-calculated local max values over recursively defined ranges. Once constructed, the max value for an arbitrary range can be efficiently calculated mostly using the pre-calculated values. Only at the fringes of the range does the calculation drop down to the element level.
It is not as fast as julian bechtold's FlowThroughForward method, but random access to ranges may be a plus.
Code to add to Main:
Console.WriteLine();
Stopwatch stopWatch3 = new Stopwatch();
stopWatch3.Start();
MyObject[] testresults3 = RangeTreeCalculation(ref testData, 10);
stopWatch3.Stop();
Console.WriteLine($"RangeTreeCalculation executed in {stopWatch3.ElapsedMilliseconds} ms");
... test comparison
Console.WriteLine($"Index: {index} brute: {testresults1[index].Value} flow: {testresults2[index].Value} rangeTree: {testresults3[index].Value}");
Test function:
public static MyObject[] RangeTreeCalculation(ref MyObject[] testDataArray, int partitionThreshold)
{
// For this implementation, we need to convert the Array to an ArrayList, because we need a
// reference type object that can be shared.
List<MyObject> testDataList = testDataArray.ToList();
// Construct a tree containing recursive collections of pre-calculated values
var rangeTree = new RangeTree(testDataList, partitionThreshold);
MyObject[] result = new MyObject[testDataList.Count];
Parallel.ForEach(testDataList, (item, state, i) =>
{
var max = rangeTree.MaxForDateRange(item.Date.AddYears(-1), item.Date);
result[i] = new MyObject() { Date = item.Date, Value = item.Value / max };
});
return result;
}
Supporting class:
// Class used to divide and conquer using dynamic programming.
public class RangeTree
{
public List<MyObject> Data; // This reference is shared by all members of the tree
public int Start { get; } // Index of first element covered by this node.
public int Count { get; } // Number of elements covered by this node.
public DateTime FirstDateTime { get; }
public DateTime LastDateTime { get; }
public double MaxValue { get; } // Pre-calculated max for all elements covered by this node.
List<RangeTree> ChildRanges { get; }
// Top level node constructor
public RangeTree(List<MyObject> data, int partitionThreshold)
: this(data, 0, data.Count, partitionThreshold)
{
}
// Child node constructor, which covers an recursively decreasing range of element.
public RangeTree(List<MyObject> data, int start, int count, int partitionThreshold)
{
Data = data;
Start = start;
Count = count;
FirstDateTime = Data[Start].Date;
LastDateTime = Data[Start + Count - 1].Date;
if (count <= partitionThreshold)
{
// If the range is smaller than the threshold, just calculate the local max
// directly from the items. No child ranges are defined.
MaxValue = Enumerable.Range(Start, Count).Select(i => Data[i].Value).Max();
}
else
{
// We still have a significant range. Decide how to further divide them up into sub-ranges.
// (There may be room for improvement here to better balance the tree.)
int partitionSize = (count - 1) / partitionThreshold + 1;
int partitionCount = (count - 1) / partitionSize + 1;
if (count < partitionThreshold * partitionThreshold)
{
// When one away from leaf nodes, prefer fewer full leaf nodes over more
// less populated leaf nodes.
partitionCount = (count - 1) / partitionThreshold + 1;
partitionSize = (count - 1) / partitionCount + 1;
}
ChildRanges = Enumerable.Range(0, partitionCount)
.Select(partitionNum => new {
ChildStart = Start + partitionNum * partitionSize,
ChildCount = Math.Min(partitionSize, Count - partitionNum * partitionSize)
})
.Where(part => part.ChildCount > 0) // Defensive
.Select(part => new RangeTree(Data, part.ChildStart, part.ChildCount, partitionThreshold))
.ToList();
// Now is the dynamic programming part:
// Calculate the local max as the max of all child max values.
MaxValue = ChildRanges.Max(chile => chile.MaxValue);
}
}
// Get the max value for a given range of dates withing this rangeTree node.
// This used the precalculated values as much as possible.
// Only at the fringes of the date range to we calculate at the element level.
public double MaxForDateRange(DateTime fromDate, DateTime thruDate)
{
double calculatedMax = Double.MinValue;
if (fromDate > this.LastDateTime || thruDate < this.FirstDateTime)
{
// Entire range is excluded. Nothing of interest here folks.
calculatedMax = Double.MinValue;
}
else if (fromDate <= this.FirstDateTime && thruDate >= this.LastDateTime)
{
// Entire range is included. Use the already-calculated max.
calculatedMax = this.MaxValue;
}
else if (ChildRanges != null)
{
// We have child ranges. Recurse and accumulate.
// Possible optimization: Calculate max for middle ranges first, and only bother
// with extreme partial ranges if their local max values exceed the preliminary result.
for (int i = 0; i < ChildRanges.Count; ++i)
{
double childMax = ChildRanges[i].MaxForDateRange(fromDate, thruDate);
if (childMax > calculatedMax)
{
calculatedMax = childMax;
}
}
}
else
{
// Leaf range. Loop through just this limited range of notes, checking individually for
// date in range and accumulating the result.
for (int i = 0; i < this.Count; ++i)
{
var element = Data[this.Start + i];
if (fromDate <= element.Date && element.Date <= thruDate && element.Value > calculatedMax)
{
calculatedMax = element.Value;
}
}
}
return calculatedMax;
}
}
There's plenty of room for improvement, such as parameterizing the types and generalizing the functionality to support more than just Max(Value), but the framework is there.

Assuming you meant you need the maximum Value for each of the last 12 months from result, then you can use LINQ:
var beginDateTime = DateTime.Now.AddMonths(-12);
var ans = result.Where(r => r.Date >= beginDateTime).GroupBy(r => r.Date.Month).Select(mg => mg.MaxBy(r => r.Value)).ToList();
Running some timing, I get that putting AsParallel after result changes the run time from around 16ms (first run) to around 32ms, so it is actually slower. It is about the same after the Where and about 23ms after the GroupBy (processing the 12 groups in parallel). On my PC at least, there isn't enough data or complex operations for parallelism, but the GroupBy isn't the most efficient.
Using an array and testing each element, I get the results in about 1.2ms:
var maxMOs = new MyObject[12];
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-1;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
Note that the results are not chronological; you could offset monthIndex by today's month to order the results if desired.
var maxMOs = new MyObject[12];
var offset = DateTime.Now.Month-11;
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-offset;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
A micro-optimization (mostly useful on repeat runnings) is to invert the test and use the null-propagating operator:
if (!(r.Value <= maxMOs[monthIndex]?.Value))
This saves about 0.2ms on the first run but up to 0.5ms on subsequent runs.

Here is a solution similar to julian bechtold's answer. Difference is that the maximum (and all related variables) are kept hidden away from the main implementation, in a separate class whose purpose is solely to keep track of the maximum over the past year. Algorithm is the same, I just use a few Linq expressions here and there.
We keep track of the maximum in the following class:
public class MaxSlidingWindow
{
private readonly List<MyObject> _maximumValues;
private double _max;
public MaxSlidingWindow()
{
_maximumValues = new List<MyObject>();
_max = double.NegativeInfinity;
}
public double Max => _max;
public void Add(MyObject myObject)
{
if (myObject.Value >= _max)
{
_maximumValues.Clear();
_max = myObject.Value;
}
else
{
RemoveValuesSmallerThan(myObject.Value);
}
_maximumValues.Add(myObject);
RemoveObservationsBefore(myObject.Date.AddYears(-1));
_max = _maximumValues[0].Value;
}
private void RemoveObservationsBefore(DateTime targetDate)
{
var toRemoveFromFront = 0;
while (_maximumValues[toRemoveFromFront].Date < targetDate && toRemoveFromFront <= maximumValues3.Count -1)
{
toRemoveFromFront++;
}
_maximumValues.RemoveRange(0, toRemoveFromFront);
}
private void RemoveValuesSmallerThan(double targetValue)
{
var maxEntry = _maximumValues.Count - 1;
var toRemoveFromBack = 0;
while (toRemoveFromBack <= maxEntry && _maximumValues[maxEntry - toRemoveFromBack].Value <= targetValue)
{
toRemoveFromBack++;
}
_maximumValues.RemoveRange(maxEntry - toRemoveFromBack + 1, toRemoveFromBack);
}
}
It can be used as follows:
public static MyObject[] GetTestObjects_MaxSlidingWindow()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
var maxSlidingWindow = new MaxSlidingWindow();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
maxSlidingWindow.Add(myObject);
var max = maxSlidingWindow.Max;
result.Add(new MyObject { Date = myObject.Date, Value = myObject.Value / max });
}
return result.ToArray();
}
See the relative timings below - above solution is slightly faster (timed over 10 million runs), but barely noticeable:
Relative timings

Multiple thread accessing and editing the same double array

I need to iterate through every double in an array to do the "Laplacian Smoothing", "mixing values" with neighbour doubles.
I'll keep stored values in a temp clone array update the original at the end.
Pseudo code:
double[] A = new double[1000];
// Filling A with values...
double[] B = A.Clone as double[];
for(int loops=0;loops<10;loops++){ // start of the loop
for(int i=0;i<1000;i++){ // iterating through all doubles in the array
// Parallel.For(0, 1000, (i) => {
double v= A[i];
B[i]-=v;
B[i+1]+=v/2;
B[i-1]+=v/2;
// here i'm going out of array bounds, i know. Pseudo code, not relevant.
}
// });
}
A = B.Clone as double[];
With for it works correctly. "Smoothing" the values in the array.
With Parallel.For() I have some access sync problems: threads are colliding and some values are actually not stored correctly. Threads access and edit the array at the same index many times.
(I haven't tested this in a linear array, i'm actually working on a multidimensional array[x,y,z] ..)
How can I solve this?
I was thinking to make a separate array for each thread, and do the sum later... but I need to know the thread index and I haven't found anywhere in the web. (I'm still interested if a "thread index" exist even with a totally different solution...).
I'll accept any solution.

You probably need one of the more advanced overloads of the Parallel.For method:
public static ParallelLoopResult For<TLocal>(int fromInclusive, int toExclusive,
ParallelOptions parallelOptions, Func<TLocal> localInit,
Func<int, ParallelLoopState, TLocal, TLocal> body,
Action<TLocal> localFinally);
Executes a for loop with thread-local data in which iterations may run in parallel, loop options can be configured, and the state of the loop can be monitored and manipulated.
This looks quite intimidating with all the various lambdas it expects. The idea is to have each thread work with local data, and finally merge the data
at the end. Here is how you could use this method to solve your problem:
double[] A = new double[1000];
double[] B = (double[])A.Clone();
object locker = new object();
var parallelOptions = new ParallelOptions()
{
MaxDegreeOfParallelism = Environment.ProcessorCount
};
Parallel.For(0, A.Length, parallelOptions,
localInit: () => new double[A.Length], // create temp array per thread
body: (i, state, temp) =>
{
double v = A[i];
temp[i] -= v;
temp[i + 1] += v / 2;
temp[i - 1] += v / 2;
return temp; // return a reference to the same temp array
}, localFinally: (localB) =>
{
// Can be called in parallel with other threads, so we need to lock
lock (locker)
{
for (int i = 0; i < localB.Length; i++)
{
B[i] += localB[i];
}
}
});
I should mention that the workload of the above example is too granular, so I wouldn't expect large improvements in performance from the parallelization. Hopefully your actual workload is more chunky. If for example you have two nested loops, parallelizing only the outer loop will work greatly because the inner loop will provide the much needed chunkiness.
Alternative solution: Instead of creating auxiliary arrays per thread, you could just update directly the B array, and use locks only when processing an index in the dangerous zone near the boundaries of the partitions:
Parallel.ForEach(Partitioner.Create(0, A.Length), parallelOptions, range =>
{
bool lockTaken = false;
try
{
for (int i = range.Item1; i < range.Item2; i++)
{
bool shouldLock = i < range.Item1 + 1 || i >= range.Item2 - 1;
if (shouldLock) Monitor.Enter(locker, ref lockTaken);
double v = A[i];
B[i] -= v;
B[i + 1] += v / 2;
B[i - 1] += v / 2;
if (shouldLock) { Monitor.Exit(locker); lockTaken = false; }
}
}
finally
{
if (lockTaken) Monitor.Exit(locker);
}
});

Ok, it appears that modulus can solve pretty much all my problems.
Here a really simplified version of the working code:
(the big script is 3d and unfinished... )
private void RunScript(bool Go, ref object Results)
{
if(Go){
LaplacianSmooth(100);
// Needed to restart "RunScript" over and over
this.Component.ExpireSolution(true);
}
else{
A = new double[count];
A[100] = 10000;
A[500] = 10000;
}
Results = A;
}
// <Custom additional code>
public static int T = Environment.ProcessorCount;
public static int count = 1000;
public double[] A = new double[count];
public double[,] B = new double[count, T];
public void LaplacianSmooth(int loops){
for(int loop = 0;loop < loops;loop++){
B = new double[count, T];
// Copying values to first column of temp multidimensional-array
Parallel.For(0, count, new ParallelOptions { MaxDegreeOfParallelism = T }, i => {
B[i, 0] = A[i];
});
// Applying Laplacian smoothing
Parallel.For(0, count, new ParallelOptions { MaxDegreeOfParallelism = T }, i => {
int t = i % 16;
// Wrapped next and previous element indexes
int n = (i + 1) % count;
int p = (i + count - 1) % count;
double v = A[i] * 0.5;
B[i, t] -= v;
B[p, t] += v / 2;
B[n, t] += v / 2;
});
// Copying values back to main array
Parallel.For(0, count, new ParallelOptions { MaxDegreeOfParallelism = T }, i => {
double val = 0;
for(int t = 0;t < T;t++){
val += B[i, t];
}
A[i] = val;
});
}
}
There are no "collisions" with the threads, as confirmed by the result of "Mass Addition" (a sum) that is constant at 20000.
Thanks everyone for the tips!

Aggregation of parallel for does not capture all iterations

I have code that works great using a simple For loop, but I'm trying to speed it up. I'm trying to adapt the code to use multiple cores and landed on Parallel For.
At a high level, I'm collecting the results from CalcRoutine for several thousand accounts and storing the results in an array with 6 elements. I'm then re-running this process 1,000 times. The order of the elements within each 6 element array is important, but the order for the final 1,000 iterations of these 6 element arrays is not important. When I run the code using a For loop, I get a 6,000 element long list. However, when I try the Parallel For version, I'm getting something closer to 600. I've confirmed that the line "return localResults" gets called 1,000 times, but for some reason not all 6 element arrays get added to the list TotalResults. Any insight as to why this isn't working would be greatly appreciated.
object locker = new object();
Parallel.For(0, iScenarios, () => new double[6], (int k, ParallelLoopState state, double[] localResults) =>
{
List<double> CalcResults = new List<double>();
for (int n = iStart; n < iEnd; n++)
{
CalcResults.AddRange(CalcRoutine(n, k));
}
localResults = this.SumOfResults(CalcResults);
return localResults;
},
(double[] localResults) =>
{
lock (locker)
{
TotalResults.AddRange(localResults);
}
});
EDIT: Here's the "non parallel" version:
for (int k = 0; k < iScenarios; k++)
{
CalcResults.Clear();
for (int n = iStart; n < iEnd; n++)
{
CalcResults.AddRange(CalcRoutine(n, k));
}
TotalResults.AddRange(SumOfResults(CalcResults));
}
The output for 1 scenario is a list of 6 doubles, 2 scenarios is a list of 12 doubles, ... n scenarios 6n doubles.
Also per one of the questions, I checked the number of times "TotalResults.AddRange..." gets called, and it's not the full 1,000 times. Why wouldn't this be called each time? With the lock, shouldn't each thread wait for this section to become available?

Check the documentation for Parallel.For
These initial states are passed to the first body invocations on each task. Then, every subsequent body invocation returns a possibly modified state value that is passed to the next body invocation. Finally, the last body invocation on each task returns a state value that is passed to the localFinally delegate
But your body delegate is ignoring the incoming value of localResults which the previous iteration within this task returned. Having the loop state being an array makes it tricky to write a correct version. This will work but looks messy:
//EDIT - Create an array of length 0 here V for input to first iteration
Parallel.For(0, iScenarios, () => new double[0],
(int k, ParallelLoopState state, double[] localResults) =>
{
List<double> CalcResults = new List<double>();
for (int n = iStart; n < iEnd; n++)
{
CalcResults.AddRange(CalcRoutine(n, k));
}
localResults = localResults.Concat(
this.SumOfResults(CalcResults)
).ToArray();
return localResults;
},
(double[] localResults) =>
{
lock (locker)
{
TotalResults.AddRange(localResults);
}
});
(Assuming Linq's enumerable extensions are in scope, for Concat)
I'd suggest using a different data structure (e.g. a List<double> rather than double[]) for the state that more naturally allows more elements to be added to it - but that would mean changing SumOfResults that you've not shown. Or just keep it all a bit more abstract:
Parallel.For(0, iScenarios, Enumerable.Empty<double>(),
(int k, ParallelLoopState state, IEnumerable<double> localResults) =>
{
List<double> CalcResults = new List<double>();
for (int n = iStart; n < iEnd; n++)
{
CalcResults.AddRange(CalcRoutine(n, k));
}
return localResults.Concat(this.SumOfResults(CalcResults));
},
(IEnumerable<double> localResults) =>
{
lock (locker)
{
TotalResults.AddRange(localResults);
}
});
(If it had worked the way you seem to have assumed, why would they have you provide two separate delegates, if all it did, on the return from body, was to immediately invoke localFinally with the return value?)

Try this:
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
class Program
{
static void Main(string[] args)
{
var iScenarios = 6;
var iStart = 0;
var iEnd = 1000;
var totalResults = new List<double>();
Parallel.For(0, iScenarios, k => {
List<double> calcResults = new List<double>();
for (int n = iStart; n < iEnd; n++)
calcResults.AddRange(CalcRoutine(n, k));
lock (totalResults)
{
totalResults.AddRange(calcResults);
}
});
}
static IEnumerable<double> CalcRoutine(int a, int b)
{
yield return 0;
}
static double[] SumOfResults(IEnumerable<double> source)
{
return source.ToArray();
}
}

LINQ to calculate a moving average of a SortedList<dateTime,double>

I have a time series in the form of a SortedList<dateTime,double>. I would like to calculate a moving average of this series. I can do this using simple for loops. I was wondering if there is a better way to do this using linq.
my version:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
var mySeries = new SortedList<DateTime, double>();
mySeries.Add(new DateTime(2011, 01, 1), 10);
mySeries.Add(new DateTime(2011, 01, 2), 25);
mySeries.Add(new DateTime(2011, 01, 3), 30);
mySeries.Add(new DateTime(2011, 01, 4), 45);
mySeries.Add(new DateTime(2011, 01, 5), 50);
mySeries.Add(new DateTime(2011, 01, 6), 65);
var calcs = new calculations();
var avg = calcs.MovingAverage(mySeries, 3);
foreach (var item in avg)
{
Console.WriteLine("{0} {1}", item.Key, item.Value);
}
}
}
class calculations
{
public SortedList<DateTime, double> MovingAverage(SortedList<DateTime, double> series, int period)
{
var result = new SortedList<DateTime, double>();
for (int i = 0; i < series.Count(); i++)
{
if (i >= period - 1)
{
double total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
double average = total / period;
result.Add(series.Keys[i], average);
}
}
return result;
}
}
}

In order to achieve an asymptotical performance of O(n) (as the hand-coded solution does), you could use the Aggregate function like in
series.Skip(period-1).Aggregate(
new {
Result = new SortedList<DateTime, double>(),
Working = List<double>(series.Take(period-1).Select(item => item.Value))
},
(list, item)=>{
list.Working.Add(item.Value);
list.Result.Add(item.Key, list.Working.Average());
list.Working.RemoveAt(0);
return list;
}
).Result;
The accumulated value (implemented as anonymous type) contains two fields: Result contains the result list build up so far. Working contains the last period-1 elements. The aggregate function adds the current value to the Working list, builds the current average and adds it to the result and then removes the first (i.e. oldest) value from the working list.
The "seed" (i.e. the starting value for the accumulation) is build by putting the first period-1 elements into Working and initializing Result to an empty list.
Consequently tha aggregation starts with element period (by skipping (period-1) elements at the beginning)
In functional programming this is a typical usage pattern for the aggretate (or fold) function, btw.
Two remarks:
The solution is not "functionally" clean in that the same list objects (Working and Result) are reused in every step. I'm not sure if that might cause problems if some future compilers try to parallellize the Aggregate function automatically (on the other hand I'm also not sure, if that's possible after all...). A purely functional solution should "create" new lists at every step.
Also note that C# lacks powerful list expressions. In some hypothetical Python-C#-mixed pseudocode one could write the aggregation function like
(list, item)=>
new {
Result = list.Result + [(item.Key, (list.Working+[item.Value]).Average())],
Working=list.Working[1::]+[item.Value]
}
which would be a bit more elegant in my humble opinion :)

For the most efficient way possible to compute a Moving Average with LINQ, you shouldn't use LINQ!
Instead I propose creating a helper class which computes a moving average in the most efficient way possible (using a circular buffer and causal moving average filter), then an extension method to make it accessible to LINQ.
First up, the moving average
public class MovingAverage
{
private readonly int _length;
private int _circIndex = -1;
private bool _filled;
private double _current = double.NaN;
private readonly double _oneOverLength;
private readonly double[] _circularBuffer;
private double _total;
public MovingAverage(int length)
{
_length = length;
_oneOverLength = 1.0 / length;
_circularBuffer = new double[length];
}
public MovingAverage Update(double value)
{
double lostValue = _circularBuffer[_circIndex];
_circularBuffer[_circIndex] = value;
// Maintain totals for Push function
_total += value;
_total -= lostValue;
// If not yet filled, just return. Current value should be double.NaN
if (!_filled)
{
_current = double.NaN;
return this;
}
// Compute the average
double average = 0.0;
for (int i = 0; i < _circularBuffer.Length; i++)
{
average += _circularBuffer[i];
}
_current = average * _oneOverLength;
return this;
}
public MovingAverage Push(double value)
{
// Apply the circular buffer
if (++_circIndex == _length)
{
_circIndex = 0;
}
double lostValue = _circularBuffer[_circIndex];
_circularBuffer[_circIndex] = value;
// Compute the average
_total += value;
_total -= lostValue;
// If not yet filled, just return. Current value should be double.NaN
if (!_filled && _circIndex != _length - 1)
{
_current = double.NaN;
return this;
}
else
{
// Set a flag to indicate this is the first time the buffer has been filled
_filled = true;
}
_current = _total * _oneOverLength;
return this;
}
public int Length { get { return _length; } }
public double Current { get { return _current; } }
}
This class provides a very fast and lightweight implementation of a MovingAverage filter. It creates a circular buffer of Length N and computes one add, one subtract and one multiply per data-point appended, as opposed to the N multiply-adds per point for the brute force implementation.
Next, to LINQ-ify it!
internal static class MovingAverageExtensions
{
public static IEnumerable<double> MovingAverage<T>(this IEnumerable<T> inputStream, Func<T, double> selector, int period)
{
var ma = new MovingAverage(period);
foreach (var item in inputStream)
{
ma.Push(selector(item));
yield return ma.Current;
}
}
public static IEnumerable<double> MovingAverage(this IEnumerable<double> inputStream, int period)
{
var ma = new MovingAverage(period);
foreach (var item in inputStream)
{
ma.Push(item);
yield return ma.Current;
}
}
}
The above extension methods wrap the MovingAverage class and allow insertion into an IEnumerable stream.
Now to use it!
int period = 50;
// Simply filtering a list of doubles
IEnumerable<double> inputDoubles;
IEnumerable<double> outputDoubles = inputDoubles.MovingAverage(period);
// Or, use a selector to filter T into a list of doubles
IEnumerable<Point> inputPoints; // assuming you have initialised this
IEnumerable<double> smoothedYValues = inputPoints.MovingAverage(pt => pt.Y, period);

You already have an answer showing you how you can use LINQ but frankly I wouldn't use LINQ here as it will most likely perform poorly compared to your current solution and your existing code already is clear.
However instead of calculating the total of the previous period elements on every step, you can keep a running total and adjust it on each iteration. That is, change this:
total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
to this:
if (i >= period) {
total -= series.Values[i - period];
}
total += series.Values[i];
This will mean that your code will take the same amount of time to execute regardless of the size of period.

This block
double total = 0;
for (int x = i; x > (i - period); x--)
total += series.Values[x];
double average = total / period;
can be rewritten as:
double average = series.Values.Skip(i - period + 1).Take(period).Sum() / period;
Your method may look like:
series.Skip(period - 1)
.Select((item, index) =>
new
{
item.Key,
series.Values.Skip(index).Take(period).Sum() / period
});
As you can see, linq is very expressive. I recommend to start with some tutorial like Introducing LINQ and 101 LINQ Samples.

To do this in a more functional way, you'd need a Scan method which exists in Rx but not in LINQ.
Let's look how it would look like if we'd have a scan method
var delta = 3;
var series = new [] {1.1, 2.5, 3.8, 4.8, 5.9, 6.1, 7.6};
var seed = series.Take(delta).Average();
var smas = series
.Skip(delta)
.Zip(series, Tuple.Create)
.Scan(seed, (sma, values)=>sma - (values.Item2/delta) + (values.Item1/delta));
smas = Enumerable.Repeat(0.0, delta-1).Concat(new[]{seed}).Concat(smas);
And here's the scan method, taken and adjusted from here:
public static IEnumerable<TAccumulate> Scan<TSource, TAccumulate>(
this IEnumerable<TSource> source,
TAccumulate seed,
Func<TAccumulate, TSource, TAccumulate> accumulator
)
{
if (source == null) throw new ArgumentNullException("source");
if (seed == null) throw new ArgumentNullException("seed");
if (accumulator == null) throw new ArgumentNullException("accumulator");
using (var i = source.GetEnumerator())
{
if (!i.MoveNext())
{
throw new InvalidOperationException("Sequence contains no elements");
}
var acc = accumulator(seed, i.Current);
while (i.MoveNext())
{
yield return acc;
acc = accumulator(acc, i.Current);
}
yield return acc;
}
}
This should have better performance than the brute force method since we are using a running total to calculate the SMA.
What's going on here?
To start we need to calculate the first period which we call seed here. Then, every subsequent value we calculate from the accumulated seed value. To do that we need the old value (that is t-delta) and the newest value for which we zip together the series, once from the beginning and once shifted by the delta.
At the end we do some cleanup by adding zeroes for the length of the first period and adding the initial seed value.

Another option is to use MoreLINQ's Windowed method, which simplifies the code significantly:
var averaged = mySeries.Windowed(period).Select(window => window.Average(keyValuePair => keyValuePair.Value));

I use this code to calculate SMA:
private void calculateSimpleMA(decimal[] values, out decimal[] buffer)
{
int period = values.Count(); // gets Period (assuming Period=Values-Array-Size)
buffer = new decimal[period]; // initializes buffer array
var sma = SMA(period); // gets SMA function
for (int i = 0; i < period; i++)
buffer[i] = sma(values[i]); // fills buffer with SMA calculation
}
static Func<decimal, decimal> SMA(int p)
{
Queue<decimal> s = new Queue<decimal>(p);
return (x) =>
{
if (s.Count >= p)
{
s.Dequeue();
}
s.Enqueue(x);
return s.Average();
};
}

Here is an extension method:
public static IEnumerable<double> MovingAverage(this IEnumerable<double> source, int period)
{
if (source is null)
{
throw new ArgumentNullException(nameof(source));
}
if (period < 1)
{
throw new ArgumentOutOfRangeException(nameof(period));
}
return Core();
IEnumerable<double> Core()
{
var sum = 0.0;
var buffer = new double[period];
var n = 0;
foreach (var x in source)
{
n++;
sum += x;
var index = n % period;
if (n >= period)
{
sum -= buffer[index];
yield return sum / period;
}
buffer[index] = x;
}
}
}

We Keep Coding

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.