Related
I'm trying to use the Parallel library for my code and I'm facing a strange issue.
I made a short program to demonstrate the behavior. In short, I make 2 loops (one inside another). The first loop generates a random array of 200 integers and the second loop adds all the arrays in a big list.
The issue is, in the end, I don't get a multiple of 200 integers, instead I see some runs doesn't wait for the random array to fully be loaded.
It's difficult to explain so here the sample code:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
namespace TestParallel
{
class Program
{
static int RecommendedDegreesOfParallelism = 8;
static int DefaultMaxPageSize = 200;
static void Main(string[] args)
{
int maxPage = 50;
List<int> lstData = new List<int>();
Parallel.For(0, RecommendedDegreesOfParallelism, new ParallelOptions() { MaxDegreeOfParallelism = RecommendedDegreesOfParallelism },
(index) =>
{
int cptItems = 0;
int cptPage = 1 - RecommendedDegreesOfParallelism + index;
int idx = index;
do
{
cptPage += RecommendedDegreesOfParallelism;
if (cptPage > maxPage) break;
int Min = 0;
int Max = 20;
Random randNum = new Random();
int[] test2 = Enumerable
.Repeat(0, DefaultMaxPageSize)
.Select(i => randNum.Next(Min, Max))
.ToArray();
var lstItems = new List<int>();
lstItems.AddRange(test2);
var lstRes = new List<int>();
lstItems.AsParallel().WithDegreeOfParallelism(8).ForAll((item) =>
{
lstRes.Add(item);
});
Console.WriteLine($"{Task.CurrentId} = {lstRes.Count}");
lstData.AddRange(lstRes);
cptItems = lstRes.Count;
} while (cptItems == DefaultMaxPageSize);
}
);
Console.WriteLine($"END: {lstData.Count}");
Console.ReadKey();
}
}
}
And here is an execution log :
4 = 200
1 = 200
2 = 200
3 = 200
6 = 200
5 = 200
7 = 200
8 = 200
1 = 200
6 = 194
2 = 191
5 = 200
7 = 200
8 = 200
4 = 200
5 = 200
3 = 182
4 = 176
8 = 150
7 = 200
5 = 147
1 = 200
7 = 189
1 = 200
1 = 198
END: 4827
We can see some loops return less than 200 items.
How is it possible?
This here is not threadsafe:
lstItems.AsParallel().WithDegreeOfParallelism(8).ForAll((item) =>
{
lstRes.Add(item);
});
From the documentation for List<T>:
It is safe to perform multiple read operations on a List, but
issues can occur if the collection is modified while it's being read.
To ensure thread safety, lock the collection during a read or write
operation. To enable a collection to be accessed by multiple threads
for reading and writing, you must implement your own synchronization.
It doesn't explicitly mention it, but .Add() can also fail when called simultaneously by multiple threads.
The solution would be to lock the calls to List<T>.Add() in the loop above, but if you do that it will likely make it slower than just adding the items in a loop in a single thread.
var locker = new object();
lstItems.AsParallel().WithDegreeOfParallelism(8).ForAll((item) =>
{
lock (locker)
{
lstRes.Add(item);
}
});
Input:
public class MyObject
{
public double Value { get; set; }
public DateTime Date { get; set; }
}
Method to generate test objects:
public static MyObject[] GetTestObjects()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
result.Add(myObject);
}
return result.ToArray();
}
Given this I require to calculate maximum Value for previous 12 month for each myObject. I could just think of doing this InParallel, but maybe there is an optimized solution?
Sorry for being unclear, this is what I use right now to get what I want:
public MyObject[] BruteForceBackward(MyObject[] testData)
{
return testData.AsParallel().Select(point =>
{
var max = testData.Where(x => x.Date <= point.Date && x.Date >= point.Date.AddYears(-1)).Max(x => x.Value);
return new MyObject() { Date = point.Date, Value = point.Value / max };
}).OrderBy(r => r.Date).ToArray();
}
This works but it is slow and eats processor resources (imagine, you have 100k objects), I believe there must be something better
I had a simillar project where i had to calculate such stuff on tons of sensor data.
You can now find a little more refined version in my Github repository, which should be ready to use (.Net):
https://github.com/forReason/Statistics-Helper-Library
In general you want to reduce the amount of loops going over all your data. At best, you want to touch each element only one single time.
Process Array (equiv. of BruteForceBackwards)
public static MyObject[] FlowThroughForward(ref MyObject[] testData)
{
// generate return array
MyObject[] returnData = new MyObject[testData.Length];
// keep track to minimize processing
double currentMaximum = 0;
List<MyObject> maximumValues = new List<MyObject>();
// go through the elements
for (int i = 0; i < testData.Length; i++)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = testData[i].Date.AddYears(-1);
// maximum logic
if (testData[i].Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = testData[i].Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= testData[i].Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(testData[i]);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
returnData[i] = new MyObject() { Date = testData[i].Date, Value = testData[i].Value / currentMaximum }; ;
}
return returnData;
}
Real Time Data or Streamed Data
Note: If you have really large lists, you might get memory issues with your approach to pass a full array. In this case: pass one value at a time, pass them from oldest value to newest value. Store the values back one at a time.
This Function can also be used on real time data.
The test method is included in code.
static void Main(string[] args)
{
int length = 50000;
Stopwatch stopWatch1 = new Stopwatch();
stopWatch1.Start();
var myObject = new MyObject();
var result = new List<MyObject>();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
for (int i = 0; i < length; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
myObject.Value = rnd.NextDouble();
myObject.Date = date.AddMinutes(15 * i);
result.Add(CalculateNextObject(ref myObject));
}
stopWatch1.Stop();
Console.WriteLine("test code executed in " + stopWatch1.ElapsedMilliseconds + " ms");
Thread.Sleep(1000000);
}
private static Random rnd = new Random();
private static double currentMaximum = 0;
private static List<MyObject> maximumValues = new List<MyObject>();
public static MyObject CalculateNextObject(ref MyObject input)
{
// calculate the oldest date to keep in tracking list
DateTime targetDate = input.Date.AddYears(-1);
// maximum logic
if (input.Value >= currentMaximum)
{
// new maximum found, clear tracking list
// this is the best case scenario
maximumValues.Clear();
currentMaximum = input.Value;
}
else
{
// unfortunately, no new maximum was found
// go backwards the maximum tracking list and check for smaller values
// clear the list of all smaller values. The list should therefore always
// be in descending order
for (int b = maximumValues.Count - 1; b >= 0; b--)
{
if (maximumValues[b].Value <= input.Value)
{
// a lower value has been found. We have a newer, higher value
// clear this waste value from the tracking list
maximumValues.RemoveAt(b);
}
else
{
// there are no more lower values.
// stop looking for smaller values to save time
break;
}
}
}
// append new value to tracking list, no matter if higher or lower
// all future values might be lower
maximumValues.Add(input);
// check if the oldest value is too old to be kept in the tracking list
while (maximumValues[0].Date < targetDate)
{
// oldest value is to be removed
maximumValues.RemoveAt(0);
// update maximum
currentMaximum = maximumValues[0].Value;
}
// add object to result list
MyObject returnData = new MyObject() { Date = input.Date, Value = input.Value / currentMaximum };
return returnData;
}
Test Method
static void Main(string[] args)
{
MyObject[] testData = GetTestObjects();
Stopwatch stopWatch1 = new Stopwatch();
Stopwatch stopWatch2 = new Stopwatch();
stopWatch1.Start();
MyObject[] testresults1 = BruteForceBackward(testData);
stopWatch1.Stop();
Console.WriteLine("BruteForceBackward executed in " + stopWatch1.ElapsedMilliseconds + " ms");
stopWatch2.Start();
MyObject[] testresults2 = FlowThroughForward(ref testData);
stopWatch2.Stop();
Console.WriteLine("FlowThroughForward executed in " + stopWatch2.ElapsedMilliseconds + " ms");
Console.WriteLine();
Console.WriteLine("Comparing some random test results: ");
var rnd = new Random();
for (int i = 0; i < 10; i++)
{
int index = rnd.Next(0, testData.Length);
Console.WriteLine("Index: " + index + " brute: " + testresults1[index].Value + " flow: " + testresults2[index].Value);
}
Thread.Sleep(1000000);
}
Test result
Tests were performed on a machine with 32 cores, so in teory multithreaded aproach should be at advantage but youll see ;)
Function
Function Time
time %
BruteForceBackward
5334 ms
99.9%
FlowThroughForward
5 ms
0.094%
Performance improvement factor: ~time/1000
console output with data validation:
BruteForceBackward executed in 5264 ms
FlowThroughForward executed in 5 ms
Comparing some random test results:
Index: 25291 brute: 0.989688139105413 flow: 0.989688139105413
Index: 11945 brute: 0.59670821976193 flow: 0.59670821976193
Index: 30282 brute: 0.413238225210297 flow: 0.413238225210297
Index: 33898 brute: 0.38258761939139 flow: 0.38258761939139
Index: 8824 brute: 0.833512217105447 flow: 0.833512217105447
Index: 22092 brute: 0.648052464067263 flow: 0.648052464067263
Index: 24633 brute: 0.35859417692481 flow: 0.35859417692481
Index: 24061 brute: 0.540642018793402 flow: 0.540642018793402
Index: 34219 brute: 0.498785766613022 flow: 0.498785766613022
Index: 2396 brute: 0.151471808392111 flow: 0.151471808392111
Cpu usage was a lot higher on Bruteforce backwards due to parallelisation.
The worst case scenario are long periods of decreasing values. The code can still be vastly optimized but I guess this should be sufficient. For further optimisation, one might look to reduce the list shuffles when removing/adding elements to maximumValues.
An interesting and challenging problem. I put together a solution using a dynamic programming approach (first learned back in CS algorithms class back in '78). First, a tree is constructed containing pre-calculated local max values over recursively defined ranges. Once constructed, the max value for an arbitrary range can be efficiently calculated mostly using the pre-calculated values. Only at the fringes of the range does the calculation drop down to the element level.
It is not as fast as julian bechtold's FlowThroughForward method, but random access to ranges may be a plus.
Code to add to Main:
Console.WriteLine();
Stopwatch stopWatch3 = new Stopwatch();
stopWatch3.Start();
MyObject[] testresults3 = RangeTreeCalculation(ref testData, 10);
stopWatch3.Stop();
Console.WriteLine($"RangeTreeCalculation executed in {stopWatch3.ElapsedMilliseconds} ms");
... test comparison
Console.WriteLine($"Index: {index} brute: {testresults1[index].Value} flow: {testresults2[index].Value} rangeTree: {testresults3[index].Value}");
Test function:
public static MyObject[] RangeTreeCalculation(ref MyObject[] testDataArray, int partitionThreshold)
{
// For this implementation, we need to convert the Array to an ArrayList, because we need a
// reference type object that can be shared.
List<MyObject> testDataList = testDataArray.ToList();
// Construct a tree containing recursive collections of pre-calculated values
var rangeTree = new RangeTree(testDataList, partitionThreshold);
MyObject[] result = new MyObject[testDataList.Count];
Parallel.ForEach(testDataList, (item, state, i) =>
{
var max = rangeTree.MaxForDateRange(item.Date.AddYears(-1), item.Date);
result[i] = new MyObject() { Date = item.Date, Value = item.Value / max };
});
return result;
}
Supporting class:
// Class used to divide and conquer using dynamic programming.
public class RangeTree
{
public List<MyObject> Data; // This reference is shared by all members of the tree
public int Start { get; } // Index of first element covered by this node.
public int Count { get; } // Number of elements covered by this node.
public DateTime FirstDateTime { get; }
public DateTime LastDateTime { get; }
public double MaxValue { get; } // Pre-calculated max for all elements covered by this node.
List<RangeTree> ChildRanges { get; }
// Top level node constructor
public RangeTree(List<MyObject> data, int partitionThreshold)
: this(data, 0, data.Count, partitionThreshold)
{
}
// Child node constructor, which covers an recursively decreasing range of element.
public RangeTree(List<MyObject> data, int start, int count, int partitionThreshold)
{
Data = data;
Start = start;
Count = count;
FirstDateTime = Data[Start].Date;
LastDateTime = Data[Start + Count - 1].Date;
if (count <= partitionThreshold)
{
// If the range is smaller than the threshold, just calculate the local max
// directly from the items. No child ranges are defined.
MaxValue = Enumerable.Range(Start, Count).Select(i => Data[i].Value).Max();
}
else
{
// We still have a significant range. Decide how to further divide them up into sub-ranges.
// (There may be room for improvement here to better balance the tree.)
int partitionSize = (count - 1) / partitionThreshold + 1;
int partitionCount = (count - 1) / partitionSize + 1;
if (count < partitionThreshold * partitionThreshold)
{
// When one away from leaf nodes, prefer fewer full leaf nodes over more
// less populated leaf nodes.
partitionCount = (count - 1) / partitionThreshold + 1;
partitionSize = (count - 1) / partitionCount + 1;
}
ChildRanges = Enumerable.Range(0, partitionCount)
.Select(partitionNum => new {
ChildStart = Start + partitionNum * partitionSize,
ChildCount = Math.Min(partitionSize, Count - partitionNum * partitionSize)
})
.Where(part => part.ChildCount > 0) // Defensive
.Select(part => new RangeTree(Data, part.ChildStart, part.ChildCount, partitionThreshold))
.ToList();
// Now is the dynamic programming part:
// Calculate the local max as the max of all child max values.
MaxValue = ChildRanges.Max(chile => chile.MaxValue);
}
}
// Get the max value for a given range of dates withing this rangeTree node.
// This used the precalculated values as much as possible.
// Only at the fringes of the date range to we calculate at the element level.
public double MaxForDateRange(DateTime fromDate, DateTime thruDate)
{
double calculatedMax = Double.MinValue;
if (fromDate > this.LastDateTime || thruDate < this.FirstDateTime)
{
// Entire range is excluded. Nothing of interest here folks.
calculatedMax = Double.MinValue;
}
else if (fromDate <= this.FirstDateTime && thruDate >= this.LastDateTime)
{
// Entire range is included. Use the already-calculated max.
calculatedMax = this.MaxValue;
}
else if (ChildRanges != null)
{
// We have child ranges. Recurse and accumulate.
// Possible optimization: Calculate max for middle ranges first, and only bother
// with extreme partial ranges if their local max values exceed the preliminary result.
for (int i = 0; i < ChildRanges.Count; ++i)
{
double childMax = ChildRanges[i].MaxForDateRange(fromDate, thruDate);
if (childMax > calculatedMax)
{
calculatedMax = childMax;
}
}
}
else
{
// Leaf range. Loop through just this limited range of notes, checking individually for
// date in range and accumulating the result.
for (int i = 0; i < this.Count; ++i)
{
var element = Data[this.Start + i];
if (fromDate <= element.Date && element.Date <= thruDate && element.Value > calculatedMax)
{
calculatedMax = element.Value;
}
}
}
return calculatedMax;
}
}
There's plenty of room for improvement, such as parameterizing the types and generalizing the functionality to support more than just Max(Value), but the framework is there.
Assuming you meant you need the maximum Value for each of the last 12 months from result, then you can use LINQ:
var beginDateTime = DateTime.Now.AddMonths(-12);
var ans = result.Where(r => r.Date >= beginDateTime).GroupBy(r => r.Date.Month).Select(mg => mg.MaxBy(r => r.Value)).ToList();
Running some timing, I get that putting AsParallel after result changes the run time from around 16ms (first run) to around 32ms, so it is actually slower. It is about the same after the Where and about 23ms after the GroupBy (processing the 12 groups in parallel). On my PC at least, there isn't enough data or complex operations for parallelism, but the GroupBy isn't the most efficient.
Using an array and testing each element, I get the results in about 1.2ms:
var maxMOs = new MyObject[12];
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-1;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
Note that the results are not chronological; you could offset monthIndex by today's month to order the results if desired.
var maxMOs = new MyObject[12];
var offset = DateTime.Now.Month-11;
foreach (var r in result.Where(r => r.Date >= beginDateTime)) {
var monthIndex = r.Date.Month-offset;
if (maxMOs[monthIndex] == null || r.Value > maxMOs[monthIndex].Value)
maxMOs[monthIndex] = r;
}
A micro-optimization (mostly useful on repeat runnings) is to invert the test and use the null-propagating operator:
if (!(r.Value <= maxMOs[monthIndex]?.Value))
This saves about 0.2ms on the first run but up to 0.5ms on subsequent runs.
Here is a solution similar to julian bechtold's answer. Difference is that the maximum (and all related variables) are kept hidden away from the main implementation, in a separate class whose purpose is solely to keep track of the maximum over the past year. Algorithm is the same, I just use a few Linq expressions here and there.
We keep track of the maximum in the following class:
public class MaxSlidingWindow
{
private readonly List<MyObject> _maximumValues;
private double _max;
public MaxSlidingWindow()
{
_maximumValues = new List<MyObject>();
_max = double.NegativeInfinity;
}
public double Max => _max;
public void Add(MyObject myObject)
{
if (myObject.Value >= _max)
{
_maximumValues.Clear();
_max = myObject.Value;
}
else
{
RemoveValuesSmallerThan(myObject.Value);
}
_maximumValues.Add(myObject);
RemoveObservationsBefore(myObject.Date.AddYears(-1));
_max = _maximumValues[0].Value;
}
private void RemoveObservationsBefore(DateTime targetDate)
{
var toRemoveFromFront = 0;
while (_maximumValues[toRemoveFromFront].Date < targetDate && toRemoveFromFront <= maximumValues3.Count -1)
{
toRemoveFromFront++;
}
_maximumValues.RemoveRange(0, toRemoveFromFront);
}
private void RemoveValuesSmallerThan(double targetValue)
{
var maxEntry = _maximumValues.Count - 1;
var toRemoveFromBack = 0;
while (toRemoveFromBack <= maxEntry && _maximumValues[maxEntry - toRemoveFromBack].Value <= targetValue)
{
toRemoveFromBack++;
}
_maximumValues.RemoveRange(maxEntry - toRemoveFromBack + 1, toRemoveFromBack);
}
}
It can be used as follows:
public static MyObject[] GetTestObjects_MaxSlidingWindow()
{
var rnd = new Random();
var date = new DateTime(2021, 1, 1, 0, 0, 0);
var result = new List<MyObject>();
var maxSlidingWindow = new MaxSlidingWindow();
for (int i = 0; i < 50000; i++)
{
//this is to simulate real data having gaps
if (rnd.Next(100) < 25)
{
continue;
}
var myObject = new MyObject()
{
Value = rnd.NextDouble(),
Date = date.AddMinutes(15 * i)
};
maxSlidingWindow.Add(myObject);
var max = maxSlidingWindow.Max;
result.Add(new MyObject { Date = myObject.Date, Value = myObject.Value / max });
}
return result.ToArray();
}
See the relative timings below - above solution is slightly faster (timed over 10 million runs), but barely noticeable:
Relative timings
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 8 years ago.
Improve this question
Every dates in the array represent a User interaction.
If there is a gap of > of 20 seconds between these dates I count a new User (lets call this group of dates a session).
Following my script I can count Users correctly.
But I cannot calculate the average time for session.
Could you point me out what am I doing wrong, of any suggestion for a better code is very welcome. Thanks.
namespace test_code
{
class Program
{
static void Main(string[] args)
{
Console.Clear();
DateTime[] dates = {
new DateTime(2014,01,01,0,0,0),
new DateTime(2014,01,01,0,0,5),
new DateTime(2014,01,01,0,0,10), // 15 s USR 1 session
new DateTime(2014,01,01,0,5,0),
new DateTime(2014,01,01,0,5,5), // 05 s USR 2
new DateTime(2014,01,01,0,10,0),
new DateTime(2014,01,01,0,10,1),
new DateTime(2014,01,01,0,10,2), // 03 s USR 3
new DateTime(2014,01,01,1,0,0),
new DateTime(2014,01,01,2,0,0),
new DateTime(2014,01,01,2,0,20) // 20 s USR 4
};
int users = dates.Length > 0 ? 1 : 0;
int gapS = 20; // Gap between date in seconds
double totalTimeGaps = 0;
double totalTime = 0;
for (int i = 0, n = i + 1; i < dates.Length - 1; i++, n++)
{
totalTime += (dates[n] - dates[i]).TotalSeconds;
if ((dates[n] - dates[i]).TotalSeconds > gapS)
{
users++;
totalTimeGaps += (dates[n] - dates[i]).TotalSeconds; // Does not count properly
Console.WriteLine(totalTimeGaps);
}
}
Console.WriteLine();
Console.WriteLine();
Console.WriteLine("Total Users: " + users);
Console.WriteLine("Total Time Avg. Session : " + (totalTime - totalTimeGaps)/users);
Console.ReadKey();
Console.Clear();
}
}
}
EDIT: Last version of working script
using System;
public class Program
{
public static void Main(string[] args)
{
DateTime[] dates = {
new DateTime(2014,01,01,0,0,0),
new DateTime(2014,01,01,0,0,5),
new DateTime(2014,01,01,0,0,10), // 10 s USR 1
new DateTime(2014,01,01,0,5,0),
new DateTime(2014,01,01,0,5,5), // 05 s USR 2
new DateTime(2014,01,01,0,10,0),
new DateTime(2014,01,01,0,10,1),
new DateTime(2014,01,01,0,10,2), // 02 s USR 3
new DateTime(2014,01,01,1,0,0), // 00 s USR 4
new DateTime(2014,01,01,2,0,0),
new DateTime(2014,01,01,2,0,20) // 20 s USR 5
};
int users = dates.Length > 0 ? 1 : 0;
int gapS = 20; // Gap between date in seconds
double totalTimeGaps = 0;
double totalTime = 0;
for (int i = 0, n = i + 1; i < dates.Length - 1; i++, n++)
{
double range = (dates[n] - dates[i]).TotalSeconds;
totalTime += range;
if (range > gapS)
{
users++;
totalTimeGaps += range;
}
}
Console.WriteLine();
Console.WriteLine();
Console.WriteLine("Total Users: " + users);
Console.WriteLine("Total Time Avg. Session : " + (totalTime - totalTimeGaps)/users);
Console.WriteLine("Total Time App." + (totalTime - totalTimeGaps));
}
}
The results I get are exactly as I'd expect when I run your program:
Total Users: 5
Total Time Avg. Session : 7.4
There are five users whose duration is 10s, 5s, 2s, 0s, 20s.
The total time is thus 37 seconds over 5 users which averages to 7.4s/user.
Your problem seems to be not in the code but in your own understanding of the data. You seem to have failed to note that what you record as user 4 is in fact two users and you are miscounting some of the timings (seemingly when there are three times involved).
Your code is correct according to the specs you gave us, you are just validating your code incorrectly.
Here is another approach using one List<DateTime> for every session-group:
List<List<DateTime>> sessions = new List<List<DateTime>> { new List<DateTime>() };
foreach(DateTime dt in dates)
{
if(!sessions.Last().Any())
sessions.Last().Add(dt);
else
{
TimeSpan diff = dt - sessions.Last().Last();
if (diff.TotalSeconds > 20)
sessions.Add(new List<DateTime> { dt });
else
sessions.Last().Add(dt);
}
}
You can use List<double> for every average:
List<double> secondsPerSession = new List<double>();
foreach (var session in sessions)
{
if (session.Count == 1)
secondsPerSession.Add(0);
else
{
double average = session.Skip(1)
.Average(d => (d - session[0]).TotalSeconds);
secondsPerSession.Add(average);
}
}
Output:
for (int i = 0; i < sessions.Count; i++)
Console.WriteLine("{0} [{1}]",
string.Join(",", sessions[i]), secondsPerSession[i]);
The problem is with your condition in you cycle. You count only the total gaps between two users and not the total time what a user has spent in your system. It should look something like this:
for (int i = 0, n = i + 1; i < dates.Length - 1; i++, n++)
{
totalTime += (dates[n] - dates[i]).TotalSeconds;
if ((dates[n] - dates[i]).TotalSeconds > gapS)
{
users++;
Console.WriteLine(totalTimeGaps);
}
else
{
totalTimeGaps += (dates[n] - dates[i]).TotalSeconds; // Does not count properly
}
}
Also if you want to get the average at the end of the cycle you should divide the total amount of time with the total number of users counter.
I have a method which should generate a unique 10-character timestamp in Base36, with microsecond resolution. However, it is failing uniqueness tests. How can this be?
private static string _lastValue = string.Empty;
private static readonly DateTime _epoch = DateTime.SpecifyKind(new DateTime(1970,1,1), DateTimeKind.Utc);
private static readonly DateTime _lastInitialized = DateTime.Now;
private static readonly Stopwatch _sw = Stopwatch.StartNew();
public static TimeSpan EpochToStopwatchStart()
{
return _lastInitialized.Subtract(_epoch);
}
public static string GetBase36Timestamp()
{
string result;
do
{
// _sw is a running Stopwatch; Microseconds = ticks / 10
long microseconds = EpochToStopwatchStart().Add(_sw.Elapsed).Ticks / 10L;
result = MicrosecondsToBase36(microseconds);
}
// MicrosecondsToBase36 encodes the Int64 value; the while() loop compares to a
// tracking field to ensure the encoded value changes from the previous one:
while (result == _lastValue);
_lastValue = result;
return result;
}
I know I discard some resolution, but this requires 10 characters in Base36, and the method checks the encoded value anyway. Unexpected dupes happen during a single run. To simplify the problem I run the tests single-threaded. I expect either the answer will be quite interesting, or I will be quite embarrassed for some very silly oversight in the question.
What happens if you add Thread.Sleep(1); in your do/while loop? You are quite possibly getting more than one microsecond generated per iteration.
Analysis
Creating a multi-threaded performance test reveals that despite the while loop, the function is able to exit at a rate of greater than once per microsecond:
static void Main(string[] args)
{
List<string> timeStamps = null; ;
int calls = 1000000;
int maxThreads = 5;
for (int threadCount = 1; threadCount <= maxThreads; threadCount++)
{
timeStamps = new List<string>(calls * maxThreads);
var userThread = new ThreadStart(() =>
{
for (int n = 0; n < calls; n++)
{
timeStamps.Add(TimeStampClass.GetBase36Timestamp());
}
});
Thread[] threads = new Thread[threadCount];
var stopwatch = Stopwatch.StartNew();
for (int j = 0; j < threadCount; j++)
{
threads[j] = new Thread(userThread);
threads[j].Start();
}
for (int j = 0; j < threadCount; j++)
{
threads[j].Join();
}
stopwatch.Stop();
Console.WriteLine("threadCount = {0}\n ------------------", threadCount);
Console.WriteLine("{0} calls in {1} milliseconds", timeStamps.Count, stopwatch.ElapsedMilliseconds);
Console.WriteLine("{0} ticks per call", (double)stopwatch.Elapsed.Ticks / (double)timeStamps.Count);
Console.WriteLine();
}
The resulting output is:
threadCount = 1
------------------
1000000 calls in 1080 milliseconds
10.802299 ticks per call
threadCount = 2
------------------
1985807 calls in 1379 milliseconds
6.94705779564681 ticks per call
threadCount = 3
------------------
2893411 calls in 1731 milliseconds
5.98568471606695 ticks per call
threadCount = 4
------------------
3715722 calls in 2096 milliseconds
5.64319478152564 ticks per call
threadCount = 5
------------------
4611970 calls in 2395 milliseconds
5.19515413153164 ticks per call
Solution for multi-threaded environment:
Surround your while loop with a lock on _lastValue:
public static string GetBase36Timestamp()
{
string result;
lock (_lastValue)
{
do
{
// _sw is a running Stopwatch; Microseconds = ticks / 10
long microseconds = EpochToStopwatchStart().Add(_sw.Elapsed).Ticks / 10L;
result = MicrosecondsToBase36(microseconds);
} while (result == _lastValue);
}
return result;
}
I think you need to use System.Threading.Interlocked.CompareExchange() to do a thread-safe compare-and-swap as an atomic operation. See Interlocked Operations for details. In a nutshell, you ...
Get a copy of the state you want to change as a local variable.
Perform your computation to get the new state
Execute Interlocked.CompareExchange(), which returns the current old-value.
if your local copy of old-value is different than the return value, the swap failed: repeat the above.
otherwise, you're good to go.
Here's a simplified example, riffing off your work:
class TimeStamp
{
static readonly DateTime unixEpoch = new DateTime(1970,1,1,0,0,0,DateTimeKind.Utc) ;
static readonly long BaseMicroseconds = (DateTime.UtcNow-unixEpoch).Ticks / 10L ;
static readonly Stopwatch Stopwatch = Stopwatch.StartNew() ;
static long State = TimeSpan.MinValue.Ticks ;
private long OffsetInMicroseconds ;
private TimeStamp()
{
long oldState ;
long newState ;
do
{
oldState = State ;
newState = Stopwatch.Elapsed.Ticks / 10L ;
} while ( oldState == newState
|| oldState != Interlocked.CompareExchange( ref State , newState , oldState )
) ;
this.OffsetInMicroseconds = newState ;
return ;
}
public static TimeStamp GetNext()
{
return new TimeStamp() ;
}
public override string ToString()
{
long v = BaseMicroseconds + this.OffsetInMicroseconds ;
string s = v.ToString() ; // conversion to Base 36 not implemented ;
return s ;
}
}
I'm trying to determine the optimal solution for this tough problem. I've got a length (let's say 11). So it's a one dimensional space 0-10. Now I've got these intervals with same length (let's assume 2 in this example). Now they're randomly distributed (overlapping, or not). Let me draw an example:
Situation:
|00|01|02|03|04|05|06|07|08|09|10| <- Space (length = 11)
|-----|
|-----|
|-----|
|-----|
|-----|
|-----| <- single interval of length = 2
Now the solution needs to find the maximal number of intervals that can fit at once without overlap.
The solution is: 4 intervals
There are three results of 4 intervals:
|00|01|02|03|04|05|06|07|08|09|10|
|-----| |-----|-----| |-----| <- result 1
|-----| |-----| |-----| |-----| <- result 2
|-----| |-----|-----| |-----| <- result 3
But there are also two more constraints as well.
If there are more results (of best solution, in this case = 4), then the one with the least number of gaps.
If there are more results still the one with the highest minimal length of all its spaces. For example the one with spaces (of length) 2 & 3 has minimal length of space = 2, that is better than 1 & 4 where the minimal length of space is only 1.
So the result 2 has 4 "continual" chunks, the other two have only 3 so the refinement is:
|00|01|02|03|04|05|06|07|08|09|10|
|-----| |-----------| |-----| <- result 1
|-----| |-----------| |-----| <- result 3
Those two got same space distributions between them, so let's take first one.
The result for the input set is:
Interval count : 4
Optimal solution: |-----| |-----------| |-----|
The algorithm has to work universally for all the space length (not only 11), all interval lengths (interval length is always <= space length) and any number of intervals.
Update:
Problematic scenario:
|00|01|02|03|04|05|06|07|08|09|
|-----|
|-----|
|-----|
|-----|
|-----|
This is a simple dynamic programming problem.
Let the total length be N and the length of a task be L.
Let F(T) be maximum number of tasks that can be selected from the sub interval (T, N), then at each unit time T, there are 3 possibilities:
There is no task that starts at T.
There is a task that starts at T, but we do not include it in the result set.
There is a task that starts at T, and we do include it in the result set.
Case 1 is simple, we just have F(T) = F(T + 1).
In case 2/3, notice that selecting a task that start a T means we must reject all tasks that start while this task is running, i.e. between T and T + L. So we get F(T) = max(F(T + 1), F(T + L) + 1).
Finally, F(N) = 0. So you just start from F(N) and work your way back to F(0).
EDIT: This will give you the maximum number of intervals, but not the set that fulfils your 2 constraints. Your explanation of the constraints is unclear to me, so I'm not sure how to help you there. In particular, I can't tell what constraint 1 means since all the solutions to your example set are apparently equal.
EDIT 2: Some further explanation as requested:
Consider your posted example, we have N = 11 and L = 2. There are tasks that start at T = 0, 3, 4, 5, 6, 9. Starting from F(11) = 0 and working backwards:
F(11) = 0
F(10) = F(11) = 0 (Since no task starts at T = 10)
F(9) = max(F(10), F(11) + 1) = 1
...
Eventually we get to F(0) = 4:
T |00|01|02|03|04|05|06|07|08|09|10|
F(T)| 4| 3| 3| 3| 3| 2| 2| 1| 1| 1| 0|
EDIT 3: Well I was curious enough about this that I wrote a solution, so may as well post it. This will give you the set that has the most tasks, with the least number of gaps, and the smallest minimum gap. The output for the examples in the question is:
(0, 2) -> (4, 6) -> (6, 8) -> (9, 11)
(0, 2) -> (4, 6) -> (8, 10)
Obviously, I make no guarantees about correctness! :)
private class Task
{
public int Start { get; set; }
public int Length { get; set; }
public int End { get { return Start + Length; } }
public override string ToString()
{
return string.Format("({0:d}, {1:d})", Start, End);
}
}
private class CacheEntry : IComparable
{
public int Tasks { get; set; }
public int Gaps { get; set; }
public int MinGap { get; set; }
public Task Task { get; set; }
public Task NextTask { get; set; }
public int CompareTo(object obj)
{
var other = obj as CacheEntry;
if (Tasks != other.Tasks)
return Tasks - other.Tasks; // More tasks is better
if (Gaps != other.Gaps)
return other.Gaps = Gaps; // Less gaps is better
return MinGap - other.MinGap; // Larger minimum gap is better
}
}
private static IList<Task> F(IList<Task> tasks)
{
var end = tasks.Max(x => x.End);
var tasksByTime = tasks.ToLookup(x => x.Start);
var cache = new List<CacheEntry>[end + 1];
cache[end] = new List<CacheEntry> { new CacheEntry { Tasks = 0, Gaps = 0, MinGap = end + 1 } };
for (int t = end - 1; t >= 0; t--)
{
if (!tasksByTime.Contains(t))
{
cache[t] = cache[t + 1];
continue;
}
foreach (var task in tasksByTime[t])
{
var oldCEs = cache[t + task.Length];
var firstOldCE = oldCEs.First();
var lastOldCE = oldCEs.Last();
var newCE = new CacheEntry
{
Tasks = firstOldCE.Tasks + 1,
Task = task,
Gaps = firstOldCE.Gaps,
MinGap = firstOldCE.MinGap
};
// If there is a task that starts at time T + L, then that will always
// be the best option for us, as it will have one less Gap than the others
if (firstOldCE.Task == null || firstOldCE.Task.Start == task.End)
{
newCE.NextTask = firstOldCE.Task;
}
// Otherwise we want the one that maximises MinGap.
else
{
var ce = oldCEs.OrderBy(x => Math.Min(x.Task.Start - newCE.Task.End, x.MinGap)).Last();
newCE.NextTask = ce.Task;
newCE.Gaps++;
newCE.MinGap = Math.Min(ce.MinGap, ce.Task.Start - task.End);
}
var toComp = cache[t] ?? cache[t + 1];
if (newCE.CompareTo(toComp.First()) < 0)
{
cache[t] = toComp;
}
else
{
var ceList = new List<CacheEntry> { newCE };
// We need to keep track of all subsolutions X that start on the interval [T, T+L] that
// have an equal number of tasks and gaps, but a possibly a smaller MinGap. This is
// because an earlier task may have an even smaller gap to this task.
int idx = newCE.Task.Start + 1;
while (idx < newCE.Task.End)
{
toComp = cache[idx];
if
(
newCE.Tasks == toComp.First().Tasks &&
newCE.Gaps == toComp.First().Gaps &&
newCE.MinGap >= toComp.First().MinGap
)
{
ceList.AddRange(toComp);
idx += toComp.First().Task.End;
}
else
idx++;
}
cache[t] = ceList;
}
}
}
var rv = new List<Task>();
var curr = cache[0].First();
while (true)
{
rv.Add(curr.Task);
if (curr.NextTask == null) break;
curr = cache[curr.NextTask.Start].First();
}
return rv;
}
public static void Main()
{
IList<Task> tasks, sol;
tasks = new List<Task>
{
new Task { Start = 0, Length = 2 },
new Task { Start = 3, Length = 2 },
new Task { Start = 4, Length = 2 },
new Task { Start = 5, Length = 2 },
new Task { Start = 6, Length = 2 },
new Task { Start = 9, Length = 2 },
};
sol = F(tasks);
foreach (var task in sol)
Console.Out.WriteLine(task);
Console.Out.WriteLine();
tasks = new List<Task>
{
new Task { Start = 0, Length = 2 },
new Task { Start = 3, Length = 2 },
new Task { Start = 4, Length = 2 },
new Task { Start = 8, Length = 2 },
};
sol = F(tasks);
foreach (var task in sol)
Console.Out.WriteLine(task);
Console.Out.WriteLine();
tasks = new List<Task>
{
new Task { Start = 0, Length = 5 },
new Task { Start = 6, Length = 5 },
new Task { Start = 7, Length = 3 },
new Task { Start = 8, Length = 9 },
new Task { Start = 19, Length = 1 },
};
sol = F(tasks);
foreach (var task in sol)
Console.Out.WriteLine(task);
Console.Out.WriteLine();
Console.In.ReadLine();
}