I'm using Task Parallel Library (TPL ) for calculating Fibonacci number.
Program is given below:
public static int Fib(int n)
{
if (n <= 1)
{
return n;
}
Task<int> task = Task.Factory.StartNew<int>(() => Fib(n - 1));
var p = Fib(n - 2);
return task.Result + p;
}
public static void Main(string[] args)
{
Stopwatch watch = new Stopwatch();
watch.Start();
Console.WriteLine("Answer: " + Fib(44));
watch.Stop();
Console.WriteLine("Time: " + watch.ElapsedMilliseconds);
}
}
Unfortunately this program takes a very long time to complete.
But serial version of this program ( as given below ) takes less than 30 seconds
to calculate 44th Fibonacci number.
public class FibTester
{
public static int Fib(int n)
{
if (n <= 1)
{
return n;
}
var q = Fib(n - 1);
var p = Fib(n - 2);
return p + q;
}
public static void Main(string[] args)
{
Stopwatch watch = new Stopwatch();
watch.Start();
Console.WriteLine("Answer: " + Fib(44));
watch.Stop();
Console.WriteLine("Time: " + watch.ElapsedMilliseconds);
}
}
I think issue in parallel version is, it creates a thread for each Fib(n - 1)
request. Is there any way to control number of thread created in TPL?
This is a perfect example of how not to multithread!
You are creating a new task for each iteration of a recursive function. So each task creates a new task, waits for that task to finish and then adds the numbers from the result.
Each thread has two jobs : 1 - to create a new thread, 2 - to add two numbers.
The overhead cost for creating each thread is going to far outweigh the cost of adding two numbers together.
To answer your question about limiting the number of threads created, the TPL uses the ThreadPool. You can limit the number of threads using ThreadPool.SetMaxThreads.
I think it is pretty clear that fibonacci cannot be parallelized unless you know some pairs of adjacent fibonacci numbers ahead of time
Just go for the iterative code.
Whatever you do, don't spawn a Task/Thread on each iteration/recursion! The overhead will kill the performance. That is a big fat Anti Pattern even if parallellization applies.
Just for fun :)
using System;
using System.Linq;
using System.Threading.Tasks;
public class Program
{
static readonly double sqrt5 = Math.Sqrt(5);
static readonly double p1 = (1 + sqrt5) / 2;
static readonly double p2 = -1 * (p1 - 1);
static ulong Fib1(int n) // surprisingly slightly slower than Fib2
{
double n1 = Math.Pow(p1, n+1);
double n2 = Math.Pow(p2, n+1);
return (ulong)((n1-n2)/sqrt5);
}
static ulong Fib2(int n) // 40x faster than Fib3
{
double n1 = 1.0;
double n2 = 1.0;
for (int i=0; i<n+1; i++)
{
n1*=p1;
n2*=p2;
}
return (ulong)((n1-n2)/sqrt5);
}
static ulong Fib3(int n) // that's fast! Done in 1.32s
{
double n1 = 1.0;
double n2 = 1.0;
Parallel.For(0,n+1,(x)=> {
n1 *= p1;
n2 *= p2;
});
return (ulong)((n1-n2)/sqrt5);
}
public static void Main(string[] args)
{
for (int j=0; j<100000; j++)
for (int i=0; i<90; i++)
Fib1(i);
for (int i=0; i<90; i++)
Console.WriteLine(Fib1(i));
}
}
Your program is very inneficient, because same calculation are repeated (Fib(n-1) actually recalculate the Fib number for all numbers < n -2, which has be done yet).
You should try this :
class Program
{
static void Main(string[] args)
{
var sw = new Stopwatch();
sw.Start();
foreach (var nbr in Fibo().Take(5000))
{
Console.Write(nbr.ToString() + " ");
}
sw.Stop();
Console.WriteLine();
Console.WriteLine("Ellapsed : " + sw.Elapsed.ToString());
Console.ReadLine();
}
static IEnumerable<long> Fibo()
{
long a = 0;
long b = 1;
long t;
while (true)
{
t = a + b;
yield return t;
a = b;
b = t;
}
}
}
44th find in 5ms.
The slowest part of the code is the Console.Write in the loop.
Related
I tryed to refactor a nested sequential for loop into a nested Parallel.For loop.
But following the recommended parallel patterns and locks, the overall result was too low compared with the sequential result.
The problem was caused by a wrong or inconsistent use of BigInteger calculation methods.
For BigInteger you need to use ++-operator or BigInteger methods like BigInteger.Add().
My sources:
How to: Write a Parallel.For Loop with Thread-Local Variables
Threading in C# - Parallel Programming - The Parallel Class - For and ForEach
Please find sample code below:
internal static class Program
{
static Object lockObj = new Object();
static void Main()
{
//target result: 575
NestedLoopAggregationTest();
return;
}
private static void NestedLoopAggregationTest()
{
BigInteger totalSequential = 0;
BigInteger totalRecomandedPattern = 0;
BigInteger totalAntiPattern = 0;
const int iEnd1 = 5;
const int iEnd2 = 10;
const int iEnd3 = 15;
for (int iCn1 = 1; iCn1 <= iEnd1; iCn1++)
{
for (int iCn2 = 1; iCn2 <= iEnd2; iCn2++)
{
for (int iCn3 = iCn2 - 1; iCn3 <= iEnd3; iCn3++)
{
totalSequential++;
}
}
}
Parallel.For(1, iEnd1 + 1, (iCn1) =>
{
Parallel.For(1, iEnd2 + 1, (iCn2) =>
{
Parallel.For<BigInteger>(iCn2 - 1, iEnd3 + 1, () => 0, (iCn3, state, subtotal) =>
{
//Solution:
//for BigInteger use ++-operator or BigInteger.Add()
subtotal = BigInteger.Add(subtotal, 1);
return subtotal;
},
(subtotal) =>
{
lock (lockObj)
{
totalRecomandedPattern = BigInteger.Add(totalRecomandedPattern, subtotal);
}
}
);
});
});
MessageBox.Show(totalSequential.ToString() + Environment.NewLine + totalRecomandedPattern.ToString() +
}
}
Your current parallel implementation requires a lock every time subtotal is modified in the inner loop. This modified approach is faster than both your serial and parallel implementaions because it avoids a lock in the innermost loop:
Parallel.For(1, iEnd1 + 1, (iCn1) =>
{
Parallel.For(1, iEnd2 + 1, (iCn2) =>
{
BigInteger subtotal = 0;
for (var iCnt3 = iCn2 - 1; iCnt3 < iEnd3 + 1; iCnt3++)
{
//Solution:
//for BigInteger use ++-operator or BigInteger.Add()
subtotal = BigInteger.Add(subtotal, 1);
}
lock (lockObj)
{
totalRecomandedPatternModified = BigInteger.Add(totalRecomandedPatternModified, subtotal);
}
});
});
I increased each of the endpoints by a factor of 10 so the runtime is long enough to be measured on my hardware, then got the following average times:
Serial: 9ms
Parallel: 11ms
Modified: 2ms
An exception occurs when I try to find the 100,000th prime number using Alea GPU. The algorithm works fine if I try to find a smaller prime number e.g. the 10,000th prime number.
I am using Alea v3.0.4, NVIDIA GTX 970, Cuda 9.2 drivers.
I am new to GPU programming. Any help would be greatly appreciated.
long[] primeNumber = new long[1]; // nth prime number to find
int n = 100000; // find the 100,000th prime number
var worker = Gpu.Default; // GTX 970 CUDA v9.2 drivers
long count = 0;
worker.LongFor(count, n, x =>
{
long a = 2;
while (count < n)
{
long b = 2;
long prime = 1;
while (b * b <= a)
{
if (a % b == 0)
{
prime = 0;
break;
}
b++;
}
if (prime > 0)
{
count++;
}
a++;
}
primeNumber[0] = (a - 1);
}
);
Here are the exception details:
System.Exception occurred HResult=0x80131500 Message=[CUDAError]
CUDA_ERROR_LAUNCH_FAILED Source=Alea StackTrace: at
Alea.CUDAInterop.cuSafeCall#2939.Invoke(String message) at
Alea.CUDAInterop.cuSafeCall(cudaError_enum result) at
A.cf5aded17df9f7cc4c132234dda010fa7.Copy#918-22.Invoke(Unit _arg9)
at Alea.Memory.Copy(FSharpOption1 streamOpt, Memory src, IntPtr
srcOffset, Memory dst, IntPtr dstOffset, FSharpOption1 lengthOpt)
at
Alea.ImplicitMemoryTrackerEntry.cdd2cd00c052408bcdbf03958f14266ca(FSharpFunc2
c600c458623dca7db199a0e417603dff4, Object
cd5116337150ebaa6de788dacd82516fa) at
Alea.ImplicitMemoryTrackerEntry.c6a75c171c9cccafb084beba315394985(FSharpFunc2
c600c458623dca7db199a0e417603dff4, Object
cd5116337150ebaa6de788dacd82516fa) at
Alea.ImplicitMemoryTracker.HostReadWriteBarrier(Object instance) at
Alea.GlobalImplicitMemoryTracker.HostReadWriteBarrier(Object instance)
at A.cf5aded17df9f7cc4c132234dda010fa7.clo#2359-624.Invoke(Object
arg00) at
Microsoft.FSharp.Collections.SeqModule.Iterate[T](FSharpFunc2 action,
IEnumerable1 source) at Alea.Kernel.LaunchRaw(LaunchParam lp,
FSharpOption1 instanceOpt, FSharpList1 args) at
Alea.Parallel.Device.DeviceFor.For(Gpu gpu, Int64 fromInclusive, Int64
toExclusive, Action1 op) at Alea.Parallel.GpuExtension.LongFor(Gpu
gpu, Int64 fromInclusive, Int64 toExclusive, Action1 op) at
TestingGPU.Program.Execute(Int32 t) in
C:\Users..\source\repos\TestingGPU\TestingGPU\Program.cs:line 148
at TestingGPU.Program.Main(String[] args)
Working Solution:
static void Main(string[] args)
{
var devices = Device.Devices;
foreach (var device in devices)
{
Console.WriteLine(device.ToString());
}
while (true)
{
Console.WriteLine("Enter a number to check if it is a prime number:");
string line = Console.ReadLine();
long checkIfPrime = Convert.ToInt64(line);
Stopwatch sw = new Stopwatch();
sw.Start();
bool GPUisPrime = GPUIsItPrime(checkIfPrime+1);
sw.Stop();
Stopwatch sw2 = new Stopwatch();
sw2.Start();
bool CPUisPrime = CPUIsItPrime(checkIfPrime+1);
sw2.Stop();
Console.WriteLine($"GPU: is {checkIfPrime} prime? {GPUisPrime} Time Elapsed: {sw.ElapsedMilliseconds.ToString()}");
Console.WriteLine($"CPU: is {checkIfPrime} prime? {CPUisPrime} Time Elapsed: {sw2.ElapsedMilliseconds.ToString()}");
}
}
[GpuManaged]
private static bool GPUIsItPrime(long n)
{
//Sieve of Eratosthenes Algorithm
bool[] isComposite = new bool[n];
var worker = Gpu.Default;
worker.LongFor(2, n, i =>
{
if (!(isComposite[i]))
{
for (long j = 2; (j * i) < isComposite.Length; j++)
{
isComposite[j * i] = true;
}
}
});
return !isComposite[n-1];
}
private static bool CPUIsItPrime(long n)
{
//Sieve of Eratosthenes Algorithm
bool[] isComposite = new bool[n];
for (int i = 2; i < n; i++)
{
if (!isComposite[i])
{
for (long j = 2; (j * i) < n; j++)
{
isComposite[j * i] = true;
}
}
}
return !isComposite[n-1];
}
Your code doesn't look right. Given a parallel for-loop method here (LongFor), Alea will spawn "n" threads, with an index "x" used to identify what the thread number is. So, for example a simple example like For(0, n, x => a[x] = x); uses "x" to initialize a[] with { 0, 1, 2, ...., n - 1}. But, your kernel code does not use "x" anywhere in the code. Consequently, you run the same code "n" times with absolutely no difference. Why then run on a GPU? What I think you want is to do is to compute in thread "x" whether "x" is prime. With result in hand, set bool prime[x] = true or false. Then, afterwards, in the kernel after all that, add a sync call, followed with a test using a single thread (e.g., x == 0) to go through prime[] and pick the largest prime from the array. Otherwise, there's a lot of collisions for 'primeNumber[0] = (a - 1);' by n-threads on the GPU. I can't imagine how you would ever get the right result. Finally, you probably want to make sure using some Alea call that prime[] is never copied to/from the GPU. But, I don't know how you do that in Alea. The compiler might be smart enough to know that prime[] is only used in the kernel code.
does anyone know a way to calculate Beta (beta coefficient) for a portfolio or stock vs. a benchmark, such as an index like S&P in c#?
I already have 2 arrays of type double that would be required for such a calculation but I can't find any sleek way to do this.
StatisticFormula.BetaFunction Method (Double, Double) exists but this accepts one value for each param, not an array - which statistically makes no sense.
thanks in advance
I'm not aware of any good C# Finance/Statistics packages, so I wrote the method directly and borrowed from this stats package: https://www.codeproject.com/Articles/42492/Using-LINQ-to-Calculate-Basic-Statistics
using System;
using System.Collections.Generic;
using System.Linq;
namespace ConsoleApplication1
{
static class Program
{
static void Main(string[] args)
{
double[] closingPriceStock = { 39.32, 39.45, 39.27, 38.73, 37.99, 38.38, 39.53, 40.55, 40.78, 41.3, 41.35, 41.25, 41.1, 41.26, 41.48, 41.68, 41.77, 41.92, 42.12, 41.85, 41.54 };
double[] closingPriceMarket = { 1972.18, 1988.87, 1987.66, 1940.51, 1867.61, 1893.21, 1970.89, 2035.73, 2079.61, 2096.92, 2102.44, 2091.54, 2083.39, 2086.05, 2084.07, 2104.18, 2077.57, 2083.56, 2099.84, 2093.32, 2098.04 };
double[] closingPriceStockDailyChange = new double[closingPriceStock.Length - 1];
double[] closingPriceMarketDailyChange = new double[closingPriceMarket.Length - 1];
for (int i = 0; i < closingPriceStockDailyChange.Length; i++)
{
closingPriceStockDailyChange[i] = (closingPriceStock[i + 1] - closingPriceStock[i]) * 100 / closingPriceStock[i];
closingPriceMarketDailyChange[i] = (closingPriceMarket[i + 1] - closingPriceMarket[i]) * 100 / closingPriceMarket[i];
}
double beta = Covariance(closingPriceStockDailyChange, closingPriceMarketDailyChange) / Variance(closingPriceMarketDailyChange);
Console.WriteLine(beta);
Console.Read();
}
public static double Variance(this IEnumerable<double> source)
{
int n = 0;
double mean = 0;
double M2 = 0;
foreach (double x in source)
{
n = n + 1;
double delta = x - mean;
mean = mean + delta / n;
M2 += delta * (x - mean);
}
return M2 / (n - 1);
}
public static double Covariance(this IEnumerable<double> source, IEnumerable<double> other)
{
int len = source.Count();
double avgSource = source.Average();
double avgOther = other.Average();
double covariance = 0;
for (int i = 0; i < len; i++)
covariance += (source.ElementAt(i) - avgSource) * (other.ElementAt(i) - avgOther);
return covariance / len;
}
}
}
This would have to be refactored to calculate beta in a function, you can import the linked package to avoid the static methods I included, etc., but this is just a toy example.
How can I make the for loop of this function to use the GPU with OpenCL?
public static double[] Calculate(double[] num, int period)
{
var final = new double[num.Length];
double sum = num[0];
double coeff = 2.0 / (1.0 + period);
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
return final;
}
Your problem as written does not fit well with something that would work on a GPU. You cannot parallelize (in a way that improves performance) the operation on a single array because the value of the nth element depends on elements 1 to n. However, you can utilize the GPU to process multiple arrays, where each GPU core operates on a separate array.
The full code for the solution is at the end of the answer, but the results of the test, to calculate on 10,000 arrays each of which has 10,000 elements, generates the following (on a GTX1080M and an i7 7700k with 32GB RAM):
Task Generating Data: 1096.4583ms
Task CPU Single Thread: 596.2624ms
Task CPU Parallel: 179.1717ms
GPU CPU->GPU: 89ms
GPU Execute: 86ms
GPU GPU->CPU: 29ms
Task Running GPU: 921.4781ms
Finished
In this test, we measure the speed at which we can generate results into a managed C# array using the CPU with one thread, the CPU with all threads, and finally the GPU using all cores. We validate that the results from each test are identical, using the function AreTheSame.
The fastest time is processing the arrays on the CPU using all threads (Task CPU Parallel: 179ms).
The GPU is actually the slowest (Task Running GPU: 922ms), but this is because of the time taken to reformat the C# arrays in a way that they can be transferred onto the GPU.
If this bottleneck were removed (which is quite possible, depending on your use case), the GPU could potentially be the fastest. If the data were already formatted in a manner that can be immediately be transferred onto the GPU, the total processing time for the GPU would be 204ms (CPU->GPU: 89ms + Execute: 86ms + GPU->CPU: 29 ms = 204ms). This is still slower than the parallel CPU option, but on a different sort of data set, it might be faster.
To get the data back from the GPU (the most important part of actually using the GPU), we use the function ComputeCommandQueue.Read. This transfers the altered array on the GPU back to the CPU.
To run the following code, reference the Cloo Nuget Package (I used 0.9.1). And make sure to compile on x64 (you will need the memory). You may need to update your graphics card driver too if it fails to find an OpenCL device.
class Program
{
static string CalculateKernel
{
get
{
return #"
kernel void Calc(global int* offsets, global int* lengths, global double* doubles, double periodFactor)
{
int id = get_global_id(0);
int start = offsets[id];
int length = lengths[id];
int end = start + length;
double sum = doubles[start];
for(int i = start; i < end; i++)
{
sum = sum + periodFactor * ( doubles[i] - sum );
doubles[i] = sum;
}
}";
}
}
public static double[] Calculate(double[] num, int period)
{
var final = new double[num.Length];
double sum = num[0];
double coeff = 2.0 / (1.0 + period);
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
return final;
}
static void Main(string[] args)
{
int maxElements = 10000;
int numArrays = 10000;
int computeCores = 2048;
double[][] sets = new double[numArrays][];
using (Timer("Generating Data"))
{
Random elementRand = new Random(1);
for (int i = 0; i < numArrays; i++)
{
sets[i] = GetRandomDoubles(elementRand.Next((int)(maxElements * 0.9), maxElements), randomSeed: i);
}
}
int period = 14;
double[][] singleResults;
using (Timer("CPU Single Thread"))
{
singleResults = CalculateCPU(sets, period);
}
double[][] parallelResults;
using (Timer("CPU Parallel"))
{
parallelResults = CalculateCPUParallel(sets, period);
}
if (!AreTheSame(singleResults, parallelResults)) throw new Exception();
double[][] gpuResults;
using (Timer("Running GPU"))
{
gpuResults = CalculateGPU(computeCores, sets, period);
}
if (!AreTheSame(singleResults, gpuResults)) throw new Exception();
Console.WriteLine("Finished");
Console.ReadKey();
}
public static bool AreTheSame(double[][] a1, double[][] a2)
{
if (a1.Length != a2.Length) return false;
for (int i = 0; i < a1.Length; i++)
{
var ar1 = a1[i];
var ar2 = a2[i];
if (ar1.Length != ar2.Length) return false;
for (int j = 0; j < ar1.Length; j++)
if (Math.Abs(ar1[j] - ar2[j]) > 0.0000001) return false;
}
return true;
}
public static double[][] CalculateGPU(int partitionSize, double[][] sets, int period)
{
ComputeContextPropertyList cpl = new ComputeContextPropertyList(ComputePlatform.Platforms[0]);
ComputeContext context = new ComputeContext(ComputeDeviceTypes.Gpu, cpl, null, IntPtr.Zero);
ComputeProgram program = new ComputeProgram(context, new string[] { CalculateKernel });
program.Build(null, null, null, IntPtr.Zero);
ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None);
ComputeEventList events = new ComputeEventList();
ComputeKernel kernel = program.CreateKernel("Calc");
double[][] results = new double[sets.Length][];
double periodFactor = 2d / (1d + period);
Stopwatch sendStopWatch = new Stopwatch();
Stopwatch executeStopWatch = new Stopwatch();
Stopwatch recieveStopWatch = new Stopwatch();
int offset = 0;
while (true)
{
int first = offset;
int last = Math.Min(offset + partitionSize, sets.Length);
int length = last - first;
var merged = Merge(sets, first, length);
sendStopWatch.Start();
ComputeBuffer<int> offsetBuffer = new ComputeBuffer<int>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Offsets);
ComputeBuffer<int> lengthsBuffer = new ComputeBuffer<int>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Lengths);
ComputeBuffer<double> doublesBuffer = new ComputeBuffer<double>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Doubles);
kernel.SetMemoryArgument(0, offsetBuffer);
kernel.SetMemoryArgument(1, lengthsBuffer);
kernel.SetMemoryArgument(2, doublesBuffer);
kernel.SetValueArgument(3, periodFactor);
sendStopWatch.Stop();
executeStopWatch.Start();
commands.Execute(kernel, null, new long[] { merged.Lengths.Length }, null, events);
executeStopWatch.Stop();
using (var pin = Pinned(merged.Doubles))
{
recieveStopWatch.Start();
commands.Read(doublesBuffer, false, 0, merged.Doubles.Length, pin.Address, events);
commands.Finish();
recieveStopWatch.Stop();
}
for (int i = 0; i < merged.Lengths.Length; i++)
{
int len = merged.Lengths[i];
int off = merged.Offsets[i];
var res = new double[len];
Array.Copy(merged.Doubles,off,res,0,len);
results[first + i] = res;
}
offset += partitionSize;
if (offset >= sets.Length) break;
}
Console.WriteLine("GPU CPU->GPU: " + recieveStopWatch.ElapsedMilliseconds + "ms");
Console.WriteLine("GPU Execute: " + executeStopWatch.ElapsedMilliseconds + "ms");
Console.WriteLine("GPU GPU->CPU: " + sendStopWatch.ElapsedMilliseconds + "ms");
return results;
}
public static PinnedHandle Pinned(object obj) => new PinnedHandle(obj);
public class PinnedHandle : IDisposable
{
public IntPtr Address => handle.AddrOfPinnedObject();
private GCHandle handle;
public PinnedHandle(object val)
{
handle = GCHandle.Alloc(val, GCHandleType.Pinned);
}
public void Dispose()
{
handle.Free();
}
}
public class MergedResults
{
public double[] Doubles { get; set; }
public int[] Lengths { get; set; }
public int[] Offsets { get; set; }
}
public static MergedResults Merge(double[][] sets, int offset, int length)
{
List<int> lengths = new List<int>(length);
List<int> offsets = new List<int>(length);
for (int i = 0; i < length; i++)
{
var arr = sets[i + offset];
lengths.Add(arr.Length);
}
var totalLength = lengths.Sum();
double[] doubles = new double[totalLength];
int dataOffset = 0;
for (int i = 0; i < length; i++)
{
var arr = sets[i + offset];
Array.Copy(arr, 0, doubles, dataOffset, arr.Length);
offsets.Add(dataOffset);
dataOffset += arr.Length;
}
return new MergedResults()
{
Doubles = doubles,
Lengths = lengths.ToArray(),
Offsets = offsets.ToArray(),
};
}
public static IDisposable Timer(string name)
{
return new SWTimer(name);
}
public class SWTimer : IDisposable
{
private Stopwatch _sw;
private string _name;
public SWTimer(string name)
{
_name = name;
_sw = Stopwatch.StartNew();
}
public void Dispose()
{
_sw.Stop();
Console.WriteLine("Task " + _name + ": " + _sw.Elapsed.TotalMilliseconds + "ms");
}
}
public static double[][] CalculateCPU(double[][] arrays, int period)
{
double[][] results = new double[arrays.Length][];
for (var index = 0; index < arrays.Length; index++)
{
var arr = arrays[index];
results[index] = Calculate(arr, period);
}
return results;
}
public static double[][] CalculateCPUParallel(double[][] arrays, int period)
{
double[][] results = new double[arrays.Length][];
Parallel.For(0, arrays.Length, i =>
{
var arr = arrays[i];
results[i] = Calculate(arr, period);
});
return results;
}
static double[] GetRandomDoubles(int num, int randomSeed)
{
Random r = new Random(randomSeed);
var res = new double[num];
for (int i = 0; i < num; i++)
res[i] = r.NextDouble() * 0.9 + 0.05;
return res;
}
}
as commenter Cory stated refer to this link for setup.
How to use your GPU in .NET
Here is how you would use this project:
Add the Nuget Package Cloo
Add reference to OpenCLlib.dll
Download OpenCLLib.zip
Add using OpenCL
static void Main(string[] args)
{
int[] Primes = { 1,2,3,4,5,6,7 };
EasyCL cl = new EasyCL();
cl.Accelerator = AcceleratorDevice.GPU;
cl.LoadKernel(IsPrime);
cl.Invoke("GetIfPrime", 0, Primes.Length, Primes, 1.0);
}
static string IsPrime
{
get
{
return #"
kernel void GetIfPrime(global int* num, int period)
{
int index = get_global_id(0);
int sum = (2.0 / (1.0 + period)) * (num[index] - num[0]);
printf("" %d \n"",sum);
}";
}
}
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
means first element is multiplied by coeff 1 time and subtracted from 2nd element. First element also multiplied by square of coeff and this time added to 3rd element. Then first element multiplied by cube of coeff and subtracted from 4th element.
This is going like this:
-e0*c*c*c + e1*c*c - e2*c = f3
e0*c*c*c*c - e1*c*c*c + e2*c*c - e3*c = f4
-e0*c*c*c*c*c + e1*c*c*c*c - e2*c*c*c + e3*c*c - e4*c =f5
For all elements, scan through for all smaller id elements and compute this:
if difference of id values(lets call it k) of elements is odd, take subtraction, if not then take addition. Before addition or subtraction, multiply that value by k-th power of coeff. Lastly, multiply the current num value by coefficient and add it to current cell. Current cell value is final(i).
This is O(N*N) and looks like an all-pairs compute kernel. An example using an open-source C# OpenCL project:
ClNumberCruncher cruncher = new ClNumberCruncher(ClPlatforms.all().gpus(), #"
__kernel void foo(__global double * num, __global double * final, __global int *parameters)
{
int threadId = get_global_id(0);
int period = parameters[0];
double coeff = 2.0 / (1.0 + period);
double sumOfElements = 0.0;
for(int i=0;i<threadId;i++)
{
// negativity of coeff is to select addition or subtraction for different powers of coeff
double powKofCoeff = pow(-coeff,threadId-i);
sumOfElements += powKofCoeff * num[i];
}
final[threadId] = sumOfElements + num[threadId] * coeff;
}
");
cruncher.performanceFeed = true; // getting benchmark feedback on console
double[] numArray = new double[10000];
double[] finalArray = new double[10000];
int[] parameters = new int[10];
int period = 15;
parameters[0] = period;
ClArray<double> numGpuArray = numArray;
numGpuArray.readOnly = true; // gpus read this from host
ClArray<double> finalGpuArray = finalArray; // finalArray will have results
finalGpuArray.writeOnly = true; // gpus write this to host
ClArray<int> parametersGpu = parameters;
parametersGpu.readOnly = true;
// calculate kernels with exact same ordering of parameters
// num(double),final(double),parameters(int)
// finalGpuArray points to __global double * final
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
// first compute always lags because of compiling the kernel so here are repeated computes to get actual performance
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
Results are on finalArray array for 10000 elements, using 100 workitems per workitem-group.
GPGPU part takes 82ms on a rx550 gpu which has very low ratio of 64bit-to-32bit compute performance(because consumer gaming cards are not good at double precision for new series). An Nvidia Tesla or an Amd Vega would easily compute this kernel without crippled performance. Fx8150(8 cores) completes in 683ms. If you need to specifically select only an integrated-GPU and its CPU, you can use
ClPlatforms.all().gpus().devicesWithHostMemorySharing() + ClPlatforms.all().cpus() when creating ClNumberCruncher instance.
binaries of api:
https://www.codeproject.com/Articles/1181213/Easy-OpenCL-Multiple-Device-Load-Balancing-and-Pip
or source code to compile on your pc:
https://github.com/tugrul512bit/Cekirdekler
if you have multiple gpus, it uses them without any extra code. Including a cpu to the computations would pull gpu effectiveness down in this sample for first iteration (repeatations complete in 76ms with cpu+gpu) so its better to use 2-3 GPU instead of CPU+GPU.
I didn't check numerical stability(you should use Kahan-Summation when adding millions or more values into same variable but I didn't use it for readability and don't have an idea about if 64-bit values need this too like 32-bit ones) or any value correctness, you should do it. Also foo kernel is not optimized. It makes %50 of core times idle so it should be better scheduled like this:
thread-0: compute element 0 and element N-1
thread-1: compute element 1 and element N-2
thread-m: compute element N/2-1 and element N/2
so all workitems get similar amount of work. On top of this, using 100 for workgroup size is not optimal. It should be something like 128,256,512 or 1024(for Nvidia) but this means array size should also be an integer multiple of this too. Then it would need extra control logic in the kernel to not go out of array borders. For even more performance, for loop could have multiple partial sums to do a "loop unrolling".
Firstly, it's not about an array with subsequences that may be in some order before we start sort, it's an about array of special structure.
I'm writing now a simple method that sorts data. Until now, I used Array.Sort, but PLINQ's OrderBy outperform standard Array.Sort on large arrays.
So i decide to write my own implementation of multithreading sort. Idea was simple: split an array on partitions, parallel sort each partition, then merge all results in one array.
Now i'm done with partitioning and sorting:
public class PartitionSorter
{
public static void Sort(int[] arr)
{
var ranges = Range.FromArray(arr);
var allDone = new ManualResetEventSlim(false, ranges.Length*2);
int completed = 0;
foreach (var range in ranges)
{
ThreadPool.QueueUserWorkItem(r =>
{
var rr = (Range) r;
Array.Sort(arr, rr.StartIndex, rr.Length);
if (Interlocked.Increment(ref completed) == ranges.Length)
allDone.Set();
}, range);
}
allDone.Wait();
}
}
public class Range
{
public int StartIndex { get; }
public int Length { get; }
public Range(int startIndex, int endIndex)
{
StartIndex = startIndex;
Length = endIndex;
}
public static Range[] FromArray<T>(T[] source)
{
int processorCount = Environment.ProcessorCount;
int partitionLength = (int) (source.Length/(double) processorCount);
var result = new Range[processorCount];
int start = 0;
for (int i = 0; i < result.Length - 1; i++)
{
result[i] = new Range(start, partitionLength);
start += partitionLength;
}
result[result.Length - 1] = new Range(start, source.Length - start);
return result;
}
}
As result I get an array with special structure, for example
[1 3 5 | 2 4 7 | 6 8 9]
Now how can I use this information and finish sorting? Insertion sorts and others doesn't use information that data in blocks is already sorted, and we just need to merge them together. I tried to apply some algorithms from Merge sort, but failed.
I've done some testing with a parallel Quicksort implementation.
I tested the following code with a RELEASE build on Windows x64 10, compiled with C#6 (Visual Studio 2015), .Net 4.61, and run outside any debugger.
My processor is quad core with hyperthreading (which is certainly going to help any parallel implementation!)
The array size is 20,000,000 (so a fairly large array).
I got these results:
LINQ OrderBy() took 00:00:14.1328090
PLINQ OrderBy() took 00:00:04.4484305
Array.Sort() took 00:00:02.3695607
Sequential took 00:00:02.7274400
Parallel took 00:00:00.7874578
PLINQ OrderBy() is much faster than LINQ OrderBy(), but slower than Array.Sort().
QuicksortSequential() is around the same speed as Array.Sort()
But the interesting thing here is that QuicksortParallelOptimised() is noticeably faster on my system - so it's definitely an efficient way of sorting if you have enough processor cores.
Here's the full compilable console app. Remember to run it in RELEASE mode - if you run it in DEBUG mode the timing results will be woefully incorrect.
using System;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
namespace Demo
{
class Program
{
static void Main()
{
int n = 20000000;
int[] a = new int[n];
var rng = new Random(937525);
for (int i = 0; i < n; ++i)
a[i] = rng.Next();
var b = a.ToArray();
var d = a.ToArray();
var sw = new Stopwatch();
sw.Restart();
var c = a.OrderBy(x => x).ToArray(); // Need ToArray(), otherwise it does nothing.
Console.WriteLine("LINQ OrderBy() took " + sw.Elapsed);
sw.Restart();
var e = a.AsParallel().OrderBy(x => x).ToArray(); // Need ToArray(), otherwise it does nothing.
Console.WriteLine("PLINQ OrderBy() took " + sw.Elapsed);
sw.Restart();
Array.Sort(d);
Console.WriteLine("Array.Sort() took " + sw.Elapsed);
sw.Restart();
QuicksortSequential(a, 0, a.Length-1);
Console.WriteLine("Sequential took " + sw.Elapsed);
sw.Restart();
QuicksortParallelOptimised(b, 0, b.Length-1);
Console.WriteLine("Parallel took " + sw.Elapsed);
// Verify that our sort implementation is actually correct!
Trace.Assert(a.SequenceEqual(c));
Trace.Assert(b.SequenceEqual(c));
}
static void QuicksortSequential<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
int pivot = Partition(arr, left, right);
QuicksortSequential(arr, left, pivot - 1);
QuicksortSequential(arr, pivot + 1, right);
}
}
static void QuicksortParallelOptimised<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
const int SEQUENTIAL_THRESHOLD = 2048;
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
{
QuicksortSequential(arr, left, right);
}
else
{
int pivot = Partition(arr, left, right);
Parallel.Invoke(
() => QuicksortParallelOptimised(arr, left, pivot - 1),
() => QuicksortParallelOptimised(arr, pivot + 1, right));
}
}
}
static int Partition<T>(T[] arr, int low, int high) where T : IComparable<T>
{
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (arr[i].CompareTo(pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
static void Swap<T>(T[] arr, int i, int j)
{
T tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
}
}