I have a database (RavenDB) which needs to be able to handle 300 queries (Full text search) every 10 seconds. To increase peformance I splitted up the database so I have multiple documentStores
my Code:
var watch = Stopwatch.StartNew();
int taskcnt = 0;
int sum = 0;
for (int i = 0; i < 11; i++)
{
Parallel.For(0, 7, new Action<int>((x) =>
{
for(int docomentStore = 0;docomentStore < 5; docomentStore++)
{
var stopWatch = Stopwatch.StartNew();
Task<IList<eBayItem>> task = new Task<IList<eBayItem>>(Database.ExecuteQuery, new Filter()
{
Store = "test" + docomentStore,
MaxPrice = 600,
MinPrice = 200,
BIN = true,
Keywords = new List<string>() { "Canon", "MP", "Black" },
ExcludedKeywords = new List<string>() { "G1", "T3" }
});
task.ContinueWith((list) => {
stopWatch.Stop();
sum += stopWatch.Elapsed.Milliseconds;
taskcnt++;
if (taskcnt == 300)
{
watch.Stop();
Console.WriteLine("Average time: " + (sum / (float)300).ToString());
Console.WriteLine("Total time: " + watch.Elapsed.ToString() + "ms");
}
});
task.Start();
}
}));
Thread.Sleep(1000);
}
Average query time: 514,13 ms
Total time: 00:01:29.9108016
The code where I query ravenDB:
public static IList<eBayItem> ExecuteQuery(object Filter)
{
IList<eBayItem> items;
Filter filter = (Filter)Filter;
if (int.Parse(filter.Store.ToCharArray().Last().ToString()) > 4)
{
Console.WriteLine(filter.Store); return null;
}
using (var session = Shards[filter.Store].OpenSession())
{
var query = session.Query<eBayItem, eBayItemIndexer>().Where(y => y.Price <= filter.MaxPrice && y.Price >= filter.MinPrice);
query = filter.Keywords.ToArray()
.Aggregate(query, (q, term) =>
q.Search(xx => xx.Title, term, options: SearchOptions.And));
if (filter.ExcludedKeywords.Count > 0)
{
query = filter.ExcludedKeywords.ToArray().Aggregate(query, (q, exterm) =>
q.Search(it => it.Title, exterm, options: SearchOptions.Not));
}
items = query.ToList<eBayItem>();
}
return items;
}
And the initialization of RavenDB:
static Dictionary<string, EmbeddableDocumentStore> Shards = new Dictionary<string, EmbeddableDocumentStore>();
public static void Connect()
{
Shards.Add("test0", new EmbeddableDocumentStore() { DataDirectory = "test.db" });
Shards.Add("test1", new EmbeddableDocumentStore() { DataDirectory = "test1.db" });
Shards.Add("test2", new EmbeddableDocumentStore() { DataDirectory = "test2.db" });
Shards.Add("test3", new EmbeddableDocumentStore() { DataDirectory = "test3.db" });
Shards.Add("test4", new EmbeddableDocumentStore() { DataDirectory = "test4.db" });
foreach (string key in Shards.Keys)
{
EmbeddableDocumentStore store = Shards[key];
store.Initialize();
IndexCreation.CreateIndexes(typeof(eBayItemIndexer).Assembly, store);
}
}
How can I optimize my code so my total time is lower ? Is it good to divide my database up in 5 different ones ?
EDIT: The program has only 1 documentStore instead of 5. (As sugested by Ayende Rahien)
Also this is the Query on its own:
Price_Range:[* TO Dx600] AND Price_Range:[Dx200 TO NULL] AND Title:(Canon) AND Title:(MP) AND Title:(Black) -Title:(G1) -Title:(T3)
No, this isn't good.
Use a single embedded RavenDB. If you need sharding, this involved multiple machines.
In general, RavenDB queries are in the few ms each. You need to show what your queries looks like (you can call ToString() on them to see that).
Having shards of RavenDB in this manner means that all of them are fighting for CPU and IO
I know this is an old post but this was the top search result I got.
I had the same problem that my queries were taking 500ms. It now takes 100ms by applying the following search practices: http://ravendb.net/docs/article-page/2.5/csharp/client-api/querying/static-indexes/searching
Related
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
I'm trying to understand C# LINQ implementation and how is it performance against FOR and FOREACH loops.
Every where I see posts of how much better (in terms of performance) is to use a for loop implementation over a LINQ one. Example1, Example2, Example3
How ever, I'm trying to come along with my own POC to see if I can optimize the GroupBy and the Where operations and I see the opposite. Can you tell me if my implementations can be optimized better?
//Where Implementation (Main Call)
var students = createStudentList();
var stopwatch1 = new Stopwatch();
stopwatch1.Start();
var y = students.Where(s=> s.age == 32);
foreach(var entry in y){}
stopwatch1.Stop();
Console.WriteLine("1) TICKS ELAPSED WHERE: " + stopwatch1.ElapsedTicks);
Console.WriteLine("1) MILLISECONDS WHERE: " + stopwatch1.ElapsedMilliseconds);
var stopwatch2 = new Stopwatch();
stopwatch2.Start();
var y2 = WhereManual(students);
foreach(var entry in y2){}
stopwatch2.Stop();
Console.WriteLine("2) TICKS ELAPSED FOR: " + stopwatch2.ElapsedTicks);
Console.WriteLine("2) MILLISECONDS FOR: " + stopwatch2.ElapsedMilliseconds);
public List<Student> WhereManual(List<Student> students){
var filteredList = new List<Student>();
for(var i = 0; i < students.Count(); i++){
var student = students[i];
if(student.age == 32){
filteredList.Add(student);
}
}
return filteredList;
}
Output:
1) TICKS ELAPSED WHERE: 389478
1) MILLISECONDS WHERE: 38
2) TICKS ELAPSED FOR: 654023
2) MILLISECONDS FOR: 65
And for the GroupBy I have
//GroupBy Implementation (Main Call)
var students = createStudentList();
var stopwatch1 = new Stopwatch();
stopwatch1.Start();
var y = students.GroupBy(s => s.age);
foreach(var entry in y){}
stopwatch1.Stop();
Console.WriteLine("1) TICKS ELAPSED GROUPBY: " + stopwatch1.ElapsedTicks);
Console.WriteLine("1) MILLISECONDS GROUPBY: " + stopwatch1.ElapsedMilliseconds);
var stopwatch2 = new Stopwatch();
stopwatch2.Start();
var y2 = dictOperation(students);
foreach(var entry in y2){}
stopwatch2.Stop();
Console.WriteLine("2) TICKS ELAPSED FOR: " + stopwatch2.ElapsedTicks);
Console.WriteLine("2) MILLISECONDS FOR: " + stopwatch2.ElapsedMilliseconds);
public List<Student> GetStudent(Dictionary<int, List<Student>> dict, int age){
List<Student> dictStudent;
return dict.TryGetValue(age, out dictStudent) ? dictStudent : null;
}
public Dictionary<int, List<Student>> dictOperation(List<Student> students){
var dict = new Dictionary<int, List<Student>>();
for(var i = 0; i < students.Count(); i++){
var student = students[i];
var studentAge = student.age;
var dictStudent = GetStudent(dict, studentAge);
if(dictStudent == null)
{
dict.Add(studentAge, new List<Student>(){student});
}
else
{
dictStudent.Add(student);
}
}
return dict;
}
And this is the output:
1) TICKS ELAPSED GROUPBY: 865702
1) MILLISECONDS GROUPBY: 86
2) TICKS ELAPSED FOR: 1364863
2) MILLISECONDS FOR: 1.36
Not much of an answer, but since I played with it a little I may as well share.
I did not spend much time looking at the GroupBy comparison because the types used are different enough that they may be the bottleneck, and I'm not familiar enough with IGrouping to create a new test right now.
I found that if you use the List.Count property instead of the List.Count() extension method, it saved enough time (iterating over 1000000 items) to make the manual code faster than Linq. Additionally, a few more milliseconds were saved by removing the assignment var student = students[i];:
public class Student { public string Name { get; set; } public int Age { get; set; } }
public class Program
{
public static List<Student> Students = new List<Student>();
public static void CreateStudents()
{
for (var i = 0; i < 1000000; i++)
{
Students.Add(new Student {Name = $"Student{i}", Age = i});
}
}
public static List<Student> WhereManualOriginal(List<Student> students)
{
var filteredList = new List<Student>();
for (var i = 0; i < students.Count(); i++)
{
var student = students[i];
if (student.Age == 32)
{
filteredList.Add(student);
}
}
return filteredList;
}
public static List<Student> WhereManualNew(List<Student> students)
{
var filteredList = new List<Student>();
for (var i = 0; i < students.Count; i++)
{
if (students[i].Age == 32)
{
filteredList.Add(students[i]);
}
}
return filteredList;
}
public static long LinqWhere()
{
var sw = Stopwatch.StartNew();
var items = Students.Where(s => s.Age == 32);
foreach (var item in items) { }
sw.Stop();
return sw.ElapsedTicks;
}
public static long ManualWhere()
{
var sw = Stopwatch.StartNew();
var items = WhereManualOriginal(Students);
foreach (var item in items) { }
sw.Stop();
return sw.ElapsedTicks;
}
public static long NewManualWhere()
{
var sw = Stopwatch.StartNew();
var items = WhereManualNew(Students);
foreach (var item in items) { }
sw.Stop();
return sw.ElapsedTicks;
}
public static void Main()
{
// Warmup stuff
CreateStudents();
WhereManualOriginal(Students);
WhereManualNew(Students);
Students.Where(s => s.Age == 32).ToList();
var linqResults = new List<long>();
var manualResults = new List<long>();
var newManualResults = new List<long>();
for (int i = 0; i < 100; i++)
{
newManualResults.Add(NewManualWhere());
manualResults.Add(ManualWhere());
linqResults.Add(LinqWhere());
}
Console.WriteLine("Linq where ......... " + linqResults.Average());
Console.WriteLine("Manual where ....... " + manualResults.Average());
Console.WriteLine("New Manual where ... " + newManualResults.Average());
GetKeyFromUser("\nDone! Press any key to exit...");
}
}
Output
I have to find each subset in a enough big list, 500/1000 items that are positive and negative and are decimal, whiches sum to 0. I'm not an expert so I read many and many articles and solutions, and then I wrote my code. Datas comes from Excel worksheet and I would to mark found sums there.
Code works in this way:
Initally I find all pair that sum to 0
Then I put the remains sums into a list and take the combinations within 20 items, beacause I know the it is not possible bigger combination sum to 0
In these combinations I search if one combinations sums to 0 and save it in result list, else save sum in dictionary as key and then I'll search if dictionary contains next sums (so I check pairs of these subsets)
I keep track of the index so I can reach and modify the cells
To found solutions is enough fast but when I want elaborate the results in Excel become really slow. I don't take care about find all solutions but I want to find as max as possible in a short time.
What do you think about this solution? How can I improve the speed? How can I skip easly the sums that are already taken? And how can mark the cells fastly in my worksheet, beacuse now here is the bottleneck of the program?
I hope it is enough clear :) Thanks to everybody for any help
Here my code of the combination's part:
List<decimal> listDecimal = new List<decimal>();
List<string> listRange = new List<string>();
List<decimal> resDecimal = new List<decimal>();
List<IEnumerable<decimal>> resDecimal2 = new List<IEnumerable<decimal>>();
List<IEnumerable<string>> resIndex = new List<IEnumerable<string>>();
Dictionary<decimal, int> dicSumma = new Dictionary<decimal, int>();
foreach (TarkistaSummat.CellsRemain el in list)
{
decimal sumDec = Convert.ToDecimal(el.Summa.Value);
listDecimal.Add(sumDec);
string row = el.Summa.Cells.Row.ToString();
string col = el.Summa.Cells.Column.ToString();
string range = el.Summa.Cells.Row.ToString() + ":" + el.Summa.Cells.Column.ToString();
listRange.Add(range);
}
var subsets = new List<IEnumerable<decimal>> { new List<decimal>() };
var subsetsIndex = new List<IEnumerable<string>> { new List<string>() };
for (int i = 0; i < list.Count; i++)
{
if (i > 20)
{
List<IEnumerable<decimal>> parSubsets = subsets.GetRange(i, i + 20);
List<IEnumerable<string>> parSubsetsIndex = subsetsIndex.GetRange(i, i + 20);
var Z = parSubsets.Select(x => x.Concat(new[] { listDecimal[i] }));
//var Zfound = Z.Select(x => x).Where(w => w.Sum() ==0);
subsets.AddRange(Z.ToList());
var Zr = parSubsetsIndex.Select(x => x.Concat(new[] { listRange[i] }));
subsetsIndex.AddRange(Zr.ToList());
}
else
{
var T = subsets.Select(y => y.Concat(new[] { listDecimal[i] }));
//var Tfound = T.Select(x => x).Where(w => w.Sum() == 0);
//resDecimal2.AddRange(Tfound);
//var TnotFound = T.Except(Tfound);
subsets.AddRange(T.ToList());
var Tr = subsetsIndex.Select(y => y.Concat(new[] { listRange[i] }));
subsetsIndex.AddRange(Tr.ToList());
}
for (int i = 0; i < subsets.Count; i++)
{
decimal sumDec = subsets[i].Sum();
if (sumDec == 0m)
{
resDecimal2.Add(subsets[i]);
resIndex.Add(subsetsIndex[i]);
continue;
}
else
{
if(dicSumma.ContainsKey(sumDec * -1))
{
dicSumma.TryGetValue(sumDec * -1, out int index);
IEnumerable<decimal> addComb = subsets[i].Union(subsets[index]);
resDecimal2.Add(addComb);
var indexComb = subsetsIndex[i].Union(subsetsIndex[index]);
resIndex.Add(indexComb);
}
else
{
if(!dicSumma.ContainsKey(sumDec))
{
dicSumma.Add(sumDec, i);
}
}
}
}
for (int i = 0; i < resIndex.Count; i++)
{
//List<Range> ranges = new List<Range>();
foreach(string el in resIndex[i])
{
string[] split = el.Split(':');
Range cell = actSheet.Cells[Convert.ToInt32(split[0]), Convert.ToInt32(split[1])];
cell.Interior.ColorIndex = 6;
}
}
}
I have looked into this Q/A , though it is working too some extent but not as expected. I want it to happen sequentially.How to do that?
Thanks in advance.
You can use Enumerable.Zip to combine the agents and accounts together (after repeating the list of agents to match or exceed the number of accounts). Then GroupBy agent.
var repeatCount = lstAccounts.Count / lstAgents.Count + 1;
var agents = Enumerable.Repeat(lstAgents, repeatCount).SelectMany(x => x);
// agents = { "Agent1", "Agent2", "Agent3", "Agent1", "Agent2", "Agent3" }
// lstAccounts = { "1001" , "1002" , "1003" , "1004" , "1005" }
var result = agents
.Zip(lstAccounts, (agent, account) => new { Agent = agent, Account = account })
.GroupBy(x => x.Agent)
.Select(g => new { Agent = g.Key, Accounts = g.Select(x => x.Account).ToList() })
.ToList();
It might not be the fastest way to do it, but it's short and readable.
Edit
Another way (probably nicer) to achieve the same result is to start by mapping each account to an index of agent using index % lstAgents.Count.
var result = lstAccounts
.Select((acc, index) => new { AgentIndex = index % lstAgents.Count, Account = acc })
.GroupBy(x => x.AgentIndex)
.Select(g => new { Agent = lstAgents[g.Key], Accounts = g.Select(x => x.Account).ToList() })
.ToList();
The algorithm is very similar to the one proposed by varocarbas, but expressed in a functional (not imperative) way.
I think that conventional loops are the best approach here: easy-to-build, clear and very scalable-/modifiable-friendly. For example:
Dictionary<string, List<string>> results = new Dictionary<string, List<string>>();
int i = -1;
while (i < lstAccounts.Count - 1)
{
for (int i2 = 0; i2 < lstAgents.Count; i2++)
{
i = i + 1;
string curAccount = lstAccounts[i];
string curAgent = lstAgents[i2];
if (!results.ContainsKey(curAgent)) results.Add(curAgent, new List<string>());
results[curAgent].Add(curAccount);
if (i >= lstAccounts.Count - 1) break;
}
}
Additionally, note that this approach is quite fast. As a reference: around 4-5 times faster (results after a simplistic test with one of the provided inputs and a Stopwatch) than the alternative proposed by Jakub in his answer.
You can try this approach with linq extention. Split extension method will split the accounts list into "n" parts (number of agents) so that you can assign each part to agents.
class Program
{
static void Main(string[] args)
{
List<string> lstAgents = new List<string>() { "Agent1", "Agent2","Agent3" };
List<string> lstAccounts = new List<string>() { "1001", "1002" ,"1003", "1004", "1005" };
var op = lstAccounts.Split(lstAgents.Count);
int i = 0;
foreach (var accounts in op)
{
//Get agent
Console.WriteLine("Account(s) for Agent: ", lstAgents[i]);
foreach (var acc in accounts)
{
Console.WriteLine(acc);
}
Console.WriteLine(Environment.NewLine);
i++;
}
Console.ReadKey();
}
}
static class LinqExtensions
{
public static IEnumerable<IEnumerable<T>> Split<T>(this IEnumerable<T> list, int parts)
{
int i = 0;
var splits = from item in list
group item by i++ % parts into part
select part.AsEnumerable();
return splits;
}
}
I run a build system. Datawise the simplified description would be that I have Configurations and each config has 0..n Builds.
Now builds produce artifacts and some of these are stored on server. What I am doing is writing kind of a rule, that sums all the bytes produced per configuration builds and checks if these are too much.
The code for the routine at the moment is following:
private void CalculateExtendedDiskUsage(IEnumerable<Configuration> allConfigurations)
{
var sw = new Stopwatch();
sw.Start();
// Lets take only confs that have been updated within last 7 days
var items = allConfigurations.AsParallel().Where(x =>
x.artifact_cleanup_type != null && x.build_cleanup_type != null &&
x.updated_date > DateTime.UtcNow.AddDays(-7)
).ToList();
using (var ctx = new LocalEntities())
{
Debug.WriteLine("Context: " + sw.Elapsed);
var allBuilds = ctx.Builds;
var ruleResult = new List<Notification>();
foreach (var configuration in items)
{
// all builds for current configuration
var configurationBuilds = allBuilds.Where(x => x.configuration_id == configuration.configuration_id)
.OrderByDescending(z => z.build_date);
Debug.WriteLine("Filter conf builds: " + sw.Elapsed);
// Since I don't know which builds/artifacts have been cleaned up, calculate it manually
if (configuration.build_cleanup_count != null)
{
var buildCleanupCount = "30"; // default
if (configuration.build_cleanup_type.Equals("ReserveBuildsByDays"))
{
var buildLastCleanupDate = DateTime.UtcNow.AddDays(-int.Parse(buildCleanupCount));
configurationBuilds = configurationBuilds.Where(x => x.build_date > buildLastCleanupDate)
.OrderByDescending(z => z.build_date);
}
if (configuration.build_cleanup_type.Equals("ReserveBuildsByCount"))
{
var buildLastCleanupCount = int.Parse(buildCleanupCount);
configurationBuilds =
configurationBuilds.Take(buildLastCleanupCount).OrderByDescending(z => z.build_date);
}
}
if (configuration.artifact_cleanup_count != null)
{
// skipped, similar to previous block
}
Debug.WriteLine("Done cleanup: " + sw.Elapsed);
const int maxDiscAllocationPerConfiguration = 1000000000; // 1GB
// Sum all disc usage per configuration
var confDiscSizePerConfiguration = configurationBuilds
.GroupBy(c => new {c.configuration_id})
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(groupedBuilds =>
new
{
configurationId = groupedBuilds.FirstOrDefault().configuration_id,
configurationPath = groupedBuilds.FirstOrDefault().configuration_path,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
}).ToList();
Debug.WriteLine("Done db query: " + sw.Elapsed);
ruleResult.AddRange(confDiscSizePerConfiguration.Select(iter => new Notification
{
ConfigurationId = iter.configurationId,
CreatedDate = DateTime.UtcNow,
RuleType = (int) RulesEnum.TooMuchDisc,
ConfigrationPath = iter.configurationPath
}));
Debug.WriteLine("Finished loop: " + sw.Elapsed);
}
// find owners and insert...
}
}
This does exactly what I want, but I am thinking if I could make it any faster. Currenly I see:
Context: 00:00:00.0609067
// first round
Filter conf builds: 00:00:00.0636291
Done cleanup: 00:00:00.0644505
Done db query: 00:00:00.3050122
Finished loop: 00:00:00.3062711
// avg round
Filter conf builds: 00:00:00.0001707
Done cleanup: 00:00:00.0006343
Done db query: 00:00:00.0760567
Finished loop: 00:00:00.0773370
The SQL generated by .ToList() looks very messy. (Everything that is used in WHERE is covered with an index in DB)
I am testing with 200 configurations, so this adds up to 00:00:18.6326722. I have a total of ~8k items that need to get processed daily (so the whole routine takes more than 10 minutes to complete).
I have been randomly googling around this internet and it seems to me that Entitiy Framework is not very good with parallel processing. Knowing that I still decided to give this async/await approch a try (First time a tried it, so sorry for any nonsense).
Basically if I move all the processing out of scope like:
foreach (var configuration in items)
{
var confDiscSizePerConfiguration = await GetData(configuration, allBuilds);
ruleResult.AddRange(confDiscSizePerConfiguration.Select(iter => new Notification
{
... skiped
}
And:
private async Task<List<Tmp>> GetData(Configuration configuration, IQueryable<Build> allBuilds)
{
var configurationBuilds = allBuilds.Where(x => x.configuration_id == configuration.configuration_id)
.OrderByDescending(z => z.build_date);
//..skipped
var confDiscSizePerConfiguration = configurationBuilds
.GroupBy(c => new {c.configuration_id})
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(groupedBuilds =>
new Tmp
{
ConfigurationId = groupedBuilds.FirstOrDefault().configuration_id,
ConfigurationPath = groupedBuilds.FirstOrDefault().configuration_path,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
}).ToListAsync();
return await confDiscSizePerConfiguration;
}
This, for some reason, drops the execution time for 200 items from 18 -> 13 sec. Anyway, from what I understand, since I am awaiting each .ToListAsync(), it is still processed in sequence, is that correct?
So the "can't process in parallel" claim starts coming out when I replace the foreach (var configuration in items) with Parallel.ForEach(items, async configuration =>. Doing this change results in:
A second operation started on this context before a previous
asynchronous operation completed. Use 'await' to ensure that any
asynchronous operations have completed before calling another method
on this context. Any instance members are not guaranteed to be thread
safe.
It was a bit confusing to me at first as I await practically in every place where the compiler allows it, but possibly the data gets seeded to fast.
I tried to overcome this by being less greedy and added the new ParallelOptions {MaxDegreeOfParallelism = 4} to that parallel loop, peasant assumption was that default connection pool size is 100, all I want to use is 4, should be plenty. But it still fails.
I have also tried to create new DbContexts inside the GetData method, but it still fails. If I remember correctly (can't test now), I got
Underlying connection failed to open
What possibilities there are to make this routine go faster?
Before going in parallel, it is worth to optimize query itself. Here are some suggestions that might improve your times:
1) Use Key when working with GroupBy. This might solve issue of complex & nested SQL query as in that way you instruct Linq to use the same keys defined in GROUP BY and not to create sub-select.
var confDiscSizePerConfiguration = configurationBuilds
.GroupBy(c => new { ConfigurationId = c.configuration_id, ConfigurationPath = c.configuration_path})
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(groupedBuilds =>
new
{
configurationId = groupedBuilds.Key.ConfigurationId,
configurationPath = groupedBuilds.Key.ConfigurationPath,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
})
.ToList();
2) It seems that you are bitten by N+1 problem. In simple words - you execute one SQL query to get all configurations and N another ones to get build information. In total that would be ~8k small queries where 2 bigger queries would suffice. If used memory is not a constraint, fetch all build data in memory and optimize for fast lookup using ToLookup.
var allBuilds = ctx.Builds.ToLookup(x=>x.configuration_id);
Later you can lookup builds by:
var configurationBuilds = allBuilds[configuration.configuration_id].OrderByDescending(z => z.build_date);
3) You are doing OrderBy on configurationBuilds multiple times. Filtering does not affect record order, so you can safely remove extra calls to OrderBy:
...
configurationBuilds = configurationBuilds.Where(x => x.build_date > buildLastCleanupDate);
...
configurationBuilds = configurationBuilds.Take(buildLastCleanupCount);
...
4) There is no point to do GroupBy as builds are already filtered for a single configuration.
UPDATE:
I took it one step further and created code that would retrieve same results as your provided code with a single request. It should be more performant and use less memory.
private void CalculateExtendedDiskUsage()
{
using (var ctx = new LocalEntities())
{
var ruleResult = ctx.Configurations
.Where(x => x.build_cleanup_count != null &&
(
(x.build_cleanup_type == "ReserveBuildsByDays" && ctx.Builds.Where(y => y.configuration_id == x.configuration_id).Where(y => y.build_date > buildLastCleanupDate).Sum(y => y.artifact_dir_size) > maxDiscAllocationPerConfiguration) ||
(x.build_cleanup_type == "ReserveBuildsByCount" && ctx.Builds.Where(y => y.configuration_id == x.configuration_id).OrderByDescending(y => y.build_date).Take(buildCleanupCount).Sum(y => y.artifact_dir_size) > maxDiscAllocationPerConfiguration)
)
)
.Select(x => new Notification
{
ConfigurationId = x.configuration_id,
ConfigrationPath = x.configuration_path
CreatedDate = DateTime.UtcNow,
RuleType = (int)RulesEnum.TooMuchDisc,
})
.ToList();
}
}
First make a new context every parallel.foreach of you going to go that route. But u need to write a query that gets all the needed data in one trip. To speed up ef u can also disable change tracking or proxies on the context when ur reading data.
There are a lot of places for optimizations...
There are places where you should put .ToArray() to avoid asking multiple time to server...
I did a lot of refactor, but I'm unable to check, due lack of more information.
Maybe this can lead you to a better solution...
private void CalculateExtendedDiskUsage(IEnumerable allConfigurations)
{
var sw = new Stopwatch();
sw.Start();
using (var ctx = new LocalEntities())
{
Debug.WriteLine("Context: " + sw.Elapsed);
var allBuilds = ctx.Builds;
var ruleResult = GetRulesResult(sw, allConfigurations, allBuilds); // Clean Code!!!
// find owners and insert...
}
}
private static IEnumerable<Notification> GetRulesResult(Stopwatch sw, IEnumerable<Configuration> allConfigurations, ICollection<Configuration> allBuilds)
{
// Lets take only confs that have been updated within last 7 days
var ruleResult = allConfigurations
.AsParallel() // Check if you really need this right here...
.Where(IsConfigElegible) // Clean Code!!!
.SelectMany(x => CreateNotifications(sw, allBuilds, x))
.ToArray();
Debug.WriteLine("Finished loop: " + sw.Elapsed);
return ruleResult;
}
private static bool IsConfigElegible(Configuration x)
{
return x.artifact_cleanup_type != null &&
x.build_cleanup_type != null &&
x.updated_date > DateTime.UtcNow.AddDays(-7);
}
private static IEnumerable<Notification> CreateNotifications(Stopwatch sw, IEnumerable<Configuration> allBuilds, Configuration configuration)
{
// all builds for current configuration
var configurationBuilds = allBuilds
.Where(x => x.configuration_id == configuration.configuration_id);
// .OrderByDescending(z => z.build_date); <<< You should order only when needed (most at the end)
Debug.WriteLine("Filter conf builds: " + sw.Elapsed);
configurationBuilds = BuildCleanup(configuration, configurationBuilds); // Clean Code!!!
configurationBuilds = ArtifactCleanup(configuration, configurationBuilds); // Clean Code!!!
Debug.WriteLine("Done cleanup: " + sw.Elapsed);
const int maxDiscAllocationPerConfiguration = 1000000000; // 1GB
// Sum all disc usage per configuration
var confDiscSizePerConfiguration = configurationBuilds
.OrderByDescending(z => z.build_date) // I think that you can put this even later (or not to have anyway)
.GroupBy(c => c.configuration_id) // No need to create a new object, just use the property
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(CreateSumPerConfiguration);
Debug.WriteLine("Done db query: " + sw.Elapsed);
// Extracting to variable to be able to return it as function result
var notifications = confDiscSizePerConfiguration
.Select(CreateNotification);
return notifications;
}
private static IEnumerable<Configuration> BuildCleanup(Configuration configuration, IEnumerable<Configuration> builds)
{
// Since I don't know which builds/artifacts have been cleaned up, calculate it manually
if (configuration.build_cleanup_count == null) return builds;
const int buildCleanupCount = 30; // Why 'string' if you always need as integer?
builds = GetDiscartBelow(configuration, buildCleanupCount, builds); // Clean Code (almost)
builds = GetDiscartAbove(configuration, buildCleanupCount, builds); // Clean Code (almost)
return builds;
}
private static IEnumerable<Configuration> ArtifactCleanup(Configuration configuration, IEnumerable<Configuration> configurationBuilds)
{
if (configuration.artifact_cleanup_count != null)
{
// skipped, similar to previous block
}
return configurationBuilds;
}
private static SumPerConfiguration CreateSumPerConfiguration(IGrouping<object, Configuration> groupedBuilds)
{
var configuration = groupedBuilds.First();
return new SumPerConfiguration
{
configurationId = configuration.configuration_id,
configurationPath = configuration.configuration_path,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
};
}
private static IEnumerable<Configuration> GetDiscartBelow(Configuration configuration,
int buildCleanupCount,
IEnumerable<Configuration> configurationBuilds)
{
if (!configuration.build_cleanup_type.Equals("ReserveBuildsByDays"))
return configurationBuilds;
var buildLastCleanupDate = DateTime.UtcNow.AddDays(-buildCleanupCount);
var result = configurationBuilds
.Where(x => x.build_date > buildLastCleanupDate);
return result;
}
private static IEnumerable<Configuration> GetDiscartAbove(Configuration configuration,
int buildLastCleanupCount,
IEnumerable<Configuration> configurationBuilds)
{
if (!configuration.build_cleanup_type.Equals("ReserveBuildsByCount"))
return configurationBuilds;
var result = configurationBuilds
.Take(buildLastCleanupCount);
return result;
}
private static Notification CreateNotification(SumPerConfiguration iter)
{
return new Notification
{
ConfigurationId = iter.configurationId,
CreatedDate = DateTime.UtcNow,
RuleType = (int)RulesEnum.TooMuchDisc,
ConfigrationPath = iter.configurationPath
};
}
}
internal class SumPerConfiguration {
public object configurationId { get; set; } //
public object configurationPath { get; set; } // I did use 'object' cause I don't know your type data
public int Total { get; set; }
public double Average { get; set; }
}
Suppose there are an arbitrary number of threads in my C# program. Each thread needs to look up the changeset ids for a particular path by looking up it's history. The method looks like so:
public List<int> GetIdsFromHistory(string path, VersionControlServer tfsClient)
{
IEnumerable submissions = tfsClient.QueryHistory(
path,
VersionSpec.Latest,
0,
RecursionType.None, // Assume that the path is to a file, not a directory
null,
null,
null,
Int32.MaxValue,
false,
false);
List<int> ids = new List<int>();
foreach(Changeset cs in submissions)
{
ids.Add(cs.ChangesetId);
}
return ids;
}
My question is, does each thread need it's own VersionControlServer instance or will one suffice? My intuition tells me that each thread needs its own instance since the TFS SDK uses webservices and I should probably have more than one connection open if I'm really going to get the parallel behavior. If I only use one connection, my intuition tells me that I'll get serial behavior even though I've got multiple threads.
If I need as many instances as there are threads, I think of using an Object-Pool pattern, but will the connections time out and close over a long period if not being used? The docs seem sparse in this regard.
It would appear that threads using the SAME client is the fastest option.
Here's the output from a test program that runs 4 tests 5 times each and returns the average result in milliseconds. Clearly using the same client across multiple threads is the fastest execution:
Parallel Pre-Alloc: Execution Time Average (ms): 1921.26044
Parallel AllocOnDemand: Execution Time Average (ms): 1391.665
Parallel-SameClient: Execution Time Average (ms): 400.5484
Serial: Execution Time Average (ms): 1472.76138
For reference, here's the test program itself (also on GitHub):
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.TeamFoundation;
using Microsoft.TeamFoundation.Client;
using Microsoft.TeamFoundation.VersionControl.Client;
using System.Collections;
using System.Threading.Tasks;
using System.Diagnostics;
namespace QueryHistoryPerformanceTesting
{
class Program
{
static string TFS_COLLECTION = /* TFS COLLECTION URL */
static VersionControlServer GetTfsClient()
{
var projectCollectionUri = new Uri(TFS_COLLECTION);
var projectCollection = TfsTeamProjectCollectionFactory.GetTeamProjectCollection(projectCollectionUri, new UICredentialsProvider());
projectCollection.EnsureAuthenticated();
return projectCollection.GetService<VersionControlServer>();
}
struct ThrArg
{
public VersionControlServer tfc { get; set; }
public string path { get; set; }
}
static List<string> PATHS = new List<string> {
// ASSUME 21 FILE PATHS
};
static int NUM_RUNS = 5;
static void Main(string[] args)
{
var results = new List<TimeSpan>();
for (int i = NUM_RUNS; i > 0; i--)
{
results.Add(RunTestParallelPreAlloc());
}
Console.WriteLine("Parallel Pre-Alloc: Execution Time Average (ms): " + results.Select(t => t.TotalMilliseconds).Average());
results.Clear();
for (int i = NUM_RUNS; i > 0; i--)
{
results.Add(RunTestParallelAllocOnDemand());
}
Console.WriteLine("Parallel AllocOnDemand: Execution Time Average (ms): " + results.Select(t => t.TotalMilliseconds).Average());
results.Clear();
for (int i = NUM_RUNS; i > 0; i--)
{
results.Add(RunTestParallelSameClient());
}
Console.WriteLine("Parallel-SameClient: Execution Time Average (ms): " + results.Select(t => t.TotalMilliseconds).Average());
results.Clear();
for (int i = NUM_RUNS; i > 0; i--)
{
results.Add(RunTestSerial());
}
Console.WriteLine("Serial: Execution Time Average (ms): " + results.Select(t => t.TotalMilliseconds).Average());
}
static TimeSpan RunTestParallelPreAlloc()
{
var paths = new List<ThrArg>();
paths.AddRange( PATHS.Select( x => new ThrArg { path = x, tfc = GetTfsClient() } ) );
return RunTestParallel(paths);
}
static TimeSpan RunTestParallelAllocOnDemand()
{
var paths = new List<ThrArg>();
paths.AddRange(PATHS.Select(x => new ThrArg { path = x, tfc = null }));
return RunTestParallel(paths);
}
static TimeSpan RunTestParallelSameClient()
{
var paths = new List<ThrArg>();
var _tfc = GetTfsClient();
paths.AddRange(PATHS.Select(x => new ThrArg { path = x, tfc = _tfc }));
return RunTestParallel(paths);
}
static TimeSpan RunTestParallel(List<ThrArg> args)
{
var allIds = new List<int>();
var stopWatch = new Stopwatch();
stopWatch.Start();
Parallel.ForEach(args, s =>
{
allIds.AddRange(GetIdsFromHistory(s.path, s.tfc));
}
);
stopWatch.Stop();
return stopWatch.Elapsed;
}
static TimeSpan RunTestSerial()
{
var allIds = new List<int>();
VersionControlServer tfsc = GetTfsClient();
var stopWatch = new Stopwatch();
stopWatch.Start();
foreach (string s in PATHS)
{
allIds.AddRange(GetIdsFromHistory(s, tfsc));
}
stopWatch.Stop();
return stopWatch.Elapsed;
}
static List<int> GetIdsFromHistory(string path, VersionControlServer tfsClient)
{
if (tfsClient == null)
{
tfsClient = GetTfsClient();
}
IEnumerable submissions = tfsClient.QueryHistory(
path,
VersionSpec.Latest,
0,
RecursionType.None, // Assume that the path is to a file, not a directory
null,
null,
null,
Int32.MaxValue,
false,
false);
List<int> ids = new List<int>();
foreach(Changeset cs in submissions)
{
ids.Add(cs.ChangesetId);
}
return ids;
}