Implementing Hoey Shamos algorithm with C# - c#

Okay, I am now getting the correct information from my current algorithm! However, with 700,000 polygons to check, it's just way too slow! The previous issue is fixed (My Line2D intersectsWith method was incorrect)
Now it's a matter of identifying my bottleneck! This algorithm is suppose to be O(nlog-n) so it should be much quicker. My intersectsWith method looks like it can't get any faster, however I will post its code, in case I'm wrong
EDIT: Added IComparable interface
My method for reading line segment intersections. Some code has been omitted for readability.
public class Line2D : IComparable
{
public Line2D(XYPoints p1, XYPoints p2)
{
}
public bool intersectsLine(Line2D comparedLine)
{
if ((X2 == comparedLine.X1) && (Y2 == comparedLine.Y1)) return false;
if ((X1 == comparedLine.X2) && (Y1 == comparedLine.Y2)) return false;
if (X2 == comparedLine.X1 && Y2 == comparedLine.Y1)
{
return false;
}
if (X1 == comparedLine.X2 && Y1 == comparedLine.Y2)
{
return false;
}
double firstLineSlopeX, firstLineSlopeY, secondLineSlopeX, secondLineSlopeY;
firstLineSlopeX = X2 - X1;
firstLineSlopeY = Y2 - Y1;
secondLineSlopeX = comparedLine.getX2() - comparedLine.getX1();
secondLineSlopeY = comparedLine.getY2() - comparedLine.getY1();
double s, t;
s = (-firstLineSlopeY * (X1 - comparedLine.getX1()) + firstLineSlopeX * (getY1() - comparedLine.getY1())) / (-secondLineSlopeX * firstLineSlopeY + firstLineSlopeX * secondLineSlopeY);
t = (secondLineSlopeX * (getY1() - comparedLine.getY1()) - secondLineSlopeY * (getX1() - comparedLine.getX1())) / (-secondLineSlopeX * firstLineSlopeY + firstLineSlopeX * secondLineSlopeY);
if (s >= 0 && s <= 1 && t >= 0 && t <= 1)
{
return true;
}
return false; // No collision
}
int IComparable.CompareTo(object obj)
{
//return Y1.GetHashCode();
Line2D o1 = this;
Line2D o2 = (Line2D)obj;
if (o1.getY1() < o2.getY1())
{
return -1;
}
else if (o1.getY1() > o2.getY2())
{
return 1;
}
else
{
if (o1.getY2() < o2.getY2())
{
return -1;
}
else if (o1.getY2() > o2.getY2())
{
return 1;
}
else
{
return 0;
}
}
}
}
The bulk of my algorithm implementation, I realize a List isn't the fastest for an algorithm, however I need indexing!:
//Create a new list, sort by Y values.
List<AlgEvent> SortedList = events.OrderBy(o => o.getY()).ToList();
List<Line2D> sweepline = new List<Line2D>();
for (var g = 0; g < SortedList.Count; g++)
{
if (SortedList[g].isStart)
{
Line2D nl = SortedList[g].line;
Line2D above;
/* Start generating above */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//add 1 to get above line
if (index == -1)
{
above = null;
}
else
{
above = sweepline[index + 1];
}
}
catch (ArgumentOutOfRangeException)
{
above = null;
}
/* End generating above */
if (above != null)
{
if (above.intersectsLine(nl))
{
return true;
}
}
Line2D below;
/* Start generating below */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//add 1 to get above line
below = sweepline[index - 1];
}
catch (ArgumentOutOfRangeException)
{
below = null;
}
/* End generating below */
if (below != null)
{
if (below.intersectsLine(nl))
{
return true;
}
}
sweepline.Add(nl);
sweepline = sweepline.OrderBy(o => o.getY1()).ToList();
}
else
{
Line2D nl = SortedList[g].line;
Line2D above;
Line2D below;
/* Start generating above */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
Console.Out.WriteLine("index:" + index);
//add 1 to get above line
above = sweepline[index + 1];
}
catch (ArgumentOutOfRangeException)
{
above = null;
}
/* End generating above */
/* Start generating below */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//add 1 to get above line
below = sweepline[index - 1];
}
catch (ArgumentOutOfRangeException)
{
below = null;
}
/* End generating below */
sweepline = sweepline.OrderBy(o => o.getY1()).ToList();
sweepline.Remove(nl);
if (above != null && below != null)
{
if (above.intersectsLine(below))
{
return true;
}
}
}
Console.WriteLine("");
}
} // end numofparts for-loop
return false;
============================================
UPDATE: September 12th:
Implemented the TreeSet from C5, implemented IComparable to my classes, and slowed it down even more? I am still indexing it if that matters?
http://www.itu.dk/research/c5/
Code using TreeSet:
TreeSet<Line2D> sweepline = new TreeSet<Line2D>();
for (var g = 0; g < SortedList.Count; g++)
{
if (SortedList[g].isStart)
{
Line2D nl = SortedList[g].line;
Line2D above;
/* Start generating above */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//add 1 to get above line
above = sweepline[index + 1];
}
catch (IndexOutOfRangeException)
{
above = null;
}
/* End generating above */
if (above != null)
{
if (above.intersectsLine(nl))
{
return false;
}
}
Line2D below;
/* Start generating below */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//add 1 to get above line
below = sweepline[index - 1];
}
catch (IndexOutOfRangeException)
{
below = null;
}
/* End generating below */
if (below != null)
{
if (below.intersectsLine(nl))
{
return false;
}
}
sweepline.Add(nl);
//sweepline = sweepline.OrderBy(o => o.getY1()).ToList();
}
else
{
Line2D nl = SortedList[g].line;
Line2D above;
Line2D below;
/* Start generating above */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//Console.Out.WriteLine("index:" + index);
//add 1 to get above line
above = sweepline[index + 1];
}
catch (IndexOutOfRangeException)
{
above = null;
}
/* End generating above */
/* Start generating below */
try
{
//grab index in sweepline
int index = sweepline.IndexOf(nl);
//add 1 to get above line
below = sweepline[index - 1];
}
catch (IndexOutOfRangeException)
{
below = null;
}
/* End generating below */
//sweepline = sweepline.OrderBy(o => o.getY1()).ToList();
sweepline.Remove(nl);
if (above != null && below != null)
{
if (above.intersectsLine(below))
{
return false;
}
}
}
//Console.WriteLine("");
}

First, regarding the line intersection: you do not need the actual point of intersection, only to know if they intersect. See http://www.geeksforgeeks.org/check-if-two-given-line-segments-intersect/ for an algorithm that does just that.
About the List implementation:
In your implementation using Lists, you call indexOf on the sweepline to find nl. This searches the sweepline from start to end. See List(T).IndexOf. If you were to use the BinarySearch method, that ought to speed up the search considerably.
List's documentation has a paragraph called performance considerations. They urge you to use a value type that implements IEquatable<T> and IComparable<T>. So, your Line2D should probably be a struct and implement these interfaces.
If you follow that advice, retrieval of the endpoint from the sweepline should be O(log n), which is sufficient for your purpose, and memory should be used more efficiently.
Insertion and removal are O(n) for Lists, cause the underlying array needs to be moved around in memory. A SortedSet has faster insertion and removal, but I don't quite see how to find an item's neighbors in O(log n) there. Anyone? (See also Why SortedSet<T>.GetViewBetween isn't O(log N)?)
Anyways, the C5 TreeSet should solve this.
I looked up the performance of IndexOf and [i] in the user guide and they're both listed as O(log n). So that is not supposed to be the issue. It's still probably somewhat faster, but no more than a fixed factor, to call the specific methods for finding the neighbors on the sweepline, i.e. Successor and Predecessor, which are also O(log n).
So
[...]
try
{
Line2D above = sweepline.Successor(nl);
if (above.intersectsLine(nl))
{
return false;
}
}
catch (NoSuchItemException ignore) { }
[...]
I don't like that they do not have a method that doesn't throw the exception, since throwing exceptions is very expensive. Your sweep line will be pretty full generally so my best guess is that failure to find one will be rare and calling Successor is the most efficient way. Alternatively, you could keep calling IndexOf like you do now, but check if it equals Count minus one before retrieving [index + 1], and prevent the exception from being thrown at all:
[...]
int index = sweepline.IndexOf(nl);
if( index < sweepline.Count-1 )
{
Line2D above = sweepline[index + 1];
if (above.intersectsLine(nl))
{
return false;
}
}
[...]
Chapter two of the manual describes equality and comparison for C5 collections. Here, too, at the very least you must implement IEquatable<T> and IComparable<T>!
One more thought: you report feeding the algorithm 700000 lines. Could you start with timing for example 1000, 2500, 5000, 10000 lines and seeing how the algorithm scales for cases where they do not intersect?
On how to compare the lines on the sweepline:
You need to find some sort of natural ordering for the Line2Ds on the Sweepline TreeSet, since the CompareTo method asks you to compare one Line2D to another. One of the Line2Ds already sits in the Sweepline TreeSet, the other has just been encountered and is being added.
Your sweepline runs from bottom to top, I think:
List<AlgEvent> SortedList = events.OrderBy(o => o.getY()).ToList();
So let's say segment S1 got added to the TreeSet at event 1, and we wish to compare it to S2, which is being added at event 2, right now.
The line segments could possibly intersect at some point, which would change the ordering, but the algorithm will check for this right after inserting them, in the above and below checks. Which would perhaps better be called the left and right checks, come to think of it.
Anyways.. so the easiest would be to compare the bottom endpoints of both line segments. To the left is smaller, to the right is bigger. However, we need to look at the ordering at the position of the sweepline and they may have changed positions since then, like in the picture.
So we need to compare the bottom endpoint of S2 to the red point on S1, and see if it lies to the left or to the right of that point. It lies to the left so S2 is considered smaller than S1.
Usually it's simpler than this: If all of S1 lies to the left of S2's bottom endpoint, S1 is smaller than S2. If all of S1 lies to the right of S2's bottom endpoint, S1 is larger than S2.
I think you're looking for the typesafer version of the interface:
public class Line2D : IComparable<Line2D>
Assuming two properties BottomY, the lowest of the two Y values, and BottomX, the X value of the lowest endpoint, a somewhat tested attempt:
int IComparable<Line2D>.CompareTo(Line2D other)
{
if( BottomY < other.BottomY )
{
return -other.CompareTo(this);
}
// we're the segment being added to the sweepline
if( BottomX >= other.X1 && BottomX >= other.X2 )
{
return 1;
}
if( BottomX <= other.X1 && BottomX <= other.X2 )
{
return -1;
}
if( other.Y2 == other.Y1 )
{
// Scary edge case: horizontal line that we intersect with. Return 0?
return 0;
}
// calculate the X coordinate of the intersection of the other segment
// with the sweepline
// double redX = other.X1 +
// (BottomY - other.Y1) * (other.X2 - other.X1) / (other.Y2 - other.Y1);
//
// return BottomX.CompareTo(redX);
// But more efficient, and more along the lines of the orientation comparison:
return Comparer<Double>.Default.Compare(
(BottomX - other.X1) * (other.Y2 - other.Y1),
(BottomY - other.Y1) * (other.X2 - other.X1) );
}

[original answer]
I am not C# user, but this should speeds things up a little.
less heap trashing
do not compute the same thing twice
avoid all sub calls if you can (get functions removed)
code:
public bool intersectsLine(const Line2D &comparedLine)
{
if ((X2==comparedLine.X1)&&(Y2==comparedLine.Y1)) return false;
if ((X1==comparedLine.X2)&&(Y1==comparedLine.Y2)) return false;
double dx1,dy1,dx2,dy2;
dx1 = X2 - X1;
dy1 = Y2 - Y1;
dx2 = comparedLine.X2 - comparedLine.X1;
dy2 = comparedLine.Y2 - comparedLine.Y1;
double s,t,ax,ay,b;
ax=X1-comparedLine.X1;
ay=Y1-comparedLine.Y1;
b=1.0/(-(dx2*dy1)+(dx1*dy2));
s = (-(dy1*ax)+(dx1*ay))*b;
t = ( (dx2*ay)-(dy2*ax))*b;
if ((s>=0)&&(s<=1)&&(t>=0)&&(t<=1)) return true;
return false; // No collision
}
for the rest of your code, add time measurements to find what exactly slow things down. My guess is on List management ... unnecessary reallocations can slow things considerably.
[edit1]
After some research on random line data I concluded this:
if too many lines are across the entire area than no optimizations are valid
if there are more small lines than there is more speedup for any optimizations
brute force is T((N*N-N)/2) which is still O(N*N)
estimate around 35 hours for 700K lines to be processed (unusable)
optimized brute force with area subdivision is T((((N/M)^2)-N)/2) - optimizations ~O((N/M)^2) where
N is max of area lines number
M is number of area divisions per any axis
idea is to check only lines crossing some region (divide dataset area to M*M squares/rectangles). For 700K lines is best subdivision to 16x16 areas. Measured times:
0.540s per 32K lines
1.950s per 64K lines
7.000s per 128K lines
27.514s per 256K lines
estimated run time is 3.7 min per 700K lines (for lines at max ~10% length of whole area). I think this is better than yours 19 minutes.
another speed up is possible with use of multi CPU/core
algorithm is fully parallel-isable and for 4 CPU/Cores 3.7min/4 -> 56s or you can port it to GPU ...
My optimized brute force algorithm with area subdivision O((((N/M)^2)-N)/2) - optimizations
get the used area size (xmin,xmax,ymin,ymax) O(N)
select subdivision M
the best I try for my random datasets 32K-256K lines was M=16
cycle through all subdivision area (evenly divided dataset area)
create list of lines crossing actual subdivision area and check intersection for all lines in that list. If do not want duplicate intersections then discard all intersection outside current area
my code (I am using BDS2006 C++ and my own lists so you need to port it to be compatible with your code)
void Twin_GLView2D::main_intersect(int M=16)
{
int ia,ib,i,j,N;
double zero=1e-6;
glview2D::_lin *l;
glview2D::_pnt p;
struct _line
{
double bx0,by0,bx1,by1; // bounding rectangle
double x0,y0,dx,dy; // precomputed params
} *lin,*a,*b;
struct _siz
{
double bx0,bx1,by0,by1; // zone bounding rectangle
} sz,bz;
List<_line*> zone;
// load and precompute lines
N=view.lin.num;
lin=new _line[N];
if (lin==NULL) return;
for (a=lin,l=view.lin.dat,ia=0;ia<N;ia++,a++,l++)
{
// line ...
if (l->p0.p[0]<=l->p1.p[0]) { a->bx0=l->p0.p[0]; a->bx1=l->p1.p[0]; }
else { a->bx0=l->p1.p[0]; a->bx1=l->p0.p[0]; }
if (l->p0.p[1]<=l->p1.p[1]) { a->by0=l->p0.p[1]; a->by1=l->p1.p[1]; }
else { a->by0=l->p1.p[1]; a->by1=l->p0.p[1]; }
a->x0=l->p0.p[0]; a->dx=l->p1.p[0]-l->p0.p[0];
a->y0=l->p0.p[1]; a->dy=l->p1.p[1]-l->p0.p[1];
// global image size for zone subdivision
if (!ia)
{
sz.bx0=l->p0.p[0];
sz.by0=l->p0.p[1];
sz.bx1=sz.bx0;
sz.by1=sz.by0;
}
if (sz.bx0>l->p0.p[0]) sz.bx0=l->p0.p[0];
if (sz.bx1<l->p0.p[0]) sz.bx1=l->p0.p[0];
if (sz.by0>l->p0.p[1]) sz.by0=l->p0.p[1];
if (sz.by1<l->p0.p[1]) sz.by1=l->p0.p[1];
if (sz.bx0>l->p1.p[0]) sz.bx0=l->p1.p[0];
if (sz.bx1<l->p1.p[0]) sz.bx1=l->p1.p[0];
if (sz.by0>l->p1.p[1]) sz.by0=l->p1.p[1];
if (sz.by1<l->p1.p[1]) sz.by1=l->p1.p[1];
}
// process lines by zonal subdivision
zone.allocate(N);
view.pnt.num=0; view.pnt.allocate(view.lin.num);
sz.bx1-=sz.bx0; sz.bx1/=double(M);
sz.by1-=sz.by0; sz.by1/=double(M);
for (bz.by0=sz.by0,bz.by1=sz.by0+sz.by1,i=0;i<M;i++,bz.by0+=sz.by1,bz.by1+=sz.by1)
for (bz.bx0=sz.bx0,bz.bx1=sz.bx0+sz.bx1,j=0;j<M;j++,bz.bx0+=sz.bx1,bz.bx1+=sz.bx1)
{
// create list of lines for actual zone only
zone.num=0; // clear zone list
for (a=lin,ia= 0;ia<N;ia++,a++)
if ((a->bx0<=bz.bx1)&&(a->bx1>=bz.bx0))
if ((a->by0<=bz.by1)&&(a->by1>=bz.by0))
zone.add(a); // add line to zone list
// check for intersection within zone only
// O((((N/M)^2)-N)/2) - optimizations
for (ia= 0,a=zone.dat[ia];ia<zone.num;ia++,a=zone.dat[ia])
for (ib=ia+1,b=zone.dat[ib];ib<zone.num;ib++,b=zone.dat[ib])
{
// discart lines with non intersecting bound rectangles
if (a->bx1<b->bx0) continue;
if (a->bx0>b->bx1) continue;
if (a->by1<b->by0) continue;
if (a->by0>b->by1) continue;
// 2D lines a,b intersect ?
double x0,y0,x1,y1,t0,t1;
// compute intersection
t1=divide(a->dx*(a->y0-b->y0)+a->dy*(b->x0-a->x0),(a->dx*b->dy)-(b->dx*a->dy));
x1=b->x0+(b->dx*t1);
y1=b->y0+(b->dy*t1);
if (fabs(a->dx)>=fabs(a->dy)) t0=divide(b->x0-a->x0+(b->dx*t1),a->dx);
else t0=divide(b->y0-a->y0+(b->dy*t1),a->dy);
x0=a->x0+(a->dx*t0);
y0=a->y0+(a->dy*t0);
// check if intersection exists
if (fabs(x1-x0)>zero) continue;
if (fabs(y1-y0)>zero) continue;
if ((t0<0.0)||(t0>1.0)) continue;
if ((t1<0.0)||(t1>1.0)) continue;
// if yes add point
p.p[0]=x0;
p.p[1]=y0;
p.p[2]=0.0;
// do not add points out of zone (allmost all duplicit points removal)
if (x0<bz.bx0) continue;
if (x0>bz.bx1) continue;
if (y0<bz.by0) continue;
if (y0>bz.by1) continue;
view.pnt.add(p);
}
}
view.redraw=true;
delete lin;
}
Notes:
List<T> x; is the same as T x[] with 'unlimited' size
x.num; is actual size of x[] in Ts not Bytes !!! index = <0,x.num-1>
x.add(q); adds q to the list at the end
x.num=0; clears the list
x.allocate(N); allocate space for N items in list to avoid relocations
input List<> is view.lin ... contains points p0,p1 each have double p[2] ... x,y
output List<> is view.pnt ... contains double p[2] ... x,y
[Edit2]
In addition I found out that the best performance of above algorithm is when M=12+(N>>15)
M is subdivision areas count per axis
N is number of lines to check

Related

Scan Line Flood Fill Algorithm in C#

I'm trying to find a faster implementation of a flood fill algorithm for a program I'm making using C# in Unity 2020.
This is my current method, which in my program takes about 400ms to run on a 1000 x 1000 map. Instead of a target colour to replace, I am using a height map (called noiseMap in this code snippet) and all values above a threshold should be considered inside the flooded area.
public void Flood()
{
landMasses.Clear();
globalSet.Clear();
HashSet<Vector2Int> samples = new HashSet<Vector2Int>();
for (int x = 0; x < mapGen.mapSize; x += mapGen.scanStride)
{
for (int y = 0; y < mapGen.mapSize; y += mapGen.scanStride)
{
samples.Add(new Vector2Int(x, y));
}
}
float[,] noiseMap = mapGen.noiseMap;
int mapSize = mapGen.mapSize;
float threshold = mapGen.threshold;
foreach (var sample in samples)
{
CalculateSets(sample, noiseMap, mapSize, threshold);
}
}
public bool Inside(Vector2Int point)
{
return Inside(point.x, point.y);
}
public bool Inside(int x, int y)
{
if (x < mapGen.mapSize && x >= 0 && y < mapGen.mapSize && y >= 0)
{
return mapGen.noiseMap[x, y] > mapGen.threshold;
}
return false;
}
public void CalculateSets(Vector2Int sample, float[,] noiseMap, int mapSize, float threshold)
{
if (globalSet.Contains(sample) || noiseMap[sample.x, sample.y] < threshold)
{
return;
}
HashSet<Vector2Int> set = new HashSet<Vector2Int>();
Queue<Vector2Int> queue = new Queue<Vector2Int>();
queue.Enqueue(sample);
while (queue.Count > 0)
{
Vector2Int n = queue.Dequeue();
if (set.Contains(n))
{
continue;
}
if(Inside(n))
{
set.Add(n);
globalSet.Add(n);
queue.Enqueue(new Vector2Int(n.x, n.y - 1));
queue.Enqueue(new Vector2Int(n.x, n.y + 1));
queue.Enqueue(new Vector2Int(n.x - 1, n.y));
queue.Enqueue(new Vector2Int(n.x + 1, n.y));
}
}
landMasses.Add(landMasses.Count.ToString(), set);
}
I've looked around at places like Wikipedia and other online forums for an implementation of the scan line flood fill, but every implementation I find has very little documentation to go along with it, or has no definitions of what their variable names represent. Regardless of this, I have tried to decipher these other implementations and have had 0 luck.
For example, on the Floodfill Wikipedia Page, there are a few different methods along with pseudocode to go along with it - but I cannot find definitions for what most of the variables mean in the later methods which are supposedly faster. Perhaps it's simple, but as someone overall new to computing algorithms I am struggling to figure it out.
So at the end of all this, I am essentially just looking for a faster way to implement something like a floodfill algorithm than what I currently have. It doesn't need to exactly fit into my program of course, even just a general C# implementation or more clarified pseudocode example with comments will be a great help.
Thank you for reading!!

Getting a List<int> from an integer which modulo result is equal to 0 without using loop [duplicate]

All numbers that divide evenly into x.
I put in 4 it returns: 4, 2, 1
edit: I know it sounds homeworky. I'm writing a little app to populate some product tables with semi random test data. Two of the properties are ItemMaximum and Item Multiplier. I need to make sure that the multiplier does not create an illogical situation where buying 1 more item would put the order over the maximum allowed. Thus the factors will give a list of valid values for my test data.
edit++:
This is what I went with after all the help from everyone. Thanks again!
edit#: I wrote 3 different versions to see which I liked better and tested them against factoring small numbers and very large numbers. I'll paste the results.
static IEnumerable<int> GetFactors2(int n)
{
return from a in Enumerable.Range(1, n)
where n % a == 0
select a;
}
private IEnumerable<int> GetFactors3(int x)
{
for (int factor = 1; factor * factor <= x; factor++)
{
if (x % factor == 0)
{
yield return factor;
if (factor * factor != x)
yield return x / factor;
}
}
}
private IEnumerable<int> GetFactors1(int x)
{
int max = (int)Math.Ceiling(Math.Sqrt(x));
for (int factor = 1; factor < max; factor++)
{
if(x % factor == 0)
{
yield return factor;
if(factor != max)
yield return x / factor;
}
}
}
In ticks.
When factoring the number 20, 5 times each:
GetFactors1-5,445,881
GetFactors2-4,308,234
GetFactors3-2,913,659
When factoring the number 20000, 5 times each:
GetFactors1-5,644,457
GetFactors2-12,117,938
GetFactors3-3,108,182
pseudocode:
Loop from 1 to the square root of the number, call the index "i".
if number mod i is 0, add i and number / i to the list of factors.
realocode:
public List<int> Factor(int number)
{
var factors = new List<int>();
int max = (int)Math.Sqrt(number); // Round down
for (int factor = 1; factor <= max; ++factor) // Test from 1 to the square root, or the int below it, inclusive.
{
if (number % factor == 0)
{
factors.Add(factor);
if (factor != number/factor) // Don't add the square root twice! Thanks Jon
factors.Add(number/factor);
}
}
return factors;
}
As Jon Skeet mentioned, you could implement this as an IEnumerable<int> as well - use yield instead of adding to a list. The advantage with List<int> is that it could be sorted before return if required. Then again, you could get a sorted enumerator with a hybrid approach, yielding the first factor and storing the second one in each iteration of the loop, then yielding each value that was stored in reverse order.
You will also want to do something to handle the case where a negative number passed into the function.
The % (remainder) operator is the one to use here. If x % y == 0 then x is divisible by y. (Assuming 0 < y <= x)
I'd personally implement this as a method returning an IEnumerable<int> using an iterator block.
Very late but the accepted answer (a while back) didn't not give the correct results.
Thanks to Merlyn, I got now got the reason for the square as a 'max' below the corrected sample. althought the answer from Echostorm seems more complete.
public static IEnumerable<uint> GetFactors(uint x)
{
for (uint i = 1; i * i <= x; i++)
{
if (x % i == 0)
{
yield return i;
if (i != x / i)
yield return x / i;
}
}
}
As extension methods:
public static bool Divides(this int potentialFactor, int i)
{
return i % potentialFactor == 0;
}
public static IEnumerable<int> Factors(this int i)
{
return from potentialFactor in Enumerable.Range(1, i)
where potentialFactor.Divides(i)
select potentialFactor;
}
Here's an example of usage:
foreach (int i in 4.Factors())
{
Console.WriteLine(i);
}
Note that I have optimized for clarity, not for performance. For large values of i this algorithm can take a long time.
Another LINQ style and tying to keep the O(sqrt(n)) complexity
static IEnumerable<int> GetFactors(int n)
{
Debug.Assert(n >= 1);
var pairList = from i in Enumerable.Range(1, (int)(Math.Round(Math.Sqrt(n) + 1)))
where n % i == 0
select new { A = i, B = n / i };
foreach(var pair in pairList)
{
yield return pair.A;
yield return pair.B;
}
}
Here it is again, only counting to the square root, as others mentioned. I suppose that people are attracted to that idea if you're hoping to improve performance. I'd rather write elegant code first, and optimize for performance later, after testing my software.
Still, for reference, here it is:
public static bool Divides(this int potentialFactor, int i)
{
return i % potentialFactor == 0;
}
public static IEnumerable<int> Factors(this int i)
{
foreach (int result in from potentialFactor in Enumerable.Range(1, (int)Math.Sqrt(i))
where potentialFactor.Divides(i)
select potentialFactor)
{
yield return result;
if (i / result != result)
{
yield return i / result;
}
}
}
Not only is the result considerably less readable, but the factors come out of order this way, too.
I did it the lazy way. I don't know much, but I've been told that simplicity can sometimes imply elegance. This is one possible way to do it:
public static IEnumerable<int> GetDivisors(int number)
{
var searched = Enumerable.Range(1, number)
.Where((x) => number % x == 0)
.Select(x => number / x);
foreach (var s in searched)
yield return s;
}
EDIT: As Kraang Prime pointed out, this function cannot exceed the limit of an integer and is (admittedly) not the most efficient way to handle this problem.
Wouldn't it also make sense to start at 2 and head towards an upper limit value that's continuously being recalculated based on the number you've just checked? See N/i (where N is the Number you're trying to find the factor of and i is the current number to check...) Ideally, instead of mod, you would use a divide function that returns N/i as well as any remainder it might have. That way you're performing one divide operation to recreate your upper bound as well as the remainder you'll check for even division.
Math.DivRem
http://msdn.microsoft.com/en-us/library/wwc1t3y1.aspx
If you use doubles, the following works: use a for loop iterating from 1 up to the number you want to factor. In each iteration, divide the number to be factored by i. If (number / i) % 1 == 0, then i is a factor, as is the quotient of number / i. Put one or both of these in a list, and you have all of the factors.
And one more solution. Not sure if it has any advantages other than being readable..:
List<int> GetFactors(int n)
{
var f = new List<int>() { 1 }; // adding trivial factor, optional
int m = n;
int i = 2;
while (m > 1)
{
if (m % i == 0)
{
f.Add(i);
m /= i;
}
else i++;
}
// f.Add(n); // adding trivial factor, optional
return f;
}
I came here just looking for a solution to this problem for myself. After examining the previous replies I figured it would be fair to toss out an answer of my own even if I might be a bit late to the party.
The maximum number of factors of a number will be no more than one half of that number.There is no need to deal with floating point values or transcendent operations like a square root. Additionally finding one factor of a number automatically finds another. Just find one and you can return both by just dividing the original number by the found one.
I doubt I'll need to use checks for my own implementation but I'm including them just for completeness (at least partially).
public static IEnumerable<int>Factors(int Num)
{
int ToFactor = Num;
if(ToFactor == 0)
{ // Zero has only itself and one as factors but this can't be discovered through division
// obviously.
yield return 0;
return 1;
}
if(ToFactor < 0)
{// Negative numbers are simply being treated here as just adding -1 to the list of possible
// factors. In practice it can be argued that the factors of a number can be both positive
// and negative, i.e. 4 factors into the following pairings of factors:
// (-4, -1), (-2, -2), (1, 4), (2, 2) but normally when you factor numbers you are only
// asking for the positive factors. By adding a -1 to the list it allows flagging the
// series as originating with a negative value and the implementer can use that
// information as needed.
ToFactor = -ToFactor;
yield return -1;
}
int FactorLimit = ToFactor / 2; // A good compiler may do this optimization already.
// It's here just in case;
for(int PossibleFactor = 1; PossibleFactor <= FactorLimit; PossibleFactor++)
{
if(ToFactor % PossibleFactor == 0)
{
yield return PossibleFactor;
yield return ToFactor / PossibleFactor;
}
}
}
Program to get prime factors of whole numbers in javascript code.
function getFactors(num1){
var factors = [];
var divider = 2;
while(num1 != 1){
if(num1 % divider == 0){
num1 = num1 / divider;
factors.push(divider);
}
else{
divider++;
}
}
console.log(factors);
return factors;
}
getFactors(20);
In fact we don't have to check for factors not to be square root in each iteration from the accepted answer proposed by chris fixed by Jon, which could slow down the method when the integer is large by adding an unnecessary Boolean check and a division. Just keep the max as double (don't cast it to an int) and change to loop to be exclusive not inclusive.
private static List<int> Factor(int number)
{
var factors = new List<int>();
var max = Math.Sqrt(number); // (store in double not an int) - Round down
if (max % 1 == 0)
factors.Add((int)max);
for (int factor = 1; factor < max; ++factor) // (Exclusice) - Test from 1 to the square root, or the int below it, inclusive.
{
if (number % factor == 0)
{
factors.Add(factor);
//if (factor != number / factor) // (Don't need check anymore) - Don't add the square root twice! Thanks Jon
factors.Add(number / factor);
}
}
return factors;
}
Usage
Factor(16)
// 4 1 16 2 8
Factor(20)
//1 20 2 10 4 5
And this is the extension version of the method for int type:
public static class IntExtensions
{
public static IEnumerable<int> Factors(this int value)
{
// Return 2 obvious factors
yield return 1;
yield return value;
// Return square root if number is prefect square
var max = Math.Sqrt(value);
if (max % 1 == 0)
yield return (int)max;
// Return rest of the factors
for (int i = 2; i < max; i++)
{
if (value % i == 0)
{
yield return i;
yield return value / i;
}
}
}
}
Usage
16.Factors()
// 4 1 16 2 8
20.Factors()
//1 20 2 10 4 5
Linq solution:
IEnumerable<int> GetFactors(int n)
{
Debug.Assert(n >= 1);
return from i in Enumerable.Range(1, n)
where n % i == 0
select i;
}

Calculate all possible permutations/combinations, then check if the result is equal to a value

Best way I can explain it is using an example:
You are visiting a shop with $2000, your goal is to have $0 at the end of your trip.
You do not know how many items are going to be available, nor how much they cost.
Say that there are currently 3 items costing $1000, $750, $500.
(The point is to calculate all possible solutions, not the most efficient one.)
You can spend $2000, this means:
You can buy the $1000 item 0, 1 or 2 times.
You can buy the $750 item 0, 1 or 2 times.
You can buy the $500 item 0, 1, 2, 3 or 4 times.
At the end I need to be able to have all solutions, in this case it will be
2*$1000
1*$1000 and 2*$500
2*$750 and 1*$500
4*$500
Side note: you can't have a duplicate solution (like this)
1*$1000 and 2*$500
2*$500 and 1*$1000
This is what I tried:
You first call this function using
goalmoney = convert.ToInt32(goalMoneyTextBox.Text);
totalmoney = Convert.ToInt32(totalMoneyTextBox.Text);
int[] list = new int[usingListBox.Items.Count];
Calculate(0, currentmoney, list);
The function:
public void Calculate(int level, int money, int[] list)
{
string item = usingListBox.Items[level].ToString();
int cost = ItemDict[item];
for (int i = 0; i <= (totalmoney / cost); i++)
{
int[] templist = list;
int tempmoney = money - (cost * i);
templist[level] = i;
if (tempmoney == goalmoney)
{
resultsFound++;
}
if (level < usingListBox.Items.Count - 1 && tempmoney != goalmoney) Calculate(level + 1, tempmoney, templist);
}
}
Your problem can be reduced to a well known mathematical problem labeled Frobenius equation which is closely related to the well known Coin problem. Suppose you have N items, where i-th item costs c[i] and you need to spent exactly S$. So you need to find all non negative integer solutions (or decide whether there are no solutions at all) of equation
c[1]*n[1] + c[2]*n[2] + ... + c[N]*n[N] = S
where all n[i] are unknown variables and each n[i] is the number of bought items of i-th type.
This equation can be solved in a various ways. The following function allSolutions (I suppose it can be additionally simplified) finds all solutions of a given equation:
public static List<int[]> allSolutions(int[] system, int total) {
ArrayList<int[]> all = new ArrayList<>();
int[] solution = new int[system.length];//initialized by zeros
int pointer = system.length - 1, temp;
out:
while (true) {
do { //the following loop can be optimized by calculation of remainder
++solution[pointer];
} while ((temp = total(system, solution)) < total);
if (temp == total && pointer != 0)
all.add(solution.clone());
do {
if (pointer == 0) {
if (temp == total) //not lose the last solution!
all.add(solution.clone());
break out;
}
for (int i = pointer; i < system.length; ++i)
solution[i] = 0;
++solution[--pointer];
} while ((temp = total(system, solution)) > total);
pointer = system.length - 1;
if (temp == total)
all.add(solution.clone());
}
return all;
}
public static int total(int[] system, int[] solution) {
int total = 0;
for (int i = 0; i < system.length; ++i)
total += system[i] * solution[i];
return total;
}
In the above code system is array of coefficients c[i] and total is S. There is an obvious restriction: system should have no any zero elements (this lead to infinite number of solutions). A slight modification of the above code avoids this restriction.
Assuming you have class Product which exposes a property called Price, this is a way to do it:
public List<List<Product>> GetAffordableCombinations(double availableMoney, List<Product> availableProducts)
{
List<Product> sortedProducts = availableProducts.OrderByDescending(p => p.Price).ToList();
//we have to cycle through the list multiple times while keeping track of the current
//position in each subsequent cycle. we're using a list of integers to save these positions
List<int> layerPointer = new List<int>();
layerPointer.Add(0);
int currentLayer = 0;
List<List<Product>> affordableCombinations = new List<List<Product>>();
List<Product> tempList = new List<Product>();
//when we went through all product on the top layer, we're done
while (layerPointer[0] < sortedProducts.Count)
{
//take the product in the current position on the current layer
var currentProduct = sortedProducts[layerPointer[currentLayer]];
var currentSum = tempList.Sum(p => p.Price);
if ((currentSum + currentProduct.Price) <= availableMoney)
{
//if the sum doesn't exeed our maximum we add that prod to a temp list
tempList.Add(currentProduct);
//then we advance to the next layer
currentLayer++;
//if it doesn't exist, we create it and set the 'start product' on that layer
//to the current product of the current layer
if (currentLayer >= layerPointer.Count)
layerPointer.Add(layerPointer[currentLayer - 1]);
}
else
{
//if the sum would exeed our maximum we move to the next prod on the current layer
layerPointer[currentLayer]++;
if (layerPointer[currentLayer] >= sortedProducts.Count)
{
//if we've reached the end of the list on the current layer,
//there are no more cheaper products to add, and this cycle is complete
//so we add the list we have so far to the possible combinations
affordableCombinations.Add(tempList);
tempList = new List<Product>();
//move to the next product on the top layer
layerPointer[0]++;
currentLayer = 0;
//set the current products on each subsequent layer to the current of the top layer
for (int i = 1; i < layerPointer.Count; i++)
{
layerPointer[i] = layerPointer[0];
}
}
}
}
return affordableCombinations;
}

Optimisation of route finding code

Small bit of background first. I am developing a system that generates a "route" between locations. Locations have a pre-defined list of neighbours not limited to those adjacent to it. The search can safely assume that by picking the closest neighbour (numerically) to the target destination, it is making the optimal move towards it.
I have working code as shown below:
public Route GetRoute(int StartPoint, int Destination)
{
Route returnRoute = new Route();
returnRoute.steps = new List<int>();
bool locationReached = false;
int selectedNeighbour;
int distanceFromTarget;
int currentPoint = StartPoint; // set the current point to the start point
while (!locationReached)
{
selectedNeighbour = 0;
distanceFromTarget = 5000; // nominal amount guaranteed to be overwritten
var neighbours = locations.FirstOrDefault(l => l.LocationID == currentPoint).Neighbours;
for (int i = 0; i < neighbours.Length; i++)
{
// get the current neighbours, then check proximity
int currentNeighbour = neighbours[i];
int tempDistance = Math.Abs( currentNeighbour - Destination );
// if nearer than previous neighbour, set it as the chosen location
if ( tempDistance < distanceFromTarget )
{
distanceFromTarget = tempDistance;
selectedNeighbour = currentNeighbour;
// if the selected neighbour is the destination, we're done
if ( selectedNeighbour == Destination )
locationReached = true;
}
} // for
// add the selected neighbour if we found one
if ( selectedNeighbour != 0 )
{
currentPoint = selectedNeighbour;
returnRoute.steps.Add(selectedNeighbour);
}
else
{
Debug.Log ("No Route Found");
return returnRoute;
}
} // while
return returnRoute;
}
My question is regarding the loop of the neighbours (int[]) variable. How can this best be optimised? I've seen some use of linq and ordering, but also comments that this approach might be inefficient. I need efficiency over neatness here.
Many thanks.

C# Finding relevant document snippets for search result display

In developing search for a site I am building, I decided to go the cheap and quick way and use Microsoft Sql Server's Full Text Search engine instead of something more robust like Lucene.Net.
One of the features I would like to have, though, is google-esque relevant document snippets. I quickly found determining "relevant" snippets is more difficult than I realized.
I want to choose snippets based on search term density in the found text. So, essentially, I need to find the most search term dense passage in the text. Where a passage is some arbitrary number of characters (say 200 -- but it really doesn't matter).
My first thought is to use .IndexOf() in a loop and build an array of term distances (subtract the index of the found term from the previously found term), then ... what? Add up any two, any three, any four, any five, sequential array elements and use the one with the smallest sum (hence, the smallest distance between search terms).
That seems messy.
Is there an established, better, or more obvious way to do this than what I have come up with?
Although it is implemented in Java, you can see one approach for that problem here:
http://rcrezende.blogspot.com/2010/08/smallest-relevant-text-snippet-for.html
I know this thread is way old, but I gave this a try last week and it was a pain in the back side. This is far from perfect, but this is what I came up with.
The snippet generator:
public static string SelectKeywordSnippets(string StringToSnip, string[] Keywords, int SnippetLength)
{
string snippedString = "";
List<int> keywordLocations = new List<int>();
//Get the locations of all keywords
for (int i = 0; i < Keywords.Count(); i++)
keywordLocations.AddRange(SharedTools.IndexOfAll(StringToSnip, Keywords[i], StringComparison.CurrentCultureIgnoreCase));
//Sort locations
keywordLocations.Sort();
//Remove locations which are closer to each other than the SnippetLength
if (keywordLocations.Count > 1)
{
bool found = true;
while (found)
{
found = false;
for (int i = keywordLocations.Count - 1; i > 0; i--)
if (keywordLocations[i] - keywordLocations[i - 1] < SnippetLength / 2)
{
keywordLocations[i - 1] = (keywordLocations[i] + keywordLocations[i - 1]) / 2;
keywordLocations.RemoveAt(i);
found = true;
}
}
}
//Make the snippets
if (keywordLocations.Count > 0 && keywordLocations[0] - SnippetLength / 2 > 0)
snippedString = "... ";
foreach (int i in keywordLocations)
{
int stringStart = Math.Max(0, i - SnippetLength / 2);
int stringEnd = Math.Min(i + SnippetLength / 2, StringToSnip.Length);
int stringLength = Math.Min(stringEnd - stringStart, StringToSnip.Length - stringStart);
snippedString += StringToSnip.Substring(stringStart, stringLength);
if (stringEnd < StringToSnip.Length) snippedString += " ... ";
if (snippedString.Length > 200) break;
}
return snippedString;
}
The function which will find the index of all keywords in the sample text
private static List<int> IndexOfAll(string haystack, string needle, StringComparison Comparison)
{
int pos;
int offset = 0;
int length = needle.Length;
List<int> positions = new List<int>();
while ((pos = haystack.IndexOf(needle, offset, Comparison)) != -1)
{
positions.Add(pos);
offset = pos + length;
}
return positions;
}
It's a bit clumsy in its execution. The way it works is by finding the position of all keywords in the string. Then checking that no keywords are closer to each other than the desired snippet length, so that snippets won't overlap (that's where it's a bit iffy...). And then grabs substrings of the desired length centered around the position of the keywords and stitches the whole thing together.
I know this is years late, but posting just in case it might help somebody coming across this question.
public class Highlighter
{
private class Packet
{
public string Sentence;
public double Density;
public int Offset;
}
public static string FindSnippet(string text, string query, int maxLength)
{
if (maxLength < 0)
{
throw new ArgumentException("maxLength");
}
var words = query.Split(' ').Where(w => !string.IsNullOrWhiteSpace(w)).Select(word => word.ToLower()).ToLookup(s => s);
var sentences = text.Split('.');
var i = 0;
var packets = sentences.Select(sentence => new Packet
{
Sentence = sentence,
Density = ComputeDensity(words, sentence),
Offset = i++
}).OrderByDescending(packet => packet.Density);
var list = new SortedList<int, string>();
int length = 0;
foreach (var packet in packets)
{
if (length >= maxLength || packet.Density == 0)
{
break;
}
string sentence = packet.Sentence;
list.Add(packet.Offset, sentence.Substring(0, Math.Min(sentence.Length, maxLength - length)));
length += packet.Sentence.Length;
}
var sb = new List<string>();
int previous = -1;
foreach (var item in list)
{
var offset = item.Key;
var sentence = item.Value;
if (previous != -1 && offset - previous != 1)
{
sb.Add(".");
}
previous = offset;
sb.Add(Highlight(sentence, words));
}
return String.Join(".", sb);
}
private static string Highlight(string sentence, ILookup<string, string> words)
{
var sb = new List<string>();
var ff = true;
foreach (var word in sentence.Split(' '))
{
var token = word.ToLower();
if (ff && words.Contains(token))
{
sb.Add("[[HIGHLIGHT]]");
ff = !ff;
}
if (!ff && !string.IsNullOrWhiteSpace(token) && !words.Contains(token))
{
sb.Add("[[ENDHIGHLIGHT]]");
ff = !ff;
}
sb.Add(word);
}
if (!ff)
{
sb.Add("[[ENDHIGHLIGHT]]");
}
return String.Join(" ", sb);
}
private static double ComputeDensity(ILookup<string, string> words, string sentence)
{
if (string.IsNullOrEmpty(sentence) || words.Count == 0)
{
return 0;
}
int numerator = 0;
int denominator = 0;
foreach(var word in sentence.Split(' ').Select(w => w.ToLower()))
{
if (words.Contains(word))
{
numerator++;
}
denominator++;
}
if (denominator != 0)
{
return (double)numerator / denominator;
}
else
{
return 0;
}
}
}
Example:
highlight "Optic flow is defined as the change of structured light in the image, e.g. on the retina or the camera’s sensor, due to a relative motion between the eyeball or camera and the scene. Further definitions from the literature highlight different properties of optic flow" "optic flow"
Output:
[[HIGHLIGHT]] Optic flow [[ENDHIGHLIGHT]] is defined as the change of structured
light in the image, e... Further definitions from the literature highlight diff
erent properties of [[HIGHLIGHT]] optic flow [[ENDHIGHLIGHT]]
Well, here's the hacked together version I made using the algorithm I described above. I don't think it is all that great. It uses three (count em, three!) loops an array and two lists. But, well, it is better than nothing. I also hardcoded the maximum length instead of turning it into a parameter.
private static string FindRelevantSnippets(string infoText, string[] searchTerms)
{
List<int> termLocations = new List<int>();
foreach (string term in searchTerms)
{
int termStart = infoText.IndexOf(term);
while (termStart > 0)
{
termLocations.Add(termStart);
termStart = infoText.IndexOf(term, termStart + 1);
}
}
if (termLocations.Count == 0)
{
if (infoText.Length > 250)
return infoText.Substring(0, 250);
else
return infoText;
}
termLocations.Sort();
List<int> termDistances = new List<int>();
for (int i = 0; i < termLocations.Count; i++)
{
if (i == 0)
{
termDistances.Add(0);
continue;
}
termDistances.Add(termLocations[i] - termLocations[i - 1]);
}
int smallestSum = int.MaxValue;
int smallestSumIndex = 0;
for (int i = 0; i < termDistances.Count; i++)
{
int sum = termDistances.Skip(i).Take(5).Sum();
if (sum < smallestSum)
{
smallestSum = sum;
smallestSumIndex = i;
}
}
int start = Math.Max(termLocations[smallestSumIndex] - 128, 0);
int len = Math.Min(smallestSum, infoText.Length - start);
len = Math.Min(len, 250);
return infoText.Substring(start, len);
}
Some improvements I could think of would be to return multiple "snippets" with a shorter length that add up to the longer length -- this way multiple parts of the document can be sampled.
This is a nice problem :)
I think I'd create an index vector: For each word, create an entry 1 if search term or otherwise 0. Then find the i such that sum(indexvector[i:i+maxlength]) is maximized.
This can actually be done rather efficiently. Start with the number of searchterms in the first maxlength words. then, as you move on, decrease your counter if indexvector[i]=1 (i.e. your about to lose that search term as you increase i) and increase it if indexvector[i+maxlength+1]=1. As you go, keep track of the i with the highest counter value.
Once you got your favourite i, you can still do finetuning like see if you can reduce the actual size without compromising your counter, e.g. in order to find sentence boundaries or whatever. Or like picking the right i of a number of is with equivalent counter values.
Not sure if this is a better approach than yours - it's a different one.
You might also want to check out this paper on the topic, which comes with yet-another baseline: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.72.4357&rep=rep1&type=pdf
I took another approach, perhaps it will help someone...
First it searches if it word appears in my case with IgnoreCase (you change this of course yourself).
Then I create a list of Regex matches on each separators and search for the first occurrence of the word (allowing partial case insensitive matches).
From that index, I get the 10 matches in front and behind the word, which makes the snippet.
public static string GetSnippet(string text, string word)
{
if (text.IndexOf(word, StringComparison.InvariantCultureIgnoreCase) == -1)
{
return "";
}
var matches = new Regex(#"\b(\S+)\s?", RegexOptions.Singleline | RegexOptions.Compiled).Matches(text);
var p = -1;
for (var i = 0; i < matches.Count; i++)
{
if (matches[i].Value.IndexOf(word, StringComparison.InvariantCultureIgnoreCase) != -1)
{
p = i;
break;
}
}
if (p == -1) return "";
var snippet = "";
for (var x = Math.Max(p - 10, 0); x < p + 10; x++)
{
snippet += matches[x].Value + " ";
}
return snippet;
}
If you use CONTAINSTABLE you will get a RANK back , this is in essence a density value - higher the RANK value, the higher the density. This way, you just run a query to get the results you want and dont have to result to massaging the data when its returned.
Wrote a function to do this just now. You want to pass in:
Inputs:
Document text
This is the full text of the document you're taking a snippet from. Most likely you will want to strip out any BBCode/HTML from this document.
Original query
The string the user entered as their search
Snippet length
Length of the snippet you wish to display.
Return Value:
Start index of the document text to take the snippet from. To get the snippet simply do documentText.Substring(returnValue, snippetLength). This has the advantage that you know if the snippet is take from the start/end/middle so you can add some decoration like ... if you wish at the snippet start/end.
Performance
A resolution set to 1 will find the best snippet but moves the window along 1 char at a time. Set this value higher to speed up execution.
Tweaks
You can work out score however you want. In this example I've done Math.pow(wordLength, 2) to favour longer words.
private static int GetSnippetStartPoint(string documentText, string originalQuery, int snippetLength)
{
// Normalise document text
documentText = documentText.Trim();
if (string.IsNullOrWhiteSpace(documentText)) return 0;
// Return 0 if entire doc fits in snippet
if (documentText.Length <= snippetLength) return 0;
// Break query down into words
var wordsInQuery = new HashSet<string>();
{
var queryWords = originalQuery.Split(' ');
foreach (var word in queryWords)
{
var normalisedWord = word.Trim().ToLower();
if (string.IsNullOrWhiteSpace(normalisedWord)) continue;
if (wordsInQuery.Contains(normalisedWord)) continue;
wordsInQuery.Add(normalisedWord);
}
}
// Create moving window to get maximum trues
var windowStart = 0;
double maxScore = 0;
var maxWindowStart = 0;
// Higher number less accurate but faster
const int resolution = 5;
while (true)
{
var text = documentText.Substring(windowStart, snippetLength);
// Get score of this chunk
// This isn't perfect, as window moves in steps of resolution first and last words will be partial.
// Could probably be improved to iterate words and not characters.
var words = text.Split(' ').Select(c => c.Trim().ToLower());
double score = 0;
foreach (var word in words)
{
if (wordsInQuery.Contains(word))
{
// The longer the word, the more important.
// Can simply replace with score += 1 for simpler model.
score += Math.Pow(word.Length, 2);
}
}
if (score > maxScore)
{
maxScore = score;
maxWindowStart = windowStart;
}
// Setup next iteration
windowStart += resolution;
// Window end passed document end
if (windowStart + snippetLength >= documentText.Length)
{
break;
}
}
return maxWindowStart;
}
Lots more you can add to this, for example instead of comparing exact words perhaps you might want to try comparing the SOUNDEX where you weight soundex matches less than exact matches.

Categories