C# Lucene.Net spellchecker - c#

I have a site that give data to the user. I want to use Lucene.Net for my autocomplete. The thing is I want to be able to return results that correct spelling errors. I see that Lucene.Net has a spellchecker functionality that suggest other words. But it returns the words and I need the Ids in order to get more info of that item. Do I have to do another query on the regular index after I get results from the spellchecker or is there a better way???

You will need to search for it, it cannot do it since spellchecking works on a separate index that is not linked to you main index your created suggestions from.
Its easy to do tho:
RAMDirectory dir = new RAMDirectory();
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), IndexWriter.MaxFieldLength.UNLIMITED);
Document d = new Document();
Field textField = new Field("text", "", Field.Store.YES, Field.Index.ANALYZED);
d.Add(textField);
Field idField = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED);
d.Add(idField);
textField.SetValue("this is a document with a some words");
idField.SetValue("42");
iw.AddDocument(d);
iw.Commit();
IndexReader reader = iw.GetReader();
SpellChecker.Net.Search.Spell.SpellChecker speller = new SpellChecker.Net.Search.Spell.SpellChecker(new RAMDirectory());
speller.IndexDictionary(new LuceneDictionary(reader, "text"));
string [] suggestions = speller.SuggestSimilar("dcument", 5);
IndexSearcher searcher = new IndexSearcher(reader);
foreach (string suggestion in suggestions)
{
TopDocs docs = searcher.Search(new TermQuery(new Term("text", suggestion)), null, Int32.MaxValue);
foreach (var doc in docs.ScoreDocs)
{
Console.WriteLine(searcher.Doc(doc.Doc).Get("id"));
}
}
reader.Dispose();
iw.Dispose();

Related

Lucene.Net Query Building [duplicate]

I need help figuring out which query types to use in given situations.
I think i'm right in saying that if i stored the word "FORD" in a lucene Field and i wanted to find an exact match i would use a TermQuery?
But which query type should i use if I was looking for the word "FORD" where the contents of the field where stored as :-
"FORD|HONDA|SUZUKI"
What if i was to search the contents of an entire page, looking for a phrase? such as "please help me"?
If you want to search FORD in FORD|HONDA|SUZUKI, either index with Field.Index.ANALYZED, or store it as below to use TermQuery
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
var fs = FSDirectory.Open("test.index");
//Index a Test Document
IndexWriter wr = new IndexWriter(fs, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
var doc = new Document();
doc.Add(new Field("Model", "FORD", Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("Model", "HONDA", Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("Model", "SUZUKI", Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("Text", #"What if i was to search the contents of an entire page, looking for a phrase? such as ""please help me""?",
Field.Store.YES, Field.Index.ANALYZED));
wr.AddDocument(doc);
wr.Commit();
var reader = wr.GetReader();
var searcher = new IndexSearcher(reader);
//Use TermQuery for "NOT_ANALYZED" fields
var result = searcher.Search(new TermQuery(new Term("Model", "FORD")), 100);
foreach (var item in result.ScoreDocs)
{
Console.WriteLine("1)" + reader.Document(item.Doc).GetField("Text").StringValue);
}
//Use QueryParser for "ANALYZED" fields
var qp = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "Text", analyzer);
result = searcher.Search(qp.Parse(#"""HELP ME"""), 100);
foreach (var item in result.ScoreDocs)
{
Console.WriteLine("2)" + reader.Document(item.Doc).GetField("Text").StringValue);
}
TermQuery means you want to search the term as it is stored in index which depends on how you indexed that field(NOT_ANALYZED, ANALYZED+WhichAnalyzer). Most common use of it is with NOT_ANALYZED fields.
You can use TermQuery with ANALYZED fields too, but then you should know how the analyzer tokenizes your input string. Below is a sample to see what how analyzers tokenize your input
var text = #"What if i was to search the contents of an entire page, looking for a phrase? such as ""please help me""?";
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30 );
//var analyzer = new WhitespaceAnalyzer();
//var analyzer = new KeywordAnalyzer();
//var analyzer = new SimpleAnalyzer();
var ts = analyzer.TokenStream("", new StringReader(text));
var termAttr = ts.GetAttribute<ITermAttribute>();
while (ts.IncrementToken())
{
Console.Write("[" + termAttr.Term + "] " );
}
I would turn the problem sideways, so I put the multiple values for each field separately in the index -- this should make searching simpler. Looking at Field Having Multiple Values might be helpful.

Lucene.Net search like a071,a072,a073

Enviroment :
Lucene.Net 3.03
Visual Studio 2010
I've been stuck on this problem for hours at this point and I can't figure out the problem.
i build some index named "Stores" , the format like below ,
a075,a073,a021....
each string represent the id of shop , and it Separated by "," ,
i would like search "a073" , and it will return matched data if the "Stores" include "a073"
thanks in advance
static RAMDirectory dir = new RAMDirectory();
public void BuildIndex()
{
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
doc.Add(new Field("PROD_ID", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
doc.Add(new Field("Stores", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
for (int i = 1; i <= 10; i++)
{
doc.GetField("PROD_ID").SetValue(Guid.NewGuid().ToString());
doc.GetField("Stores").SetValue("a075,a073,no2" + i.ToString());
iw.AddDocument(doc);
}
iw.Optimize();
iw.Commit();
iw.Close();
}
private void Search(string KeyWord)
{
IndexSearcher search = new IndexSearcher(dir, true);
QueryParser qp = new QueryParser(Version.LUCENE_30, "Stores", new StandardAnalyzer(Version.LUCENE_30));
Query query = qp.Parse(KeyWord);
var hits = search.Search(query, null, search.MaxDoc).ScoreDocs;
foreach (var res in hits)
{
Response.Write(string.Format("PROD_ID:{0} / Stores{1}"
, search.Doc(res.Doc).Get("PROD_ID").ToString()
, search.Doc(res.Doc).Get("Stores").ToString() + "<BR>"));
}
}
Try to use Lucene.Net.Search.WildcardQuery and include wildcards.
Google for Lucene regular expression search to find some code to use regular expressions in your query... there is a contrib implementation called Contrib.Regex.RegexTermEnum.
An alternative would be a multivalued field, instead of a string separated by comma you would pass an array into it. This will be split and indexed by Lucene and you can query it in the same manner as a normal field. In addition you can query it multiple times, e.g. multiField:ValueA and mutliField:ValueB ...

Lucene.net search by numeric value(as string)

I indexes documents with text and numbers. To create an index I use
analyser = new SnowballAnalyzer(Version.LUCENE_30, "English");
I use Snoschballanalyzer because I need morphology(table - tables).
When I search for text in the index - I find text, but don't find numeric value. I find one solution - Lucene - searching for a numeric value field, but it is necessary to create a separate field for numeric values. I now do not need to search a range of numeric values. I want to find a numeric value as a string.
Example - source text:"He was born 1990 years". I need to find this tesxt on request "born" and "1990".
You shouldnt have to do anything special.
Heres some code that does what you seem to want to achieve.
RAMDirectory dir = new RAMDirectory();
IndexWriter iw = new IndexWriter(dir, new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30,"English"), IndexWriter.MaxFieldLength.UNLIMITED);
Document d = new Document();
Field f = new Field("text", "", Field.Store.YES, Field.Index.ANALYZED);
d.Add(f);
f.SetValue("He was born 1990 years");
iw.AddDocument(d);
iw.Commit();
IndexReader reader = iw.GetReader();
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser qp = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "text", new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English"));
Query q = qp.Parse("+born +1990");
TopDocs td = searcher.Search(q, null, 25);
foreach (var sd in td.ScoreDocs)
{
Console.WriteLine(searcher.Doc(sd.Doc).GetField("text").StringValue);
}
searcher.Dispose();
reader.Dispose();
iw.Dispose();

Searching Lucene.Net index for an url field

I want to search a Lucene.net index for a stored url field. My code is given below:
Field urlField = new Field("Url", url.ToLower(), Field.Store.YES,Field.Index.TOKENIZED);
document.Add(urlField);`
indexWriter.AddDocument(document);
I am using the above code for writing into the index.
And the below code to search the Url in the index.
Lucene.Net.Store.Directory _directory = FSDirectory.GetDirectory(Host, false);
IndexReader reader = IndexReader.Open(_directory);
KeywordAnalyzer _analyzer = new KeywordAnalyzer();
IndexSearcher indexSearcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser("Url", _analyzer);
Query query = parser.Parse("\"" + downloadDoc.Uri.ToString() + "\"");
TopDocs hits = indexSearcher.Search(query, null, 10);
if (hits.totalHits > 0)
{
//statements....
}
But whenever I search for a url for example: http://www.xyz.com/, I am not getting any hits.
Somehow, figured out the alternative. But this works in case of only one document in the index. If there are more documents, the below code will not yield correct result. Any ideas? Pls help
While writing the index, use KeywordAnalyzer()
KeywordAnalyzer _analyzer = new KeywordAnalyzer();
indexWriter = new IndexWriter(_directory, _analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
Then while searching also, use KeywordAnalyzer()
IndexReader reader = IndexReader.Open(_directory);
KeywordAnalyzer _analyzer = new KeywordAnalyzer();
IndexSearcher indexSearcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser("Url", _analyzer);
Query query = parser.Parse("\"" + url.ToString() + "\"");
TopDocs hits = indexSearcher.Search(query, null, 1);
This is because the KeywordAnalyzer "Tokenizes" the entire stream as a
single token.
Please help. Its urgent.
Cheers
Sunil...
This worked for me:
IndexReader reader = IndexReader.Open(_directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
TermQuery tq= new TermQuery(new Term("Url", downloadDoc.Uri.ToString().ToLower()));
BooleanQuery bq = new BooleanQuery();
bq.Add(tq, BooleanClause.Occur.SHOULD);
TopScoreDocCollector collector = TopScoreDocCollector.create(10, true);
Use StandardAnalyzer while writing into the index.
This answer helped me: Lucene search by URL
try putting quotes around query, eg. like this :
"http://www.google.com/"
Using the whitespace or keyword analyzer should work.
Would anyone actually search for "http://www.Google.com"? Seems more likely that a user would search for "Google" instead.
You can always return the entire URL if their is a partial match. I think the standard analyzer should be more appropriate for searching and retrieving a URL.

Storing relational data in a Lucene.NET index

I'm currently trying to implement a Lucene.NET based search on a large database and I've hit a snag trying to do a search on what is essentially relational data.
At a high level the data I'm trying to search is grouped, each item belongs to 1 to 3 groups. I then need to be able to do a search for all items that are in a combination of groups (EG: Each item belongs to both group A and group B).
Each of these groupings have ID's and Descriptions existing from the data I'm searching, but the descriptions may be sub-strings of one another (EG: One group named "Stuff" and the other "Other stuff"), and I don't want to match the categories that have a sub-string of the one I'm looking for.
I've been considering pulling the data back without this filtering and then filtering the ID's, but I was intending to paginate the data returned from Lucene for performance reasons. I've also considered putting the ID's in space-separated and doing a text-search on the field, but that seems like a total hack...
Does anyone have any idea how to best handle this kind of search in Lucene.NET? (Just to clarify before someone says I'm using the wrong tool, this is only a subset of a larger set of filters which includes full-text searching. If you still think I'm using the wrong tool though I'd love to hear what the right one is)
I've had my share of problems with storing relational data i Lucene but the one you have should be easy to fix.
I guess you tokenize the group fields and that makes it possible to search for substrings in the field value. Just add the field untokenized and it should work like expected.
Please check the following small piece of code:
internal class Program {
private static void Main(string[] args) {
var directory = new RAMDirectory();
var writer = new IndexWriter(directory, new StandardAnalyzer());
AddDocument(writer, "group", "stuff", Field.Index.UN_TOKENIZED);
AddDocument(writer, "group", "other stuff", Field.Index.UN_TOKENIZED);
writer.Close(true);
var searcher = new IndexSearcher(directory);
Hits hits = searcher.Search(new TermQuery(new Term("group", "stuff")));
for (int i = 0; i < hits.Length(); i++) {
Console.WriteLine(hits.Doc(i).GetField("group").StringValue());
}
}
private static void AddDocument(IndexWriter writer, string name, string value, Field.Index index) {
var document = new Document();
document.Add(new Field(name, value, Field.Store.YES, index));
writer.AddDocument(document);
}
}
The sample adds two documents to the index which are untokenized, does a search for stuff and gets one hit. If you changed the code to add them tokenized then you will have two hits as you see now.
The issue with using Lucene for relational data is that it might be expected that wildcard and range searches always will work. That is not really the case if the index is big due to way Lucene resolves those queries.
Another sample to illustrate the behavior:
private static void Main(string[] args) {
var directory = new RAMDirectory();
var writer = new IndexWriter(directory, new StandardAnalyzer());
var documentA = new Document();
documentA.Add(new Field("name", "A", Field.Store.YES, Field.Index.UN_TOKENIZED));
documentA.Add(new Field("group", "stuff", Field.Store.YES, Field.Index.UN_TOKENIZED));
documentA.Add(new Field("group", "other stuff", Field.Store.YES, Field.Index.UN_TOKENIZED));
writer.AddDocument(documentA);
var documentB = new Document();
documentB.Add(new Field("name", "B", Field.Store.YES, Field.Index.UN_TOKENIZED));
documentB.Add(new Field("group", "stuff", Field.Store.YES, Field.Index.UN_TOKENIZED));
writer.AddDocument(documentB);
var documentC = new Document();
documentC.Add(new Field("name", "C", Field.Store.YES, Field.Index.UN_TOKENIZED));
documentC.Add(new Field("group", "other stuff", Field.Store.YES, Field.Index.UN_TOKENIZED));
writer.AddDocument(documentC);
writer.Close(true);
var query1 = new TermQuery(new Term("group", "stuff"));
SearchAndDisplay("First sample", directory, query1);
var query2 = new TermQuery(new Term("group", "other stuff"));
SearchAndDisplay("Second sample", directory, query2);
var query3 = new BooleanQuery();
query3.Add(new TermQuery(new Term("group", "stuff")), BooleanClause.Occur.MUST);
query3.Add(new TermQuery(new Term("group", "other stuff")), BooleanClause.Occur.MUST);
SearchAndDisplay("Third sample", directory, query3);
}
private static void SearchAndDisplay(string title, Directory directory, Query query3) {
var searcher = new IndexSearcher(directory);
Hits hits = searcher.Search(query3);
Console.WriteLine(title);
for (int i = 0; i < hits.Length(); i++) {
Console.WriteLine(hits.Doc(i).GetField("name").StringValue());
}
}

Categories