lucenenet-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Sergey Zaharov <sergozaha...@gmail.com>
Subject WhiteSpaceAnalizer help
Date Tue, 06 Jun 2017 21:03:08 GMT
Hi,

we use lucene v 3.0.3 in our project and it works fine. Now we want to
upgrade to v 4 and we faced with next problem:

according to our business process, we store in lucene index some text and
list of access codes. In lucene access codes list stored as string with
space as separator. For analizing that filed with access codes in lucene v3
we use some custom class like

public class CaseInsensitiveWhitespaceAnalyzer : Analyzer
    {
        /// <summary>
        /// </summary>
        public override TokenStream TokenStream(string fieldName,
TextReader reader)
        {
            TokenStream t = null;
            t = new WhitespaceTokenizer(reader);
            t = new LowerCaseFilter(t);

            return t;
        }
    }

Please do not pay attention to LowerCaseFilter at that moment.

In Lucene v4 exists new class WhitespaceAnalyzer which should do the same
things i hope, but unfortunately it doesnot work.

I have made small example of code. That is in the attachment. With current
code there is no result found. But if i commented each second line of
accesses like

            AddNewItem(new FullTextIndexItem
            {
                ObjectText = "111 222 333 qqq",

                Access = new List<FullTextIndexItemAccessInfo>()
                {
                    new FullTextIndexItemAccessInfo() { Key = 1037, Info =
"PW???"},
                    //new FullTextIndexItemAccessInfo() { Key = 1041, Info
= "P????"}
                }
            });

            AddNewItem(new FullTextIndexItem
            {
                ObjectText = "aaa bbb ccc qqq",
                Access = new List<FullTextIndexItemAccessInfo>()
                {
                    new FullTextIndexItemAccessInfo() { Key = 1037, Info =
"PW???"},
                    //new FullTextIndexItemAccessInfo() { Key = 1042, Info
= "PW??C"}
                }
            });


 then i got a result sucesfully. So i think that the problem somewhere in
WhiteSpaceAnalizer. Otherwise, probably, i use it n wrong way. :)

Could you please point me on my mistake or help solve that issue?

Demo solution code you can find by the link:
https://drive.google.com/open?id=0B4yKxCojuj4fYlE5Qm85cDM0dnc


or below

using System;
using System.Collections.Generic;
using System.Linq;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers.Classic;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;

namespace Lucene4TestWSA
{
    class Program
    {
        private const string FIELD_BODY = "postBody";
        private const string FIELD_SECURITY = "Security";

        private static IndexWriter _writer;
        private static Directory _directory;
        private static WhitespaceAnalyzer _analyzer;
        private static IndexReader _indexReader;
        private static IndexSearcher _searcher;
        private static IndexWriterConfig _cfg;

        private static void AddNewItem(FullTextIndexItem item)
        {
            if (_writer == null) return;
            var doc = new Document();

           var objectText = (item.ObjectText ?? "");
            doc.Add(new StringField(FIELD_BODY, objectText, Field.Store.NO
));

            var securCodes = (item.Access == null || item.Access.All(x =>
x.Key == 0))
                ? "?"
                : string.Join(" ", item.Access.Where(x => x.Key !=
0).Select(x => x.Key.ToString() + x.Info).ToList());
            doc.Add(new StringField(FIELD_SECURITY, securCodes.ToLower(),
Field.Store.YES));

            _writer.AddDocument(doc);
        }

        static void Main(string[] args)
        {

            var dir = @"c:\TestLuceneDir";
            if (System.IO.Directory.Exists(dir))
            {
                System.IO.Directory.Delete(dir, true);
            }

            var di = System.IO.Directory.CreateDirectory(dir);
            var _directory = FSDirectory.Open(di);
            _analyzer = new WhitespaceAnalyzer(LuceneVersion.LUCENE_48);
            _cfg = new IndexWriterConfig(LuceneVersion.LUCENE_48,
_analyzer);
            var writer = new IndexWriter(_directory, _cfg);
            writer.Commit();
            writer.Dispose();
            _cfg = null;

            _indexReader = DirectoryReader.Open(_directory);
            _searcher = new IndexSearcher(_indexReader);

            var analyzer = new WhitespaceAnalyzer(LuceneVersion.LUCENE_48);
            _cfg = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer);
            _writer = new IndexWriter(_directory, _cfg);

            AddNewItem(new FullTextIndexItem
            {
                ObjectText = "111 222 333 qqq",

                Access = new List<FullTextIndexItemAccessInfo>()
                {
                    new FullTextIndexItemAccessInfo() { Key = 1037, Info =
"PW???"},
                    //new FullTextIndexItemAccessInfo() { Key = 1041, Info
= "P????"}
                }
            });

            AddNewItem(new FullTextIndexItem
            {
                ObjectText = "aaa bbb ccc qqq",
                Access = new List<FullTextIndexItemAccessInfo>()
                {
                    new FullTextIndexItemAccessInfo() { Key = 1037, Info =
"PW???"},
                    //new FullTextIndexItemAccessInfo() { Key = 1042, Info
= "PW??C"}
                }
            });

            _writer.Commit();
            _writer.Dispose();
            _writer = null;
            _cfg = null;
            _indexReader = DirectoryReader.Open(_directory);
            _searcher = new IndexSearcher(_indexReader);

            _analyzer = new WhitespaceAnalyzer(LuceneVersion.LUCENE_48);
            var boolQry = new BooleanQuery();

            var parser = new QueryParser(LuceneVersion.LUCENE_48,
FIELD_BODY, _analyzer) { AllowLeadingWildcard = true };
            var textQry = parser.Parse("*qqq*");
            boolQry.Add(textQry, Occur.MUST);
            var an = new WhitespaceAnalyzer(LuceneVersion.LUCENE_48);
            var localParser = new QueryParser(LuceneVersion.LUCENE_48,
FIELD_SECURITY, an);

            var localQry = localParser.Parse("1037p????");

            boolQry.Add(localQry, Occur.MUST);

            var qryRes = _searcher.Search(boolQry, 1000);

            Console.WriteLine($"Result found {qryRes.TotalHits}");
            Console.ReadLine();
        }
    }

    public class FullTextIndexItemAccessInfo
    {
        public int Key { get; set; }
        public string Info { get; set; }
    }

    public class FullTextIndexItem
    {
        public string ObjectText { get; set; }
        public List<FullTextIndexItemAccessInfo> Access { get; set; }
    }
}




Many thank in advance.

-- 
Best regards, Sergey.

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message