lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ankit Murarka <ankit.mura...@rancoretech.com>
Subject Re: Boolean Query when indexing each line as a document.
Date Wed, 14 Aug 2013 14:29:08 GMT
Hello. The problem  is as follows:

I have a document containing information in lines. So I am indexing all 
files line by line.
So If I say in my document I have,
              INSIDE POST OF SERVER\
and in my index file created I have,
              INSIDE POST OF SERVER\

and I fire a boolean query with INSIDE and POST with MUST/MUST, I am 
getting no HIT.

I am providing the complete CODE I am using to create INDEX and TO 
SEARCH..Both are drawn from sample code present online.

/*INDEX CODE:
*/
package org.RunAllQueriesWithLineByLinePhrases;

public class CreateIndex {
   public static void main(String[] args) {
     String indexPath = "D:\\INDEXFORQUERY";  //Place where indexes will 
be created
     String docsPath="Indexed";    //Place where the files are kept.
     boolean create=true;
    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.exit(1);
     }
    try {
      Directory dir = FSDirectory.open(new File(indexPath));
      Analyzer analyzer=new 
CustomAnalyzerForCaseSensitive(Version.LUCENE_44);
      IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, 
analyzer);
       if (create) {
         iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
       } else {
           System.out.println("Trying to set IWC mode to UPDATE...NOT 
DESIRED..");
      }
       IndexWriter writer = new IndexWriter(dir, iwc);
       indexDocs(writer, docDir);
       writer.close();
     } catch (IOException e) {
       System.out.println(" caught a " + e.getClass() +
        "\n with message: " + e.getMessage());
     }
  }
   static void indexDocs(IndexWriter writer, File file)
     throws IOException {
    if (file.canRead())
    {
       if (file.isDirectory()) {
        String[] files = file.list();
         if (files != null) {
           for (int i = 0; i < files.length; i++) {
               if(files[i]!=null)
             indexDocs(writer, new File(file, files[i]));
           }
         }
      } else {
         try {
           Document doc = new Document();
           Field pathField = new StringField("path", file.getPath(), 
Field.Store.YES);
           doc.add(pathField);
           doc.add(new LongField("modified", file.lastModified(), 
Field.Store.NO));
           LineNumberReader lnr=new LineNumberReader(new FileReader(file));
          String line=null;
           while( null != (line = lnr.readLine()) ){
               doc.add(new StringField("contents",line,Field.Store.YES));
           }
           if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
             writer.addDocument(doc);
           } else {
             writer.updateDocument(new Term("path", file.getPath()), doc);
           }
         } finally {
         }
       }
     }
   } }

/*SEARCHING CODE:-*/

package org.RunAllQueriesWithLineByLinePhrases;

public class SearchFORALLQUERIES {
   public static void main(String[] args) throws Exception {

     String[] argument=new String[20];
     argument[0]="-index";
     argument[1]="D:\\INDEXFORQUERY";
     argument[2]="-field";
     argument[3]="contents";  //field value
     argument[4]="-repeat";
     argument[5]="2";   //repeat value
     argument[6]="-raw";
     argument[7]="-paging";
     argument[8]="300";   //paging value

     String index = "index";
     String field = "contents";
     String queries = null;
     int repeat = 0;
     boolean raw = false;
     String queryString = null;
     int hitsPerPage = 10;

     for(int i = 0;i < argument.length;i++) {
       if ("-index".equals(argument[i])) {
         index = argument[i+1];
         i++;
       } else if ("-field".equals(argument[i])) {
         field = argument[i+1];
         i++;
       } else if ("-queries".equals(argument[i])) {
         queries = argument[i+1];
         i++;
       } else if ("-query".equals(argument[i])) {
         queryString = argument[i+1];
         i++;
       } else if ("-repeat".equals(argument[i])) {
         repeat = Integer.parseInt(argument[i+1]);
         i++;
       } else if ("-raw".equals(argument[i])) {
         raw = true;   //set it true to just display the count. If false 
then it also display file name.
       } else if ("-paging".equals(argument[i])) {
         hitsPerPage = Integer.parseInt(argument[i+1]);
         if (hitsPerPage <= 0) {
           System.err.println("There must be at least 1 hit per page.");
           System.exit(1);
        }
        i++;
      }
    }
     System.out.println("processing input");
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new 
File(index)));  //location where indexes are.
    IndexSearcher searcher = new IndexSearcher(reader);
    BufferedReader in = null;
    if (queries != null) {
      in = new BufferedReader(new InputStreamReader(new 
FileInputStream(queries), "UTF-8")); //provide query as input
    } else {
      in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    while (true) {
      if (queries == null && queryString == null) 
{                        // prompt the user
        System.out.println("Enter query: ");   //if query is not 
present, prompt the user to enter query.
      }
      String line = queryString != null ? queryString : in.readLine();

      if (line == null || line.length() == -1) {
        break;
      }
      line = line.trim();
      if (line.length() == 0) {
        break;
      }
String[] str=line.split(" ");
  System.out.println("queries are "  + str[0] + " and is  "  + str[1]);
   Query query1 = new TermQuery(new Term(field, str[0]));
   Query query2=new TermQuery(new Term(field,str[1]));
       BooleanQuery booleanQuery = new BooleanQuery();
     booleanQuery.add(query1, BooleanClause.Occur.MUST);
     booleanQuery.add(query2, BooleanClause.Occur.MUST);
      if (repeat > 0) {            //repeat=2                 // repeat 
& time as benchmark
        Date start = new Date();
         for (int i = 0; i < repeat; i++) {
           searcher.search(booleanQuery, null, 100);
         }
         Date end = new Date();
         System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
       }
       doPagingSearch(in, searcher, booleanQuery, hitsPerPage, raw, 
queries == null && queryString == null);
       if (queryString != null) {
         break;
       }
     }
     reader.close();
   }
   public static void doPagingSearch(BufferedReader in, IndexSearcher 
searcher, Query query,
                                      int hitsPerPage, boolean raw, 
boolean interactive) throws IOException {
     TopDocs results = searcher.search(query, 5 * hitsPerPage);
     ScoreDoc[] hits = results.scoreDocs;
     int numTotalHits = results.totalHits;
     System.out.println(numTotalHits + " total matching documents");
     int start = 0;
     int end = Math.min(numTotalHits, hitsPerPage);
     while (true) {
       if (end > hits.length) {
         System.out.println("Only results 1 - " + hits.length +" of " + 
numTotalHits + " total matching documents collected.");
         System.out.println("Collect more (y/n) ?");
         String line = in.readLine();
         if (line.length() == 0 || line.charAt(0) == 'n') {
           break;
         }
         hits = searcher.search(query, numTotalHits).scoreDocs;
       }
       end = Math.min(hits.length, start + hitsPerPage);   //3 and 5.
       for (int i = start; i < end; i++) {  //0 to 3.
         if (raw) {

           System.out.println("doc="+hits[i].doc+" score="+hits[i].score);
         }
         Document doc = searcher.doc(hits[i].doc);
         List<IndexableField> filed=doc.getFields();
         filed.size();
         String path = doc.get("path");
         if (path != null) {
           System.out.println((i+1) + ". " + path);
           String title = doc.get("title");
           if (title != null) {
             System.out.println("   Title: " + doc.get("title"));
           }
         } else {
           System.out.println((i+1) + ". " + "No path for this document");
         }
       }
       if (!interactive || end == 0) {
         break;
       }
       if (numTotalHits >= end) {
         boolean quit = false;
         while (true) {
           System.out.print("Press ");
           if (start - hitsPerPage >= 0) {
             System.out.print("(p)revious page, ");
           }
           if (start + hitsPerPage < numTotalHits) {
             System.out.print("(n)ext page, ");
           }
           System.out.println("(q)uit or enter number to jump to a page.");
           String line = in.readLine();
           if (line.length() == 0 || line.charAt(0)=='q') {
             quit = true;
             break;
           }
           if (line.charAt(0) == 'p') {
             start = Math.max(0, start - hitsPerPage);
             break;
           } else if (line.charAt(0) == 'n') {
             if (start + hitsPerPage < numTotalHits) {
               start+=hitsPerPage;
             }
             break;
           } else {
             int page = Integer.parseInt(line);
             if ((page - 1) * hitsPerPage < numTotalHits) {
               start = (page - 1) * hitsPerPage;
               break;
             } else {
               System.out.println("No such page");
             }
           }
         }
         if (quit) break;
         end = Math.min(numTotalHits, start + hitsPerPage);
       }
     }
   }
}

/*CUSTOM ANALYZER CODE:*/

package com.rancore.demo;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;

public class CustomAnalyzerForCaseSensitive extends StopwordAnalyzerBase {

       public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
       private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
       public static final CharArraySet STOP_WORDS_SET = 
StopAnalyzer.ENGLISH_STOP_WORDS_SET;
       public CustomAnalyzerForCaseSensitive(Version matchVersion, 
CharArraySet stopWords) {
         super(matchVersion, stopWords);
       }
       public CustomAnalyzerForCaseSensitive(Version matchVersion) {
         this(matchVersion, STOP_WORDS_SET);
       }
       public CustomAnalyzerForCaseSensitive(Version matchVersion, 
Reader stopwords) throws IOException {
             this(matchVersion, loadStopwordSet(stopwords, matchVersion));
           }
       public void setMaxTokenLength(int length) {
             maxTokenLength = length;
           }
           /**
            * @see #setMaxTokenLength
            */
           public int getMaxTokenLength() {
             return maxTokenLength;
           }
     @Override
     protected TokenStreamComponents createComponents(final String 
fieldName, final Reader reader) {
          final StandardTokenizer src = new 
StandardTokenizer(matchVersion, reader);
             src.setMaxTokenLength(maxTokenLength);
             TokenStream tok = new StandardFilter(matchVersion, src);
            // tok = new LowerCaseFilter(matchVersion, tok);
             tok = new StopFilter(matchVersion, tok, stopwords);
             return new TokenStreamComponents(src, tok) {
               @Override
               protected void setReader(final Reader reader) throws 
IOException {
                 
src.setMaxTokenLength(CustomAnalyzerForCaseSensitive.this.maxTokenLength);
                 super.setReader(reader);
               }
             };
     }
}



I HOPE I HAVE GIVEN THE COMPLETE CODE SAMPLE FOR PEOPLE TO WORK ON..

PLEASE GUIDE ME NOW:  IN case any further information is required please 
let me know.


On 8/14/2013 7:43 PM, Ian Lea wrote:
> Well, you have supplied a bit more info - good - but I still can't
> spot the problem.  Unless someone else can I suggest you post a very
> small self-contained program that demonstrates the problem.
>
>
> --
> Ian.
>
>
> On Wed, Aug 14, 2013 at 2:50 PM, Ankit Murarka
> <ankit.murarka@rancoretech.com>  wrote:
>    
>> Hello.
>>          The problem does not seem to be getting solved.
>>
>> As mentioned, I am indexing each line of each file.
>> The sample text present inside LUKE is
>>
>> <am name="notification" value="10"/>\
>> <type="DE">\
>> java.lang.Thread.run(Thread.java:619)
>>      
>>>> Size of list  array::0\
>>>>          
>> at java.lang.reflect.Method.invoke(Method.java:597)
>> org.com.dummy,INFO,<<  Still figuring out how to run
>>      
>>>> ,SERVER,100.100.100.100:8080,EXCEPTION,10613349
>>>>          
>> INSIDE POST OF Listener\
>>
>> In my Luke, I can see the text as "INSIDE POST OF Listener" .. This is
>> present in many files.
>>
>> /*Query is +contents:INSIDE contents:POST */              --/The field name
>> is contents. Same analyzer is being used. This is a boolean query./
>>
>> To test, I indexed only 20 files. In 19 files, this is present.
>>
>> The boolean query should give me a hit for this document.
>>
>> BUT IT IS RETURNING ME NO HIT..
>>
>> If I index the same files WITHOUT line by line then, it gives me proper
>> hits..
>>
>> But for me it should work on Indexes created by Line by Line parsing also.
>>
>> Please guide.
>>
>>
>>
>>
>>
>> On 8/13/2013 4:41 PM, Ian Lea wrote:
>>      
>>> remedialaction != "remedial action"?
>>>
>>> Show us your query.  Show a small self-contained sample program or
>>> test case that demonstrates the problem.  You need to give us
>>> something more to go on.
>>>
>>>
>>> --
>>> Ian.
>>>
>>>
>>> On Tue, Aug 13, 2013 at 11:13 AM, Ankit Murarka
>>> <ankit.murarka@rancoretech.com>   wrote:
>>>
>>>        
>>>> Hello,
>>>>           I am aware of that link and I have been through that link many
>>>> number of times.
>>>>
>>>> Problem I have is:
>>>>
>>>> 1. Each line is indexed. So indexed line looks something like "<attribute
>>>> name="remedial action" value="Checking"/>\"
>>>> 2. I am easily firing a phrase query on this line. It suggest me the
>>>> possible values. No problem,.
>>>> 3. If I fire a Boolean Query with "remedialaction" and "Checking" as a
>>>> must/must , then it is not providing me this document as a hit.
>>>> 4. I am using StandardAnalyzer both during the indexing and searching
>>>> time.
>>>>
>>>>
>>>> On 8/13/2013 2:31 PM, Ian Lea wrote:
>>>>
>>>>          
>>>>> Should be straightforward enough.  Work through the tips in the FAQ
>>>>> entry at
>>>>>
>>>>> http://wiki.apache.org/lucene-java/LuceneFAQ#Why_am_I_getting_no_hits_.2F_incorrect_hits.3F
>>>>> and post back if that doesn't help, with details of how you are
>>>>> analyzing the data and how you are searching.
>>>>>
>>>>>
>>>>> --
>>>>> Ian.
>>>>>
>>>>>
>>>>> On Tue, Aug 13, 2013 at 8:56 AM, Ankit Murarka
>>>>> <ankit.murarka@rancoretech.com>    wrote:
>>>>>
>>>>>
>>>>>            
>>>>>> Hello All,
>>>>>>                   I have 2 different usecases.
>>>>>> I am trying to provide both boolean query and phrase search query
in
>>>>>> the
>>>>>> application.
>>>>>>
>>>>>> In every line of the document which I am indexing I have content
like :
>>>>>>
>>>>>> <attribute name="remedial action" value="Checking"/>\
>>>>>>
>>>>>> Due to the phrase search requirement, I am indexing each line of
the
>>>>>> file
>>>>>> as
>>>>>> a new document.
>>>>>>
>>>>>> Now when I am trying to do a phrase query (Did you Mean, Infix Analyzer
>>>>>> etc,
>>>>>> or phrase suggest) this seems to work fine and provide me with desired
>>>>>> suggestions.
>>>>>>
>>>>>> Problem is :
>>>>>>
>>>>>> How do I invoke boolean query for this. I mean when I verified the
>>>>>> indexes
>>>>>> in Luke, I saw the whole line as expected is indexed.
>>>>>>
>>>>>> So, if user wish to perform a boolean query say suppose containing
>>>>>> "remedialaction" and "Checking" how do I get this document as a hit.
I
>>>>>> believe since I am indexing each line, this seems to be bit tricky.
>>>>>>
>>>>>> Please guide.
>>>>>>
>>>>>> --
>>>>>> Regards
>>>>>>
>>>>>> Ankit
>>>>>>
>>>>>>
>>>>>> ---------------------------------------------------------------------
>>>>>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>>>>>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>>>>>
>>>>>>
>>>>>>
>>>>>>              
>>>>> ---------------------------------------------------------------------
>>>>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>>>>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>            
>>>>
>>>>
>>>> --
>>>> Regards
>>>>
>>>> Ankit Murarka
>>>>
>>>> "What lies behind us and what lies before us are tiny matters compared
>>>> with
>>>> what lies within us"
>>>>
>>>>
>>>> ---------------------------------------------------------------------
>>>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>>>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>>>
>>>>
>>>>          
>>> ---------------------------------------------------------------------
>>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>>
>>>
>>>
>>>        
>>
>>
>> --
>> Regards
>>
>> Ankit Murarka
>>
>> "What lies behind us and what lies before us are tiny matters compared with
>> what lies within us"
>>
>>      
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
>
>    


-- 
Regards

Ankit Murarka

"What lies behind us and what lies before us are tiny matters compared with what lies within
us"


Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message