lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Thomas X Hoban" <tho...@caseknowledge.com>
Subject Lucene - PDFBox
Date Wed, 25 May 2005 20:58:41 GMT
    

First, I am new to Lucene.

Is there anyone out there who has had trouble getting hits when running phrase queries against
an index that contains content from PDF files.  For PDF documents, I create the document using
LucenePDFDocument.getDocument(file) and then add it to the index.  For non-pdf documents,
I create the document using FileDocument.Document(file).

For instance, I add documents with the following text:

pdf1.pdf -- "Dave has good taste"
pdf2.pdf -- "Tom has good taste"
word1.doc -- "Liz has bad taste"
word2.doc -- "Troy has bad taste"

When I search content for the following strings:

    has good taste
      get expected results with hits on pdf1.doc, pdf2.doc, word1.doc and word2.doc

    "has good taste"
       get unexpected result: 0 hits

    "has bad taste"
       get expected results with hits on word1.doc and word2.doc
 
It seems that searching for individual words work fine for both PDF and non-pdf files.  However,
searching on a phrase (enclosed in quotes) works on non-pdf files but not on files parsed
with the LucenePDFDocument class.

Can anyone offer advise?

Below is code for index creation.  It is the demo IndexFiles class provided with Lucene along
with some changes...

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;

//import javax.activation.MimetypesFileTypeMap;

import org.pdfbox.searchengine.lucene.LucenePDFDocument;


class IndexFiles {
  public static void main(String[] args) throws IOException {
    String usage = "java " + IndexFiles.class + " <root_directory>";
    if (args.length == 0) {
      System.err.println("Usage: " + usage);
      System.exit(1);
    }

    Date start = new Date();
    try {
      IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true);
      indexDocs(writer, new File(args[0]));

      writer.optimize();
      writer.close();

      Date end = new Date();

      System.out.print(end.getTime() - start.getTime());
      System.out.println(" total milliseconds");

    } catch (IOException e) {
      System.out.println(" caught a " + e.getClass() +
       "\n with message: " + e.getMessage());
    }
  }

  public static void indexDocs(IndexWriter writer, File file)
    throws IOException {
    // do not try to index files that cannot be read

    if (file.canRead()) {
      if (file.isDirectory()) {
        String[] files = file.list();
        // an IO error could occur
        if (files != null) {
          for (int i = 0; i < files.length; i++) {
            indexDocs(writer, new File(file, files[i]));
          }
        }
      } else {
        System.out.println("adding " + file);
        try {

          Document doc = null;
          if (file.getName().indexOf(".pdf") >= 0)
              // writer.addDocument(LucenePDFDocument.getDocument(file));
              doc = LucenePDFDocument.getDocument(file);
          else
              doc = FileDocument.Document(file);

          Field field = null; 
          if (file.getPath().indexOf("case1") >=0)
              field = new Field("caseid", "1", false, true, false);
          else if (file.getPath().indexOf("case2") >=0)
              field = new Field("caseid", "2", false, true, false);
          else if (file.getPath().indexOf("case3") >=0)
              field = new Field("caseid", "3", false, true, false);
          else 
              field = new Field("caseid", "0", false, true, false);

          doc.add(field);

          writer.addDocument(doc);
        }
        // at least on windows, some temporary files raise this exception with an "access
denied" message
        // checking if the file can be read doesn't help
        catch (FileNotFoundException fnfe) {
          ;
        }
      }
    }
  }
}


Here is the SearchFiles class with some minor modifications...

import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.StringTokenizer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;

class SearchFiles {

  private static Query getCaseQuery(String line, Analyzer analyzer)
  throws ParseException {
      BooleanQuery bq = new BooleanQuery();
      StringTokenizer st = new StringTokenizer(line);
      Query query = QueryParser.parse(line, "contents", analyzer);
      String caseId = null;
      while (st.hasMoreTokens()) {
          caseId = st.nextToken();
          System.out.println("build case query for " + caseId);
          
          query = QueryParser.parse(caseId, "caseid", analyzer);
          bq.add(query, false, false);
      }

      return bq;
  }
  public static void main(String[] args) {
    try {
      Searcher searcher = new IndexSearcher("index");
      Analyzer analyzer = new StandardAnalyzer();

      BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
      while (true) {
        System.out.print("Query: ");
        String line = in.readLine();
        System.out.print("Cases: ");
        String caseLine = in.readLine();
        Query caseQuery = getCaseQuery(caseLine, analyzer);

        if (line.length() == -1)
          break;

        
        Query query = QueryParser.parse(line, "contents", analyzer);
        // PhraseQuery query = new PhraseQuery();
        // query.add(new Term("contents",line));
        System.out.println("Searching for: " + query.toString("contents"));
        /*
        BooleanQuery wholeQuery = new BooleanQuery();
        wholeQuery.add(caseQuery, true, false);
        wholeQuery.add(query,     true, false);
        Hits hits = searcher.search(wholeQuery);
        */
        Hits hits = searcher.search(query);
        System.out.println(hits.length() + " total matching documents");

        final int HITS_PER_PAGE = 10;
        for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
          int end = Math.min(hits.length(), start + HITS_PER_PAGE);
          for (int i = start; i < end; i++) {
            Document doc = hits.doc(i);
            String path = doc.get("path");
            if (path != null) {
              System.out.println(i + ". " + path);
            } else {
              String url = doc.get("url");
              if (url != null) {
                System.out.println(i + ". " + url);
                System.out.println("   - " + doc.get("title"));
              } else {
                System.out.println(i + ". " + "No path nor URL for this document");
              }
            }
          }

          if (hits.length() > end) {
            System.out.print("more (y/n) ? ");
            line = in.readLine();
            if (line.length() == 0 || line.charAt(0) == 'n')
              break;
          }
        }
      }
      searcher.close();

    } catch (Exception e) {
      System.out.println(" caught a " + e.getClass() +
                         "\n with message: " + e.getMessage());
    }
  }
}

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message