lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Yagnesh Shah" <ys...@hwwilson.com>
Subject HTML pages highlighter
Date Wed, 30 Mar 2005 17:31:31 GMT
Hello Lucene-User,
	Is any one try to do highlighting with HTML pages?

I am trying to do this using demo example by Keld H. Hansen article "Unweaving a Tangled Web
HTMLParser and Lucene" but I am getting "null" value for text at line #47 Any Idea?

      1 package org.apache.lucene.search.highlight;
      2
      3 import java.io.StringReader;
      4
      5 import org.apache.lucene.analysis.Analyzer;
      6 import org.apache.lucene.analysis.TokenStream;
      7 import org.apache.lucene.analysis.standard.StandardAnalyzer;
      8 import org.apache.lucene.queryParser.QueryParser;
      9 import org.apache.lucene.search.Hits;
     10 import org.apache.lucene.search.IndexSearcher;
     11 import org.apache.lucene.search.Query;
     12 import org.apache.lucene.search.highlight.Formatter;
     13 import org.apache.lucene.search.highlight.Highlighter;
     14 import org.apache.lucene.search.highlight.QueryScorer;
     15 import org.apache.lucene.search.highlight.SimpleFragmenter;
     16
     17 public class Searcher {
     18
     19    static Query query;
     20    static Hits hits;
     21
     22    private static final String FIELD_NAME = "contents";
     23    private static final String indexDir = "/opt/dynamo/prod/hww-doc/hww/help/index";
     24
     25    private static Analyzer analyzer = new StandardAnalyzer();
     26
     27    public static void main(String[] args) throws Exception {
     28
     29       IndexSearcher is   = new IndexSearcher(indexDir);
     30       String searchCriteria = "scholarly";
     31       query = QueryParser.parse(searchCriteria, "contents", analyzer);
     32
     33       hits  = is.search(query);
     34       System.out.println("found in: " + query +"\nhits-length:" +hits.length());
     35
     36       doStandardHighlights();
     37
     38       is.close();
     39    }
     40
     41    static void doStandardHighlights() throws Exception {
     42       Highlighter highlighter = new Highlighter(new MyBolder(), new QueryScorer(query));
     43       System.out.println("Highlighter: " + highlighter +"\nhits-length:" +hits.length());
     44       highlighter.setTextFragmenter(new SimpleFragmenter(20));
     45       for (int i = 0; i < hits.length(); i++) {
     46          System.out.println("URL " + (i + 1) + ": " + hits.doc(i).getField("path").stringValue());
     47          String text = hits.doc(i).get("FIELD_NAME");
     48          int maxNumFragmentsRequired = 2;
     49          String fragmentSeparator = "...";
     50          TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
     51
     52          String result =
     53             highlighter.getBestFragments(
     54                tokenStream,
     55                text,
     56                maxNumFragmentsRequired,
     57                fragmentSeparator);
     58          System.out.println("\tfound in: " + result);
     59       }
     60    }
     61
     62    private static class MyBolder implements Formatter {
     63         public String highlightTerm(String originalText , TokenGroup group)
     64         {
     65                 if(group.getTotalScore()<=0)
     66                 {
     67                         return originalText;
     68                 }
     69                 return "<b>" + originalText + "</b>";
     70         }
     71    }
     72
     73 }

Yagnesh N. Shah 
Senior Technology Engineer 
CS Dept., 4th Floor
H. W. Wilson 
950 University Avenue, 
Bronx NY 10452 
(718) 588 8400 x2721 
http://www.hwwilson.com

 

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message