lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ralf Bierig <ralf.bie...@gmail.com>
Subject Payload and Similarity Function: Always same value
Date Thu, 30 Oct 2014 18:03:40 GMT
I want to implement a Lucene Indexer/Searcher that uses the new Payload 
feature to add meta information to tokens. I specifically add weights 
(i.e. 0-100) to conceptual tags in order to use them to overwrite the 
standard Lucene TF-IDF weighting. I am puzzled by the behaviour of this 
and I believe there is something wrong with the Similarity class, that I 
overwrote, but I cannot figure it out.

I attach the complete code below for this exampe. When I run a query 
with it (e.g. "concept:red") I discover that each payload is always the 
first number that was passed through MyPayloadSimilarity (in the code 
example, this is 1.0) and not 1.0, 50.0 and 100.0. As a result, all 
documents get the same payload and the same score. However, the data 
should feature picture #1, with a payload of 100.0, followed by picture 
#2, followed by picture #3 and very diverse scores. I can't get my heard 
around this...

Here are the results of the run:

Query: concept:red
===>  docid: 0 payload: 1.0
===>  docid: 1 payload: 1.0
===>  docid: 2 payload: 1.0
Number of results:3
-> docid: 3.jpg score: 0.2518424
-> docid: 2.jpg score: 0.2518424
-> docid: 1.jpg score: 0.2518424

What is wrong? Did i misunderstand something about Payloads?

---Start Code---

public class PayloadShowcase {

  public static void main(String s[]) {
      PayloadShowcase p = new PayloadShowcase();
      p.run();
  }

public void run () {
     // Step 1: indexing
     MyPayloadIndexer indexer = new MyPayloadIndexer();
     indexer.index();
     // Step 2: searching
     MyPayloadSearcher searcher = new MyPayloadSearcher();
     searcher.search("red");
}

public class MyPayloadAnalyzer extends Analyzer {

     private PayloadEncoder encoder;
     MyPayloadAnalyzer(PayloadEncoder encoder) {
         this.encoder = encoder;
     }

     @Override
     protected TokenStreamComponents createComponents(String fieldName, 
Reader reader) {
         Tokenizer source = new WhitespaceTokenizer(reader);
         TokenStream filter = new LowerCaseFilter(source);
         filter = new DelimitedPayloadTokenFilter(filter, '|', encoder);
         return new TokenStreamComponents(source, filter);
     }
}

public class MyPayloadIndexer {

     public MyPayloadIndexer() {}

     public void index() {
         try {
             Directory dir = FSDirectory.open(new 
File("D:/data/indices/sandbox"));
             Analyzer analyzer = new MyPayloadAnalyzer(new FloatEncoder());
             IndexWriterConfig iwconfig = new 
IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
             iwconfig.setSimilarity(new MyPayloadSimilarity());
             iwconfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

             // load mappings and classifiers
             HashMap<String, String> mappings = this.loadDataMappings();
             HashMap<String, HashMap> cMaps = this.loadData();

             IndexWriter writer = new IndexWriter(dir, iwconfig);
             indexDocuments(writer, mappings, cMaps);
             writer.close();

         } catch (IOException e) {
             System.out.println("Exception while indexing: " + 
e.getMessage());
         }
     }

     private void indexDocuments(IndexWriter writer, HashMap<String, 
String> fileMappings, HashMap<String, HashMap> concepts) throws 
IOException {

         Set fileSet = fileMappings.keySet();
         Iterator<String> iterator = fileSet.iterator();
         while (iterator.hasNext()){
             // unique file information
             String fileID = iterator.next();
             String filePath = fileMappings.get(fileID);
             // create a new, empty document
             Document doc = new Document();
             // path of the indexed file
             Field pathField = new StringField("path", filePath, 
Field.Store.YES);
             doc.add(pathField);
             // lookup all concept probabilities for this fileID
             Iterator<String> conceptIterator = 
concepts.keySet().iterator();
             while (conceptIterator.hasNext()){
                 String conceptName = conceptIterator.next();
                 HashMap conceptMap = concepts.get(conceptName);
                 doc.add(new TextField("concept", ("" + conceptName + 
"|").trim() + (conceptMap.get(fileID) + "").trim(), Field.Store.YES));
             }
             writer.addDocument(doc);
         }
     }

     public HashMap<String, String> loadDataMappings(){
         HashMap<String, String> h = new HashMap<>();
         h.put("1", "1.jpg");
         h.put("2", "2.jpg");
         h.put("3", "3.jpg");
         return h;
     }

     public HashMap<String, HashMap> loadData(){
         HashMap<String, HashMap> h = new HashMap<>();
         HashMap<String, String> green = new HashMap<>();
         green.put("1", "50.0");
         green.put("2", "1.0");
         green.put("3", "100.0");
         HashMap<String, String> red = new HashMap<>();
         red.put("1", "100.0");
         red.put("2", "50.0");
         red.put("3", "1.0");
         HashMap<String, String> blue = new HashMap<>();
         blue.put("1", "1.0");
         blue.put("2", "50.0");
         blue.put("3", "100.0");
         h.put("green", green);
         h.put("red", red);
         h.put("blue", blue);
         return h;
     }
}

class MyPayloadSimilarity extends DefaultSimilarity {

     @Override
     public float scorePayload(int docID, int start, int end, BytesRef 
payload) {
         float pload = 1.0f;
         if (payload != null) {
             pload = PayloadHelper.decodeFloat(payload.bytes, 
payload.offset);
         }
         System.out.println("===>  docid: " + docID + " payload: " + pload);
         return pload;
     }
}

public class MyPayloadSearcher {

     public MyPayloadSearcher() {}

     public void search(String queryString) {
         try {
             IndexReader reader = 
DirectoryReader.open(FSDirectory.open(new File("D:/data/indices/sandbox")));
             IndexSearcher searcher = new IndexSearcher(reader);
             searcher.setSimilarity(new PayloadSimilarity());
             PayloadTermQuery query = new PayloadTermQuery(new 
Term("concept", queryString),
                     new AveragePayloadFunction());
             System.out.println("Query: " + query.toString());
             TopDocs topDocs = searcher.search(query, 999);
             ScoreDoc[] hits = topDocs.scoreDocs;
             System.out.println("Number of results:" + hits.length);

             // output
             for (int i = 0; i < hits.length; i++) {
                 Document doc = searcher.doc(hits[i].doc);
                 System.out.println("-> docid: " + doc.get("path") + " 
score: " + hits[i].score);
             }
             reader.close();

         } catch (Exception e) {
             System.out.println("Exception while searching: " + 
e.getMessage());
         }
     }
}

---End Code---

Any ideas? I am very grateful for any help...

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message