lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Grant Ingersoll <gsing...@apache.org>
Subject Re: Boosting query - debuging
Date Thu, 07 May 2009 03:51:41 GMT
Hi Liat,

Can you post the code you are using to generate the info below?

-Grant

On May 3, 2009, at 11:43 PM, liat oren wrote:

> I looked into the output again, and saw that the explain method,  
> explains a
> different result then the document i thought it did.
>
> Within the loop of the results, I replaced
> int docId = hits[j].doc;
>   Document curDoc = searcher.doc(docId);
> with
> Document curDoc = searcher.doc(j);
> So I got the right explain to the documnet.
>
> The strange things I got are:
> 1. The explain is much shorter as you can see below
> 2. the score of finlin (1.6479614) is differnt than the one in the  
> explain
> (0.34333253)
> 3. I think it is because of the fieldNorm. why is it differnt than  
> the one
> of TTD?
>
> finlin, score: 1.6479614
> 0.3433253 = (MATCH) fieldWeight(worlds:666666 in 0), product of:
>  0.70710677 = (MATCH) btq, product of:
>    0.70710677 = tf(phraseFreq=0.5)
>    1.0 = scorePayload(...)
>  0.7768564 = idf(worlds: 666666=4)
>  0.625 = fieldNorm(field=worlds, doc=0)
>
> TTD, score: 1.6479614
> 1.6479613 = (MATCH) fieldWeight(worlds:666666 in 1), product of:
>  2.1213202 = (MATCH) btq, product of:
>    0.70710677 = tf(phraseFreq=0.5)
>    3.0 = scorePayload(...)
>  0.7768564 = idf(worlds: 666666=4)
>  1.0 = fieldNorm(field=worlds, doc=1)
>
> Thanks again,
> Liat
>
>
>
> 2009/5/3 liat oren <oren.liat@gmail.com>
>
>> Hi,
>>
>> I try to debug boosting query.
>> Is there a way to see the term boost in the documents? I see them  
>> in spans
>> in BoostingTermQuery, yet, from there I can't see which document I  
>> am in.
>> If I want to copy some of the document in an index that saves the  
>> boosting
>> - how can it be done?
>>
>> The problem I am facing is that I get unexpected results - If for  
>> word "a",
>> I have the worlds "1111" (boosting 3) and "2222" and for word "b" I  
>> have the
>> world "1111". When I try to search for "1111" (boosting 5), word  
>> "a" gets
>> better results.
>>
>> When I debugged it, I saw that the boosting is always three, but  
>> since in
>> the index I have a lot of documents, I tried to do the same on a  
>> smaller
>> index.
>>
>> I put only two words as you can see in the code below (I put all the
>> methods and classes needed to run this code).
>>
>> The problem I saw here is the scorePayload in the Explain method -  
>> it took
>> a differnt value from the one I indexed.
>> You can see below the output - for TTD - 1.0 = scorePayload(...)
>> and for finlin 3.0 = scorePayload(...)
>> while the boosting I used was the opposite - for TTD, I used 3 and  
>> for
>> finlin, I used 1
>>
>> The scorePayload should be the factor I put when I indexed, right?
>>
>> Thanks a lot,
>> Liat
>>
>> TTD, score: 1.2611988
>>
>> 0.26274973 = (MATCH) weight(worlds:666666 in 0), product of:
>>  0.99999994 = queryWeight(worlds:666666), product of:
>>    0.5945349 = idf(worlds: 666666=2)
>>    1.681987 = queryNorm
>>  0.26274976 = (MATCH) fieldWeight(worlds:666666 in 0), product of:
>>    0.70710677 = (MATCH) btq, product of:
>>      0.70710677 = tf(phraseFreq=0.5)
>>      1.0 = scorePayload(...)
>>    0.5945349 = idf(worlds: 666666=2)
>>    0.625 = fieldNorm(field=worlds, doc=0)
>> ********************************************************
>> finlin, score: 0.26274976
>>
>> 1.2611988 = (MATCH) weight(worlds:666666 in 1), product of:
>>  0.99999994 = queryWeight(worlds:666666), product of:
>>    0.5945349 = idf(worlds: 666666=2)
>>    1.681987 = queryNorm
>>  1.2611989 = (MATCH) fieldWeight(worlds:666666 in 1), product of:
>>    2.1213202 = (MATCH) btq, product of:
>>      0.70710677 = tf(phraseFreq=0.5)
>>      3.0 = scorePayload(...)
>>    0.5945349 = idf(worlds: 666666=2)
>>    1.0 = fieldNorm(field=worlds, doc=1)
>>
>> *The code*
>> **
>> public class Test
>> {
>> public Test()
>> {
>> }
>> public static void main(String[] args) throws IOException, Exception
>> {
>>  Test st = new Test();
>>  st.index(); //
>>  st.testRealIndex();
>> }
>> public void index() throws IOException
>> {
>>  DoubleMap wordMap = new DoubleMap();
>>  wordMap.insert("TTD", 666666, 3);
>>  wordMap.insert("finlin", 666666, 1);
>>  wordMap.insert("finlin", 222222, 2);
>>  index(wordMap, "wordIndexTry", "", "0");
>> }
>> public synchronized void index(DoubleMap doubleMap, String dirPath,  
>> String
>> originalPath, String includeFreq) throws IOException
>> {
>>  File f = new File(dirPath);
>>  IndexWriter writer = null;
>>  PayloadAnalyzer panalyzer = new PayloadAnalyzer();
>>  if(f.exists())
>>  {
>>   writer = new IndexWriter(dirPath, panalyzer, false);
>>  }
>>  else
>>  {
>>   writer = new IndexWriter(dirPath, panalyzer, true);
>>  }
>>  Iterator it = doubleMap.getMap().entrySet().iterator();
>>  int count = 0;
>>  int size = doubleMap.getMap().size();
>>  while(it.hasNext())
>>  {
>>   count++;
>>   Map.Entry entry = (Map.Entry) it.next();
>>   String word = entry.getKey().toString();
>>   Word w = new Word();
>>   w.word = word;
>>   Date date = new Date();
>>   System.out.println(date.toString() + " : Updateing word " + word  
>> + " ( "
>> + count + " out of " + size + ") " + " FROM " + originalPath);
>>   Map<Long, Double> innerMap = (Map<Long, Double>) entry.getValue();
>>   Map<String, Integer> scoresMap = processMap(writer, panalyzer,  
>> innerMap,
>> entry, w, dirPath, includeFreq);
>>   index(writer, panalyzer, innerMap, scoresMap, w, dirPath,  
>> includeFreq);
>>  }
>>  System.out.println("Optimizing " + dirPath + " ...");
>>  writer.optimize();
>>  writer.close();
>> }
>> public synchronized Map<String, Integer> processMap(IndexWriter  
>> writer,
>> PayloadAnalyzer panalyzer, Map<Long, Double> innerMap, Map.Entry  
>> entry, Word
>> w, String dirPath, String includeFreq) throws IOException
>> {
>>  Map<String, Integer> scoresMap = new HashMap<String, Integer>();
>>  Iterator worldsIter = innerMap.entrySet().iterator();
>>  String worlds = "";
>>  synchronized(worldsIter)
>>  {
>>   while(worldsIter.hasNext())
>>   {
>>    Map.Entry worldsEntry = (Map.Entry) worldsIter.next();
>>    String world = worldsEntry.getKey().toString();
>>    int freq = (int)  
>> Double.parseDouble(worldsEntry.getValue().toString());
>>    scoresMap.put(world, freq);
>>    worlds += world + " ";
>>    FileUtil.writeToFile("Output\\WordWorldsFreq.txt", w.word +
>> Constants.TAB_SEP + world + Constants.TAB_SEP + freq);
>>   }
>>  }
>>  panalyzer.setMapScores(scoresMap);
>> //MapUtil.copyStringIntMap(scoresMap));
>>  return scoresMap;
>> }
>> public synchronized void index(IndexWriter writer, PayloadAnalyzer
>> panalyzer, Map<Long, Double> innerMap, Map<String, Integer>  
>> scoresMap, Word
>> w, String dirPath, String includeFreq) throws IOException
>> {
>>  System.out.println("indexing");
>>  w.worldsMap = innerMap;
>>  WordIndex wi = new WordIndex(w);
>>  wi.createDocument(includeFreq);
>>  writer.addDocument(wi.getDocument());
>> }
>> public void testRealIndex() throws IOException
>> {
>>  String word = "TTD";
>>  String worlds = "666666";
>>  DoubleMap wordsWorldsFreqMap = new DoubleMap();
>>  wordsWorldsFreqMap.insert("TTD", 666666, 1.0);
>>  BoostingBooleanQueryParser bbqp = new BoostingBooleanQueryParser();
>>  BooleanQuery bq = bbqp.parse(word, worlds, wordsWorldsFreqMap,  
>> "worlds");
>>  IndexSearcher searcher = new IndexSearcher("wordIndexTry");
>> //D:\\PaiDatabase\\Indexes\\WordIndex");
>>  searcher.setSimilarity(new WordsSimilarity());
>>  TopDocCollector collector = new TopDocCollector(30);
>>  searcher.search(bq, collector);
>>  ScoreDoc[] hits = collector.topDocs().scoreDocs;
>>  for(int j = 0; j < Math.min(hits.length, 10); j++)
>>  {
>>   int docId = hits[j].doc;
>>   Document curDoc = searcher.doc(docId);
>>   System.out.println(curDoc.getField("word").stringValue() + ",  
>> score: " +
>> hits[j].score);
>>   Explanation explanation = searcher.explain(bq, j);
>>   System.out.println(explanation.toString());
>>   String sym = curDoc.getField("word").stringValue();
>>  }
>> }
>> public abstract class Index
>> {
>>  protected Document doc = new Document();
>>  public Index()
>>  {
>>  }
>>  public Document getDocument()
>>  {
>>   return doc;
>>  }
>>  public void setDocument(Document d)
>>  {
>>   this.doc = d;
>>  }
>> }
>> public class WordIndex extends Index
>> {
>>  protected Word w;
>>  public String FIELD_WORD = "word";
>>  public String FIELD_WORLDS = "worlds";
>>  public WordIndex(Word w)
>>  {
>>   this.w = w;
>>  }
>>  public void createDocument(String includeFreq) throws
>> java.io.FileNotFoundException
>>  {
>>   // make a new, empty document
>>   doc = new Document();
>>   doc.add(new Field(FIELD_WORD, w.word, Field.Store.YES,
>> Field.Index.NOT_ANALYZED));
>>   doc.add(new Field(FIELD_WORLDS,
>> String.valueOf(w.getWorldIds(includeFreq)), Field.Store.YES,
>> Field.Index.ANALYZED, Field.TermVector.YES));
>>  }
>>  public Document getDoc(String word, String indexPath) throws  
>> IOException
>>  {
>>   IndexSearcher mapSearcher = new IndexSearcher(indexPath);
>>   TermQuery mapQuery = new TermQuery(new Term(FIELD_WORD, word));
>>   Hits mapHits = mapSearcher.search(mapQuery);
>>   if(mapHits.length() != 0)
>>   {
>>    Document doc = mapHits.doc(0);
>>    return doc;
>>   }
>>   return null;
>>  }
>> }
>> public class Word
>> {
>>  public String word;
>>  public Map<Long, Double> worldsMap = new HashMap<Long, Double>();
>>  public Word()
>>  {
>>  }
>>  public String getWorldIds(String includeFreq)
>>  {
>>   String worlds = "";
>>   Iterator iter = worldsMap.entrySet().iterator();
>>   while(iter.hasNext())
>>   {
>>    Map.Entry entry = (Map.Entry) iter.next();
>>    if(includeFreq.equals("1"))
>>    {
>>     int freq = (int) Double.parseDouble(entry.getValue().toString());
>>     for(int i = 0; i < freq; i++)
>>     {
>>      worlds += entry.getKey().toString() + " ";
>>     }
>>    }
>>    else
>>    {
>>     worlds += entry.getKey().toString() + " ";
>>    }
>>   }
>>   return worlds;
>>  }
>> }
>> public class DoubleMap
>> {
>>  private Map<String, Map<Long, Double>> map;
>>  public Map<String, String> worldsListMap = new HashMap<String,  
>> String>();
>>  public List<String> entriesList = new ArrayList<String>();
>>  public DoubleMap()
>>  {
>>   map = new HashMap<String, Map<Long, Double>>();
>>  }
>>  public void insert(String word, long worldId, double beta)
>>  {
>>   if(map.get(word) != null)
>>   {
>>    Map<Long, Double> innerMap = map.get(word);
>>    if(innerMap.get(worldId) != null)
>>    {
>>     return;
>>    }
>>    innerMap.put(worldId, beta);
>>    map.put(word, innerMap);
>>   }
>>   else
>>   {
>>    Map<Long, Double> innerMap = new HashMap<Long, Double>();
>>    innerMap.put(worldId, beta);
>>    map.put(word, innerMap);
>>   }
>>  }
>>  public void insert(String word, long worldId, double beta, int size)
>>  {
>>   if(map.get(word) != null)
>>   {
>>    Map<Long, Double> innerMap = map.get(word);
>>    if(innerMap.get(worldId) != null)
>>    {
>>     return;
>>    }
>>    if(innerMap.size() == size)
>>    {
>>     Iterator iter = innerMap.entrySet().iterator();
>>     int count = 0;
>>     while(iter.hasNext())
>>     {
>>      Map.Entry entry = (Map.Entry) iter.next();
>>      count++;
>>     }
>>     System.out.println(count);
>>     long minWorldId = getMinItem(innerMap);
>>     innerMap.remove(minWorldId);
>>    }
>>    innerMap.put(worldId, beta);
>>    map.put(word, innerMap);
>>   }
>>   else
>>   {
>>    Map<Long, Double> innerMap = new HashMap<Long, Double>();
>>    innerMap.put(worldId, beta);
>>    map.put(word, innerMap);
>>   }
>>  }
>>  private long getMinItem(Map<Long, Double> innerMap)
>>  {
>>   Iterator it = innerMap.entrySet().iterator();
>>   long worldId = -1;
>>   while(it.hasNext())
>>   {
>>    Map.Entry entry = (Map.Entry) it.next();
>>    worldId = Long.parseLong(entry.getKey().toString());
>>   }
>>   return worldId;
>>  }
>>  public Map<String, Map<Long, Double>> getMap()
>>  {
>>   return map;
>>  }
>> }
>> public class BoostingBooleanQueryParser
>> {
>>  public BoostingBooleanQueryParser()
>>  {
>>  }
>>  public BooleanQuery parse(String word, String worlds, DoubleMap
>> wordsWorldsFreqMap, String fieldName) throws IOException
>>  {
>>   BooleanQuery bq = new BooleanQuery();
>>   String[] splitWorlds = worlds.split(" ");
>>   for(int i = 0; i < splitWorlds.length; i++)
>>   {
>>    double freq =
>> wordsWorldsFreqMap 
>> .getMap().get(word).get(Long.parseLong(splitWorlds[i]));
>>    BoostingTermQuery tq = new BoostingTermQuery(new Term(fieldName,
>> splitWorlds[i]));
>>    tq.setBoost((float) freq);
>>    bq.add(tq, BooleanClause.Occur.SHOULD);
>>   }
>>   return bq;
>>  }
>> }
>> public class PayloadAnalyzer extends Analyzer
>> {
>>  private PayloadTokenStream payToken = null;
>>  private int score;
>>  private Map<String, Integer> scoresMap = new HashMap<String,  
>> Integer>();
>>  public synchronized void setScore(int s)
>>  {
>>   score = s;
>>  }
>>  public synchronized void setMapScores(Map<String, Integer>  
>> scoresMap)
>>  {
>>   this.scoresMap = scoresMap;
>>  }
>>  public final TokenStream tokenStream(String field, Reader reader)
>>  {
>>   payToken = new PayloadTokenStream(new WhitespaceTokenizer(reader));
>> //new LowerCaseTokenizer(reader));
>>   payToken.setScore(score);
>>   payToken.setMapScores(scoresMap);
>>   return payToken;
>>  }
>> }
>> public class PayloadTokenStream extends TokenStream
>> {
>>  private Tokenizer tok = null;
>>  private int score;
>>  private Map<String, Integer> scoresMap = new HashMap<String,  
>> Integer>();
>>  public PayloadTokenStream(Tokenizer tokenizer)
>>  {
>>   tok = tokenizer;
>>  }
>>  public void setScore(int s)
>>  {
>>   score = s;
>>  }
>>  public synchronized void setMapScores(Map<String, Integer>  
>> scoresMap)
>>  {
>>   this.scoresMap = scoresMap;
>>  }
>>  public Token next(Token t) throws IOException
>>  {
>>   t = tok.next(t);
>>   if(t != null)
>>   {
>>    //t.setTermBuffer("can change");
>>    //Do something with the data
>>    byte[] bytes = ("score:" + score).getBytes();
>>    //                              t.setPayload(new Payload(bytes));
>>    String word = String.copyValueOf(t.termBuffer(), 0,  
>> t.termLength());
>>    if(!word.equals("") && word != null)
>>    {
>>     int score = scoresMap.get(word);
>>     if(score > 127)
>>     {
>>      score = 127;
>>     }
>>     byte payLoad = Byte.parseByte(String.valueOf(score));
>>     t.setPayload(new Payload(new byte[] { Byte.valueOf(payLoad) }));
>>    }
>>   }
>>   return t;
>>  }
>>  public void reset(Reader input) throws IOException
>>  {
>>   tok.reset(input);
>>  }
>>  public void close() throws IOException
>>  {
>>   tok.close();
>>  }
>> }
>> }
>>

--------------------------
Grant Ingersoll
http://www.lucidimagination.com/

Search the Lucene ecosystem (Lucene/Solr/Nutch/Mahout/Tika/Droids)  
using Solr/Lucene:
http://www.lucidimagination.com/search


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message