lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From David Ginzburg <davidginzb...@gmail.com>
Subject localToken contains a termBuffer with 10 empty chars ('')
Date Sun, 18 Oct 2009 00:27:55 GMT
Hi,
I have written a my own weighted synonym filter and tried to integrate it
inside an analyzer.
The analyzer as defined in the schema.xml is:




the field type is
*<fieldType name="Company_Name" class="solr.TextField"
positionIncrementGap="100" >
      <analyzer type="index">
        <tokenizer class="solr.**WhitespaceTokenizerFactory"/>
        **
        <filter class="DTSynonymFactory"
FreskoFunction="**SimilarityProbManual.txt"
ignoreCase="true" expand="false"/>

        <!--<filter class="solr.**EnglishPorterFilterFactory"
protected="protwords.txt"/>-->
        <!--<filter class="solr.**RemoveDuplicatesTokenFilterFac**tory"/>-->
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.**StandardTokenizerFactory"/>
        <filter class="solr.**LowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
        <!--<filter class="solr.**EnglishPorterFilterFactory"
protected="protwords.txt"/>-->
        <!--<filter class="solr.**RemoveDuplicatesTokenFilterFac**tory"/
>-->
      </analyzer>
    </fieldType>*


The problem is that I always get in the  Token next(Token reusableToken)
method in  DTSynonymFilter  a token with a termBuffer containing  10 emty
chars.
*
*
*I have debugged and stepped into Solr code and found that *
*in class DocInverterPerField
 Token token = stream.next(localToken); line 134*
*
localToken contains a termBuffer with 10 empty chars ('')*

*What am I doing wrong ???
*
The java code:
*
import com.google.common.collect.**ArrayListMultimap;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.**Token;
import org.apache.lucene.analysis.**TokenFilter;
import org.apache.lucene.analysis.**TokenStream;
import org.apache.lucene.analysis.**payloads.PayloadHelper;
import org.apache.lucene.index.**Payload;

/**
 *
 * @author david
 */
public class DTSynonymFilter extends TokenFilter {

    public DTSynonymFilter(TokenStream input, ArrayListMultimap<String,
Synonym> syns) {
        super(input);
        this.synsMap = syns;
        System.out.println("in DTSynonymFilter synsMap ");



    }
    public static final String SYNONYM = "<SYNONYM>";
    TokenFilter tf;
    private LinkedList<Token> synonymTokenQueue = new LinkedList<Token>();

    private ArrayListMultimap<String, Synonym> synsMap = null;
    private LinkedList<Token> buffer;

    private Token nextTok(Token target) throws IOException {

        if (buffer != null && !buffer.isEmpty()) {
            return buffer.removeFirst();
        } else {
            return input.next(target);
        }
    }

    private void pushTok(Token t) {
        if (buffer == null) {
            buffer = new LinkedList<Token>();

        }
        buffer.addFirst(t);
    }

    @Override
    public Token next(Token reusableToken) throws IOException {

        if (synonymTokenQueue.size() > 0) {

            return synonymTokenQueue.removeFirst(* *);

        }
        if (reusableToken == null) {
            return null;
        }

        reusableToken.setPayload(new Payload(new byte[]{(byte) 1}));

      //   System.out.println("trying to get synonyms for "+reusableToken);
      //    System.out.println(synsMap.* *get(reusableToken.term()));
        List<Synonym> syns = synsMap.get(reusableToken.**term());
       for (Synonym synonym : synsMap.get(reusableToken.**term())) {
                System.out.println(synonym);
            }
        Payload boostPayload;

        for (Synonym synonym : syns) {
            //Token(char[] startTermBuffer, int termBufferOffset, int
termBufferLength, int start, int end)
           // Token synToken = new Token(synonym.getToken().**toCharArray(),
reusableToken.startOffset(), reusableToken.endOffset(),
synonym.getToken().length(), 0);//, t.startOffset(), t.endOffset(),
SYNONYM);
            Token newTok = new Token(reusableToken.**startOffset(),
reusableToken.endOffset(), SYNONYM);
            newTok.setTermBuffer(synonym.**getToken().toCharArray(), 0,
synonym.getToken().length());
            // set the position increment to zero
            // this tells lucene the synonym is
            // in the exact same location as the originating word
            newTok.setPositionIncrement(0)**;
            boostPayload = new Payload(PayloadHelper.**
encodeFloat(synonym.getWieght(**)));
            newTok.setPayload(**boostPayload);
            synonymTokenQueue.add(newTok);

        }
        return reusableToken;








    }
}


import DTSynonymFilter;
import com.google.common.collect.**ArrayListMultimap;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.**Token;
import org.apache.lucene.analysis.**TokenStream;
import org.apache.solr.analysis.**BaseTokenFilterFactory;
import org.apache.solr.analysis.**TokenizerFactory;
import org.apache.solr.common.**ResourceLoader;
import org.apache.solr.common.util.**StrUtils;
import org.apache.solr.util.plugin.**ResourceLoaderAware;

/**
 *
 * @author david
 */
public class DTSynonymFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {

    boolean informed=false;
    String synonyms=null;

    public DTSynonymFactory(){

       // this.syns= ArrayListMultimap.create();
    }

    final static Logger log = Logger.getLogger(**DTSynonymFactory.class.**
getName());

    private static TokenizerFactory loadTokenizerFactory(* *ResourceLoader
loader, String cname, Map<String, String> args) {
        TokenizerFactory tokFactory = (TokenizerFactory)
loader.newInstance(cname);
        tokFactory.init(args);
        return tokFactory;
    }
    private ArrayListMultimap<String, Synonym> syns = null;

    public DTSynonymFilter create(TokenStream input) {

        Thread.dumpStack();
        try {
            Thread.sleep(5000);
        } catch (InterruptedException ex) {
            Logger.getLogger(**DTSynonymFactory.class.**getName()).log(Level.SEVERE,
null, ex);
        }
        if(syns!=null){
            System.out.println("in create() syns is "+syns+" syns size is
"+" " );
            return new DTSynonymFilter(input,syns);
        }
        else{
            System.out.println("in create() syns is "+syns+" and informed is
"+informed);
            return new DTSynonymFilter(input,null);


        }
  }
    @Override
    public void inform(ResourceLoader loader) {

         synonyms = args.get("FreskoFunction");
        System.out.println("in DTSynonymFilter.inform() synonyms file is
"+synonyms);
        boolean ignoreCase = getBoolean("ignoreCase", false);
         System.out.println("in DTSynonymFilter.inform() ignoreCase is
"+ignoreCase);
        boolean expand = getBoolean("expand", true);
        System.out.println("in DTSynonymFilter.inform() expand is "+expand);
        //String seperator =
        String tf = args.get("tokenizerFactory");

        TokenizerFactory tokFactory = null;
        if (tf != null) {
            tokFactory = loadTokenizerFactory(loader, tf, args);
        }
        if (tf != null) {
            System.out.println("**TokenizerFactory loaded ");
        }
        if (synonyms != null) {
            List<String> wlist = null;
            try {
                File synonymFile = new File(synonyms);
                if (synonymFile.exists()) {
                    wlist = loader.getLines(synonyms);
                } else {
                    List<String> files = StrUtils.splitFileNames(**
synonyms);
                    for (String file : files) {
                        wlist = loader.getLines(file.trim());
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();

                throw new RuntimeException(e);

            }
            syns = ArrayListMultimap.create();
            populateSynMap("\\|", wlist);
            if(syns==null){
                System.out.println("sysns after create and populate is
null!!!!!!");
                Thread.sleep(5000);


            }
            else{
                System.out.println("after crete the size of syns is
"+syns.size());
                informed=true;
            }

        // synMap = new SynonymMap(ignoreCase);
        // parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
        }
        else{
            throw new RuntimeException("Could not find synonyms");
        }
        }catch(Exception e){
           e.printStackTrace();
           throw  new RuntimeException(e);
        }
    }


        }

    }
}

* Thanks in advance

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message