lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Weiwei Wang <ww.wang...@gmail.com>
Subject I need to implement a TokenFilter to break season07
Date Tue, 15 Dec 2009 09:01:43 GMT
Hi, all
     I currently need a TokenFilter to break  token season07 into two tokens
season 07

I tried PatternReplaceCharFilter to replace "season07" with "season 07",
however, the offset is not correct for Highlighting. For this reason, I want
to implement a TokenFilter, but I do not know how to deal with the offset.
My implemtation is currently following EdgeNGramTokenFilter:
public final class AlphaNumberTokenFilter extends TokenFilter
{

    private char[] curTermBuffer;

    private int curTermLength;

    private int currentOffset;

    private int baseOffset;


    private TermAttribute termAtt;

    private OffsetAttribute offsetAtt;

    protected AlphaNumberTokenFilter(TokenStream input)
    {
    super(input);
    this.termAtt = addAttribute(TermAttribute.class);
    this.offsetAtt = addAttribute(OffsetAttribute.class);
    }

    @Override
    public final boolean incrementToken() throws IOException
    {
    while (true)
    {
        if (curTermBuffer == null)
        {
        if (!input.incrementToken())
        {
            return false;
        }
        else
        {
            curTermBuffer = (char[]) termAtt.termBuffer().clone();
            curTermLength = termAtt.termLength();
            currentOffset = 0;
            baseOffset = offsetAtt.startOffset();
        }
        }
        if (currentOffset < curTermLength)
        {

        for(int i=currentOffset;i<curTermLength-1;i++)
        {

if(Character.isLetter(curTermBuffer[i])&&Character.isDigit(curTermBuffer[i+1]))
            {
            int start = currentOffset;
            int end = i+1;
            offsetAtt.setOffset(baseOffset+start, baseOffset+end);
            termAtt.setTermBuffer(curTermBuffer,start,end-start);
            currentOffset=i+1;
            return true;
            }
        }
        if(currentOffset<curTermLength)
        {
                int start = currentOffset;
            int end = curTermLength;
            offsetAtt.setOffset(baseOffset+start, baseOffset+end);
            termAtt.setTermBuffer(curTermBuffer,start,end-start);
            currentOffset=curTermLength;
            return true;
        }
        }
        curTermBuffer = null;
    }
    }

    @Override
    public void reset() throws IOException
    {
    super.reset();
    curTermBuffer = null;
    }
}

-- 
Weiwei Wang
Alex Wang
王巍巍
Room 403, Mengmin Wei Building
Computer Science Department
Gulou Campus of Nanjing University
Nanjing, P.R.China, 210093

Homepage: http://cs.nju.edu.cn/rl/weiweiwang

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message