lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Uwe Schindler" <...@thetaphi.de>
Subject RE: Converting from TermAttribute to CharTermAttribute
Date Thu, 15 Sep 2011 10:12:18 GMT
Hi,

In your incrementToken method, you differs in both implementations:
The original one uses setTermBuffer(), but the new one only resizes the
buffer, but never changes it.

Uwe
-----
Uwe Schindler
H.-H.-Meier-Allee 63, D-28213 Bremen
http://www.thetaphi.de
eMail: uwe@thetaphi.de


> -----Original Message-----
> From: Paul Taylor [mailto:paul_t100@fastmail.fm]
> Sent: Thursday, September 15, 2011 11:39 AM
> To: 'java-user@lucene.apache.org'
> Subject: Converting from TermAttribute to CharTermAttribute
> 
> Have updated from Lucene 3.0 to lucene 3.1 an dnow getting various
> deprecations that Im trying to move
> 
> I change this filter class and now my test are failing, anybody able to
see what
> Im missing please
> 
> Paul
> 
> package org.musicbrainz.search.analysis;
> 
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.TermAttribute;
> 
> import java.io.IOException;
> 
> /**
>   * A filter that replaces accented characters by their unaccented
equivalents.
>   */
> public class AccentFilter extends TokenFilter {
> 
>      private char[] output = new char[256];
>      private int outputPos;
> 
>      private TermAttribute termAttr;
> 
>      public AccentFilter(TokenStream input) {
>          super(input);
>          termAttr = (TermAttribute) addAttribute(TermAttribute.class);
>      }
> 
>      @Override
>      public boolean incrementToken() throws IOException {
>          if (!input.incrementToken())
>              return false;
> 
>          final char[] buffer = termAttr.termBuffer();
>          final int length    = termAttr.termLength();
>          if (removeAccents(buffer, length))  {
>              termAttr.setTermBuffer(output, 0, outputPos);
>          }
>          return true;
>      }
> 
>      protected final boolean removeAccents(char[] input, int length) {
>          final int maxSizeNeeded = 2 * length;
>          int size = output.length;
>          while (size < maxSizeNeeded)
>              size *= 2;
> 
>          int inputPos = 0;
>          outputPos = 0;
> 
>          for (int i = 0; i < length; i++) {
>              int c = (int) input[i];
> 
>              int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
>              int position = c & UnaccentData.BLOCK_MASK;
> 
>              short[] positions = UnaccentPositions.positions[block];
>              int unacPosition = positions[position];
>              int unacLength = positions[position + 1] - unacPosition;
> 
>              if (unacLength > 0) {
>                  // allocate a new char array, if necessary
>                  if (size != output.length)
>                      output = new char[size];
>                  // copy front of the input
>                  if (inputPos < i) {
>                      System.arraycopy(input, inputPos, output, outputPos,
i - inputPos);
>                      outputPos += i - inputPos;
>                  }
>                  // copy unaccented data
>                  System.arraycopy(UnaccentData.data[block], unacPosition,
>                          output, outputPos, unacLength);
>                  outputPos += unacLength;
>                  inputPos = i + 1;
>              }
>          }
> 
>          // no conversion needed...
>          if (inputPos == 0)
>              return false;
> 
>          // copy rest of the input
>          int copyLength = length - inputPos;
>          if (copyLength > 0) {
>              System.arraycopy(input, inputPos, output, outputPos,
copyLength);
>              outputPos += copyLength;
>          }
> 
>          return true;
>      }
> 
> }
> 
> to
> 
> package org.musicbrainz.search.analysis;
> 
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> 
> import java.io.IOException;
> 
> /**
>   * A filter that replaces accented characters by their unaccented
equivalents.
>   */
> public class AccentFilter extends TokenFilter {
> 
>      private char[] output = new char[256];
>      private int outputPos;
> 
>      private CharTermAttribute termAttr;
> 
>      public AccentFilter(TokenStream input) {
>          super(input);
>          termAttr = (CharTermAttribute)
addAttribute(CharTermAttribute.class);
>      }
> 
>      @Override
>      public boolean incrementToken() throws IOException {
>          if (!input.incrementToken())
>              return false;
> 
>          final char[] buffer = termAttr.buffer();
>          final int length    = termAttr.length();
>          if (removeAccents(buffer, length))  {
>              termAttr.resizeBuffer(outputPos);
>          }
>          return true;
>      }
> 
>      protected final boolean removeAccents(char[] input, int length) {
>          final int maxSizeNeeded = 2 * length;
>          int size = output.length;
>          while (size < maxSizeNeeded)
>              size *= 2;
> 
>          int inputPos = 0;
>          outputPos = 0;
> 
>          for (int i = 0; i < length; i++) {
>              int c = (int) input[i];
> 
>              int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
>              int position = c & UnaccentData.BLOCK_MASK;
> 
>              short[] positions = UnaccentPositions.positions[block];
>              int unacPosition = positions[position];
>              int unacLength = positions[position + 1] - unacPosition;
> 
>              if (unacLength > 0) {
>                  // allocate a new char array, if necessary
>                  if (size != output.length)
>                      output = new char[size];
>                  // copy front of the input
>                  if (inputPos < i) {
>                      System.arraycopy(input, inputPos, output, outputPos,
i - inputPos);
>                      outputPos += i - inputPos;
>                  }
>                  // copy unaccented data
>                  System.arraycopy(UnaccentData.data[block], unacPosition,
>                          output, outputPos, unacLength);
>                  outputPos += unacLength;
>                  inputPos = i + 1;
>              }
>          }
> 
>          // no conversion needed...
>          if (inputPos == 0)
>              return false;
> 
>          // copy rest of the input
>          int copyLength = length - inputPos;
>          if (copyLength > 0) {
>              System.arraycopy(input, inputPos, output, outputPos,
copyLength);
>              outputPos += copyLength;
>          }
> 
>          return true;
>      }
> 
> }
> 
> 
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message