Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 1416 invoked from network); 15 Dec 2009 13:27:53 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 15 Dec 2009 13:27:53 -0000 Received: (qmail 13811 invoked by uid 500); 15 Dec 2009 13:27:52 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 13745 invoked by uid 500); 15 Dec 2009 13:27:52 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 13736 invoked by uid 99); 15 Dec 2009 13:27:51 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 15 Dec 2009 13:27:51 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 15 Dec 2009 13:27:49 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 6BCDA238897F; Tue, 15 Dec 2009 13:27:28 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r890791 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java Date: Tue, 15 Dec 2009 13:27:28 -0000 To: java-commits@lucene.apache.org From: uschindler@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091215132728.6BCDA238897F@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: uschindler Date: Tue Dec 15 13:27:27 2009 New Revision: 890791 URL: http://svn.apache.org/viewvc?rev=890791&view=rev Log: LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer over itsself, instead it sets the length to the offset of the delimiter. Also optimizes logic and IdentityEncoder to use NIO. Modified: lucene/java/trunk/contrib/CHANGES.txt lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java Modified: lucene/java/trunk/contrib/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=890791&r1=890790&r2=890791&view=diff ============================================================================== --- lucene/java/trunk/contrib/CHANGES.txt (original) +++ lucene/java/trunk/contrib/CHANGES.txt Tue Dec 15 13:27:27 2009 @@ -65,6 +65,12 @@ into core, and moved the ICU-based collation support into contrib/icu. (Robert Muir) +Optimizations + + * LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer + over itsself. Instead it sets only the length. This patch also optimizes + the logic of the filter and uses NIO for IdentityEncoder. (Uwe Schindler) + Test Cases * LUCENE-2115: Cutover contrib tests to use Java5 generics. (Kay Kay Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java?rev=890791&r1=890790&r2=890791&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java Tue Dec 15 13:27:27 2009 @@ -61,26 +61,19 @@ @Override public boolean incrementToken() throws IOException { - boolean result = false; if (input.incrementToken()) { final char[] buffer = termAtt.termBuffer(); final int length = termAtt.termLength(); - //look for the delimiter - boolean seen = false; for (int i = 0; i < length; i++) { if (buffer[i] == delimiter) { - termAtt.setTermBuffer(buffer, 0, i); payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); - seen = true; - break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same + termAtt.setTermLength(i); // simply set a new length + return true; } } - if (seen == false) { - //no delimiter - payAtt.setPayload(null); - } - result = true; - } - return result; + // we have not seen the delimiter + payAtt.setPayload(null); + return true; + } else return false; } } Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java?rev=890791&r1=890790&r2=890791&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java Tue Dec 15 13:27:27 2009 @@ -18,9 +18,9 @@ import org.apache.lucene.index.Payload; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.util.Arrays; -import java.io.UnsupportedEncodingException; /** @@ -30,28 +30,30 @@ public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{ protected Charset charset = Charset.forName("UTF-8"); - protected String charsetName = "UTF-8"; //argh, stupid 1.4 + + /** @deprecated This field is no longer used. Use {@link #charset} instead. */ + @Deprecated + protected String charsetName = charset.name(); public IdentityEncoder() { } public IdentityEncoder(Charset charset) { this.charset = charset; + // @deprecated, remove this in 4.0: charsetName = charset.name(); } public Payload encode(char[] buffer, int offset, int length) { - //what's the most efficient way to get a byte [] from a char[] array - //Do we have to go through String? - String tmp = new String(buffer, offset, length); - Payload result = null;//Can we avoid allocating by knowing where using the new API? - try { - result = new Payload(tmp.getBytes(charsetName)); - } catch (UnsupportedEncodingException e) { - //should never hit this, since we get the name from the Charset + final ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length)); + if (bb.hasArray()) { + return new Payload(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()); + } else { + // normally it should always have an array, but who knows? + final byte[] b = new byte[bb.remaining()]; + bb.get(b); + return new Payload(b); } - - return result; } }