Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 35650 invoked from network); 2 Apr 2010 14:01:17 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 2 Apr 2010 14:01:17 -0000 Received: (qmail 79602 invoked by uid 500); 2 Apr 2010 05:01:16 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 79469 invoked by uid 500); 2 Apr 2010 05:01:16 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 79462 invoked by uid 99); 2 Apr 2010 05:01:15 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 02 Apr 2010 05:01:15 +0000 X-ASF-Spam-Status: No, hits=-1411.6 required=10.0 tests=ALL_TRUSTED,AWL X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 02 Apr 2010 05:01:14 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 7D5052388994; Fri, 2 Apr 2010 05:00:53 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r930163 - in /lucene/dev/trunk/solr: CHANGES.txt src/java/org/apache/solr/analysis/ShingleFilterFactory.java src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Date: Fri, 02 Apr 2010 05:00:53 -0000 To: java-commits@lucene.apache.org From: rmuir@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100402050053.7D5052388994@eris.apache.org> Author: rmuir Date: Fri Apr 2 05:00:53 2010 New Revision: 930163 URL: http://svn.apache.org/viewvc?rev=930163&view=rev Log: SOLR-1740: ShingleFilterFactory improvements Modified: lucene/dev/trunk/solr/CHANGES.txt lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Modified: lucene/dev/trunk/solr/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=930163&r1=930162&r2=930163&view=diff ============================================================================== --- lucene/dev/trunk/solr/CHANGES.txt (original) +++ lucene/dev/trunk/solr/CHANGES.txt Fri Apr 2 05:00:53 2010 @@ -135,6 +135,10 @@ New Features TokenFilters now support custom Attributes, and some have improved performance: especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler) +* SOLR-1740: ShingleFilterFactory supports the "minShingleSize" and "tokenSeparator" + parameters for controlling the minimum shingle size produced by the filter, and + the separator string that it uses, respectively. (Steven Rowe via rmuir) + Optimizations ---------------------- Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java?rev=930163&r1=930162&r2=930163&view=diff ============================================================================== --- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (original) +++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java Fri Apr 2 05:00:53 2010 @@ -21,21 +21,49 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; + import java.util.Map; /** Factory for {@link ShingleFilter} */ public class ShingleFilterFactory extends BaseTokenFilterFactory { + private int minShingleSize; private int maxShingleSize; private boolean outputUnigrams; + private String tokenSeparator; + public void init(Map args) { super.init(args); maxShingleSize = getInt("maxShingleSize", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); + if (maxShingleSize < 2) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Invalid maxShingleSize (" + maxShingleSize + + ") - must be at least 2"); + } + minShingleSize = getInt("minShingleSize", + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); + if (minShingleSize < 2) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Invalid minShingleSize (" + minShingleSize + + ") - must be at least 2"); + } + if (minShingleSize > maxShingleSize) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "Invalid minShingleSize (" + minShingleSize + + ") - must be no greater than maxShingleSize (" + + maxShingleSize + ")"); + } outputUnigrams = getBoolean("outputUnigrams", true); + tokenSeparator = args.containsKey("tokenSeparator") + ? args.get("tokenSeparator") + : ShingleFilter.TOKEN_SEPARATOR; } public ShingleFilter create(TokenStream input) { - ShingleFilter r = new ShingleFilter(input,maxShingleSize); + ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.setOutputUnigrams(outputUnigrams); + r.setTokenSeparator(tokenSeparator); return r; } } Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java?rev=930163&r1=930162&r2=930163&view=diff ============================================================================== --- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (original) +++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Fri Apr 2 05:00:53 2010 @@ -70,4 +70,150 @@ public class TestShingleFilterFactory ex new String[] {"this", "this is", "this is a", "is", "is a", "is a test", "a", "a test", "test"}); } + + /** + * Test with higher min (and max) shingle size + */ + public void testMinShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this is a", "this is a test", + "is", "is a test", "a", "test" }); + } + + /** + * Test with higher min (and max) shingle size and with unigrams disabled + */ + public void testMinShingleSizeNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this is a", "this is a test", "is a test" }); + } + + /** + * Test with higher same min and max shingle size + */ + public void testEqualMinAndMaxShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "3"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this is a", "is", "is a test", "a", "test" }); + } + + /** + * Test with higher same min and max shingle size and with unigrams disabled + */ + public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "3"); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this is a", "is a test" }); + } + + /** + * Test with a non-default token separator + */ + public void testTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", "=BLAH="); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a", + "a", "a=BLAH=test", "test" }); + } + + /** + * Test with a non-default token separator and with unigrams disabled + */ + public void testTokenSeparatorNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", "=BLAH="); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" }); + } + + /** + * Test with an empty token separator + */ + public void testEmptyTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", ""); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" }); + } + + /** + * Test with higher min (and max) shingle size + * and with a non-default token separator + */ + public void testMinShingleSizeAndTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("tokenSeparator", "=BLAH="); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this", "this=BLAH=is=BLAH=a", + "this=BLAH=is=BLAH=a=BLAH=test", "is", + "is=BLAH=a=BLAH=test", "a", "test" }); + } + + /** + * Test with higher min (and max) shingle size + * and with a non-default token separator + * and with unigrams disabled + */ + public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("tokenSeparator", "=BLAH="); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, + new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", + "is=BLAH=a=BLAH=test", }); + } }