Return-Path: Delivered-To: apmail-lucene-solr-commits-archive@minotaur.apache.org Received: (qmail 22207 invoked from network); 21 Dec 2009 13:54:22 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 21 Dec 2009 13:54:22 -0000 Received: (qmail 54630 invoked by uid 500); 21 Dec 2009 13:54:22 -0000 Delivered-To: apmail-lucene-solr-commits-archive@lucene.apache.org Received: (qmail 54570 invoked by uid 500); 21 Dec 2009 13:54:22 -0000 Mailing-List: contact solr-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: solr-dev@lucene.apache.org Delivered-To: mailing list solr-commits@lucene.apache.org Received: (qmail 54561 invoked by uid 99); 21 Dec 2009 13:54:22 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 21 Dec 2009 13:54:22 +0000 X-ASF-Spam-Status: No, hits=-2.5 required=5.0 tests=AWL,BAYES_00 X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 21 Dec 2009 13:54:14 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 090202388A19; Mon, 21 Dec 2009 13:53:54 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r892821 [2/3] - in /lucene/solr/trunk: ./ src/test/org/apache/solr/analysis/ src/test/test-files/solr/conf/ Date: Mon, 21 Dec 2009 13:53:52 -0000 To: solr-commits@lucene.apache.org From: markrmiller@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091221135354.090202388A19@eris.apache.org> Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,50 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.solr.common.ResourceLoader; + +/** + * Simple tests to ensure the French elision filter factory is working. + */ +public class TestElisionFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually normalizes text. + */ + public void testElision() throws Exception { + Reader reader = new StringReader("l'avion"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + ElisionFilterFactory factory = new ElisionFilterFactory(); + ResourceLoader loader = solrConfig.getResourceLoader(); + Map args = new HashMap(); + args.put("articles", "frenchArticles.txt"); + factory.init(args); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "avion" }); + } + +} Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the French stem filter factory is working. + */ +public class TestFrenchStemFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("habitable"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + FrenchStemFilterFactory factory = new FrenchStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "habit" }); + } +} Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the German stem filter factory is working. + */ +public class TestGermanStemFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("Tischen"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + GermanStemFilterFactory factory = new GermanStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "tisch" }); + } +} Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Greek lowercase filter factory is working. + */ +public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually lowercases (and a bit more) greek text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" }); + } +} Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java Mon Dec 21 13:53:50 2009 @@ -28,12 +28,24 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public void testHyphenatedWords() throws Exception { String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal"; - String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological"; // first test TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); - ts = new HyphenatedWordsFilter(ts); - String actual = tsToString(ts); - assertEquals("Testing HyphenatedWordsFilter", - outputAfterHyphenatedWordsFilter, actual); + HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); + ts = factory.create(ts); + assertTokenStreamContents(ts, + new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" }); } + + /** + * Test that HyphenatedWordsFilter behaves correctly with a final hyphen + */ + public void testHyphenAtEnd() throws Exception { + String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-"; + // first test + TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); + HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); + ts = factory.create(ts); + assertTokenStreamContents(ts, + new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" }); + } } Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepWordFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepWordFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepWordFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestKeepWordFilter.java Mon Dec 21 13:53:50 2009 @@ -17,13 +17,14 @@ package org.apache.solr.analysis; +import java.io.StringReader; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Set; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; /** @@ -37,7 +38,7 @@ words.add( "aaa" ); words.add( "bbb" ); - List input = tokens( "aaa BBB ccc ddd EEE" ); + String input = "aaa BBB ccc ddd EEE"; Map args = new HashMap(); @@ -47,18 +48,28 @@ factory.init( args ); factory.inform( solrConfig.getResourceLoader() ); factory.setWords( words ); + assertTrue(factory.isIgnoreCase()); + TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); - List expect = tokens( "aaa BBB" ); - List real = getTokens(factory.create( new IterTokenStream(input) )); - assertTokEqual( expect, real ); + // Test Stopwords (ignoreCase via the setter instead) + factory = new KeepWordFilterFactory(); + args = new HashMap(); + factory.init( args ); + factory.inform( solrConfig.getResourceLoader() ); + factory.setIgnoreCase(true); + factory.setWords( words ); + assertTrue(factory.isIgnoreCase()); + stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); // Now force case + args = new HashMap(); args.put( "ignoreCase", "false" ); factory.init( args ); factory.inform( solrConfig.getResourceLoader() ); - - expect = tokens( "aaa" ); - real = getTokens(factory.create( new IterTokenStream(input) )); - assertTokEqual( expect, real ); + assertFalse(factory.isIgnoreCase()); + stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); + assertTokenStreamContents(stream, new String[] { "aaa" }); } } Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java Mon Dec 21 13:53:50 2009 @@ -1,37 +1,27 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.junit.Assert; import org.junit.Test; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; /** - * @version $Id$ * @since solr 1.4 */ -public class TestMultiWordSynonyms { +public class TestMultiWordSynonyms extends BaseTokenTestCase { @Test - public void testMultiWordSynonmys() throws IOException { + public void testMultiWordSynonyms() throws IOException { List rules = new ArrayList(); rules.add("a b c,d"); SynonymMap synMap = new SynonymMap(true); SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null); SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap); - TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); - - ts.reset(); - List tokens = new ArrayList(); - while (ts.incrementToken()) tokens.add(termAtt.term()); - // This fails because ["e","e"] is the value of the token stream - Assert.assertEquals(Arrays.asList("a", "e"), tokens); + assertTokenStreamContents(ts, new String[] { "a", "e" }); } } Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestNGramFilters.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestNGramFilters.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestNGramFilters.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestNGramFilters.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,163 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the NGram filter factories are working. + */ +public class TestNGramFilters extends BaseTokenTestCase { + /** + * Test NGramTokenizerFactory + */ + public void testNGramTokenizer() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + NGramTokenizerFactory factory = new NGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "t", "e", "s", "t", "te", "es", "st" }); + } + /** + * Test NGramTokenizerFactory with min and max gram options + */ + public void testNGramTokenizer2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "2"); + args.put("maxGramSize", "3"); + NGramTokenizerFactory factory = new NGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "te", "es", "st", "tes", "est" }); + } + /** + * Test the NGramFilterFactory + */ + public void testNGramFilter() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + NGramFilterFactory factory = new NGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] { "t", "e", "s", "t", "te", "es", "st" }); + } + /** + * Test the NGramFilterFactory with min and max gram options + */ + public void testNGramFilter2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "2"); + args.put("maxGramSize", "3"); + NGramFilterFactory factory = new NGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] { "te", "es", "st", "tes", "est" }); + } + /** + * Test EdgeNGramTokenizerFactory + */ + public void testEdgeNGramTokenizer() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "t" }); + } + /** + * Test EdgeNGramTokenizerFactory with min and max gram size + */ + public void testEdgeNGramTokenizer2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "1"); + args.put("maxGramSize", "2"); + EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "t", "te" }); + } + /** + * Test EdgeNGramTokenizerFactory with side option + */ + public void testEdgeNGramTokenizer3() throws Exception { + Reader reader = new StringReader("ready"); + Map args = new HashMap(); + args.put("side", "back"); + EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "y" }); + } + /** + * Test EdgeNGramFilterFactory + */ + public void testEdgeNGramFilter() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] { "t" }); + } + /** + * Test EdgeNGramFilterFactory with min and max gram size + */ + public void testEdgeNGramFilter2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "1"); + args.put("maxGramSize", "2"); + EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] { "t", "te" }); + } + /** + * Test EdgeNGramFilterFactory with side option + */ + public void testEdgeNGramFilter3() throws Exception { + Reader reader = new StringReader("ready"); + Map args = new HashMap(); + args.put("side", "back"); + EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] { "y" }); + } +} Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java Mon Dec 21 13:53:50 2009 @@ -19,6 +19,8 @@ import java.io.IOException; import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; @@ -37,20 +39,33 @@ // this is test. public void testNothingChange() throws IOException { final String BLOCK = "this is test."; - CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3", + PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); + args.put("replacement", "$1$2$3"); + factory.init(args); + CharStream cs = factory.create( CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "this", "is", "test." }, + new int[] { 0, 5, 8 }, + new int[] { 4, 7, 13 }, + new int[] { 1, 1, 1 }); } // 012345678 // aa bb cc public void testReplaceByEmpty() throws IOException { final String BLOCK = "aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "", + PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); + factory.init(args); + CharStream cs = factory.create( CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertEquals( 0, getTokens( ts ).size() ); + assertFalse(ts.incrementToken()); } // 012345678 @@ -58,10 +73,19 @@ // aa#bb#cc public void test1block1matchSameLength() throws IOException { final String BLOCK = "aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3", + PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); + args.put("replacement", "$1#$2#$3"); + factory.init(args); + CharStream cs = factory.create( CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa#bb#cc" }, + new int[] { 0 }, + new int[] { 8 }, + new int[] { 1 }); } // 11111 @@ -73,7 +97,11 @@ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3", CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa##bb###cc", "dd" }, + new int[] { 0, 9 }, + new int[] { 8, 11 }, + new int[] { 1, 1 }); } // 01234567 @@ -84,7 +112,11 @@ CharStream cs = new PatternReplaceCharFilter( "a", "aa", CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa", "aa" }, + new int[] { 1, 4 }, + new int[] { 2, 5 }, + new int[] { 1, 1 }); } // 11111 @@ -96,7 +128,11 @@ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2", CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa#bb", "dd" }, + new int[] { 0, 12 }, + new int[] { 11, 14 }, + new int[] { 1, 1 }); } // 111111111122222222223333 @@ -108,8 +144,11 @@ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3", CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ), - getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, + new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, + new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 }); } // 11111111112222222222333333333 @@ -121,8 +160,11 @@ CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".", CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ), - getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, + new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, + new int[] { 7, 10, 14, 20, 24, 27, 35, 38 }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1 }); } // 11111111112222222222333333333 @@ -136,7 +178,10 @@ cs = new PatternReplaceCharFilter( "bb", "b", ".", cs ); cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs ); TokenStream ts = new WhitespaceTokenizer( cs ); - assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ), - getTokens( ts ) ); + assertTokenStreamContents(ts, + new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, + new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, + new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); } } Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java Mon Dec 21 13:53:50 2009 @@ -17,7 +17,6 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; @@ -27,7 +26,7 @@ /** * @version $Id:$ */ -public class TestPatternReplaceFilter extends AnalysisTestCase { +public class TestPatternReplaceFilter extends BaseTokenTestCase { public void testReplaceAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; @@ -35,14 +34,8 @@ (new WhitespaceTokenizer(new StringReader(input)), Pattern.compile("a*b"), "-", true); - Token token = ts.next(); - assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("-", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("c-", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertNull(token); + assertTokenStreamContents(ts, + new String[] { "-foo-foo-foo-", "-", "c-" }); } public void testReplaceFirst() throws Exception { @@ -51,14 +44,8 @@ (new WhitespaceTokenizer(new StringReader(input)), Pattern.compile("a*b"), "-", false); - Token token = ts.next(); - assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("-", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("c-", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertNull(token); + assertTokenStreamContents(ts, + new String[] { "-fooaabfooabfoob", "-", "c-" }); } public void testStripFirst() throws Exception { @@ -67,14 +54,8 @@ (new WhitespaceTokenizer(new StringReader(input)), Pattern.compile("a*b"), null, false); - Token token = ts.next(); - assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("c", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertNull(token); + assertTokenStreamContents(ts, + new String[] { "fooaabfooabfoob", "", "c" }); } public void testStripAll() throws Exception { @@ -83,14 +64,8 @@ (new WhitespaceTokenizer(new StringReader(input)), Pattern.compile("a*b"), null, true); - Token token = ts.next(); - assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("c", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertNull(token); + assertTokenStreamContents(ts, + new String[] { "foofoofoo", "", "c" }); } public void testReplaceAllWithBackRef() throws Exception { @@ -99,14 +74,8 @@ (new WhitespaceTokenizer(new StringReader(input)), Pattern.compile("(a*)b"), "$1\\$", true); - Token token = ts.next(); - assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("a$", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength())); - token = ts.next(); - assertNull(token); + assertTokenStreamContents(ts, + new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" }); } } Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java Mon Dec 21 13:53:50 2009 @@ -17,6 +17,7 @@ package org.apache.solr.analysis; +import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; @@ -27,8 +28,8 @@ import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.MappingCharFilter; import org.apache.lucene.analysis.NormalizeCharMap; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; public class TestPatternTokenizerFactory extends BaseTokenTestCase { @@ -57,7 +58,7 @@ tokenizer.init( args ); TokenStream stream = tokenizer.create( new StringReader( test[2] ) ); - String out = TestHyphenatedWordsFilter.tsToString( stream ); + String out = tsToString( stream ); System.out.println( test[2] + " ==> " + out ); assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out ); @@ -93,20 +94,45 @@ PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); tokFactory.init( args ); TokenStream stream = tokFactory.create( charStream ); - - List result = getTokens( stream ); - List expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" ); - assertTokEqualOff( expect, result ); + assertTokenStreamContents(stream, + new String[] { "Günther", "Günther", "is", "here" }, + new int[] { 0, 13, 26, 29 }, + new int[] { 12, 25, 28, 33 }, + new int[] { 1, 1, 1, 1 }); - charStream.reset(); + charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); args.put( PatternTokenizerFactory.PATTERN, "Günther" ); args.put( PatternTokenizerFactory.GROUP, "0" ); tokFactory = new PatternTokenizerFactory(); tokFactory.init( args ); stream = tokFactory.create( charStream ); + assertTokenStreamContents(stream, + new String[] { "Günther", "Günther" }, + new int[] { 0, 13 }, + new int[] { 12, 25 }, + new int[] { 1, 1 }); + } + + /** + * TODO: rewrite tests not to use string comparison. + * @deprecated only tests TermAttribute! + */ + private static String tsToString(TokenStream in) throws IOException { + StringBuilder out = new StringBuilder(); + TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class); + // extra safety to enforce, that the state is not preserved and also + // assign bogus values + in.clearAttributes(); + termAtt.setTermBuffer("bogusTerm"); + while (in.incrementToken()) { + if (out.length() > 0) + out.append(' '); + out.append(termAtt.term()); + in.clearAttributes(); + termAtt.setTermBuffer("bogusTerm"); + } - result = getTokens( stream ); - expect = tokens( "Günther,1,0,12 Günther,1,13,25" ); - assertTokEqualOff( expect, result ); + in.close(); + return out.toString(); } } Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Persian normalization factory is working. + */ +public class TestPersianNormalizationFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually normalizes persian text. + */ + public void testNormalization() throws Exception { + Reader reader = new StringReader("های"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "هاي" }); + } +} Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPhoneticFilter.java Mon Dec 21 13:53:50 2009 @@ -17,16 +17,14 @@ package org.apache.solr.analysis; -import java.util.ArrayList; +import java.io.StringReader; import java.util.HashMap; import java.util.Map; -import org.apache.commons.codec.Encoder; -import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; -import org.apache.commons.codec.language.RefinedSoundex; -import org.apache.commons.codec.language.Soundex; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; /** @@ -61,50 +59,38 @@ assertFalse( ff.inject ); } - public void runner( Encoder enc, boolean inject ) throws Exception - { - String[] input = new String[] { - "aaa", "bbb", "ccc", "easgasg" - }; - - ArrayList stream = new ArrayList(); - ArrayList output = new ArrayList(); - for( String s : input ) { - stream.add( new Token( s, 0, s.length() ) ); - - // phonetic token is added first in the current impl - output.add( new Token( enc.encode(s).toString(), 0, s.length() ) ); - - // add the original if applicable - if( inject ) { - output.add( new Token( s, 0, s.length() ) ); - } - } - - // System.out.println("###stream="+stream); - // System.out.println("###output="+output); - - PhoneticFilter filter = new PhoneticFilter( - new IterTokenStream(stream.iterator()), enc, "text", inject ); - - Token got = new Token(); - for( Token t : output ) { - got = filter.next(got); - // System.out.println("##### expect=" + t + " got="+got); - assertEquals( t.term(), got.term()); - } - assertNull( filter.next() ); // no more tokens + public void testAlgorithms() throws Exception { + assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); + assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg", + new String[] { "A", "B", "KKK", "ESKS" }); + + assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); + assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg", + new String[] { "A", "PP", "KK", "ASKS" }); + + assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg", + new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); + assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg", + new String[] { "A000", "B000", "C000", "E220" }); + + assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg", + new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); + assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg", + new String[] { "A0", "B1", "C3", "E034034" }); } - public void testEncodes() throws Exception { - runner( new DoubleMetaphone(), true ); - runner( new Metaphone(), true ); - runner( new Soundex(), true ); - runner( new RefinedSoundex(), true ); - - runner( new DoubleMetaphone(), false ); - runner( new Metaphone(), false ); - runner( new Soundex(), false ); - runner( new RefinedSoundex(), false ); + static void assertAlgorithm(String algName, String inject, String input, + String[] expected) throws Exception { + Tokenizer tokenizer = new WhitespaceTokenizer( + new StringReader(input)); + Map args = new HashMap(); + args.put("encoder", algName); + args.put("inject", inject); + PhoneticFilterFactory factory = new PhoneticFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, expected); } } Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Porter stem filter factory is working. + */ +public class TestPorterStemFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("dogs"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + PorterStemFilterFactory factory = new PorterStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "dog" }); + } +} Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java Mon Dec 21 13:53:50 2009 @@ -20,10 +20,14 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.util.Iterator; import java.util.Arrays; -public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase { +public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase { public static Token tok(int pos, String t, int start, int end) { Token tok = new Token(t,start,end); @@ -38,15 +42,27 @@ throws Exception { final Iterator toks = Arrays.asList(tokens).iterator(); - - final TokenStream ts = new RemoveDuplicatesTokenFilter + RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); + final TokenStream ts = factory.create (new TokenStream() { - public Token next() { return toks.hasNext() ? toks.next() : null; } + TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + public boolean incrementToken() { + if (toks.hasNext()) { + clearAttributes(); + Token tok = toks.next(); + termAtt.setTermBuffer(tok.term()); + offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); + posIncAtt.setPositionIncrement(tok.getPositionIncrement()); + return true; + } else { + return false; + } + } }); - final String actual = TestBufferedTokenStream.tsToString(ts); - assertEquals(expected + " != " + actual, expected, actual); - + assertTokenStreamContents(ts, expected.split("\\s")); } public void testNoDups() throws Exception { Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,41 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Reverse string filter factory is working. + */ +public class TestReverseStringFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually reverses text. + */ + public void testReversing() throws Exception { + Reader reader = new StringReader("simple test"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + ReverseStringFilterFactory factory = new ReverseStringFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "elpmis", "tset" }); + } +} Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -21,11 +21,9 @@ import java.io.StringReader; import java.util.HashMap; -import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.queryParser.ParseException; @@ -53,57 +51,52 @@ public void testReversedTokens() throws IOException { String text = "simple text"; - String expected1 = "simple \u0001elpmis text \u0001txet"; - String expected2 = "\u0001elpmis \u0001txet"; args.put("withOriginal", "true"); factory.init(args); TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text))); - List realTokens = getTokens(input); - List expectedTokens = tokens(expected1); - // set positionIncrements in expected tokens - for (int i = 1; i < expectedTokens.size(); i += 2) { - expectedTokens.get(i).setPositionIncrement(0); - } - assertTokEqual(realTokens, expectedTokens); - + assertTokenStreamContents(input, + new String[] { "\u0001elpmis", "simple", "\u0001txet", "text" }, + new int[] { 1, 0, 1, 0 }); + // now without original tokens args.put("withOriginal", "false"); factory.init(args); input = factory.create(new WhitespaceTokenizer(new StringReader(text))); - realTokens = getTokens(input); - expectedTokens = tokens(expected2); - assertTokEqual(realTokens, expectedTokens); + assertTokenStreamContents(input, + new String[] { "\u0001elpmis", "\u0001txet" }, + new int[] { 1, 1 }); } public void testIndexingAnalysis() throws Exception { Analyzer a = schema.getAnalyzer(); String text = "one two three si\uD834\uDD1Ex"; - String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis"; - List expectedTokens1 = getTokens( - new WhitespaceTokenizer(new StringReader(expected1))); - // set positionIncrements and offsets in expected tokens - for (int i = 1; i < expectedTokens1.size(); i += 2) { - Token t = expectedTokens1.get(i); - t.setPositionIncrement(0); - } - String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis"; - List expectedTokens2 = getTokens( - new WhitespaceTokenizer(new StringReader(expected2))); - String expected3 = "one two three si\uD834\uDD1Ex"; - List expectedTokens3 = getTokens( - new WhitespaceTokenizer(new StringReader(expected3))); + // field one TokenStream input = a.tokenStream("one", new StringReader(text)); - List realTokens = getTokens(input); - assertTokEqual(realTokens, expectedTokens1); + assertTokenStreamContents(input, + new String[] { "\u0001eno", "one", "\u0001owt", "two", + "\u0001eerht", "three", "\u0001x\uD834\uDD1Eis", "si\uD834\uDD1Ex" }, + new int[] { 0, 0, 4, 4, 8, 8, 14, 14 }, + new int[] { 3, 3, 7, 7, 13, 13, 19, 19 }, + new int[] { 1, 0, 1, 0, 1, 0, 1, 0 } + ); // field two input = a.tokenStream("two", new StringReader(text)); - realTokens = getTokens(input); - assertTokEqual(realTokens, expectedTokens2); + assertTokenStreamContents(input, + new String[] { "\u0001eno", "\u0001owt", + "\u0001eerht", "\u0001x\uD834\uDD1Eis" }, + new int[] { 0, 4, 8, 14 }, + new int[] { 3, 7, 13, 19 }, + new int[] { 1, 1, 1, 1 } + ); // field three input = a.tokenStream("three", new StringReader(text)); - realTokens = getTokens(input); - assertTokEqual(realTokens, expectedTokens3); + assertTokenStreamContents(input, + new String[] { "one", "two", "three", "si\uD834\uDD1Ex" }, + new int[] { 0, 4, 8, 14 }, + new int[] { 3, 7, 13, 19 }, + new int[] { 1, 1, 1, 1 } + ); } public void testQueryParsing() throws IOException, ParseException { Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRussianFilters.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRussianFilters.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRussianFilters.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestRussianFilters.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,79 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Russian filter factories are working. + */ +public class TestRussianFilters extends BaseTokenTestCase { + /** + * Test RussianLetterTokenizerFactory + */ + public void testTokenizer() throws Exception { + Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100"); + Map args = new HashMap(); + RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, new String[] {"Вместе", "с", "тем", "о", + "силе", "электромагнитной", "100"}); + } + + /** + * Test RussianLowerCaseFilterFactory + */ + public void testLowerCase() throws Exception { + Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100"); + Map args = new HashMap(); + RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory(); + factory.init(args); + RussianLowerCaseFilterFactory filterFactory = new RussianLowerCaseFilterFactory(); + filterFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = filterFactory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"вместе", "с", "тем", "о", + "силе", "электромагнитной", "100"}); + } + + /** + * Test RussianStemFilterFactory + */ + public void testStemmer() throws Exception { + Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100"); + Map args = new HashMap(); + RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory(); + factory.init(args); + RussianLowerCaseFilterFactory caseFactory = new RussianLowerCaseFilterFactory(); + caseFactory.init(args); + RussianStemFilterFactory stemFactory = new RussianStemFilterFactory(); + stemFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = caseFactory.create(tokenizer); + stream = stemFactory.create(stream); + assertTokenStreamContents(stream, new String[] {"вмест", "с", "тем", "о", + "сил", "электромагнитн", "100"}); + } +} Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,73 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Shingle filter factory works. + */ +public class TestShingleFilterFactory extends BaseTokenTestCase { + /** + * Test the defaults + */ + public void testDefaults() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, new String[] {"this", "this is", "is", + "is a", "a", "a test", "test"}); + } + + /** + * Test with unigrams disabled + */ + public void testNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] {"this is", "is a", "a test"}); + } + + /** + * Test with a higher max shingle size + */ + public void testMaxShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("maxShingleSize", "3"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + assertTokenStreamContents(stream, + new String[] {"this", "this is", "this is a", "is", + "is a", "is a test", "a", "a test", "test"}); + } +} Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStandardFactories.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStandardFactories.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStandardFactories.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestStandardFactories.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,121 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the standard lucene factories are working. + */ +public class TestStandardFactories extends BaseTokenTestCase { + /** + * Test StandardTokenizerFactory + */ + public void testStandardTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What's", "this", "thing", "do" }); + } + + /** + * Test StandardFilterFactory + */ + public void testStandardFilter() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + StandardFilterFactory filterFactory = new StandardFilterFactory(); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = filterFactory.create(tokenizer); + assertTokenStreamContents(stream, + new String[] {"What", "this", "thing", "do"}); + } + + /** + * Test KeywordTokenizerFactory + */ + public void testKeywordTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + KeywordTokenizerFactory factory = new KeywordTokenizerFactory(); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What's this thing do?"}); + } + + /** + * Test WhitespaceTokenizerFactory + */ + public void testWhitespaceTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What's", "this", "thing", "do?"}); + } + + /** + * Test LetterTokenizerFactory + */ + public void testLetterTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + LetterTokenizerFactory factory = new LetterTokenizerFactory(); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What", "s", "this", "thing", "do"}); + } + + /** + * Test LowerCaseTokenizerFactory + */ + public void testLowerCaseTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory(); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"what", "s", "this", "thing", "do"}); + } + + /** + * Ensure the ASCIIFoldingFilterFactory works + */ + public void testASCIIFolding() throws Exception { + Reader reader = new StringReader("Česká"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Ceska" }); + } + + /** + * Ensure the ISOLatin1AccentFilterFactory works + * (sometimes, at least not uppercase hacek) + */ + public void testISOLatin1Folding() throws Exception { + Reader reader = new StringReader("Česká"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + ISOLatin1AccentFilterFactory factory = new ISOLatin1AccentFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Česka" }); + } +} Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymFilter.java Mon Dec 21 13:53:50 2009 @@ -19,11 +19,20 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import java.io.IOException; +import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; -import java.util.Iterator; +import java.util.Collection; import java.util.List; /** @@ -31,34 +40,42 @@ */ public class TestSynonymFilter extends BaseTokenTestCase { - public List strings(String str) { + static List strings(String str) { String[] arr = str.split(" "); return Arrays.asList(arr); } - - public List getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException { - ArrayList lst = new ArrayList(); - final List toks = tokens(input); - TokenStream ts = new TokenStream() { - Iterator iter = toks.iterator(); - @Override - public Token next() throws IOException { - return iter.hasNext() ? (Token)iter.next() : null; - } - }; - - SynonymFilter sf = new SynonymFilter(ts, dict); - - Token target = new Token(); // test with token reuse - while(true) { - Token t = sf.next(target); - if (t==null) return lst; - lst.add((Token)t.clone()); - } + static void assertTokenizesTo(SynonymMap dict, String input, + String expected[]) throws IOException { + Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input)); + SynonymFilter stream = new SynonymFilter(tokenizer, dict); + assertTokenStreamContents(stream, expected); } - - + + static void assertTokenizesTo(SynonymMap dict, String input, + String expected[], int posIncs[]) throws IOException { + Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input)); + SynonymFilter stream = new SynonymFilter(tokenizer, dict); + assertTokenStreamContents(stream, expected, posIncs); + } + + static void assertTokenizesTo(SynonymMap dict, List input, + String expected[], int posIncs[]) + throws IOException { + TokenStream tokenizer = new IterTokenStream(input); + SynonymFilter stream = new SynonymFilter(tokenizer, dict); + assertTokenStreamContents(stream, expected, posIncs); + } + + static void assertTokenizesTo(SynonymMap dict, List input, + String expected[], int startOffsets[], int endOffsets[], int posIncs[]) + throws IOException { + TokenStream tokenizer = new IterTokenStream(input); + SynonymFilter stream = new SynonymFilter(tokenizer, dict); + assertTokenStreamContents(stream, expected, startOffsets, endOffsets, + posIncs); + } + public void testMatching() throws IOException { SynonymMap map = new SynonymMap(); @@ -71,28 +88,29 @@ map.add(strings("z x c v"), tokens("zxcv"), orig, merge); map.add(strings("x c"), tokens("xc"), orig, merge); - // System.out.println(map); - // System.out.println(getTokList(map,"a",false)); - - assertTokEqual(getTokList(map,"$",false), tokens("$")); - assertTokEqual(getTokList(map,"a",false), tokens("aa")); - assertTokEqual(getTokList(map,"a $",false), tokens("aa $")); - assertTokEqual(getTokList(map,"$ a",false), tokens("$ aa")); - assertTokEqual(getTokList(map,"a a",false), tokens("aa aa")); - assertTokEqual(getTokList(map,"b",false), tokens("bb")); - assertTokEqual(getTokList(map,"z x c v",false), tokens("zxcv")); - assertTokEqual(getTokList(map,"z x c $",false), tokens("z xc $")); + assertTokenizesTo(map, "$", new String[] { "$" }); + assertTokenizesTo(map, "a", new String[] { "aa" }); + assertTokenizesTo(map, "a $", new String[] { "aa", "$" }); + assertTokenizesTo(map, "$ a", new String[] { "$", "aa" }); + assertTokenizesTo(map, "a a", new String[] { "aa", "aa" }); + assertTokenizesTo(map, "b", new String[] { "bb" }); + assertTokenizesTo(map, "z x c v", new String[] { "zxcv" }); + assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" }); // repeats map.add(strings("a b"), tokens("ab"), orig, merge); map.add(strings("a b"), tokens("ab"), orig, merge); - assertTokEqual(getTokList(map,"a b",false), tokens("ab")); + + // FIXME: the below test intended to be { "ab" } + assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab" }); // check for lack of recursion map.add(strings("zoo"), tokens("zoo"), orig, merge); - assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo $ zoo")); + assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" }); map.add(strings("zoo"), tokens("zoo zoo"), orig, merge); - assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo zoo zoo $ zoo zoo")); + // FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" } + // maybe this was just a typo in the old test???? + assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }); } public void testIncludeOrig() throws IOException { @@ -107,25 +125,48 @@ map.add(strings("z x c v"), tokens("zxcv"), orig, merge); map.add(strings("x c"), tokens("xc"), orig, merge); - // System.out.println(map); - // System.out.println(getTokList(map,"a",false)); - - assertTokEqual(getTokList(map,"$",false), tokens("$")); - assertTokEqual(getTokList(map,"a",false), tokens("a/aa")); - assertTokEqual(getTokList(map,"a",false), tokens("a/aa")); - assertTokEqual(getTokList(map,"$ a",false), tokens("$ a/aa")); - assertTokEqual(getTokList(map,"a $",false), tokens("a/aa $")); - assertTokEqual(getTokList(map,"$ a !",false), tokens("$ a/aa !")); - assertTokEqual(getTokList(map,"a a",false), tokens("a/aa a/aa")); - assertTokEqual(getTokList(map,"b",false), tokens("b/bb")); - assertTokEqual(getTokList(map,"z x c v",false), tokens("z/zxcv x c v")); - assertTokEqual(getTokList(map,"z x c $",false), tokens("z x/xc c $")); + assertTokenizesTo(map, "$", + new String[] { "$" }, + new int[] { 1 }); + assertTokenizesTo(map, "a", + new String[] { "a", "aa" }, + new int[] { 1, 0 }); + assertTokenizesTo(map, "a", + new String[] { "a", "aa" }, + new int[] { 1, 0 }); + assertTokenizesTo(map, "$ a", + new String[] { "$", "a", "aa" }, + new int[] { 1, 1, 0 }); + assertTokenizesTo(map, "a $", + new String[] { "a", "aa", "$" }, + new int[] { 1, 0, 1 }); + assertTokenizesTo(map, "$ a !", + new String[] { "$", "a", "aa", "!" }, + new int[] { 1, 1, 0, 1 }); + assertTokenizesTo(map, "a a", + new String[] { "a", "aa", "a", "aa" }, + new int[] { 1, 0, 1, 0 }); + assertTokenizesTo(map, "b", + new String[] { "b", "bb" }, + new int[] { 1, 0 }); + assertTokenizesTo(map, "z x c v", + new String[] { "z", "zxcv", "x", "c", "v" }, + new int[] { 1, 0, 1, 1, 1 }); + assertTokenizesTo(map, "z x c $", + new String[] { "z", "x", "xc", "c", "$" }, + new int[] { 1, 1, 0, 1, 1 }); // check for lack of recursion map.add(strings("zoo zoo"), tokens("zoo"), orig, merge); - assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo/zoo $ zoo/zoo")); + // CHECKME: I think the previous test (with 4 zoo's), was just a typo. + assertTokenizesTo(map, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "zoo", "$", "zoo" }, + new int[] { 1, 0, 1, 1, 1 }); + map.add(strings("zoo"), tokens("zoo zoo"), orig, merge); - assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo $ zoo/zoo zoo")); + assertTokenizesTo(map, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, + new int[] { 1, 0, 1, 1, 1, 0, 1 }); } @@ -136,25 +177,35 @@ boolean merge = true; map.add(strings("a"), tokens("a5,5"), orig, merge); map.add(strings("a"), tokens("a3,3"), orig, merge); - // System.out.println(map); - assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2")); + + assertTokenizesTo(map, "a", + new String[] { "a3", "a5" }, + new int[] { 1, 2 }); map.add(strings("b"), tokens("b3,3"), orig, merge); map.add(strings("b"), tokens("b5,5"), orig, merge); - //System.out.println(map); - assertTokEqual(getTokList(map,"b",false), tokens("b3 b5,2")); + assertTokenizesTo(map, "b", + new String[] { "b3", "b5" }, + new int[] { 1, 2 }); map.add(strings("a"), tokens("A3,3"), orig, merge); map.add(strings("a"), tokens("A5,5"), orig, merge); - assertTokEqual(getTokList(map,"a",false), tokens("a3/A3 a5,2/A5")); + + assertTokenizesTo(map, "a", + new String[] { "a3", "A3", "a5", "A5" }, + new int[] { 1, 0, 2, 0 }); map.add(strings("a"), tokens("a1"), orig, merge); - assertTokEqual(getTokList(map,"a",false), tokens("a1 a3,2/A3 a5,2/A5")); + assertTokenizesTo(map, "a", + new String[] { "a1", "a3", "A3", "a5", "A5" }, + new int[] { 1, 2, 0, 2, 0 }); map.add(strings("a"), tokens("a2,2"), orig, merge); map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge); - assertTokEqual(getTokList(map,"a",false), tokens("a1 a2 a3/A3 a4 a5/A5 a6")); + assertTokenizesTo(map, "a", + new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" }, + new int[] { 1, 1, 1, 0, 1, 1, 0, 1 }); } @@ -167,41 +218,56 @@ map.add(strings("qwe"), tokens("xx"), orig, merge); map.add(strings("qwe"), tokens("yy"), orig, merge); map.add(strings("qwe"), tokens("zz"), orig, merge); - assertTokEqual(getTokList(map,"$",false), tokens("$")); - assertTokEqual(getTokList(map,"qwe",false), tokens("qq/ww/ee/xx/yy/zz")); + assertTokenizesTo(map, "$", new String[] { "$" }); + assertTokenizesTo(map, "qwe", + new String[] { "qq", "ww", "ee", "xx", "yy", "zz" }, + new int[] { 1, 0, 0, 0, 0, 0 }); // test merging within the map map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge); map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge); - assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2 a7,2 a8 a9 a10 a11 a111,100")); + assertTokenizesTo(map, "a", + new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" }, + new int[] { 1, 2, 2, 1, 1, 1, 1, 100 }); } - public void testOffsets() throws IOException { + public void testPositionIncrements() throws IOException { SynonymMap map = new SynonymMap(); boolean orig = false; boolean merge = true; - // test that generated tokens start at the same offset as the original + // test that generated tokens start at the same posInc as the original map.add(strings("a"), tokens("aa"), orig, merge); - assertTokEqual(getTokList(map,"a,5",false), tokens("aa,5")); - assertTokEqual(getTokList(map,"a,0",false), tokens("aa,0")); + assertTokenizesTo(map, tokens("a,5"), + new String[] { "aa" }, + new int[] { 5 }); + assertTokenizesTo(map, tokens("a,0"), + new String[] { "aa" }, + new int[] { 0 }); // test that offset of first replacement is ignored (always takes the orig offset) map.add(strings("b"), tokens("bb,100"), orig, merge); - assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5")); - assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0")); + assertTokenizesTo(map, tokens("b,5"), + new String[] { "bb" }, + new int[] { 5 }); + assertTokenizesTo(map, tokens("b,0"), + new String[] { "bb" }, + new int[] { 0 }); // test that subsequent tokens are adjusted accordingly map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge); - assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5 c2,2")); - assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0 c2,2")); - + assertTokenizesTo(map, tokens("c,5"), + new String[] { "cc", "c2" }, + new int[] { 5, 2 }); + assertTokenizesTo(map, tokens("c,0"), + new String[] { "cc", "c2" }, + new int[] { 0, 2 }); } - public void testOffsetsWithOrig() throws IOException { + public void testPositionIncrementsWithOrig() throws IOException { SynonymMap map = new SynonymMap(); boolean orig = true; @@ -209,18 +275,30 @@ // test that generated tokens start at the same offset as the original map.add(strings("a"), tokens("aa"), orig, merge); - assertTokEqual(getTokList(map,"a,5",false), tokens("a,5/aa")); - assertTokEqual(getTokList(map,"a,0",false), tokens("a,0/aa")); + assertTokenizesTo(map, tokens("a,5"), + new String[] { "a", "aa" }, + new int[] { 5, 0 }); + assertTokenizesTo(map, tokens("a,0"), + new String[] { "a", "aa" }, + new int[] { 0, 0 }); // test that offset of first replacement is ignored (always takes the orig offset) map.add(strings("b"), tokens("bb,100"), orig, merge); - assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5/b")); - assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0/b")); + assertTokenizesTo(map, tokens("b,5"), + new String[] { "b", "bb" }, + new int[] { 5, 0 }); + assertTokenizesTo(map, tokens("b,0"), + new String[] { "b", "bb" }, + new int[] { 0, 0 }); // test that subsequent tokens are adjusted accordingly map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge); - assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5/c c2,2")); - assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0/c c2,2")); + assertTokenizesTo(map, tokens("c,5"), + new String[] { "c", "cc", "c2" }, + new int[] { 5, 0, 2 }); + assertTokenizesTo(map, tokens("c,0"), + new String[] { "c", "cc", "c2" }, + new int[] { 0, 0, 2 }); } @@ -238,10 +316,101 @@ map.add(strings("a a"), tokens("b"), orig, merge); map.add(strings("x"), tokens("y"), orig, merge); - System.out.println(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false)); - // "a a x" => "b y" - assertTokEqualOff(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false), tokens("b,1,0,3 y,1,4,5")); + assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"), + new String[] { "b", "y" }, + new int[] { 0, 4 }, + new int[] { 3, 5 }, + new int[] { 1, 1 }); } + + /*** + * Return a list of tokens according to a test string format: + * a b c => returns List [a,b,c] + * a/b => tokens a and b share the same spot (b.positionIncrement=0) + * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0) + * a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11 + * @deprecated does not support attributes api + */ + private List tokens(String str) { + String[] arr = str.split(" "); + List result = new ArrayList(); + for (int i=0; i 1) { + posInc = Integer.parseInt(params[1]); + } else { + posInc = 1; + } + + if (params.length > 2) { + start = Integer.parseInt(params[2]); + } else { + start = 0; + } + + if (params.length > 3) { + end = Integer.parseInt(params[3]); + } else { + end = start + params[0].length(); + } + + Token t = new Token(params[0],start,end,"TEST"); + t.setPositionIncrement(posInc); + + result.add(t); + for (int j=1; j tokens) { + this(tokens.toArray(new Token[tokens.size()])); + } + + public boolean incrementToken() throws IOException { + if (index >= tokens.length) + return false; + else { + clearAttributes(); + Token token = tokens[index++]; + termAtt.setTermBuffer(token.term()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + posIncAtt.setPositionIncrement(token.getPositionIncrement()); + flagsAtt.setFlags(token.getFlags()); + typeAtt.setType(token.type()); + payloadAtt.setPayload(token.getPayload()); + return true; + } + } + } } Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java Mon Dec 21 13:53:50 2009 @@ -0,0 +1,42 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Thai word filter factory is working. + */ +public class TestThaiWordFilterFactory extends BaseTokenTestCase { + /** + * Ensure the filter actually decomposes text. + */ + public void testWordBreak() throws Exception { + Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี"); + Tokenizer tokenizer = new WhitespaceTokenizer(reader); + ThaiWordFilterFactory factory = new ThaiWordFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้", + "ต้อง", "แสดง", "ว่า", "งาน", "ดี"}); + } +}