cassandra-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From m..@apache.org
Subject [1/7] cassandra git commit: SASI tokenizer for simple delimiter based entries
Date Thu, 15 Mar 2018 09:45:41 GMT
Repository: cassandra
Updated Branches:
  refs/heads/cassandra-3.11 5a662ea3f -> 4c6a10af8
  refs/heads/trunk d4dfbb5c6 -> 92cde9b27


http://git-wip-us.apache.org/repos/asf/cassandra/blob/4c6a10af/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java
b/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java
new file mode 100644
index 0000000..15bbb84
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sasi.analyzer;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.commons.io.IOUtils;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class DelimiterAnalyzerTest
+{
+
+    @Test
+    public void caseSensitiveAnalizer() throws Exception
+    {
+        DelimiterAnalyzer analyzer = new DelimiterAnalyzer();
+
+        analyzer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        String testString = "Nip it in the bud";
+        ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
+        analyzer.reset(toAnalyze);
+        StringBuilder output = new StringBuilder();
+        while (analyzer.hasNext())
+            output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ?
' ' : ""));
+
+        Assert.assertTrue(testString.equals(output.toString()));
+        Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void ensureIncompatibleInputSkipped() throws Exception
+    {
+        new DelimiterAnalyzer().init(new HashMap(), Int32Type.instance);
+    }
+
+    @Test
+    public void testTokenizationLoremIpsum() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/lorem_ipsum.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        tokenizer.reset(bb);
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(69, tokens.size());
+
+    }
+
+    @Test
+    public void testTokenizationJaJp1() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_1.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, "。");
+                }},
+            UTF8Type.instance);
+
+        tokenizer.reset(bb);
+        List<ByteBuffer> tokens = new ArrayList<>();
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(4, tokens.size());
+    }
+
+    @Test
+    public void testTokenizationJaJp2() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_2.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, "。");
+                }},
+            UTF8Type.instance);
+
+        tokenizer.reset(bb);
+        List<ByteBuffer> tokens = new ArrayList<>();
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(2, tokens.size());
+    }
+
+    @Test
+    public void testTokenizationRuRu1() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ru_ru_1.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        tokenizer.reset(bb);
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(447, tokens.size());
+    }
+
+    @Test
+    public void testTokenizationZnTw1() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/zn_tw_1.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        tokenizer.reset(bb);
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(403, tokens.size());
+    }
+
+    @Test
+    public void testTokenizationAdventuresOfHuckFinn() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        tokenizer.reset(bb);
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(104594, tokens.size());
+    }
+
+    @Test
+    public void testWorldCities() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/world_cities_a.csv")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, ",");
+                }},
+            UTF8Type.instance);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        tokenizer.reset(bb);
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(122265, tokens.size());
+    }
+
+    @Test
+    public void tokenizeDomainNamesAndUrls() throws Exception
+    {
+        ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
+                DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/top_visited_domains.txt")));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        tokenizer.reset(bb);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        while (tokenizer.hasNext())
+            tokens.add(tokenizer.next());
+
+        assertEquals(12, tokens.size());
+    }
+
+    @Test
+    public void testReuseAndResetTokenizerInstance() throws Exception
+    {
+        List<ByteBuffer> bbToTokenize = new ArrayList<>();
+        bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
+        bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
+        bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
+        bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
+        bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));
+
+        DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
+
+        tokenizer.init(
+            new HashMap()
+                {{
+                    put(DelimiterTokenizingOptions.DELIMITER, " ");
+                }},
+            UTF8Type.instance);
+
+        List<ByteBuffer> tokens = new ArrayList<>();
+        for (ByteBuffer bb : bbToTokenize)
+        {
+            tokenizer.reset(bb);
+            while (tokenizer.hasNext())
+                tokens.add(tokenizer.next());
+        }
+        assertEquals(26, tokens.size());
+    }
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@cassandra.apache.org
For additional commands, e-mail: commits-help@cassandra.apache.org


Mime
View raw message