asterixdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wangs...@apache.org
Subject [1/3] asterixdb git commit: ASTERIXDB-1877: Tokenizer in FullText Search fix
Date Mon, 10 Apr 2017 19:47:16 GMT
Repository: asterixdb
Updated Branches:
  refs/heads/master 90fb051a0 -> ab01c87e5


http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql
b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql
new file mode 100644
index 0000000..b6732c4
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ *  Description     : Full-text search non-index test
+ *                  : This test is intended to verify that the full-text search works as
expected.
+ *                  : But, the form of the query is join. So, each keyword from the outer
dataset will be processed
+ *                  : separately. Thus, query #3 and query #4 should generate the same result.
+ *                  : query #3 - two string values in [an ordered list] query with "any"
option.
+ *                  :            an ordered list is first initialized by let clause and is
being used.
+ *                  :            in this case, "any" option that enforces a disjunctive search
will be applied.
+ *                  : query #4 - the same as query #3, but with a different option - "all"
+ *                  :            in this case, we explicitly specify "all" option that enforces
a conjunctive search.
+ *                  : query #5 - the same as query #4, but without any option that is equivalent
to "all".
+ *  Expected Result : Success
+ *
+*/
+
+drop dataverse twitter if exists;
+create dataverse twitter if not exists;
+use dataverse twitter
+create type typeUser if not exists as open {
+    id: int64,
+    name: string,
+    screen_name : string,
+    lang : string,
+    location: string,
+    create_at: date,
+    description: string,
+    followers_count: int32,
+    friends_count: int32,
+    statues_count: int64
+}
+create type typePlace if not exists as open{
+    country : string,
+    country_code : string,
+    full_name : string,
+    id : string,
+    name : string,
+    place_type : string,
+    bounding_box : rectangle
+}
+create type typeGeoTag if not exists as open {
+    stateID: int32,
+    stateName: string,
+    countyID: int32,
+    countyName: string,
+    cityID: int32?,
+    cityName: string?
+}
+create type typeTweet if not exists as open{
+    create_at : datetime,
+    id: int64,
+    "text": string,
+    in_reply_to_status : int64,
+    in_reply_to_user : int64,
+    favorite_count : int64,
+    coordinate: point?,
+    retweet_count : int64,
+    lang : string,
+    is_retweet: boolean,
+    hashtags : {{ string }} ?,
+    user_mentions : {{ int64 }} ? ,
+    user : typeUser,
+    place : typePlace?,
+    geo_tag: typeGeoTag
+}
+create dataset ds_tweet(typeTweet) if not exists primary key id
+using compaction policy prefix (("max-mergable-component-size"="134217728"),("max-tolerance-component-count"="10"))
with filter on create_at;

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql
b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql
new file mode 100644
index 0000000..6947e27
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse twitter;
+
+load dataset ds_tweet
+using localfs
+(("path"="asterix_nc1://data/fulltext/cloudberry_sample_tweet.adm"),("format"="adm"));

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql
b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql
new file mode 100644
index 0000000..c795d40
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse twitter;
+
+for $t in dataset twitter.ds_tweet
+where ftcontains($t.'text', ['good'])
+order by $t.id
+return {"id":$t.id}
+

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql
b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql
new file mode 100644
index 0000000..8892da0
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse twitter;
+
+for $t in dataset twitter.ds_tweet
+where ftcontains($t.'text', ['good','difficult'])
+order by $t.id
+return {"id":$t.id}
+

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql
b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql
new file mode 100644
index 0000000..b3c61b3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse twitter;
+
+for $t in dataset twitter.ds_tweet
+where ftcontains($t.'text', ['good','difficult'], {'mode':'any'})
+order by $t.id
+return {"id":$t.id}
+

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm
new file mode 100644
index 0000000..85c3c4f
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm
@@ -0,0 +1,3 @@
+{ "id": 668945643054870528 }
+{ "id": 668945646725017600 }
+{ "id": 668945653892911104 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm
new file mode 100644
index 0000000..17babd8
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm
@@ -0,0 +1 @@
+{ "id": 668945643054870528 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm
new file mode 100644
index 0000000..2d91ff6
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm
@@ -0,0 +1,4 @@
+{ "id": 668945643054870528 }
+{ "id": 668945646725017600 }
+{ "id": 668945651263115264 }
+{ "id": 668945653892911104 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml
index 7a486b7..956ea53 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -421,6 +421,11 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="fulltext">
+      <compilation-unit name="fulltext-08">
+        <output-dir compare="Text">fulltext-08</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="fulltext">
       <compilation-unit name="fulltext-index-01">
         <output-dir compare="Text">fulltext-index-01</output-dir>
       </compilation-unit>

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md b/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md
index 4fe17ac..bc0b398 100644
--- a/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md
+++ b/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md
@@ -56,6 +56,14 @@ Thus, "Voice" or "voice" will be evaluated as the same word.
 
 The DDL and DML of TinySocial can be found in [ADM: Modeling Semistructed Data in AsterixDB](primer.html#ADM:_Modeling_Semistructed_Data_in_AsterixDB).
 
+The same query can be also expressed in the SQL++.
+
+        use TinySocial;
+
+        select element {"id":msg.id}
+        from TweetMessages as msg
+        where TinySocial.ftcontains(msg.`message-text`, "voice", {"mode":"any"})
+
 The `Expression1` is an expression that should be evaluable as a string at runtime as in
the above example
 where `$msg.message-text` is a string field. The `Expression2` can be a string, an (un)ordered
list
 of string value(s), or an expression. In the last case, the given expression should be evaluable
@@ -103,3 +111,13 @@ or “sound is not clear. You may need to install a new system.”
 
        ... where ftcontains($msg.message-text, ["sound", "system"], {"mode":"all"})
        ... where ftcontains($msg.message-text, ["sound", "system"])
+
+
+## <a id="FulltextIndex">Creating and utilizing a Full-text index</a> <font
size="4"><a href="#toc">[Back to TOC]</a></font> ##
+
+When there is a full-text index on the field that is being searched, rather than scanning
all records,
+AsterixDB can utilize that index to expedite the execution of a FTS query. To create a full-text
index,
+you need to specify the index type as `fulltext` in your DDL statement. For instance, the
following DDL
+statement create a full-text index on the TweetMessages.message-text attribute.
+
+    create index messageFTSIdx on TweetMessages(message-text) type fulltext;

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java
index b94821f..0a48f6f 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java
@@ -244,7 +244,16 @@ public class FullTextContainsEvaluator implements IScalarEvaluator {
         int queryTokenCount = 0;
         int uniqueQueryTokenCount = 0;
 
+        int numBytesToStoreLength;
+
         // Reset the tokenizer for the given keywords in the given query
+        if (typeTag2 == ATypeTag.STRING) {
+            // How many bytes are required to store the length of the given token?
+            numBytesToStoreLength = UTF8StringUtil
+                    .getNumBytesToStoreLength(UTF8StringUtil.getUTFLength(queryArray, queryArrayStartOffset));
+            queryArrayStartOffset = queryArrayStartOffset + numBytesToStoreLength;
+            queryArrayLength = queryArrayLength - numBytesToStoreLength;
+        }
         tokenizerForRightArray.reset(queryArray, queryArrayStartOffset, queryArrayLength);
 
         // Create tokens from the given query predicate
@@ -256,7 +265,6 @@ public class FullTextContainsEvaluator implements IScalarEvaluator {
             // We don't store the actual value of this token since we can access it via offset
and length.
             int tokenOffset = tokenizerForRightArray.getToken().getStartOffset();
             int tokenLength = tokenizerForRightArray.getToken().getTokenLength();
-            int numBytesToStoreLength;
 
             // If a token comes from a string tokenizer, each token doesn't have the length
data
             // in the beginning. Instead, if a token comes from an (un)ordered list, each
token has
@@ -352,7 +360,14 @@ public class FullTextContainsEvaluator implements IScalarEvaluator {
 
         // The left side: field (document)
         // Resets the tokenizer for the given keywords in a document.
-        tokenizerForLeftArray.reset(arg1.getByteArray(), arg1.getStartOffset(), arg1.getLength());
+
+        // How many bytes are required to store the length of the given string?
+        int numBytesToStoreLength = UTF8StringUtil
+                .getNumBytesToStoreLength(UTF8StringUtil.getUTFLength(arg1.getByteArray(),
arg1.getStartOffset()));
+        int startOffset = arg1.getStartOffset() + numBytesToStoreLength;
+        int length = arg1.getLength() - numBytesToStoreLength;
+
+        tokenizerForLeftArray.reset(arg1.getByteArray(), startOffset, length);
 
         // Creates tokens from a field in the left side (document)
         while (tokenizerForLeftArray.hasNext()) {


Mime
View raw message