asterixdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wangs...@apache.org
Subject asterixdb git commit: ASTERIXDB-1778: Optimize the edit-distance-check function
Date Sun, 05 Feb 2017 08:21:46 GMT
Repository: asterixdb
Updated Branches:
  refs/heads/master 2343e1c13 -> 56189fb11


ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.
 - Remove letter counting filtering method since it is only applicable for
   the string in ASCII range (0 ~ 127).

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Reviewed-on: https://asterix-gerrit.ics.uci.edu/1481
Sonar-Qube: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
BAD: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Jianfeng Jia <jianfeng.jia@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo
Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/56189fb1
Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/56189fb1
Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/56189fb1

Branch: refs/heads/master
Commit: 56189fb1116a25aca29b3989814ceea00d470e39
Parents: 2343e1c
Author: Taewoo Kim <wangsaeu@yahoo.com>
Authored: Sat Feb 4 14:45:51 2017 -0800
Committer: Taewoo Kim <wangsaeu@gmail.com>
Committed: Sun Feb 5 00:20:27 2017 -0800

----------------------------------------------------------------------
 .../src/main/markdown/builtins/5_similarity.md  |  30 ++
 asterixdb/asterix-fuzzyjoin/pom.xml             |   4 +
 .../similarity/IGenericSimilarityMetric.java    |  30 +-
 .../fuzzyjoin/similarity/IListIterator.java     |  38 ---
 .../fuzzyjoin/similarity/SimilarityMetric.java  |  67 +----
 .../SimilarityMetricEditDistance.java           | 286 +++++++------------
 .../similarity/SimilarityMetricJaccard.java     |  23 +-
 .../SimilarityMetricEditDistanceTest.java       |  72 +++++
 .../common/AbstractAsterixListIterator.java     |   7 +-
 .../common/EditDistanceCheckEvaluator.java      |   8 +-
 .../common/EditDistanceEvaluator.java           |   8 +-
 .../SimilarityJaccardSortedCheckEvaluator.java  |   2 +-
 .../SimilarityJaccardSortedEvaluator.java       |   2 +-
 .../data/std/util/ISequenceIterator.java        |  40 +++
 .../std/util/UTF8StringCharByCharIterator.java  |  87 ++++++
 15 files changed, 389 insertions(+), 315 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@ including [edit distance](http://en.wikipedia.org/wiki/Levenshtein_distance) and
 
         2
 
+### edit_distance_check ###
+* Syntax:
+
+        edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+    * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+    * `expression2` : The same type as `expression1`.
+    * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+    * an `array` with two items:
+        * The first item contains a `boolean` value representing whether the edit distance of `expression1` and `expression2` is within the given threshold.
+        * The second item contains an `integer` that represents the edit distance of `expression1` and `expression2` if the first item is true.
+        * If the first item is false, then the second item is set to 2147483647.
+    * `missing` if any argument is a `missing` value,
+    * `null` if any argument is a `null` value but no argument is a `missing` value,
+    * a type error will be raised if:
+        * the first or second argument is any other non-string value,
+        * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be utilized for this function.
+* Example:
+
+        edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+        [ true, 2 ]
 
 ### edit_distance_contains ###
 * Syntax:

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/pom.xml
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/pom.xml b/asterixdb/asterix-fuzzyjoin/pom.xml
index 0539782..9485852 100644
--- a/asterixdb/asterix-fuzzyjoin/pom.xml
+++ b/asterixdb/asterix-fuzzyjoin/pom.xml
@@ -82,6 +82,10 @@
       <groupId>org.apache.hyracks</groupId>
       <artifactId>hyracks-util</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>hyracks-data-std</artifactId>
+    </dependency>
   </dependencies>
 
 </project>

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..b213df2 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -20,13 +20,33 @@
 package org.apache.asterix.fuzzyjoin.similarity;
 
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
 public interface IGenericSimilarityMetric {
-    // returns similarity
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException;
+    /**
+     * Returns the similarity value for the given two lists.
+     *
+     * @param firstSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @param secondSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @return a float similarity value
+     * @throws HyracksDataException
+     */
+    public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+            throws HyracksDataException;
 
-    // returns -1 if does not satisfy threshold
-    // else returns similarity
-    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+    /**
+     * Returns the similarity value for the given two lists. If the calculated similarity value
+     * doesn't satisfy the given simThresh value based on the function's check condition, this returns -1.
+     *
+     * @param firstSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @param secondSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @return a float similarity value.
+     * @throws HyracksDataException
+     */
+    public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence, float simThresh)
             throws HyracksDataException;
 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
deleted file mode 100644
index 6c3d22e..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.similarity;
-
-import org.apache.hyracks.api.exceptions.HyracksDataException;
-
-public interface IListIterator {
-    public int compare(IListIterator cmpIter) throws HyracksDataException;
-
-    public byte[] getData();
-
-    public int getPos();
-
-    public boolean hasNext();
-
-    public void next() throws HyracksDataException;
-
-    public void reset() throws HyracksDataException;
-
-    public int size();
-}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..3348d4c 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -21,10 +21,12 @@ package org.apache.asterix.fuzzyjoin.similarity;
 
 import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
 public abstract class SimilarityMetric {
 
-    public static int getIntersectSize(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+    public static int getIntersectSize(ISequenceIterator tokensX, ISequenceIterator tokensY)
+            throws HyracksDataException {
         int intersectSize = 0;
         while (tokensX.hasNext() && tokensY.hasNext()) {
             int cmp = tokensX.compare(tokensY);
@@ -64,23 +66,6 @@ public abstract class SimilarityMetric {
     }
 
     public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
-        // int intersectSize = 0;
-        //
-        // while (startX < tokensX.length && startY < tokensY.length) {
-        // int tokenX = tokensX[startX];
-        // int tokenY = tokensY[startY];
-        // if (tokenX > tokenY) {
-        // startY++;
-        // } else if (tokenX < tokenY) {
-        // startX++;
-        // } else {
-        // intersectSize++;
-        // startX++;
-        // startY++;
-        // }
-        // }
-        //
-        // return intersectSize;
         return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
     }
 
@@ -131,52 +116,6 @@ public abstract class SimilarityMetric {
         return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
     }
 
-    // @SuppressWarnings("unchecked")
-    // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
-    // int intersectSize = 0;
-    //
-    // Iterator<Tuple> iteratorX = tokensX.iterator();
-    // Iterator<Tuple> iteratorY = tokensY.iterator();
-    //
-    // Tuple nextX = null;
-    // Tuple nextY = null;
-    //
-    // while ((nextX != null || iteratorX.hasNext())
-    // && (nextY != null || iteratorY.hasNext())) {
-    // if (nextX == null) {
-    // nextX = iteratorX.next();
-    // }
-    // if (nextY == null) {
-    // nextY = iteratorY.next();
-    // }
-    //
-    // int cmp = nextX.compareTo(nextY);
-    // if (cmp > 0) {
-    // nextY = null;
-    // } else if (cmp < 0) {
-    // nextX = null;
-    // } else {
-    // intersectSize++;
-    // nextX = null;
-    // nextY = null;
-    // }
-    // }
-    //
-    // return intersectSize;
-    // }
-
-    // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
-    // public abstract float getSimilarity(DataBag tokensX, int lengthX,
-    // DataBag tokensY, int lengthY);
-
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
-        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
-        int totalSize = tokensX.size() + tokensY.size();
-
-        return (float) intersectionSize / (totalSize - intersectionSize);
-    }
-
     public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
 
     public abstract float getSimilarity(int[] tokensX, int[] tokensY);

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..8003767 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -19,39 +19,59 @@
 
 package org.apache.asterix.fuzzyjoin.similarity;
 
-import java.util.Arrays;
-
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
+import org.apache.hyracks.data.std.util.UTF8StringCharByCharIterator;
 import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
 
-    // dp implementation only needs 2 rows
+    // This Dynamic Programming implementation only needs 2 rows.
     private final int rows = 2;
     private int cols;
     private int[][] matrix;
 
-    // for letter count filtering
-    private final int[] fsLcCount = new int[128];
-    private final int[] ssLcCount = new int[128];
+    // for string edit-distance calculation
+    private final UTF8StringCharByCharIterator leftIt = new UTF8StringCharByCharIterator();
+    private final UTF8StringCharByCharIterator rightIt = new UTF8StringCharByCharIterator();
+
+    public static final int SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE = -1;
 
     public SimilarityMetricEditDistance() {
         cols = 100; // arbitrary default value
         matrix = new int[rows][cols];
     }
 
-    @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException {
-        int flLen = firstList.size();
-        int slLen = secondList.size();
+    /**
+     * Gets the edit distance value for the given two sequences using a Dynamic Programming approach.
+     * If a positive simThresh value is provided, this method only calculates 2 * (simThresh + 1) cells per row,
+     * not all the cells in a row as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner–Fischer_algorithm
+     * for more details. Also, as one more optimization, during the calculation steps, if this method finds out
+     * that the final edit distance value cannot be within simThresh, this method stops the calculation
+     * and immediately returns -1.
+     * If the final edit distance value is less than or equal to simThresh, then that value will be returned.
+     * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+     * the final edit distance value.
+     *
+     * @return the edit distance of the two lists. -1 if a positive simThresh value is given and the edit distance
+     *         value is greater than the given simThresh.
+     */
+    private float computeActualSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence,
+            float simThresh) throws HyracksDataException {
+        int flLen = firstSequence.size();
+        int slLen = secondSequence.size();
+
+        // When a positive threshold is given, then we can apply two optimizations.
+        int edThresh = (int) simThresh;
+        boolean canTerminateEarly = edThresh >= 0;
 
-        // reuse existing matrix if possible
+        // Reuses the existing matrix if possible.
         if (slLen >= cols) {
             cols = slLen + 1;
             matrix = new int[rows][cols];
         }
 
-        // init matrix
+        // Inits the matrix.
         for (int i = 0; i <= slLen; i++) {
             matrix[0][i] = i;
         }
@@ -59,20 +79,54 @@ public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
         int currRow = 1;
         int prevRow = 0;
 
-        // expand dynamic programming matrix row by row
+        int from = 1;
+        int to = slLen;
+        int minDistance = -1;
+
+        // Expands the dynamic programming matrix row by row.
         for (int i = 1; i <= flLen; i++) {
             matrix[currRow][0] = i;
 
-            secondList.reset();
-            for (int j = 1; j <= slLen; j++) {
+            secondSequence.reset();
+
+            // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+            // Also keeps minDistance to see whether the possible edit distance after
+            // each row calculation is greater than the simThresh.
+            if (canTerminateEarly) {
+                minDistance = edThresh + 1;
+                from = Math.max(i - edThresh - 1, 1);
+                to = Math.min(i + edThresh + 1, slLen);
+                for (int j = 1; j < from; j++) {
+                    // Moves the pointer of the second list to the point where the calculation starts for this row.
+                    secondSequence.next();
+                }
+                if (from > 1) {
+                    // Sets the left Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][from - 1] = edThresh + 1;
+                }
+                if (to < slLen) {
+                    // Sets the right Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][to + 1] = edThresh + 1;
+                }
+            }
+
+            for (int j = from; j <= to; j++) {
 
                 matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
-                        matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
+                        matrix[prevRow][j - 1] + (firstSequence.compare(secondSequence) == 0 ? 0 : 1));
 
-                secondList.next();
-            }
+                // Replaces minDistance after each cell computation if we find a smaller value than that.
+                if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+                    minDistance = matrix[currRow][j];
+                }
 
-            firstList.next();
+                secondSequence.next();
+            }
+            // If the minimum distance value is greater than the given threshold, no reason to process next row.
+            if (canTerminateEarly && minDistance > edThresh) {
+                return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
+            }
+            firstSequence.next();
 
             int tmp = currRow;
             currRow = prevRow;
@@ -82,8 +136,12 @@ public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
         return matrix[prevRow][slLen];
     }
 
+    /**
+     * Gets the similarity value for the given two sequences. If the value doesn't satisfy the given simThresh,
+     * this method returns -1. Else, this returns the real similarity value.
+     */
     @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+    public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
             throws HyracksDataException {
 
         int edThresh = (int) simThresh;
@@ -93,18 +151,18 @@ public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
 
         // length filter
         if (Math.abs(flLen - slLen) > edThresh) {
-            return -1;
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         }
 
-        float ed = getSimilarity(firstList, secondList);
-        if (ed > edThresh) {
-            return -1;
+        float ed = computeActualSimilarity(firstList, secondList, simThresh);
+        if (ed > edThresh || ed < 0) {
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         } else {
             return ed;
         }
     }
 
-    public int getSimilarityContains(IListIterator exprList, IListIterator patternList, int simThresh)
+    public int getSimilarityContains(ISequenceIterator exprList, ISequenceIterator patternList, int simThresh)
             throws HyracksDataException {
         int exprLen = exprList.size();
         int patternLen = patternList.size();
@@ -148,182 +206,50 @@ public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
         }
 
         if (minEd > simThresh) {
-            return -1;
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         } else {
             return minEd;
         }
     }
 
     // faster implementation for common case of string edit distance
-    public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart) {
-        int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
-        int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
-
-        int fsUtfLen = UTF8StringUtil.getUTFLength(leftBytes, fsStart);
-        int ssUtfLen = UTF8StringUtil.getUTFLength(rightBytes, ssStart);
-        int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
-        int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
-        // reuse existing matrix if possible
-        if (ssLen >= cols) {
-            cols = ssLen + 1;
-            matrix = new int[rows][cols];
-        }
-
-        int fsDataStart = fsStart + fsMetaLen;
-        int ssDataStart = ssStart + ssMetaLen;
-
-        // init matrix
-        for (int i = 0; i <= ssLen; i++) {
-            matrix[0][i] = i;
-        }
-
-        int currRow = 1;
-        int prevRow = 0;
-
-        // expand dynamic programming matrix row by row
-        int fsPos = fsDataStart;
-        for (int i = 1; i <= fsLen; i++) {
-            matrix[currRow][0] = i;
-            char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
-            int ssPos = ssDataStart;
-            for (int j = 1; j <= ssLen; j++) {
-                char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
-
-                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
-                        matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
-
-                ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
-            }
-            fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
-            int tmp = currRow;
-            currRow = prevRow;
-            prevRow = tmp;
-        }
-        return matrix[prevRow][ssLen];
+    public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart,
+            int edThresh) throws HyracksDataException {
+        leftIt.reset(leftBytes, fsStart);
+        rightIt.reset(rightBytes, ssStart);
+        return (int) computeActualSimilarity(leftIt, rightIt, edThresh);
     }
 
-    public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh) {
+    public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh)
+            throws HyracksDataException {
         int fsStrLen = UTF8StringUtil.getStringLength(bytesLeft, fsStart);
         int ssStrLen = UTF8StringUtil.getStringLength(bytesRight, ssStart);
 
-        int fsUtfLen = UTF8StringUtil.getUTFLength(bytesLeft, fsStart);
-        int ssUtfLen = UTF8StringUtil.getUTFLength(bytesRight, ssStart);
-        int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
-        int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
         // length filter
         if (Math.abs(fsStrLen - ssStrLen) > edThresh) {
-            return -1;
-        }
-
-        // initialize letter count filtering
-        Arrays.fill(fsLcCount, 0);
-        Arrays.fill(ssLcCount, 0);
-
-        // compute letter counts for first string
-        int fsPos = fsStart + fsMetaLen;
-        int fsEnd = fsPos + fsUtfLen;;
-        while (fsPos < fsEnd) {
-            char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
-            if (c < 128) {
-                fsLcCount[c]++;
-            }
-            fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
-        }
-
-        // compute letter counts for second string
-        int ssPos = ssStart + ssMetaLen;
-        int ssEnd = ssPos + ssUtfLen;
-        while (ssPos < ssEnd) {
-            char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
-            if (c < 128) {
-                ssLcCount[c]++;
-            }
-            ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
-        }
-
-        // apply filter
-        int gtSum = 0;
-        int ltSum = 0;
-        for (int i = 0; i < 128; i++) {
-            if (fsLcCount[i] > ssLcCount[i]) {
-                gtSum += fsLcCount[i] - ssLcCount[i];
-                if (gtSum > edThresh) {
-                    return -1;
-                }
-            } else {
-                ltSum += ssLcCount[i] - fsLcCount[i];
-                if (ltSum > edThresh) {
-                    return -1;
-                }
-            }
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         }
 
-        int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
-        if (ed > edThresh) {
-            return -1;
+        int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart, edThresh);
+        if (ed > edThresh || ed < 0) {
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         } else {
             return ed;
         }
     }
 
     // checks whether the first string contains a similar string to the second string
-    public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] pattenBytes, int patternStart,
-            int edThresh) {
-
-        int stringLen = UTF8StringUtil.getStringLength(strBytes, stringStart);
-        int patternLen = UTF8StringUtil.getStringLength(pattenBytes, patternStart);
-
-        int stringUTFLen = UTF8StringUtil.getUTFLength(strBytes, stringStart);
-        int stringMetaLen = UTF8StringUtil.getNumBytesToStoreLength(stringUTFLen);
-
-        int patternUTFLen = UTF8StringUtil.getUTFLength(pattenBytes, patternStart);
-        int patternMetaLen = UTF8StringUtil.getNumBytesToStoreLength(patternUTFLen);
-
-        // reuse existing matrix if possible
-        if (patternLen >= cols) {
-            cols = patternLen + 1;
-            matrix = new int[rows][cols];
-        }
-
-        int stringDataStart = stringStart + stringMetaLen;
-        int patternDataStart = patternStart + patternMetaLen;
-
-        // init matrix
-        for (int i = 0; i <= patternLen; i++) {
-            matrix[0][i] = i;
-        }
-
-        int currRow = 1;
-        int prevRow = 0;
-        int minEd = Integer.MAX_VALUE;
-        // expand dynamic programming matrix row by row
-        int stringPos = stringDataStart;
-        for (int i = 1; i <= stringLen; i++) {
-            matrix[currRow][0] = 0;
-            char stringChar = Character.toLowerCase(UTF8StringUtil.charAt(strBytes, stringPos));
-
-            int patternPos = patternDataStart;
-            for (int j = 1; j <= patternLen; j++) {
-                char patternChar = Character.toLowerCase(UTF8StringUtil.charAt(pattenBytes, patternPos));
-                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
-                        matrix[prevRow][j - 1] + (stringChar == patternChar ? 0 : 1));
-                patternPos += UTF8StringUtil.charSize(pattenBytes, patternPos);
-                if (j == patternLen && matrix[currRow][patternLen] < minEd) {
-                    minEd = matrix[currRow][patternLen];
-                }
-            }
+    public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] patternBytes, int patternStart,
+            int edThresh) throws HyracksDataException {
+        leftIt.reset(strBytes, stringStart);
+        rightIt.reset(patternBytes, patternStart);
+        return getSimilarityContains(leftIt, rightIt, edThresh);
+    }
 
-            stringPos += UTF8StringUtil.charSize(strBytes, stringPos);
-            int tmp = currRow;
-            currRow = prevRow;
-            prevRow = tmp;
-        }
-        if (minEd > edThresh) {
-            return -1;
-        } else {
-            return minEd;
-        }
+    @Override
+    public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+            throws HyracksDataException {
+        // Passes -1 as the simThresh to calculate the edit distance without applying any calculation optimizations.
+        return computeActualSimilarity(firstSequence, secondSequence, -1);
     }
 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..4a31b8b 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -24,6 +24,7 @@ import java.util.TreeSet;
 
 import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
 public class SimilarityMetricJaccard extends SimilarityMetric implements IGenericSimilarityMetric {
 
@@ -44,24 +45,8 @@ public class SimilarityMetricJaccard extends SimilarityMetric implements IGeneri
         return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
     }
 
-    // @Override
-    // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
-    // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
-    // (int) tokensY.size());
-    // }
-
-    // @Override
-    // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
-    // int lengthY) {
-    // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
-    // tokensY);
-    // int totalSize = lengthX + lengthY;
-    //
-    // return (float) intersectionSize / (totalSize - intersectionSize);
-    // }
-
     @Override
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+    public float computeSimilarity(ISequenceIterator tokensX, ISequenceIterator tokensY) throws HyracksDataException {
         int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
         int totalSize = tokensX.size() + tokensY.size();
 
@@ -69,7 +54,7 @@ public class SimilarityMetricJaccard extends SimilarityMetric implements IGeneri
     }
 
     @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+    public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
             throws HyracksDataException {
 
         // apply length filter
@@ -81,7 +66,7 @@ public class SimilarityMetricJaccard extends SimilarityMetric implements IGeneri
             return -1f;
         }
 
-        float jacc = getSimilarity(firstList, secondList);
+        float jacc = computeSimilarity(firstList, secondList);
         if (jacc < simThresh) {
             return -1f;
         } else {

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
new file mode 100644
index 0000000..caa80bc
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.fuzzyjoin.similarity;
+
+import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.junit.Test;
+
+public class SimilarityMetricEditDistanceTest {
+
+    private static final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+
+    @Test
+    public void test() throws Exception {
+        // For this case, the edit-distance of two strings is 3.
+        UTF8StringPointable leftStrPointable1 = generateUTF8Pointable("coupon not available in store");
+        UTF8StringPointable rightStrPointable1 = generateUTF8Pointable("coupon is available in store");
+
+        // The edit-distance between leftStrPointable1 and the following is 14.
+        UTF8StringPointable rightStrPointable2 = generateUTF8Pointable("coupon in store");
+
+        byte[] leftBytes1 = leftStrPointable1.getByteArray();
+        int leftStartOffset1 = leftStrPointable1.getStartOffset();
+        byte[] rightBytes1 = rightStrPointable1.getByteArray();
+        int rightStartOffset1 = rightStrPointable1.getStartOffset();
+        byte[] rightBytes2 = rightStrPointable2.getByteArray();
+        int rightStartOffset2 = rightStrPointable2.getStartOffset();
+
+        // Case 1 - normal - no early termination
+        int edThresh = 3;
+        int edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes1, rightStartOffset1, edThresh);
+        assertEquals(edThresh, edVal);
+
+        // Case 2 - the length difference between two strings is greater than edThresh.
+        // Even without calculating the distance, the method should return -1.
+        edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+        assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+        // Case 3 - the edit distance is 14, but the threshold is 1.
+        // The early termination should happen and the returned value should be -1.
+        edThresh = 1;
+        edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+        assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+        // Case 4 - the edit distance is 14, but the threshold is 13.
+        // The early termination will not happen. But, the resulting edit distance is greater than the given threshold.
+        // So, the final returned value should be -1.
+        edThresh = 13;
+        edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+        assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
index b929854..2435047 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
@@ -20,13 +20,13 @@ package org.apache.asterix.runtime.evaluators.common;
 
 import org.apache.asterix.common.exceptions.AsterixException;
 import org.apache.asterix.formats.nontagged.BinaryComparatorFactoryProvider;
-import org.apache.asterix.fuzzyjoin.similarity.IListIterator;
 import org.apache.asterix.om.types.ATypeTag;
 import org.apache.asterix.om.types.EnumDeserializer;
 import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
-public abstract class AbstractAsterixListIterator implements IListIterator {
+public abstract class AbstractAsterixListIterator implements ISequenceIterator {
 
     protected byte[] data;
     protected int count = 0;
@@ -42,7 +42,7 @@ public abstract class AbstractAsterixListIterator implements IListIterator {
     protected final boolean ignoreCase = true;
 
     @Override
-    public int compare(IListIterator cmpIter) throws HyracksDataException {
+    public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
         return cmp.compare(data, pos, -1, cmpIter.getData(), cmpIter.getPos(), -1);
     }
 
@@ -100,6 +100,7 @@ public abstract class AbstractAsterixListIterator implements IListIterator {
         }
     }
 
+    @Override
     public void reset(byte[] data, int startOff) throws HyracksDataException {
         this.data = data;
         this.startOff = startOff;

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..63e9e44 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@ package org.apache.asterix.runtime.evaluators.common;
 import java.io.IOException;
 
 import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
 import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
 import org.apache.asterix.om.base.ABoolean;
 import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@ public class EditDistanceCheckEvaluator extends EditDistanceEvaluator {
         try {
             edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
                     argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+            if (edThresh < 0) {
+                throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+                        3, edThresh);
+            }
             editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
             writeResult(editDistance);
         } catch (IOException e) {
@@ -101,7 +107,7 @@ public class EditDistanceCheckEvaluator extends EditDistanceEvaluator {
             case ORDEREDLIST: {
                 firstOrdListIter.reset(leftBytes, leftStartOffset);
                 secondOrdListIter.reset(rightBytes, rightStartOffset);
-                return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+                return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
             }
 
             default: {

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..dbe99b9 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,15 @@ public class EditDistanceEvaluator implements IScalarEvaluator {
 
         switch (argType) {
             case STRING: {
-                return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
-                        rightStartOffset + typeIndicatorSize);
+                // Passes -1 as the simThresh to calculate the edit distance
+                // without applying any calculation optimizations.
+                return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, -1);
             }
             case ORDEREDLIST: {
                 firstOrdListIter.reset(leftBytes, leftStartOffset);
                 secondOrdListIter.reset(rightBytes, rightStartOffset);
-                return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+                return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter);
             }
             default: {
                 throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
index 19e9395..0decd9e 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
@@ -34,6 +34,6 @@ public class SimilarityJaccardSortedCheckEvaluator extends SimilarityJaccardChec
 
     @Override
     protected float computeResult() throws HyracksDataException {
-        return jaccard.getSimilarity(firstListIter, secondListIter, jaccThresh);
+        return jaccard.computeSimilarity(firstListIter, secondListIter, jaccThresh);
     }
 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..1cd32c8 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@ public class SimilarityJaccardSortedEvaluator extends SimilarityJaccardEvaluator
 
     @Override
     protected float computeResult() throws HyracksDataException {
-        return jaccard.getSimilarity(firstListIter, secondListIter);
+        return jaccard.computeSimilarity(firstListIter, secondListIter);
     }
 }

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
new file mode 100644
index 0000000..9441453
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public interface ISequenceIterator {
+    public int compare(ISequenceIterator cmpIter) throws HyracksDataException;
+
+    public byte[] getData();
+
+    public int getPos();
+
+    public boolean hasNext();
+
+    public void next() throws HyracksDataException;
+
+    public void reset() throws HyracksDataException;
+
+    public void reset(byte[] data, int startOff) throws HyracksDataException;
+
+    public int size();
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/56189fb1/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
new file mode 100644
index 0000000..237d291
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+/**
+ * An iterator class for a String. This class iterates a char by char in the given String.
+ */
+public class UTF8StringCharByCharIterator implements ISequenceIterator {
+
+    protected byte[] data;
+    protected int pos = -1;
+    protected int length = -1;
+    protected int utfByteLength = -1;
+    protected int metaLength = -1;
+    protected int startOffset = -1;
+
+    @Override
+    public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
+        char thisChar = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
+        char thatChar = Character.toLowerCase(UTF8StringUtil.charAt(cmpIter.getData(), cmpIter.getPos()));
+        if (thisChar == thatChar) {
+            return 0;
+        }
+        return -1;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return pos < utfByteLength;
+    }
+
+    @Override
+    public int size() {
+        return length;
+    }
+
+    @Override
+    public byte[] getData() {
+        return data;
+    }
+
+    @Override
+    public int getPos() {
+        return pos;
+    }
+
+    @Override
+    public void next() throws HyracksDataException {
+        pos += UTF8StringUtil.charSize(data, pos);
+    }
+
+    @Override
+    public void reset() throws HyracksDataException {
+        pos = startOffset + metaLength;
+    }
+
+    @Override
+    public void reset(byte[] data, int startOff) throws HyracksDataException {
+        this.data = data;
+        this.startOffset = startOff;
+        this.length = UTF8StringUtil.getStringLength(data, startOffset);
+        this.utfByteLength = UTF8StringUtil.getUTFLength(data, startOffset);
+        this.metaLength = UTF8StringUtil.getNumBytesToStoreLength(utfByteLength);
+        reset();
+    }
+
+}


Mime
View raw message