asterixdb-notifications mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Taewoo Kim (Code Review)" <do-not-re...@asterixdb.incubator.apache.org>
Subject Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Date Thu, 02 Feb 2017 23:56:50 GMT
Taewoo Kim has uploaded a new change for review.

  https://asterix-gerrit.ics.uci.edu/1481

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation stpes early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to computes all cells in the 2*2 array.

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
8 files changed, 173 insertions(+), 117 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/1

diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@
 
         2
 
+### edit_distance_check ###
+* Syntax:
+
+        edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+    * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+    * `expression2` : The same type as `expression1`.
+    * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+    * an `array` with two items:
+        * The first item contains a `boolean` value representing whether the edit distance
of `expression1` and `expression2` is within the given threshold.
+        * The second item contains an `integer` that represents the edit distance of `expression1`
and `expression2` if the first item is true.
+        * If the first item is false, then the second item is set to 2147483647.
+    * `missing` if any argument is a `missing` value,
+    * `null` if any argument is a `null` value but no argument is a `missing` value,
+    * a type error will be raised if:
+        * the first or second argument is any other non-string value,
+        * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be
utilized for this function.
+* Example:
+
+        edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+        [ true, 2 ]
 
 ### edit_distance_contains ###
 * Syntax:
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..751597d 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -22,8 +22,11 @@
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 
 public interface IGenericSimilarityMetric {
-    // returns similarity
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws
HyracksDataException;
+    // Returns -1 if this method supports early-termination and it becomes obvious that
+    // the possible similarity value can't satisfy the given simThresh value.
+    // Else returns the calculated similarity value.
+    public float getActualSimilarityVal(IListIterator firstList, IListIterator secondList,
float simThresh)
+            throws HyracksDataException;
 
     // returns -1 if does not satisfy threshold
     // else returns similarity
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..70029a3 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -64,23 +64,6 @@
     }
 
     public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY)
{
-        // int intersectSize = 0;
-        //
-        // while (startX < tokensX.length && startY < tokensY.length) {
-        // int tokenX = tokensX[startX];
-        // int tokenY = tokensY[startY];
-        // if (tokenX > tokenY) {
-        // startY++;
-        // } else if (tokenX < tokenY) {
-        // startX++;
-        // } else {
-        // intersectSize++;
-        // startX++;
-        // startY++;
-        // }
-        // }
-        //
-        // return intersectSize;
         return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
     }
 
@@ -129,52 +112,6 @@
 
     public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY,
int tokenStop) {
         return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length,
tokenStop);
-    }
-
-    // @SuppressWarnings("unchecked")
-    // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
-    // int intersectSize = 0;
-    //
-    // Iterator<Tuple> iteratorX = tokensX.iterator();
-    // Iterator<Tuple> iteratorY = tokensY.iterator();
-    //
-    // Tuple nextX = null;
-    // Tuple nextY = null;
-    //
-    // while ((nextX != null || iteratorX.hasNext())
-    // && (nextY != null || iteratorY.hasNext())) {
-    // if (nextX == null) {
-    // nextX = iteratorX.next();
-    // }
-    // if (nextY == null) {
-    // nextY = iteratorY.next();
-    // }
-    //
-    // int cmp = nextX.compareTo(nextY);
-    // if (cmp > 0) {
-    // nextY = null;
-    // } else if (cmp < 0) {
-    // nextX = null;
-    // } else {
-    // intersectSize++;
-    // nextX = null;
-    // nextY = null;
-    // }
-    // }
-    //
-    // return intersectSize;
-    // }
-
-    // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
-    // public abstract float getSimilarity(DataBag tokensX, int lengthX,
-    // DataBag tokensY, int lengthY);
-
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException
{
-        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
-        int totalSize = tokensX.size() + tokensY.size();
-
-        return (float) intersectionSize / (totalSize - intersectionSize);
     }
 
     public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY,
int startY, int lengthY);
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..ba0453a 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -26,32 +26,50 @@
 
 public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
 
-    // dp implementation only needs 2 rows
+    // This Dynamic Programming implementation only needs 2 rows.
     private final int rows = 2;
     private int cols;
     private int[][] matrix;
 
-    // for letter count filtering
-    private final int[] fsLcCount = new int[128];
-    private final int[] ssLcCount = new int[128];
+    // for ASCII letter count filtering
+    private final int[] letterCounts = new int[128];
 
     public SimilarityMetricEditDistance() {
         cols = 100; // arbitrary default value
         matrix = new int[rows][cols];
     }
 
+    /**
+     * Gets the edit distance value for the given two lists using a Dynamic Programming approach.
+     * If a positive simThresh value is provided, this method only calculates 2 * (simThresh
+ 1) cells per row,
+     * not entire cells as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner–Fischer_algorithm
+     * for more details. Also, as one more optimization, during the calculation steps, if
this method finds out
+     * that the final edit distance value cannot be less than simThresh, this method stops
the calculation
+     * and immediately returns -1.
+     * If the final edit distance value is less than or equal to simThresh, then that value
will be returned.
+     * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+     * the final edit distance value.
+     *
+     * @return the edit distance of the two lists. -1 if a positive simThresh value is given
and the edit distance
+     *         value is greater than the given simThresh.
+     */
     @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws
HyracksDataException {
+    public float getActualSimilarityVal(IListIterator firstList, IListIterator secondList,
float simThresh)
+            throws HyracksDataException {
         int flLen = firstList.size();
         int slLen = secondList.size();
 
-        // reuse existing matrix if possible
+        // When a positive threshold is given, then we can apply two optimizations.
+        int edThresh = (int) simThresh;
+        boolean canTerminateEarly = edThresh >= 0 ? true : false;
+
+        // Reuses the existing matrix if possible.
         if (slLen >= cols) {
             cols = slLen + 1;
             matrix = new int[rows][cols];
         }
 
-        // init matrix
+        // Inits the matrix.
         for (int i = 0; i <= slLen; i++) {
             matrix[0][i] = i;
         }
@@ -59,19 +77,53 @@
         int currRow = 1;
         int prevRow = 0;
 
-        // expand dynamic programming matrix row by row
+        int from = 1;
+        int to = slLen;
+        int minDistance = -1;
+
+        // Expands the dynamic programming matrix row by row.
         for (int i = 1; i <= flLen; i++) {
             matrix[currRow][0] = i;
 
             secondList.reset();
-            for (int j = 1; j <= slLen; j++) {
+
+            // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+            // Also keeps minDistance to see whether the possible edit distance after
+            // each row calculation is greater than the simThresh.
+            if (canTerminateEarly) {
+                minDistance = edThresh + 1;
+                from = Math.max(i - edThresh - 1, 1);
+                to = Math.min(i + edThresh + 1, slLen);
+                for (int j = 1; j < from; j++) {
+                    // Moves the pointer of the second list to the point where the calculation
starts for this row.
+                    secondList.next();
+                }
+                if (from > 1) {
+                    // Sets the left Boundary cell value to make sure that the calculation
is correct.
+                    matrix[currRow][from - 1] = edThresh + 1;
+                }
+                if (to < slLen) {
+                    // Sets the right Boundary cell value to make sure that the calculation
is correct.
+                    matrix[currRow][to + 1] = edThresh + 1;
+                }
+            }
+
+            for (int j = from; j <= to; j++) {
 
                 matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j
- 1] + 1),
                         matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0
: 1));
 
+                // Replaces minDistance after each cell computation if we find a smaller
value than that.
+                if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+                    minDistance = matrix[currRow][j];
+                }
+
                 secondList.next();
             }
-
+            // If the minimum distance value is greater than the given threshold, no reason
to process next row.
+            if (canTerminateEarly && minDistance > edThresh) {
+                return -1;
+            }
             firstList.next();
 
             int tmp = currRow;
@@ -82,6 +134,9 @@
         return matrix[prevRow][slLen];
     }
 
+    /**
+     * Gets the edit distance value for the given two lists.
+     */
     @Override
     public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
             throws HyracksDataException {
@@ -96,8 +151,8 @@
             return -1;
         }
 
-        float ed = getSimilarity(firstList, secondList);
-        if (ed > edThresh) {
+        float ed = getActualSimilarityVal(firstList, secondList, simThresh);
+        if (ed > edThresh || ed < 0) {
             return -1;
         } else {
             return ed;
@@ -155,7 +210,8 @@
     }
 
     // faster implementation for common case of string edit distance
-    public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int
ssStart) {
+    public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes,
int ssStart,
+            int edThresh) {
         int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
         int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
 
@@ -164,7 +220,10 @@
         int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
         int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
 
-        // reuse existing matrix if possible
+        // When a positive threshold is given, then we can apply two optimizations.
+        boolean canTerminateEarly = edThresh >= 0 ? true : false;
+
+        // Reuses the existing matrix if possible.
         if (ssLen >= cols) {
             cols = ssLen + 1;
             matrix = new int[rows][cols];
@@ -173,7 +232,7 @@
         int fsDataStart = fsStart + fsMetaLen;
         int ssDataStart = ssStart + ssMetaLen;
 
-        // init matrix
+        // Inits the matrix
         for (int i = 0; i <= ssLen; i++) {
             matrix[0][i] = i;
         }
@@ -181,19 +240,55 @@
         int currRow = 1;
         int prevRow = 0;
 
-        // expand dynamic programming matrix row by row
+        int from = 1;
+        int to = ssLen;
+        int minDistance = -1;
+
+        // Expands the dynamic programming matrix row by row.
         int fsPos = fsDataStart;
         for (int i = 1; i <= fsLen; i++) {
             matrix[currRow][0] = i;
             char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
             int ssPos = ssDataStart;
-            for (int j = 1; j <= ssLen; j++) {
+
+            // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+            // Also keeps minDistance to see whether the possible edit distance after
+            // each row calculation is greater than the simThresh.
+            if (canTerminateEarly) {
+                minDistance = edThresh + 1;
+                from = Math.max(i - edThresh - 1, 1);
+                to = Math.min(i + edThresh + 1, ssLen);
+                for (int j = 1; j < from; j++) {
+                    // Moves the pointer of the second list to the point where the calculation
starts for this row.
+                    ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
+                }
+                if (from > 1) {
+                    // Sets the left Boundary cell value to make sure that the calculation
is correct.
+                    matrix[currRow][from - 1] = edThresh + 1;
+                }
+                if (to < ssLen) {
+                    // Sets the right Boundary cell value to make sure that the calculation
is correct.
+                    matrix[currRow][to + 1] = edThresh + 1;
+                }
+            }
+
+            for (int j = from; j <= to; j++) {
                 char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
 
                 matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j
- 1] + 1),
                         matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
 
+                // Replaces minDistance after each cell computation if we find a smaller
value than that.
+                if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+                    minDistance = matrix[currRow][j];
+                }
+
                 ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
+            }
+
+            // If the minimum distance value is greater than the given threshold, no reason
to process next row.
+            if (canTerminateEarly && minDistance > edThresh) {
+                    return -1;
             }
             fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
             int tmp = currRow;
@@ -218,8 +313,7 @@
         }
 
         // initialize letter count filtering
-        Arrays.fill(fsLcCount, 0);
-        Arrays.fill(ssLcCount, 0);
+        Arrays.fill(letterCounts, 0);
 
         // compute letter counts for first string
         int fsPos = fsStart + fsMetaLen;
@@ -227,7 +321,7 @@
         while (fsPos < fsEnd) {
             char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
             if (c < 128) {
-                fsLcCount[c]++;
+                letterCounts[c]++;
             }
             fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
         }
@@ -238,30 +332,30 @@
         while (ssPos < ssEnd) {
             char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
             if (c < 128) {
-                ssLcCount[c]++;
+                letterCounts[c]--;
             }
             ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
         }
 
         // apply filter
-        int gtSum = 0;
-        int ltSum = 0;
+        int secondTofirstDiffSum = 0;
+        int firstToSecondDiffSum = 0;
         for (int i = 0; i < 128; i++) {
-            if (fsLcCount[i] > ssLcCount[i]) {
-                gtSum += fsLcCount[i] - ssLcCount[i];
-                if (gtSum > edThresh) {
+            if (letterCounts[i] >= 0) {
+                secondTofirstDiffSum += letterCounts[i];
+                if (secondTofirstDiffSum > edThresh) {
                     return -1;
                 }
             } else {
-                ltSum += ssLcCount[i] - fsLcCount[i];
-                if (ltSum > edThresh) {
+                firstToSecondDiffSum += Math.abs(letterCounts[i]);
+                if (firstToSecondDiffSum > edThresh) {
                     return -1;
                 }
             }
         }
 
-        int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
-        if (ed > edThresh) {
+        int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart,
edThresh);
+        if (ed > edThresh || ed < 0) {
             return -1;
         } else {
             return ed;
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..cafc7fb 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -44,24 +44,10 @@
         return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
     }
 
-    // @Override
-    // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
-    // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
-    // (int) tokensY.size());
-    // }
-
-    // @Override
-    // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
-    // int lengthY) {
-    // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
-    // tokensY);
-    // int totalSize = lengthX + lengthY;
-    //
-    // return (float) intersectionSize / (totalSize - intersectionSize);
-    // }
-
+    // SimThresh value will be ignored for this method since it doesn't provide an early
termination.
     @Override
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException
{
+    public float getActualSimilarityVal(IListIterator tokensX, IListIterator tokensY, float
simThresh)
+            throws HyracksDataException {
         int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
         int totalSize = tokensX.size() + tokensY.size();
 
@@ -81,7 +67,7 @@
             return -1f;
         }
 
-        float jacc = getSimilarity(firstList, secondList);
+        float jacc = getActualSimilarityVal(firstList, secondList, simThresh);
         if (jacc < simThresh) {
             return -1f;
         } else {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..3dd3516 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@
 import java.io.IOException;
 
 import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
 import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
 import org.apache.asterix.om.base.ABoolean;
 import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@
         try {
             edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
2,
                     argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+            if (edThresh < 0) {
+                throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+                        3, edThresh);
+            }
             editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
             writeResult(editDistance);
         } catch (IOException e) {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..92f8df3 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,13 @@
 
         switch (argType) {
             case STRING: {
-                return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize,
rightBytes,
-                        rightStartOffset + typeIndicatorSize);
+                return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset +
typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, -1);
             }
             case ORDEREDLIST: {
                 firstOrdListIter.reset(leftBytes, leftStartOffset);
                 secondOrdListIter.reset(rightBytes, rightStartOffset);
-                return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+                return (int) ed.getActualSimilarityVal(firstOrdListIter, secondOrdListIter,
-1);
             }
             default: {
                 throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..3a60295 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@
 
     @Override
     protected float computeResult() throws HyracksDataException {
-        return jaccard.getSimilarity(firstListIter, secondListIter);
+        return jaccard.getActualSimilarityVal(firstListIter, secondListIter, -1.0f);
     }
 }

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wangsaeu@gmail.com>

Mime
View raw message