asterixdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mb...@apache.org
Subject [6/7] asterixdb git commit: [ASTERIXDB-2330][*DB][RT] Add IFunctionRegistrant for dynamic function registration
Date Wed, 14 Mar 2018 04:08:19 GMT
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java
new file mode 100644
index 0000000..dd36671
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedGramTokensDescriptor.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.GramTokensEvaluator;
+import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+
+public class CountHashedGramTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new CountHashedGramTokensDescriptor();
+        }
+    };
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.COUNTHASHED_GRAM_TOKENS;
+    }
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args)
+            throws AlgebricksException {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+                NGramUTF8StringBinaryTokenizer tokenizer =
+                        new NGramUTF8StringBinaryTokenizer(3, true, false, true, tokenFactory);
+                return new GramTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32);
+            }
+        };
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java
new file mode 100644
index 0000000..e12ba2e
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/CountHashedWordTokensDescriptor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+
+public class CountHashedWordTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new CountHashedWordTokensDescriptor();
+        }
+    };
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.COUNTHASHED_WORD_TOKENS;
+    }
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+                IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, true, tokenFactory);
+                return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32);
+            }
+        };
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java
new file mode 100644
index 0000000..e4b40b1
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.EditDistanceCheckEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class EditDistanceCheckDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new EditDistanceCheckDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new EditDistanceCheckEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.EDIT_DISTANCE_CHECK;
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
new file mode 100644
index 0000000..4c7c257
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.EditDistanceContainsEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class EditDistanceContainsDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new EditDistanceContainsDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new EditDistanceContainsEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.EDIT_DISTANCE_CONTAINS;
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java
new file mode 100644
index 0000000..8c6c9ed
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceDescriptor.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.EditDistanceEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class EditDistanceDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new EditDistanceDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new EditDistanceEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.EDIT_DISTANCE;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java
new file mode 100644
index 0000000..0f4ebee
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceListIsFilterableDescriptor.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.dataflow.data.nontagged.serde.AOrderedListSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.ABoolean;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+/**
+ * Checks whether a list with an edit distance threshold can be filtered with a lower bounding on the number
+ * of common list elements. This function returns 'true' if the lower bound on the number of common elements
+ * is positive, 'false' otherwise. For example, this function is used during an indexed nested-loop join based
+ * on edit distance. We partition the tuples of the probing dataset into those that are filterable and those
+ * that are not. Those that are filterable are forwarded to the index. The others are are fed into a (non
+ * indexed) nested-loop join.
+ */
+public class EditDistanceListIsFilterableDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new EditDistanceListIsFilterableDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new EditDistanceListIsFilterableEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE;
+    }
+
+    private static class EditDistanceListIsFilterableEvaluator implements IScalarEvaluator {
+
+        protected final IPointable listPtr = new VoidPointable();
+        protected final IPointable edThreshPtr = new VoidPointable();
+        protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+        protected final DataOutput output = resultStorage.getDataOutput();
+
+        protected final IScalarEvaluator listEval;
+        protected final IScalarEvaluator edThreshEval;
+
+        @SuppressWarnings("unchecked")
+        private final ISerializerDeserializer<ABoolean> booleanSerde =
+                SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+
+        public EditDistanceListIsFilterableEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+                throws HyracksDataException {
+            listEval = args[0].createScalarEvaluator(context);
+            edThreshEval = args[1].createScalarEvaluator(context);
+        }
+
+        @Override
+        public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+            resultStorage.reset();
+
+            listEval.evaluate(tuple, listPtr);
+            edThreshEval.evaluate(tuple, edThreshPtr);
+
+            // Check type and compute string length.
+            byte[] bytes = listPtr.getByteArray();
+            int offset = listPtr.getStartOffset();
+
+            ATypeTag typeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[offset]);
+            long listLen;
+            switch (typeTag) {
+                case MULTISET:
+                    listLen = AUnorderedListSerializerDeserializer.getNumberOfItems(bytes, offset);
+                    break;
+                case ARRAY:
+                    listLen = AOrderedListSerializerDeserializer.getNumberOfItems(bytes, offset);
+                    break;
+                default:
+                    throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE, 0,
+                            ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+            }
+
+            // Check type and extract edit-distance threshold.
+            bytes = edThreshPtr.getByteArray();
+            offset = edThreshPtr.getStartOffset();
+            long edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE.getName(),
+                    1, bytes, offset);
+
+            // Compute result.
+            long lowerBound = listLen - edThresh;
+            try {
+                if (lowerBound <= 0) {
+                    booleanSerde.serialize(ABoolean.FALSE, output);
+                } else {
+                    booleanSerde.serialize(ABoolean.TRUE, output);
+                }
+            } catch (IOException e) {
+                throw new HyracksDataException(e);
+            }
+            result.set(resultStorage);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java
new file mode 100644
index 0000000..ddb18fc
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableDescriptor.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+/**
+ * Checks whether a string with an edit distance threshold can be filtered with a lower bounding
+ * on number of common grams. This function returns 'true' if the lower bound on the number of
+ * common grams is positive, 'false' otherwise. For example, this function is used during an indexed
+ * nested-loop join based on edit distance. We partition the tuples of the probing dataset into those
+ * that are filterable and those that are not. Those that are filterable are forwarded to the index.
+ * The others are fed into a (non indexed) nested-loop join.
+ */
+public class EditDistanceStringIsFilterableDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new EditDistanceStringIsFilterableDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new EditDistanceStringIsFilterableEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE;
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java
new file mode 100644
index 0000000..0509f51
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/EditDistanceStringIsFilterableEvaluator.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.runtime.evaluators.functions;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.dataflow.data.nontagged.serde.ABooleanSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.ABoolean;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class EditDistanceStringIsFilterableEvaluator implements IScalarEvaluator {
+
+    protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+    protected final DataOutput output = resultStorage.getDataOutput();
+    protected final IPointable stringPtr = new VoidPointable();
+    protected final IPointable edThreshPtr = new VoidPointable();
+    protected final IPointable gramLenPtr = new VoidPointable();
+    protected final IPointable usePrePostPtr = new VoidPointable();
+
+    protected final IScalarEvaluator stringEval;
+    protected final IScalarEvaluator edThreshEval;
+    protected final IScalarEvaluator gramLenEval;
+    protected final IScalarEvaluator usePrePostEval;
+
+    @SuppressWarnings("unchecked")
+    private final ISerializerDeserializer<ABoolean> booleanSerde =
+            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+
+    private final UTF8StringPointable utf8Ptr = new UTF8StringPointable();
+
+    public EditDistanceStringIsFilterableEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        stringEval = args[0].createScalarEvaluator(context);
+        edThreshEval = args[1].createScalarEvaluator(context);
+        gramLenEval = args[2].createScalarEvaluator(context);
+        usePrePostEval = args[3].createScalarEvaluator(context);
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+
+        stringEval.evaluate(tuple, stringPtr);
+        edThreshEval.evaluate(tuple, edThreshPtr);
+        gramLenEval.evaluate(tuple, gramLenPtr);
+        usePrePostEval.evaluate(tuple, usePrePostPtr);
+
+        // Check type and compute string length.
+        byte typeTag = stringPtr.getByteArray()[stringPtr.getStartOffset()];
+        if (typeTag != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
+            throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE, 0, typeTag,
+                    ATypeTag.SERIALIZED_STRING_TYPE_TAG);
+        }
+        utf8Ptr.set(stringPtr.getByteArray(), stringPtr.getStartOffset() + 1, stringPtr.getLength());
+        int strLen = utf8Ptr.getStringLength();
+
+        // Check type and extract edit-distance threshold.
+        long edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE.getName(), 1,
+                edThreshPtr.getByteArray(), edThreshPtr.getStartOffset());
+
+        // Check type and extract gram length.
+        long gramLen = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_LIST_IS_FILTERABLE.getName(), 2,
+                gramLenPtr.getByteArray(), gramLenPtr.getStartOffset());
+
+        // Check type and extract usePrePost flag.
+        typeTag = usePrePostPtr.getByteArray()[usePrePostPtr.getStartOffset()];
+        if (typeTag != ATypeTag.SERIALIZED_BOOLEAN_TYPE_TAG) {
+            throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE, 3, typeTag,
+                    ATypeTag.SERIALIZED_BOOLEAN_TYPE_TAG);
+        }
+        boolean usePrePost = ABooleanSerializerDeserializer.getBoolean(usePrePostPtr.getByteArray(),
+                usePrePostPtr.getStartOffset() + 1);
+
+        // Compute result.
+        long numGrams = usePrePost ? strLen + gramLen - 1 : strLen - gramLen + 1;
+        long lowerBound = numGrams - edThresh * gramLen;
+        try {
+            if (lowerBound <= 0 || strLen == 0) {
+                booleanSerde.serialize(ABoolean.FALSE, output);
+            } else {
+                booleanSerde.serialize(ABoolean.TRUE, output);
+            }
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java
new file mode 100644
index 0000000..190013b
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/GramTokensDescriptor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.GramTokensEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
+
+public class GramTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new GramTokensDescriptor();
+        }
+    };
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.GRAM_TOKENS;
+    }
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                ITokenFactory tokenFactory = new UTF8NGramTokenFactory();
+                NGramUTF8StringBinaryTokenizer tokenizer =
+                        new NGramUTF8StringBinaryTokenizer(3, true, true, true, tokenFactory);
+                return new GramTokensEvaluator(args, ctx, tokenizer, BuiltinType.ASTRING);
+            }
+        };
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java
new file mode 100644
index 0000000..32dc292
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedGramTokensDescriptor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.GramTokensEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+
+public class HashedGramTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new HashedGramTokensDescriptor();
+        }
+    };
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.HASHED_GRAM_TOKENS;
+    }
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+                NGramUTF8StringBinaryTokenizer tokenizer =
+                        new NGramUTF8StringBinaryTokenizer(3, true, true, true, tokenFactory);
+                return new GramTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32);
+            }
+        };
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java
new file mode 100644
index 0000000..e224fce
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/HashedWordTokensDescriptor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+
+public class HashedWordTokensDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new HashedWordTokensDescriptor();
+        }
+    };
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.HASHED_WORD_TOKENS;
+    }
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+                IBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, true, tokenFactory);
+                return new WordTokensEvaluator(args, ctx, tokenizer, BuiltinType.AINT32);
+            }
+        };
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java
new file mode 100644
index 0000000..c9a865b
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenDescriptor.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.common.functions.FunctionConstants;
+import org.apache.asterix.dataflow.data.nontagged.serde.ADoubleSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters;
+import org.apache.asterix.om.base.AInt32;
+import org.apache.asterix.om.base.AMutableInt32;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.IntegerPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class PrefixLenDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    private final static FunctionIdentifier FID =
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "prefix-len@3", 3);
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new PrefixLenDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
+
+                return new IScalarEvaluator() {
+
+                    private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+                    private final DataOutput out = resultStorage.getDataOutput();
+                    private final IPointable inputVal = new VoidPointable();
+                    private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalSimilarity = args[1].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalThreshold = args[2].createScalarEvaluator(ctx);
+
+                    private final SimilarityFiltersCache similarityFiltersCache = new SimilarityFiltersCache();
+
+                    // result
+                    private final AMutableInt32 res = new AMutableInt32(0);
+                    @SuppressWarnings("unchecked")
+                    private final ISerializerDeserializer<AInt32> int32Serde =
+                            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32);
+
+                    @Override
+                    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+                        resultStorage.reset();
+                        // length
+                        evalLen.evaluate(tuple, inputVal);
+                        byte[] data = inputVal.getByteArray();
+                        int offset = inputVal.getStartOffset();
+                        if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 0, data[offset],
+                                    ATypeTag.SERIALIZED_INT32_TYPE_TAG);
+                        }
+                        int length = IntegerPointable.getInteger(data, offset + 1);
+
+                        // similarity threshold
+                        evalThreshold.evaluate(tuple, inputVal);
+                        data = inputVal.getByteArray();
+                        offset = inputVal.getStartOffset();
+                        if (data[offset] != ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 1, data[offset],
+                                    ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG);
+                        }
+                        float similarityThreshold = (float) ADoubleSerializerDeserializer.getDouble(data, offset + 1);
+
+                        // similarity name
+                        evalSimilarity.evaluate(tuple, inputVal);
+                        data = inputVal.getByteArray();
+                        offset = inputVal.getStartOffset();
+                        int len = inputVal.getLength();
+                        if (data[offset] != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 2, data[offset],
+                                    ATypeTag.SERIALIZED_STRING_TYPE_TAG);
+                        }
+                        SimilarityFilters similarityFilters =
+                                similarityFiltersCache.get(similarityThreshold, data, offset, len);
+
+                        int prefixLength = similarityFilters.getPrefixLength(length);
+                        res.setValue(prefixLength);
+
+                        try {
+                            int32Serde.serialize(res, out);
+                        } catch (IOException e) {
+                            throw new HyracksDataException(e);
+                        }
+                        result.set(resultStorage);
+                    }
+                };
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return FID;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java
new file mode 100644
index 0000000..25d4be3
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/PrefixLenJaccardDescriptor.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+import org.apache.asterix.om.base.AInt32;
+import org.apache.asterix.om.base.AMutableInt32;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class PrefixLenJaccardDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new PrefixLenJaccardDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
+
+                return new IScalarEvaluator() {
+
+                    private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+                    private final DataOutput out = resultStorage.getDataOutput();
+                    private final IPointable lenPtr = new VoidPointable();
+                    private final IPointable thresholdPtr = new VoidPointable();
+                    private final IScalarEvaluator evalLen = args[0].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalThreshold = args[1].createScalarEvaluator(ctx);
+
+                    private float similarityThresholdCache;
+                    private SimilarityFiltersJaccard similarityFilters;
+
+                    // result
+                    private final AMutableInt32 res = new AMutableInt32(0);
+                    @SuppressWarnings("unchecked")
+                    private final ISerializerDeserializer<AInt32> int32Serde =
+                            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT32);
+
+                    @Override
+                    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+                        resultStorage.reset();
+                        evalLen.evaluate(tuple, lenPtr);
+                        evalThreshold.evaluate(tuple, thresholdPtr);
+
+                        // length
+                        int length = ATypeHierarchy.getIntegerValue(getIdentifier().getName(), 0, lenPtr.getByteArray(),
+                                lenPtr.getStartOffset());
+                        // similarity threshold
+                        byte[] data = thresholdPtr.getByteArray();
+                        int offset = thresholdPtr.getStartOffset();
+                        if (data[offset] != ATypeTag.SERIALIZED_FLOAT_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 1, data[offset],
+                                    ATypeTag.SERIALIZED_FLOAT_TYPE_TAG);
+                        }
+                        float similarityThreshold = AFloatSerializerDeserializer.getFloat(data, offset + 1);
+
+                        if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
+                            similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+                        }
+
+                        int prefixLength = similarityFilters.getPrefixLength(length);
+                        res.setValue(prefixLength);
+
+                        try {
+                            int32Serde.serialize(res, out);
+                        } catch (IOException e) {
+                            throw new HyracksDataException(e);
+                        }
+                        result.set(resultStorage);
+                    }
+                };
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.PREFIX_LEN_JACCARD;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java
new file mode 100644
index 0000000..8584d06
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityDescriptor.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.common.functions.FunctionConstants;
+import org.apache.asterix.dataflow.data.nontagged.serde.ADoubleSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AOrderedListSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.IntArray;
+import org.apache.asterix.fuzzyjoin.similarity.PartialIntersect;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetric;
+import org.apache.asterix.om.base.ADouble;
+import org.apache.asterix.om.base.AMutableDouble;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.SimilarityFiltersCache;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.IntegerPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    private final static FunctionIdentifier FID =
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "similarity@7", 7);
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new SimilarityDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(final IHyracksTaskContext ctx) throws HyracksDataException {
+
+                return new IScalarEvaluator() {
+
+                    private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+                    private final DataOutput out = resultStorage.getDataOutput();
+                    private final IPointable inputVal = new VoidPointable();
+                    private final IScalarEvaluator evalLen1 = args[0].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalTokens1 = args[1].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalLen2 = args[2].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalTokens2 = args[3].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalTokenPrefix = args[4].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalSimilarity = args[5].createScalarEvaluator(ctx);
+                    private final IScalarEvaluator evalThreshold = args[6].createScalarEvaluator(ctx);
+
+                    private final SimilarityFiltersCache similarityFiltersCache = new SimilarityFiltersCache();
+
+                    private final IntArray tokens1 = new IntArray();
+                    private final IntArray tokens2 = new IntArray();
+                    private final PartialIntersect parInter = new PartialIntersect();
+
+                    // result
+                    private final AMutableDouble res = new AMutableDouble(0);
+                    @SuppressWarnings("unchecked")
+                    private final ISerializerDeserializer<ADouble> doubleSerde =
+                            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ADOUBLE);
+
+                    @Override
+                    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+                        resultStorage.reset();
+                        // similarity threshold
+                        evalThreshold.evaluate(tuple, inputVal);
+                        byte[] data = inputVal.getByteArray();
+                        int offset = inputVal.getStartOffset();
+
+                        if (data[offset] != ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 0, data[offset],
+                                    ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG);
+                        }
+                        float similarityThreshold = (float) ADoubleSerializerDeserializer.getDouble(data, offset + 1);
+
+                        // similarity name
+                        evalSimilarity.evaluate(tuple, inputVal);
+                        data = inputVal.getByteArray();
+                        offset = inputVal.getStartOffset();
+                        int len = inputVal.getLength();
+                        if (data[offset] != ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 1, data[offset],
+                                    ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG);
+                        }
+                        SimilarityFilters similarityFilters =
+                                similarityFiltersCache.get(similarityThreshold, data, offset, len);
+
+                        evalLen1.evaluate(tuple, inputVal);
+                        data = inputVal.getByteArray();
+                        offset = inputVal.getStartOffset();
+                        if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 2, data[offset],
+                                    ATypeTag.SERIALIZED_INT32_TYPE_TAG);
+                        }
+                        int length1 = IntegerPointable.getInteger(data, offset + 1);
+
+                        evalLen2.evaluate(tuple, inputVal);
+                        data = inputVal.getByteArray();
+                        offset = inputVal.getStartOffset();
+                        if (data[offset] != ATypeTag.SERIALIZED_INT32_TYPE_TAG) {
+                            throw new TypeMismatchException(getIdentifier(), 3, data[offset],
+                                    ATypeTag.SERIALIZED_INT32_TYPE_TAG);
+                        }
+                        int length2 = IntegerPointable.getInteger(data, offset + 1);
+
+                        float sim = 0;
+
+                        //
+                        // -- - length filter - --
+                        //
+                        if (similarityFilters.passLengthFilter(length1, length2)) {
+
+                            // -- - tokens1 - --
+                            int i;
+                            tokens1.reset();
+                            evalTokens1.evaluate(tuple, inputVal);
+                            byte[] serList = inputVal.getByteArray();
+                            offset = inputVal.getStartOffset();
+
+                            if (serList[offset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG
+                                    && serList[offset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
+                                throw new TypeMismatchException(getIdentifier(), 4, data[offset],
+                                        ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG,
+                                        ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+                            }
+
+                            int lengthTokens1;
+                            if (serList[offset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
+                                lengthTokens1 = AOrderedListSerializerDeserializer.getNumberOfItems(serList, offset);
+                                // read tokens
+                                for (i = 0; i < lengthTokens1; i++) {
+                                    int itemOffset =
+                                            AOrderedListSerializerDeserializer.getItemOffset(serList, offset, i);
+                                    tokens1.add(IntegerPointable.getInteger(serList, itemOffset));
+                                }
+                            } else {
+                                lengthTokens1 = AUnorderedListSerializerDeserializer.getNumberOfItems(serList, offset);
+                                // read tokens
+                                for (i = 0; i < lengthTokens1; i++) {
+                                    int itemOffset =
+                                            AUnorderedListSerializerDeserializer.getItemOffset(serList, offset, i);
+                                    tokens1.add(IntegerPointable.getInteger(serList, itemOffset));
+                                }
+                            }
+                            // pad tokens
+                            for (; i < length1; i++) {
+                                tokens1.add(Integer.MAX_VALUE);
+                            }
+
+                            // -- - tokens2 - --
+                            tokens2.reset();
+                            evalTokens2.evaluate(tuple, inputVal);
+                            serList = inputVal.getByteArray();
+                            offset = inputVal.getStartOffset();
+
+                            if (serList[offset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG
+                                    && serList[offset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
+                                throw new TypeMismatchException(getIdentifier(), 5, data[offset],
+                                        ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG,
+                                        ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+                            }
+
+                            int lengthTokens2;
+                            if (serList[0] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
+                                lengthTokens2 = AOrderedListSerializerDeserializer.getNumberOfItems(serList, offset);
+                                // read tokens
+                                for (i = 0; i < lengthTokens2; i++) {
+                                    int itemOffset =
+                                            AOrderedListSerializerDeserializer.getItemOffset(serList, offset, i);
+                                    tokens2.add(IntegerPointable.getInteger(serList, itemOffset));
+                                }
+                            } else {
+                                lengthTokens2 = AUnorderedListSerializerDeserializer.getNumberOfItems(serList, offset);
+                                // read tokens
+                                for (i = 0; i < lengthTokens2; i++) {
+                                    int itemOffset =
+                                            AUnorderedListSerializerDeserializer.getItemOffset(serList, offset, i);
+                                    tokens2.add(IntegerPointable.getInteger(serList, itemOffset));
+                                }
+                            }
+                            // pad tokens
+                            for (; i < length2; i++) {
+                                tokens2.add(Integer.MAX_VALUE);
+                            }
+
+                            // -- - token prefix - --
+                            evalTokenPrefix.evaluate(tuple, inputVal);
+                            int tokenPrefix =
+                                    IntegerPointable.getInteger(inputVal.getByteArray(), inputVal.getStartOffset() + 1);
+
+                            //
+                            // -- - position filter - --
+                            //
+                            SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(),
+                                    0, tokens2.length(), tokenPrefix, parInter);
+                            if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1,
+                                    parInter.posYStop, length2)) {
+
+                                //
+                                // -- - suffix filter - --
+                                //
+                                if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(),
+                                        parInter.posXStart, tokens2.get(), 0, tokens2.length(), parInter.posYStart)) {
+
+                                    sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(),
+                                            parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(),
+                                            parInter.posYStop + 1, parInter.intersectSize);
+                                }
+                            }
+                        }
+
+                        res.setValue(sim);
+
+                        try {
+                            doubleSerde.serialize(res, out);
+                        } catch (IOException e) {
+                            throw HyracksDataException.create(e);
+                        }
+                        result.set(resultStorage);
+                    }
+                };
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return FID;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java
new file mode 100644
index 0000000..9fc5b34
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardCheckDescriptor.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.SimilarityJaccardCheckEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class SimilarityJaccardCheckDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new SimilarityJaccardCheckDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new SimilarityJaccardCheckEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.SIMILARITY_JACCARD_CHECK;
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java
new file mode 100644
index 0000000..742f2ff
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/functions/SimilarityJaccardDescriptor.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.functions;
+
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.functions.IFunctionDescriptor;
+import org.apache.asterix.om.functions.IFunctionDescriptorFactory;
+import org.apache.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import org.apache.asterix.runtime.evaluators.common.SimilarityJaccardEvaluator;
+import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class SimilarityJaccardDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+    private static final long serialVersionUID = 1L;
+    public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+        @Override
+        public IFunctionDescriptor createFunctionDescriptor() {
+            return new SimilarityJaccardDescriptor();
+        }
+    };
+
+    @Override
+    public IScalarEvaluatorFactory createEvaluatorFactory(final IScalarEvaluatorFactory[] args) {
+        return new IScalarEvaluatorFactory() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public IScalarEvaluator createScalarEvaluator(IHyracksTaskContext ctx) throws HyracksDataException {
+                return new SimilarityJaccardEvaluator(args, ctx);
+            }
+        };
+    }
+
+    @Override
+    public FunctionIdentifier getIdentifier() {
+        return BuiltinFunctions.SIMILARITY_JACCARD;
+    }
+
+}


Mime
View raw message