asterixdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jianf...@apache.org
Subject [07/15] incubator-asterixdb git commit: ASTERIXDB-1102: VarSize Encoding to store length of String and ByteArray
Date Thu, 29 Oct 2015 04:44:58 GMT
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
deleted file mode 100644
index 6472b68..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
-
-    private static final long serialVersionUID = 1L;
-
-    public HashedUTF8NGramTokenFactory() {
-        super();
-    }
-
-    public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public IToken createToken() {
-        return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
deleted file mode 100644
index 6911b25..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class HashedUTF8WordToken extends UTF8WordToken {
-
-    private int hash = 0;
-
-    public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (o == null) {
-            return false;
-        }
-        if (!(o instanceof IToken)) {
-            return false;
-        }
-        IToken t = (IToken) o;
-        if (t.getTokenLength() != tokenLength) {
-            return false;
-        }
-        int offset = 0;
-        for (int i = 0; i < tokenLength; i++) {
-            if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) {
-                return false;
-            }
-            offset += StringUtils.charSize(data, start + offset);
-        }
-        return true;
-    }
-
-    @Override
-    public int hashCode() {
-        return hash;
-    }
-
-    @Override
-    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
-        super.reset(data, start, length, tokenLength, tokenCount);
-
-        // pre-compute hash value using JAQL-like string hashing
-        int pos = start;
-        hash = GOLDEN_RATIO_32;
-        for (int i = 0; i < tokenLength; i++) {
-            hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-            hash *= GOLDEN_RATIO_32;
-            pos += StringUtils.charSize(data, pos);
-        }
-        hash += tokenCount;
-    }
-
-    @Override
-    public void serializeToken(DataOutput dos) throws IOException {
-        if (tokenTypeTag > 0) {
-            dos.write(tokenTypeTag);
-        }
-
-        // serialize hash value
-        dos.writeInt(hash);
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
deleted file mode 100644
index 50bc67c..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
-
-    private static final long serialVersionUID = 1L;
-
-    public HashedUTF8WordTokenFactory() {
-        super();
-    }
-
-    public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public IToken createToken() {
-        return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
deleted file mode 100644
index 86359e1..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public interface IBinaryTokenizer {
-    public IToken getToken();
-
-    public boolean hasNext();
-
-    public void next();
-
-    public void reset(byte[] data, int start, int length);
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
deleted file mode 100644
index f7cf4d5..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-
-public interface IBinaryTokenizerFactory extends Serializable {
-    public IBinaryTokenizer createTokenizer();
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java
deleted file mode 100644
index 81f7b44..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public interface INGramToken {
-    public int getNumPostChars();
-
-    public int getNumPreChars();
-
-    public void setNumPrePostChars(int numPreChars, int numPostChars);
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java
deleted file mode 100644
index 6d7b05d..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public interface IToken {
-    public byte[] getData();
-
-    public int getLength();
-
-    public int getStart();
-
-    public int getTokenLength();
-
-    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount);
-
-    public void serializeToken(DataOutput dos) throws IOException;
-
-    public void serializeTokenCount(DataOutput dos) throws IOException;
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
deleted file mode 100644
index 245530f..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-
-public interface ITokenFactory extends Serializable {
-    public IToken createToken();
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 88c58b2..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
-
-    private int gramLength;
-    private boolean usePrePost;
-
-    private int gramNum;
-    private int totalGrams;
-
-    private final INGramToken concreteToken;
-
-    public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
-            boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
-        super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
-        this.gramLength = gramLength;
-        this.usePrePost = usePrePost;
-        concreteToken = (INGramToken) token;
-    }
-
-    @Override
-    public boolean hasNext() {
-        if (gramNum < totalGrams) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-    @Override
-    public void next() {
-        int currentTokenStart = index;
-        int tokenCount = 1;
-        int numPreChars = 0;
-        int numPostChars = 0;
-        if (usePrePost) {
-            numPreChars = Math.max(gramLength - gramNum - 1, 0);
-            numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
-        }
-        gramNum++;
-
-        concreteToken.setNumPrePostChars(numPreChars, numPostChars);
-        if (numPreChars == 0) {
-            index += StringUtils.charSize(data, index);
-        }
-
-        // compute token count
-        // ignore pre and post grams for duplicate detection
-        if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
-            int tmpIndex = start;
-            while (tmpIndex < currentTokenStart) {
-                tokenCount++; // assume found
-                int offset = 0;
-                for (int j = 0; j < gramLength; j++) {
-                    if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils
-                            .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) {
-                        tokenCount--;
-                        break;
-                    }
-                    offset += StringUtils.charSize(data, tmpIndex + offset);
-                }
-                tmpIndex += StringUtils.charSize(data, tmpIndex);
-            }
-        }
-
-        // set token
-        token.reset(data, currentTokenStart, length, gramLength, tokenCount);
-    }
-
-    @Override
-    public void reset(byte[] data, int start, int length) {
-        super.reset(data, start, length);
-        gramNum = 0;
-
-        int numChars = 0;
-        int pos = index;
-        int end = pos + utf8Length;
-        while (pos < end) {
-            numChars++;
-            pos += StringUtils.charSize(data, pos);
-        }
-
-        if (usePrePost) {
-            totalGrams = numChars + gramLength - 1;
-        } else {
-            totalGrams = numChars - gramLength + 1;
-        }
-    }
-
-    public void setGramlength(int gramLength) {
-        this.gramLength = gramLength;
-    }
-
-    public void setPrePost(boolean usePrePost) {
-        this.usePrePost = usePrePost;
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java
deleted file mode 100644
index d3afd80..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java
+++ /dev/null
@@ -1,216 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class StringUtils {
-    public static char charAt(byte[] b, int s) {
-        int c = b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return (char) c;
-
-            case 12:
-            case 13:
-                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
-
-            case 14:
-                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
-
-            default:
-                throw new IllegalArgumentException();
-        }
-    }
-
-    public static int charSize(byte[] b, int s) {
-        int c = b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return 1;
-
-            case 12:
-            case 13:
-                return 2;
-
-            case 14:
-                return 3;
-        }
-        throw new IllegalStateException();
-    }
-
-    public static int getModifiedUTF8Len(char c) {
-        if (c >= 0x0000 && c <= 0x007F) {
-            return 1;
-        } else if (c <= 0x07FF) {
-            return 2;
-        } else {
-            return 3;
-        }
-    }
-
-    public static int getStrLen(byte[] b, int s) {
-        int pos = s + 2;
-        int end = pos + getUTFLen(b, s);
-        int charCount = 0;
-        while (pos < end) {
-            charCount++;
-            pos += charSize(b, pos);
-        }
-        return charCount;
-    }
-
-    public static int getUTFLen(byte[] b, int s) {
-        return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
-    }
-
-    public static char toLowerCase(char c) {
-        switch (c) {
-            case 'A':
-                return 'a';
-            case 'B':
-                return 'b';
-            case 'C':
-                return 'c';
-            case 'D':
-                return 'd';
-            case 'E':
-                return 'e';
-            case 'F':
-                return 'f';
-            case 'G':
-                return 'g';
-            case 'H':
-                return 'h';
-            case 'I':
-                return 'i';
-            case 'J':
-                return 'j';
-            case 'K':
-                return 'k';
-            case 'L':
-                return 'l';
-            case 'M':
-                return 'm';
-            case 'N':
-                return 'n';
-            case 'O':
-                return 'o';
-            case 'P':
-                return 'p';
-            case 'Q':
-                return 'q';
-            case 'R':
-                return 'r';
-            case 'S':
-                return 's';
-            case 'T':
-                return 't';
-            case 'U':
-                return 'u';
-            case 'V':
-                return 'v';
-            case 'W':
-                return 'w';
-            case 'X':
-                return 'x';
-            case 'Y':
-                return 'y';
-            case 'Z':
-                return 'z';
-            case 'Ä':
-                return 'ä';
-            case 'Ǟ':
-                return 'ǟ';
-            case 'Ë':
-                return 'ë';
-            case 'Ḧ':
-                return 'ḧ';
-            case 'Ï':
-                return 'ï';
-            case 'Ḯ':
-                return 'ḯ';
-            case 'Ö':
-                return 'ö';
-            case 'Ȫ':
-                return 'ȫ';
-            case 'Ṏ':
-                return 'ṏ';
-            case 'Ü':
-                return 'ü';
-            case 'Ǖ':
-                return 'ǖ';
-            case 'Ǘ':
-                return 'ǘ';
-            case 'Ǚ':
-                return 'ǚ';
-            case 'Ǜ':
-                return 'ǜ';
-            case 'Ṳ':
-                return 'ṳ';
-            case 'Ṻ':
-                return 'ṻ';
-            case 'Ẅ':
-                return 'ẅ';
-            case 'Ẍ':
-                return 'ẍ';
-            case 'Ÿ':
-                return 'ÿ';
-            default:
-                // since I probably missed some chars above
-                // use Java to convert to lower case to be safe
-                return Character.toLowerCase(c);
-        }
-    }
-
-    public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
-
-        if (c >= 0x0000 && c <= 0x007F) {
-            dos.writeByte(c);
-        } else if (c <= 0x07FF) {
-            dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
-            dos.writeByte((byte) (0x80 | (c & 0x3F)));
-        } else {
-            dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
-            dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
-            dos.writeByte((byte) (0x80 | (c & 0x3F)));
-        }
-    }
-
-    public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
-        dos.write((len >>> 8) & 0xFF);
-        dos.write((len >>> 0) & 0xFF);
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
deleted file mode 100644
index a3326c4..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
-
-    public final static char PRECHAR = '#';
-
-    public final static char POSTCHAR = '$';
-
-    protected int numPreChars;
-    protected int numPostChars;
-
-    public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public int getNumPostChars() {
-        return numPreChars;
-    }
-
-    @Override
-    public int getNumPreChars() {
-        return numPostChars;
-    }
-
-    @Override
-    public void serializeToken(DataOutput dos) throws IOException {
-        handleTokenTypeTag(dos);
-
-        // regular chars
-        int numRegChars = tokenLength - numPreChars - numPostChars;
-
-        // assuming pre and post char need 1-byte each in utf8
-        int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
-
-        // write utf8 length indicator
-        StringUtils.writeUTF8Len(tokenUTF8Len, dos);
-
-        // pre chars
-        for (int i = 0; i < numPreChars; i++) {
-            StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
-        }
-
-        int pos = start;
-        for (int i = 0; i < numRegChars; i++) {
-            char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-            StringUtils.writeCharAsModifiedUTF8(c, dos);
-            pos += StringUtils.charSize(data, pos);
-        }
-
-        // post chars
-        for (int i = 0; i < numPostChars; i++) {
-            StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
-        }
-    }
-
-    public void setNumPrePostChars(int numPreChars, int numPostChars) {
-        this.numPreChars = numPreChars;
-        this.numPostChars = numPostChars;
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
deleted file mode 100644
index 520aa66..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
-
-    private static final long serialVersionUID = 1L;
-
-    public UTF8NGramTokenFactory() {
-        super();
-    }
-
-    public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public IToken createToken() {
-        return new UTF8NGramToken(tokenTypeTag, countTypeTag);
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
deleted file mode 100644
index 41a8105..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class UTF8WordToken extends AbstractUTF8Token {
-
-    public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public void serializeToken(DataOutput dos) throws IOException {
-        handleTokenTypeTag(dos);
-
-        int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
-        StringUtils.writeUTF8Len(tokenUTF8Len, dos);
-        int pos = start;
-        for (int i = 0; i < tokenLength; i++) {
-            char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-            StringUtils.writeCharAsModifiedUTF8(c, dos);
-            pos += StringUtils.charSize(data, pos);
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
deleted file mode 100644
index 9d15db9..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
-
-    private static final long serialVersionUID = 1L;
-
-    public UTF8WordTokenFactory() {
-        super();
-    }
-
-    public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
-        super(tokenTypeTag, countTypeTag);
-    }
-
-    @Override
-    public IToken createToken() {
-        return new UTF8WordToken(tokenTypeTag, countTypeTag);
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
deleted file mode 100644
index d10aefb..0000000
--- a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
+++ /dev/null
@@ -1,239 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tests;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
-import org.apache.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory;
-import org.apache.asterix.fuzzyjoin.tokenizer.IToken;
-import org.apache.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
-
-public class NGramTokenizerTest {
-
-    private char PRECHAR = '#';
-    private char POSTCHAR = '$';
-
-    private String str = "Jürgen S. Generic's Car";
-    private byte[] inputBuffer;
-
-    private int gramLength = 3;
-
-    private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
-
-        String tmp = s.toLowerCase();
-        if (prePost) {
-            StringBuilder preBuilder = new StringBuilder();
-            for (int i = 0; i < gramLength - 1; i++) {
-                preBuilder.append(PRECHAR);
-            }
-            String pre = preBuilder.toString();
-
-            StringBuilder postBuilder = new StringBuilder();
-            for (int i = 0; i < gramLength - 1; i++) {
-                postBuilder.append(POSTCHAR);
-            }
-            String post = postBuilder.toString();
-
-            tmp = pre + s.toLowerCase() + post;
-        }
-
-        for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
-            String gram = tmp.substring(i, i + gramLength);
-            grams.add(gram);
-        }
-    }
-
-    @Before
-    public void init() throws Exception {
-        // serialize string into bytes
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        DataOutput dos = new DataOutputStream(baos);
-        dos.writeUTF(str);
-        inputBuffer = baos.toByteArray();
-    }
-
-    void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
-        HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
-        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
-                false, tokenFactory);
-        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
-        ArrayList<String> expectedGrams = new ArrayList<String>();
-        getExpectedGrams(str, gramLength, expectedGrams, prePost);
-        ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
-        HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
-        for (String s : expectedGrams) {
-            Integer count = gramCounts.get(s);
-            if (count == null) {
-                count = 1;
-                gramCounts.put(s, count);
-            } else {
-                count++;
-            }
-
-            int hash = tokenHash(s, count);
-            expectedHashedGrams.add(hash);
-        }
-
-        int tokenCount = 0;
-
-        while (tokenizer.hasNext()) {
-            tokenizer.next();
-
-            // serialize hashed token
-            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-            DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
-            IToken token = tokenizer.getToken();
-            token.serializeToken(tokenDos);
-
-            // deserialize token
-            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
-            DataInput in = new DataInputStream(bais);
-
-            Integer hashedGram = in.readInt();
-
-            // System.out.println(hashedGram);
-
-            Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
-
-            tokenCount++;
-        }
-        // System.out.println("---------");
-    }
-
-    void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
-        HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
-        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
-                tokenFactory);
-        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
-        ArrayList<String> expectedGrams = new ArrayList<String>();
-        getExpectedGrams(str, gramLength, expectedGrams, prePost);
-        ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
-        for (String s : expectedGrams) {
-            int hash = tokenHash(s, 1);
-            expectedHashedGrams.add(hash);
-        }
-
-        int tokenCount = 0;
-
-        while (tokenizer.hasNext()) {
-            tokenizer.next();
-
-            // serialize hashed token
-            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-            DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
-            IToken token = tokenizer.getToken();
-            token.serializeToken(tokenDos);
-
-            // deserialize token
-            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
-            DataInput in = new DataInputStream(bais);
-
-            Integer hashedGram = in.readInt();
-
-            // System.out.println(hashedGram);
-
-            Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
-
-            tokenCount++;
-        }
-        // System.out.println("---------");
-    }
-
-    void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
-        UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
-        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
-                tokenFactory);
-        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
-        ArrayList<String> expectedGrams = new ArrayList<String>();
-        getExpectedGrams(str, gramLength, expectedGrams, prePost);
-
-        int tokenCount = 0;
-
-        while (tokenizer.hasNext()) {
-            tokenizer.next();
-
-            // serialize hashed token
-            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-            DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
-            IToken token = tokenizer.getToken();
-            token.serializeToken(tokenDos);
-
-            // deserialize token
-            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
-            DataInput in = new DataInputStream(bais);
-
-            String strGram = in.readUTF();
-
-            // System.out.println("\"" + strGram + "\"");
-
-            Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
-
-            tokenCount++;
-        }
-        // System.out.println("---------");
-    }
-
-    @Test
-    public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
-        runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
-        runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
-    }
-
-    @Test
-    public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
-        runTestNGramTokenizerWithHashedUTF8Tokens(false);
-        runTestNGramTokenizerWithHashedUTF8Tokens(true);
-    }
-
-    @Test
-    public void testNGramTokenizerWithUTF8Tokens() throws IOException {
-        runTestNGramTokenizerWithUTF8Tokens(false);
-        runTestNGramTokenizerWithUTF8Tokens(true);
-    }
-
-    public int tokenHash(String token, int tokenCount) {
-        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
-        for (int i = 0; i < token.length(); i++) {
-            h ^= token.charAt(i);
-            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
-        }
-        return h + tokenCount;
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java
deleted file mode 100644
index a4afe0c..0000000
--- a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tests;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import junit.framework.Assert;
-
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
-import org.apache.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory;
-import org.apache.asterix.fuzzyjoin.tokenizer.IToken;
-import org.apache.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
-
-public class WordTokenizerTest {
-
-    private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
-    private byte[] inputBuffer;
-
-    private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
-    private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
-    private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
-
-    @Before
-    public void init() throws IOException {
-        // serialize text into bytes
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        DataOutput dos = new DataOutputStream(baos);
-        dos.writeUTF(text);
-        inputBuffer = baos.toByteArray();
-
-        // init expected string tokens
-        expectedUTF8Tokens.add("hello");
-        expectedUTF8Tokens.add("world");
-        expectedUTF8Tokens.add("i");
-        expectedUTF8Tokens.add("would");
-        expectedUTF8Tokens.add("like");
-        expectedUTF8Tokens.add("to");
-        expectedUTF8Tokens.add("inform");
-        expectedUTF8Tokens.add("you");
-        expectedUTF8Tokens.add("of");
-        expectedUTF8Tokens.add("the");
-        expectedUTF8Tokens.add("importance");
-        expectedUTF8Tokens.add("of");
-        expectedUTF8Tokens.add("foo");
-        expectedUTF8Tokens.add("bar");
-        expectedUTF8Tokens.add("yes");
-        expectedUTF8Tokens.add("foo");
-        expectedUTF8Tokens.add("bar");
-        expectedUTF8Tokens.add("jürgen");
-
-        // hashed tokens ignoring token count
-        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
-            int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
-            expectedHashedUTF8Tokens.add(hash);
-        }
-
-        // hashed tokens using token count
-        HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
-        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
-            Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
-            if (count == null) {
-                count = 1;
-                tokenCounts.put(expectedUTF8Tokens.get(i), count);
-            } else {
-                count++;
-            }
-
-            int hash = tokenHash(expectedUTF8Tokens.get(i), count);
-            expectedCountedHashedUTF8Tokens.add(hash);
-        }
-    }
-
-    @Test
-    public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
-
-        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
-        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
-                tokenFactory);
-
-        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
-        int tokenCount = 0;
-
-        while (tokenizer.hasNext()) {
-            tokenizer.next();
-
-            // serialize token
-            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-            DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
-            IToken token = tokenizer.getToken();
-            token.serializeToken(tokenDos);
-
-            // deserialize token
-            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
-            DataInput in = new DataInputStream(bais);
-
-            Integer hashedToken = in.readInt();
-
-            // System.out.println(hashedToken);
-
-            Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
-
-            tokenCount++;
-        }
-    }
-
-    @Test
-    public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
-
-        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
-        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
-        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
-        int tokenCount = 0;
-
-        while (tokenizer.hasNext()) {
-            tokenizer.next();
-
-            // serialize token
-            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-            DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
-            IToken token = tokenizer.getToken();
-            token.serializeToken(tokenDos);
-
-            // deserialize token
-            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
-            DataInput in = new DataInputStream(bais);
-
-            Integer hashedToken = in.readInt();
-
-            // System.out.println(hashedToken);
-
-            Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
-
-            tokenCount++;
-        }
-    }
-
-    @Test
-    public void testWordTokenizerWithUTF8Tokens() throws IOException {
-
-        UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
-        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
-        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
-        int tokenCount = 0;
-
-        while (tokenizer.hasNext()) {
-            tokenizer.next();
-
-            // serialize hashed token
-            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-            DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
-            IToken token = tokenizer.getToken();
-            token.serializeToken(tokenDos);
-
-            // deserialize token
-            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
-            DataInput in = new DataInputStream(bais);
-
-            String strToken = in.readUTF();
-
-            // System.out.println(strToken);
-
-            Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
-
-            tokenCount++;
-        }
-    }
-
-    // JAQL
-    public int tokenHash(String token, int tokenCount) {
-        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
-        for (int i = 0; i < token.length(); i++) {
-            h ^= token.charAt(i);
-            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
-        }
-        return h + tokenCount;
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
index b88ed3a..ac975c4 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
@@ -54,6 +54,7 @@ import org.apache.hyracks.storage.am.lsm.common.api.ILSMIndexAccessorInternal;
 @SuppressWarnings({ "rawtypes", "unchecked" })
 public class ExternalFileIndexAccessor implements Serializable {
 
+    private final FilesIndexDescription filesIndexDescription = new FilesIndexDescription();
     private static final long serialVersionUID = 1L;
     private ExternalBTreeDataflowHelper indexDataflowHelper;
     private ExternalLoopkupOperatorDiscriptor opDesc;
@@ -119,7 +120,7 @@ public class ExternalFileIndexAccessor implements Serializable {
             int recordLength = tuple.getFieldLength(FilesIndexDescription.FILE_PAYLOAD_INDEX);
             ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
             DataInput in = new DataInputStream(stream);
-            ARecord externalFileRecord = (ARecord) FilesIndexDescription.EXTERNAL_FILE_RECORD_SERDE.deserialize(in);
+            ARecord externalFileRecord = (ARecord) filesIndexDescription.EXTERNAL_FILE_RECORD_SERDE.deserialize(in);
             setExternalFileFromARecord(externalFileRecord, file);
         } else {
             // This should never happen

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
index 07c8e5f..a7844ce 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
@@ -40,7 +40,6 @@ import org.apache.hyracks.storage.common.IStorageManagerInterface;
  * This operator is intended for using record ids to access data in external sources
  */
 public class ExternalLoopkupOperatorDiscriptor extends AbstractTreeIndexOperatorDescriptor {
-
     private static final long serialVersionUID = 1L;
     private final IControlledAdapterFactory adapterFactory;
     private final INullWriterFactory iNullWriterFactory;
@@ -53,8 +52,8 @@ public class ExternalLoopkupOperatorDiscriptor extends AbstractTreeIndexOperator
             ISearchOperationCallbackFactory searchOpCallbackFactory, boolean retainNull,
             INullWriterFactory iNullWriterFactory) {
         super(spec, 1, 1, outRecDesc, storageManager, lcManagerProvider, fileSplitProvider,
-                FilesIndexDescription.EXTERNAL_FILE_INDEX_TYPE_TRAITS,
-                FilesIndexDescription.FILES_INDEX_COMP_FACTORIES, FilesIndexDescription.BLOOM_FILTER_FIELDS,
+                new FilesIndexDescription().EXTERNAL_FILE_INDEX_TYPE_TRAITS,
+                new FilesIndexDescription().FILES_INDEX_COMP_FACTORIES, FilesIndexDescription.BLOOM_FILTER_FIELDS,
                 externalFilesIndexDataFlowHelperFactory, null, propagateInput, retainNull, iNullWriterFactory, null,
                 searchOpCallbackFactory, null);
         this.adapterFactory = adapterFactory;

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
index 0474ae5..cb4c0d2 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
@@ -41,58 +41,64 @@ public class FilesIndexDescription {
     public final static int FILE_KEY_INDEX = 0;
     public final static int FILE_KEY_SIZE = 1;
     public final static int FILE_PAYLOAD_INDEX = 1;
-    public static RecordDescriptor FILE_INDEX_RECORD_DESCRIPTOR;
-    public static RecordDescriptor FILE_BUDDY_BTREE_RECORD_DESCRIPTOR;
     public final static String[] payloadFieldNames = { "FileName", "FileSize", "FileModDate" };
     public final static IAType[] payloadFieldTypes = { BuiltinType.ASTRING, BuiltinType.AINT64, BuiltinType.ADATETIME };
-    public static ARecordType EXTERNAL_FILE_RECORD_TYPE;
-    public static ISerializerDeserializer EXTERNAL_FILE_RECORD_SERDE;
-    public static final ISerializerDeserializer[] EXTERNAL_FILE_BUDDY_BTREE_FIELDS = new ISerializerDeserializer[1];
-    public static final ITypeTraits[] EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS = new ITypeTraits[1];
-    public static final ISerializerDeserializer[] EXTERNAL_FILE_TUPLE_FIELDS = new ISerializerDeserializer[FILE_INDEX_TUPLE_SIZE];
-    public static final ITypeTraits[] EXTERNAL_FILE_INDEX_TYPE_TRAITS = new ITypeTraits[FILE_INDEX_TUPLE_SIZE];
-    public static final IBinaryComparatorFactory[] FILES_INDEX_COMP_FACTORIES = new IBinaryComparatorFactory[] { AqlBinaryComparatorFactoryProvider.INSTANCE
-            .getBinaryComparatorFactory(BuiltinType.AINT32, true) };
+
     public static final int[] BLOOM_FILTER_FIELDS = { 0 };
     public static final int EXTERNAL_FILE_NAME_FIELD_INDEX = 0;
     public static final int EXTERNAL_FILE_SIZE_FIELD_INDEX = 1;
     public static final int EXTERNAL_FILE_MOD_DATE_FIELD_INDEX = 2;
-    static {
-        try {
-            EXTERNAL_FILE_RECORD_TYPE = new ARecordType("ExternalFileRecordType", payloadFieldNames, payloadFieldTypes,
-                    true);
-            EXTERNAL_FILE_RECORD_SERDE = AqlSerializerDeserializerProvider.INSTANCE
-                    .getSerializerDeserializer(EXTERNAL_FILE_RECORD_TYPE);
 
-            EXTERNAL_FILE_TUPLE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
-                    .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
-            EXTERNAL_FILE_TUPLE_FIELDS[FILE_PAYLOAD_INDEX] = EXTERNAL_FILE_RECORD_SERDE;
-            EXTERNAL_FILE_BUDDY_BTREE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
-                    .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+    public final ARecordType EXTERNAL_FILE_RECORD_TYPE;
+    public final ITypeTraits[] EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS = new ITypeTraits[1];
+    public final ITypeTraits[] EXTERNAL_FILE_INDEX_TYPE_TRAITS = new ITypeTraits[FILE_INDEX_TUPLE_SIZE];
 
-            EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
-                    .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
-            EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_PAYLOAD_INDEX] = AqlTypeTraitProvider.INSTANCE
-                    .getTypeTrait(EXTERNAL_FILE_RECORD_TYPE);
-            EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
-                    .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+    public final ISerializerDeserializer EXTERNAL_FILE_RECORD_SERDE;
+    public final RecordDescriptor FILE_INDEX_RECORD_DESCRIPTOR;
+    public final RecordDescriptor FILE_BUDDY_BTREE_RECORD_DESCRIPTOR;
+    public final ISerializerDeserializer[] EXTERNAL_FILE_BUDDY_BTREE_FIELDS = new ISerializerDeserializer[1];
+    public final ISerializerDeserializer[] EXTERNAL_FILE_TUPLE_FIELDS = new ISerializerDeserializer[FILE_INDEX_TUPLE_SIZE];
+    public final IBinaryComparatorFactory[] FILES_INDEX_COMP_FACTORIES = new IBinaryComparatorFactory[] {
+            AqlBinaryComparatorFactoryProvider.INSTANCE.getBinaryComparatorFactory(BuiltinType.AINT32, true) };
 
-            FILE_INDEX_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_TUPLE_FIELDS,
-                    EXTERNAL_FILE_INDEX_TYPE_TRAITS);
-
-            FILE_BUDDY_BTREE_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_BUDDY_BTREE_FIELDS,
-                    EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS);
+    public FilesIndexDescription() {
+        ARecordType type;
+        try {
+            type = new ARecordType("ExternalFileRecordType", payloadFieldNames,
+                    payloadFieldTypes, true);
         } catch (Exception e) {
             e.printStackTrace();
-            System.exit(1);
+            throw new RuntimeException(e);
         }
+        EXTERNAL_FILE_RECORD_TYPE = type;
+        EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
+                .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+        EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_PAYLOAD_INDEX] = AqlTypeTraitProvider.INSTANCE
+                .getTypeTrait(EXTERNAL_FILE_RECORD_TYPE);
+        EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
+                .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+
+        EXTERNAL_FILE_RECORD_SERDE = AqlSerializerDeserializerProvider.INSTANCE
+                .getSerializerDeserializer(EXTERNAL_FILE_RECORD_TYPE);
+
+        EXTERNAL_FILE_TUPLE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
+                .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+        EXTERNAL_FILE_TUPLE_FIELDS[FILE_PAYLOAD_INDEX] = EXTERNAL_FILE_RECORD_SERDE;
+        EXTERNAL_FILE_BUDDY_BTREE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
+                .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+
+        FILE_INDEX_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_TUPLE_FIELDS,
+                EXTERNAL_FILE_INDEX_TYPE_TRAITS);
+
+        FILE_BUDDY_BTREE_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_BUDDY_BTREE_FIELDS,
+                EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS);
     }
 
     @SuppressWarnings("unchecked")
-    public static void getBuddyBTreeTupleFromFileNumber(ArrayTupleReference tuple, ArrayTupleBuilder tupleBuilder,
+    public void getBuddyBTreeTupleFromFileNumber(ArrayTupleReference tuple, ArrayTupleBuilder tupleBuilder,
             AMutableInt32 aInt32) throws IOException, AsterixException {
         tupleBuilder.reset();
-        FilesIndexDescription.FILE_BUDDY_BTREE_RECORD_DESCRIPTOR.getFields()[0].serialize(aInt32,
+        FILE_BUDDY_BTREE_RECORD_DESCRIPTOR.getFields()[0].serialize(aInt32,
                 tupleBuilder.getDataOutput());
         tupleBuilder.addFieldEndOffset();
         tuple.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray());

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
index a1a5b5c..e79fe1c 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
@@ -25,6 +25,7 @@ import java.io.DataInputStream;
 
 import org.apache.asterix.common.transactions.JobId;
 import org.apache.asterix.dataflow.data.nontagged.serde.AObjectSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AStringSerializerDeserializer;
 import org.apache.asterix.metadata.MetadataException;
 import org.apache.asterix.metadata.api.IValueExtractor;
 import org.apache.asterix.om.base.AString;
@@ -36,6 +37,8 @@ import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
  * contains a serialized representation of a Dataset metadata entity.
  */
 public class DatasetNameValueExtractor implements IValueExtractor<String> {
+    private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer();
+
     @Override
     public String getValue(JobId jobId, ITupleReference tuple) throws MetadataException, HyracksDataException {
         byte[] serRecord = tuple.getFieldData(2);
@@ -43,6 +46,6 @@ public class DatasetNameValueExtractor implements IValueExtractor<String> {
         int recordLength = tuple.getFieldLength(2);
         ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
         DataInput in = new DataInputStream(stream);
-        return (((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue());
+        return (((AString) aObjSerDer.deserialize(in)).getStringValue());
     }
 }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
index 9d5e8b1..9a50a31 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
@@ -40,6 +40,7 @@ import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
 public class DatatypeNameValueExtractor implements IValueExtractor<String> {
     private final String dataverseName;
     private final MetadataNode metadataNode;
+    private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer();
 
     public DatatypeNameValueExtractor(String dataverseName, MetadataNode metadataNode) {
         this.dataverseName = dataverseName;
@@ -53,7 +54,7 @@ public class DatatypeNameValueExtractor implements IValueExtractor<String> {
         int recordLength = tuple.getFieldLength(2);
         ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
         DataInput in = new DataInputStream(stream);
-        String typeName = ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue();
+        String typeName = ((AString) aObjSerDer.deserialize(in)).getStringValue();
         try {
             if (metadataNode.getDatatype(jobId, dataverseName, typeName).getIsAnonymous()) {
                 // Get index 0 because it is anonymous type, and it is used in

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
index d046650..41d92c9 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
@@ -43,6 +43,7 @@ public class NestedDatatypeNameValueExtractor implements IValueExtractor<String>
     public NestedDatatypeNameValueExtractor(String datatypeName) {
         this.datatypeName = datatypeName;
     }
+    private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer();
 
     @Override
     public String getValue(JobId jobId, ITupleReference tuple) throws MetadataException, HyracksDataException {
@@ -51,13 +52,13 @@ public class NestedDatatypeNameValueExtractor implements IValueExtractor<String>
         int recordLength = tuple.getFieldLength(2);
         ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
         DataInput in = new DataInputStream(stream);
-        String nestedType = ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue();
+        String nestedType = ((AString) aObjSerDer.deserialize(in)).getStringValue();
         if (nestedType.equals(datatypeName)) {
             recordStartOffset = tuple.getFieldStart(1);
             recordLength = tuple.getFieldLength(1);
             stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
             in = new DataInputStream(stream);
-            return ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue();
+            return ((AString) aObjSerDer.deserialize(in)).getStringValue();
         }
         return null;
     }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java b/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
index d664e12..55ff32f 100644
--- a/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
+++ b/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
@@ -45,6 +45,7 @@ public class RecordBuilder implements IARecordBuilder {
     private final static int DEFAULT_NUM_OPEN_FIELDS = 10;
     private final static byte SER_NULL_TYPE_TAG = ATypeTag.NULL.serialize();
     private final static byte RECORD_TYPE_TAG = ATypeTag.RECORD.serialize();
+    private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer();
 
     private int openPartOffsetArraySize;
     private byte[] openPartOffsetArray;
@@ -226,9 +227,8 @@ public class RecordBuilder implements IARecordBuilder {
                 for (int i = 1; i < numberOfOpenFields; i++) {
                     if (utf8Comparator.compare(openBytes, (int) openPartOffsets[i - 1], openFieldNameLengths[i - 1],
                             openBytes, (int) openPartOffsets[i], openFieldNameLengths[i]) == 0) {
-                        String field = UTF8StringSerializerDeserializer.INSTANCE
-                                .deserialize(new DataInputStream(new ByteArrayInputStream(openBytes,
-                                        (int) openPartOffsets[i], openFieldNameLengths[i])));
+                        String field = utf8SerDer.deserialize(new DataInputStream(new ByteArrayInputStream(openBytes,
+                                (int) openPartOffsets[i], openFieldNameLengths[i])));
                         throw new AsterixException("Open fields " + (i - 1) + " and " + i
                                 + " have the same field name \"" + field + "\"");
                     }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
index f019f10..a3bff52 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
@@ -26,8 +26,8 @@ import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
 public class AListElementToken implements IToken {
 
     protected byte[] data;
-    protected int start;
-    protected int length;
+    protected int startOffset;
+    protected int endOffset;
     protected int tokenLength;
     protected int typeTag;
 
@@ -37,13 +37,13 @@ public class AListElementToken implements IToken {
     }
 
     @Override
-    public int getLength() {
-        return length;
+    public int getEndOffset() {
+        return endOffset;
     }
 
     @Override
-    public int getStart() {
-        return start;
+    public int getStartOffset() {
+        return startOffset;
     }
 
     @Override
@@ -52,10 +52,10 @@ public class AListElementToken implements IToken {
     }
 
     @Override
-    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+    public void reset(byte[] data, int startOffset, int endOffset, int tokenLength, int tokenCount) {
         this.data = data;
-        this.start = start;
-        this.length = length;
+        this.startOffset = startOffset;
+        this.endOffset = endOffset;
         this.tokenLength = tokenLength;
         // We abuse the last param, tokenCount, to pass the type tag.
         typeTag = tokenCount;
@@ -64,7 +64,7 @@ public class AListElementToken implements IToken {
     @Override
     public void serializeToken(GrowableArray out) throws IOException {
         out.getDataOutput().writeByte(typeTag);
-        out.getDataOutput().write(data, start, length);
+        out.getDataOutput().write(data, startOffset, endOffset - startOffset);
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
index 5f6e0b8..32207d3 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
@@ -59,9 +59,10 @@ public class AOrderedListBinaryTokenizer implements IBinaryTokenizer {
             itemOffset = getItemOffset(data, start, itemIndex);
             // Assuming homogeneous list.
             ATypeTag typeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(data[start + 1]);
+            // ? Can we handle the non-string type ?
             length = NonTaggedFormatUtil.getFieldValueLength(data, itemOffset, typeTag, false);
             // Last param is a hack to pass the type tag.
-            token.reset(data, itemOffset, length, length, data[start + 1]);
+            token.reset(data, itemOffset, itemOffset + length, length, data[start + 1]);
         } catch (AsterixException e) {
             throw new IllegalStateException(e);
         }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
index 64b5610..767a343 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
@@ -19,7 +19,6 @@
 
 package org.apache.asterix.dataflow.data.nontagged.comparators;
 
-import org.apache.asterix.formats.nontagged.UTF8StringLowercasePointable;
 import org.apache.asterix.om.types.ATypeTag;
 import org.apache.asterix.om.types.EnumDeserializer;
 import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
@@ -31,6 +30,7 @@ import org.apache.hyracks.data.std.primitive.DoublePointable;
 import org.apache.hyracks.data.std.primitive.FloatPointable;
 import org.apache.hyracks.data.std.primitive.IntegerPointable;
 import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable;
 
 public class ListItemBinaryComparatorFactory implements IBinaryComparatorFactory {
 

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
index 6935f24..493833b 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
@@ -21,7 +21,6 @@ package org.apache.asterix.dataflow.data.nontagged.hash;
 
 import java.io.IOException;
 
-import org.apache.asterix.formats.nontagged.UTF8StringLowercasePointable;
 import org.apache.asterix.om.types.ATypeTag;
 import org.apache.asterix.om.types.EnumDeserializer;
 import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction;
@@ -29,6 +28,7 @@ import org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.data.std.accessors.MurmurHash3BinaryHashFunctionFamily;
 import org.apache.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory;
+import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable;
 import org.apache.hyracks.data.std.util.GrowableArray;
 
 /**

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
index 596e168..7d88a90 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
@@ -29,8 +29,13 @@ import java.io.IOException;
 import java.io.OutputStream;
 import java.io.PrintStream;
 
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.bytes.HexPrinter;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
 public class PrintTools {
 
+
     private static final GregorianCalendarSystem gCalInstance = GregorianCalendarSystem.getInstance();
     private static long CHRONON_OF_DAY = 24 * 60 * 60 * 1000;
 
@@ -185,13 +190,13 @@ public class PrintTools {
     }
 
     public static void writeUTF8StringAsCSV(byte[] b, int s, int l, OutputStream os) throws IOException {
-        int stringLength = UTF8StringPointable.getUTFLength(b, s);
-        int position = s + 2; // skip 2 bytes containing string size
+        int stringLength = UTF8StringUtil.getUTFLength(b, s);
+        int position = s + UTF8StringUtil.getNumBytesToStoreLength(stringLength);
         int maxPosition = position + stringLength;
         os.write('"');
         while (position < maxPosition) {
-            char c = UTF8StringPointable.charAt(b, position);
-            int sz = UTF8StringPointable.charSize(b, position);
+            char c = UTF8StringUtil.charAt(b, position);
+            int sz = UTF8StringUtil.charSize(b, position);
             if (c == '"') {
                 os.write('"');
             }
@@ -202,13 +207,13 @@ public class PrintTools {
     }
 
     public static void writeUTF8StringAsJSON(byte[] b, int s, int l, OutputStream os) throws IOException {
-        int stringLength = UTF8StringPointable.getUTFLength(b, s);
-        int position = s + 2; // skip 2 bytes containing string size
-        int maxPosition = position + stringLength;
+        int utfLength = UTF8StringUtil.getUTFLength(b, s);
+        int position = s + UTF8StringUtil.getNumBytesToStoreLength(utfLength); // skip 2 bytes containing string size
+        int maxPosition = position + utfLength;
         os.write('"');
         while (position < maxPosition) {
-            char c = UTF8StringPointable.charAt(b, position);
-            int sz = UTF8StringPointable.charSize(b, position);
+            char c = UTF8StringUtil.charAt(b, position);
+            int sz = UTF8StringUtil.charSize(b, position);
             switch (c) {
                 // escape
                 case '\b':
@@ -296,27 +301,9 @@ public class PrintTools {
         os.write('u');
         os.write('0');
         os.write('0');
-        os.write(hex((c >>> 4) & 0x0f, CASE.LOWER_CASE));
-        os.write(hex(c & 0x0f, CASE.LOWER_CASE));
+        os.write(HexPrinter.hex((c >>> 4) & 0x0f, HexPrinter.CASE.LOWER_CASE));
+        os.write(HexPrinter.hex(c & 0x0f, HexPrinter.CASE.LOWER_CASE));
     }
 
-    public static Appendable printHexString(byte[] bytes, int start, int length, Appendable appendable)
-            throws IOException {
-        for (int i = 0; i < length; ++i) {
-            appendable.append((char) hex((bytes[start + i] >>> 4) & 0x0f, CASE.UPPER_CASE));
-            appendable.append((char) hex((bytes[start + i] & 0x0f), CASE.UPPER_CASE));
-        }
-        return appendable;
-    }
-
-    public static byte hex(int i, CASE c) {
-        switch (c) {
-            case LOWER_CASE:
-                return (byte) (i < 10 ? i + '0' : i + ('a' - 10));
-            case UPPER_CASE:
-                return (byte) (i < 10 ? i + '0' : i + ('A' - 10));
-        }
-        return Byte.parseByte(null);
-    }
 
 }


Mime
View raw message