Return-Path: X-Original-To: apmail-asterixdb-commits-archive@minotaur.apache.org Delivered-To: apmail-asterixdb-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 7FB2518AF9 for ; Thu, 29 Oct 2015 04:45:02 +0000 (UTC) Received: (qmail 39636 invoked by uid 500); 29 Oct 2015 04:45:02 -0000 Delivered-To: apmail-asterixdb-commits-archive@asterixdb.apache.org Received: (qmail 39607 invoked by uid 500); 29 Oct 2015 04:45:02 -0000 Mailing-List: contact commits-help@asterixdb.incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@asterixdb.incubator.apache.org Delivered-To: mailing list commits@asterixdb.incubator.apache.org Received: (qmail 39598 invoked by uid 99); 29 Oct 2015 04:45:02 -0000 Received: from Unknown (HELO spamd1-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Oct 2015 04:45:02 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd1-us-west.apache.org (ASF Mail Server at spamd1-us-west.apache.org) with ESMTP id B7D06C8407 for ; Thu, 29 Oct 2015 04:45:01 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd1-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 1.779 X-Spam-Level: * X-Spam-Status: No, score=1.779 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, KAM_LAZY_DOMAIN_SECURITY=1, RCVD_IN_MSPIKE_H3=-0.01, RCVD_IN_MSPIKE_WL=-0.01, RP_MATCHES_RCVD=-0.001] autolearn=disabled Received: from mx1-us-west.apache.org ([10.40.0.8]) by localhost (spamd1-us-west.apache.org [10.40.0.7]) (amavisd-new, port 10024) with ESMTP id FxIY35KkWkqY for ; Thu, 29 Oct 2015 04:44:56 +0000 (UTC) Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx1-us-west.apache.org (ASF Mail Server at mx1-us-west.apache.org) with SMTP id 9559E23062 for ; Thu, 29 Oct 2015 04:44:52 +0000 (UTC) Received: (qmail 39293 invoked by uid 99); 29 Oct 2015 04:44:52 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Oct 2015 04:44:52 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 2F4B6E35C7; Thu, 29 Oct 2015 04:44:52 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: jianfeng@apache.org To: commits@asterixdb.incubator.apache.org Date: Thu, 29 Oct 2015 04:44:58 -0000 Message-Id: <085ddac5d3a84025a815c0ace63a3314@git.apache.org> In-Reply-To: <950b1890ef09463ba47a572aa330a78e@git.apache.org> References: <950b1890ef09463ba47a572aa330a78e@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [07/15] incubator-asterixdb git commit: ASTERIXDB-1102: VarSize Encoding to store length of String and ByteArray http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java deleted file mode 100644 index 6472b68..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory { - - private static final long serialVersionUID = 1L; - - public HashedUTF8NGramTokenFactory() { - super(); - } - - public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public IToken createToken() { - return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag); - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java deleted file mode 100644 index 6911b25..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.DataOutput; -import java.io.IOException; - -public class HashedUTF8WordToken extends UTF8WordToken { - - private int hash = 0; - - public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public boolean equals(Object o) { - if (o == null) { - return false; - } - if (!(o instanceof IToken)) { - return false; - } - IToken t = (IToken) o; - if (t.getTokenLength() != tokenLength) { - return false; - } - int offset = 0; - for (int i = 0; i < tokenLength; i++) { - if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) { - return false; - } - offset += StringUtils.charSize(data, start + offset); - } - return true; - } - - @Override - public int hashCode() { - return hash; - } - - @Override - public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) { - super.reset(data, start, length, tokenLength, tokenCount); - - // pre-compute hash value using JAQL-like string hashing - int pos = start; - hash = GOLDEN_RATIO_32; - for (int i = 0; i < tokenLength; i++) { - hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos)); - hash *= GOLDEN_RATIO_32; - pos += StringUtils.charSize(data, pos); - } - hash += tokenCount; - } - - @Override - public void serializeToken(DataOutput dos) throws IOException { - if (tokenTypeTag > 0) { - dos.write(tokenTypeTag); - } - - // serialize hash value - dos.writeInt(hash); - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java deleted file mode 100644 index 50bc67c..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory { - - private static final long serialVersionUID = 1L; - - public HashedUTF8WordTokenFactory() { - super(); - } - - public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public IToken createToken() { - return new HashedUTF8WordToken(tokenTypeTag, countTypeTag); - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java deleted file mode 100644 index 86359e1..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public interface IBinaryTokenizer { - public IToken getToken(); - - public boolean hasNext(); - - public void next(); - - public void reset(byte[] data, int start, int length); -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java deleted file mode 100644 index f7cf4d5..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.Serializable; - -public interface IBinaryTokenizerFactory extends Serializable { - public IBinaryTokenizer createTokenizer(); -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java deleted file mode 100644 index 81f7b44..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public interface INGramToken { - public int getNumPostChars(); - - public int getNumPreChars(); - - public void setNumPrePostChars(int numPreChars, int numPostChars); -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java deleted file mode 100644 index 6d7b05d..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.DataOutput; -import java.io.IOException; - -public interface IToken { - public byte[] getData(); - - public int getLength(); - - public int getStart(); - - public int getTokenLength(); - - public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount); - - public void serializeToken(DataOutput dos) throws IOException; - - public void serializeTokenCount(DataOutput dos) throws IOException; -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java deleted file mode 100644 index 245530f..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.Serializable; - -public interface ITokenFactory extends Serializable { - public IToken createToken(); -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java deleted file mode 100644 index 88c58b2..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer { - - private int gramLength; - private boolean usePrePost; - - private int gramNum; - private int totalGrams; - - private final INGramToken concreteToken; - - public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount, - boolean sourceHasTypeTag, ITokenFactory tokenFactory) { - super(ignoreTokenCount, sourceHasTypeTag, tokenFactory); - this.gramLength = gramLength; - this.usePrePost = usePrePost; - concreteToken = (INGramToken) token; - } - - @Override - public boolean hasNext() { - if (gramNum < totalGrams) { - return true; - } else { - return false; - } - } - - @Override - public void next() { - int currentTokenStart = index; - int tokenCount = 1; - int numPreChars = 0; - int numPostChars = 0; - if (usePrePost) { - numPreChars = Math.max(gramLength - gramNum - 1, 0); - numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0; - } - gramNum++; - - concreteToken.setNumPrePostChars(numPreChars, numPostChars); - if (numPreChars == 0) { - index += StringUtils.charSize(data, index); - } - - // compute token count - // ignore pre and post grams for duplicate detection - if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) { - int tmpIndex = start; - while (tmpIndex < currentTokenStart) { - tokenCount++; // assume found - int offset = 0; - for (int j = 0; j < gramLength; j++) { - if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils - .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) { - tokenCount--; - break; - } - offset += StringUtils.charSize(data, tmpIndex + offset); - } - tmpIndex += StringUtils.charSize(data, tmpIndex); - } - } - - // set token - token.reset(data, currentTokenStart, length, gramLength, tokenCount); - } - - @Override - public void reset(byte[] data, int start, int length) { - super.reset(data, start, length); - gramNum = 0; - - int numChars = 0; - int pos = index; - int end = pos + utf8Length; - while (pos < end) { - numChars++; - pos += StringUtils.charSize(data, pos); - } - - if (usePrePost) { - totalGrams = numChars + gramLength - 1; - } else { - totalGrams = numChars - gramLength + 1; - } - } - - public void setGramlength(int gramLength) { - this.gramLength = gramLength; - } - - public void setPrePost(boolean usePrePost) { - this.usePrePost = usePrePost; - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java deleted file mode 100644 index d3afd80..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.DataOutput; -import java.io.IOException; - -public class StringUtils { - public static char charAt(byte[] b, int s) { - int c = b[s] & 0xff; - switch (c >> 4) { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - return (char) c; - - case 12: - case 13: - return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F)); - - case 14: - return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0)); - - default: - throw new IllegalArgumentException(); - } - } - - public static int charSize(byte[] b, int s) { - int c = b[s] & 0xff; - switch (c >> 4) { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - return 1; - - case 12: - case 13: - return 2; - - case 14: - return 3; - } - throw new IllegalStateException(); - } - - public static int getModifiedUTF8Len(char c) { - if (c >= 0x0000 && c <= 0x007F) { - return 1; - } else if (c <= 0x07FF) { - return 2; - } else { - return 3; - } - } - - public static int getStrLen(byte[] b, int s) { - int pos = s + 2; - int end = pos + getUTFLen(b, s); - int charCount = 0; - while (pos < end) { - charCount++; - pos += charSize(b, pos); - } - return charCount; - } - - public static int getUTFLen(byte[] b, int s) { - return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0); - } - - public static char toLowerCase(char c) { - switch (c) { - case 'A': - return 'a'; - case 'B': - return 'b'; - case 'C': - return 'c'; - case 'D': - return 'd'; - case 'E': - return 'e'; - case 'F': - return 'f'; - case 'G': - return 'g'; - case 'H': - return 'h'; - case 'I': - return 'i'; - case 'J': - return 'j'; - case 'K': - return 'k'; - case 'L': - return 'l'; - case 'M': - return 'm'; - case 'N': - return 'n'; - case 'O': - return 'o'; - case 'P': - return 'p'; - case 'Q': - return 'q'; - case 'R': - return 'r'; - case 'S': - return 's'; - case 'T': - return 't'; - case 'U': - return 'u'; - case 'V': - return 'v'; - case 'W': - return 'w'; - case 'X': - return 'x'; - case 'Y': - return 'y'; - case 'Z': - return 'z'; - case 'Ä': - return 'ä'; - case 'Ǟ': - return 'ǟ'; - case 'Ë': - return 'ë'; - case 'Ḧ': - return 'ḧ'; - case 'Ï': - return 'ï'; - case 'Ḯ': - return 'ḯ'; - case 'Ö': - return 'ö'; - case 'Ȫ': - return 'ȫ'; - case 'Ṏ': - return 'ṏ'; - case 'Ü': - return 'ü'; - case 'Ǖ': - return 'ǖ'; - case 'Ǘ': - return 'ǘ'; - case 'Ǚ': - return 'ǚ'; - case 'Ǜ': - return 'ǜ'; - case 'Ṳ': - return 'ṳ'; - case 'Ṻ': - return 'ṻ'; - case 'Ẅ': - return 'ẅ'; - case 'Ẍ': - return 'ẍ'; - case 'Ÿ': - return 'ÿ'; - default: - // since I probably missed some chars above - // use Java to convert to lower case to be safe - return Character.toLowerCase(c); - } - } - - public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException { - - if (c >= 0x0000 && c <= 0x007F) { - dos.writeByte(c); - } else if (c <= 0x07FF) { - dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F))); - dos.writeByte((byte) (0x80 | (c & 0x3F))); - } else { - dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F))); - dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F))); - dos.writeByte((byte) (0x80 | (c & 0x3F))); - } - } - - public static void writeUTF8Len(int len, DataOutput dos) throws IOException { - dos.write((len >>> 8) & 0xFF); - dos.write((len >>> 0) & 0xFF); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java deleted file mode 100644 index a3326c4..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.DataOutput; -import java.io.IOException; - -public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken { - - public final static char PRECHAR = '#'; - - public final static char POSTCHAR = '$'; - - protected int numPreChars; - protected int numPostChars; - - public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public int getNumPostChars() { - return numPreChars; - } - - @Override - public int getNumPreChars() { - return numPostChars; - } - - @Override - public void serializeToken(DataOutput dos) throws IOException { - handleTokenTypeTag(dos); - - // regular chars - int numRegChars = tokenLength - numPreChars - numPostChars; - - // assuming pre and post char need 1-byte each in utf8 - int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars; - - // write utf8 length indicator - StringUtils.writeUTF8Len(tokenUTF8Len, dos); - - // pre chars - for (int i = 0; i < numPreChars; i++) { - StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos); - } - - int pos = start; - for (int i = 0; i < numRegChars; i++) { - char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos)); - StringUtils.writeCharAsModifiedUTF8(c, dos); - pos += StringUtils.charSize(data, pos); - } - - // post chars - for (int i = 0; i < numPostChars; i++) { - StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos); - } - } - - public void setNumPrePostChars(int numPreChars, int numPostChars) { - this.numPreChars = numPreChars; - this.numPostChars = numPostChars; - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java deleted file mode 100644 index 520aa66..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory { - - private static final long serialVersionUID = 1L; - - public UTF8NGramTokenFactory() { - super(); - } - - public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public IToken createToken() { - return new UTF8NGramToken(tokenTypeTag, countTypeTag); - } - -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java deleted file mode 100644 index 41a8105..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java +++ /dev/null @@ -1,44 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.DataOutput; -import java.io.IOException; - -public class UTF8WordToken extends AbstractUTF8Token { - - public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public void serializeToken(DataOutput dos) throws IOException { - handleTokenTypeTag(dos); - - int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength); - StringUtils.writeUTF8Len(tokenUTF8Len, dos); - int pos = start; - for (int i = 0; i < tokenLength; i++) { - char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos)); - StringUtils.writeCharAsModifiedUTF8(c, dos); - pos += StringUtils.charSize(data, pos); - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java deleted file mode 100644 index 9d15db9..0000000 --- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory { - - private static final long serialVersionUID = 1L; - - public UTF8WordTokenFactory() { - super(); - } - - public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) { - super(tokenTypeTag, countTypeTag); - } - - @Override - public IToken createToken() { - return new UTF8WordToken(tokenTypeTag, countTypeTag); - } - -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java deleted file mode 100644 index d10aefb..0000000 --- a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java +++ /dev/null @@ -1,239 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tests; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.DataOutput; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import org.apache.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token; -import org.apache.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory; -import org.apache.asterix.fuzzyjoin.tokenizer.IToken; -import org.apache.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer; -import org.apache.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory; - -public class NGramTokenizerTest { - - private char PRECHAR = '#'; - private char POSTCHAR = '$'; - - private String str = "Jürgen S. Generic's Car"; - private byte[] inputBuffer; - - private int gramLength = 3; - - private void getExpectedGrams(String s, int gramLength, ArrayList grams, boolean prePost) { - - String tmp = s.toLowerCase(); - if (prePost) { - StringBuilder preBuilder = new StringBuilder(); - for (int i = 0; i < gramLength - 1; i++) { - preBuilder.append(PRECHAR); - } - String pre = preBuilder.toString(); - - StringBuilder postBuilder = new StringBuilder(); - for (int i = 0; i < gramLength - 1; i++) { - postBuilder.append(POSTCHAR); - } - String post = postBuilder.toString(); - - tmp = pre + s.toLowerCase() + post; - } - - for (int i = 0; i < tmp.length() - gramLength + 1; i++) { - String gram = tmp.substring(i, i + gramLength); - grams.add(gram); - } - } - - @Before - public void init() throws Exception { - // serialize string into bytes - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput dos = new DataOutputStream(baos); - dos.writeUTF(str); - inputBuffer = baos.toByteArray(); - } - - void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException { - HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory(); - NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false, - false, tokenFactory); - tokenizer.reset(inputBuffer, 0, inputBuffer.length); - - ArrayList expectedGrams = new ArrayList(); - getExpectedGrams(str, gramLength, expectedGrams, prePost); - ArrayList expectedHashedGrams = new ArrayList(); - HashMap gramCounts = new HashMap(); - for (String s : expectedGrams) { - Integer count = gramCounts.get(s); - if (count == null) { - count = 1; - gramCounts.put(s, count); - } else { - count++; - } - - int hash = tokenHash(s, count); - expectedHashedGrams.add(hash); - } - - int tokenCount = 0; - - while (tokenizer.hasNext()) { - tokenizer.next(); - - // serialize hashed token - ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream(); - DataOutput tokenDos = new DataOutputStream(tokenBaos); - - IToken token = tokenizer.getToken(); - token.serializeToken(tokenDos); - - // deserialize token - ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray()); - DataInput in = new DataInputStream(bais); - - Integer hashedGram = in.readInt(); - - // System.out.println(hashedGram); - - Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram); - - tokenCount++; - } - // System.out.println("---------"); - } - - void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException { - HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory(); - NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false, - tokenFactory); - tokenizer.reset(inputBuffer, 0, inputBuffer.length); - - ArrayList expectedGrams = new ArrayList(); - getExpectedGrams(str, gramLength, expectedGrams, prePost); - ArrayList expectedHashedGrams = new ArrayList(); - for (String s : expectedGrams) { - int hash = tokenHash(s, 1); - expectedHashedGrams.add(hash); - } - - int tokenCount = 0; - - while (tokenizer.hasNext()) { - tokenizer.next(); - - // serialize hashed token - ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream(); - DataOutput tokenDos = new DataOutputStream(tokenBaos); - - IToken token = tokenizer.getToken(); - token.serializeToken(tokenDos); - - // deserialize token - ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray()); - DataInput in = new DataInputStream(bais); - - Integer hashedGram = in.readInt(); - - // System.out.println(hashedGram); - - Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram); - - tokenCount++; - } - // System.out.println("---------"); - } - - void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException { - UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory(); - NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false, - tokenFactory); - tokenizer.reset(inputBuffer, 0, inputBuffer.length); - - ArrayList expectedGrams = new ArrayList(); - getExpectedGrams(str, gramLength, expectedGrams, prePost); - - int tokenCount = 0; - - while (tokenizer.hasNext()) { - tokenizer.next(); - - // serialize hashed token - ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream(); - DataOutput tokenDos = new DataOutputStream(tokenBaos); - - IToken token = tokenizer.getToken(); - token.serializeToken(tokenDos); - - // deserialize token - ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray()); - DataInput in = new DataInputStream(bais); - - String strGram = in.readUTF(); - - // System.out.println("\"" + strGram + "\""); - - Assert.assertEquals(expectedGrams.get(tokenCount), strGram); - - tokenCount++; - } - // System.out.println("---------"); - } - - @Test - public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception { - runTestNGramTokenizerWithCountedHashedUTF8Tokens(false); - runTestNGramTokenizerWithCountedHashedUTF8Tokens(true); - } - - @Test - public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception { - runTestNGramTokenizerWithHashedUTF8Tokens(false); - runTestNGramTokenizerWithHashedUTF8Tokens(true); - } - - @Test - public void testNGramTokenizerWithUTF8Tokens() throws IOException { - runTestNGramTokenizerWithUTF8Tokens(false); - runTestNGramTokenizerWithUTF8Tokens(true); - } - - public int tokenHash(String token, int tokenCount) { - int h = AbstractUTF8Token.GOLDEN_RATIO_32; - for (int i = 0; i < token.length(); i++) { - h ^= token.charAt(i); - h *= AbstractUTF8Token.GOLDEN_RATIO_32; - } - return h + tokenCount; - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java ---------------------------------------------------------------------- diff --git a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java deleted file mode 100644 index a4afe0c..0000000 --- a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tests; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.DataOutput; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; - -import junit.framework.Assert; - -import org.junit.Before; -import org.junit.Test; - -import org.apache.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token; -import org.apache.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer; -import org.apache.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory; -import org.apache.asterix.fuzzyjoin.tokenizer.IToken; -import org.apache.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory; - -public class WordTokenizerTest { - - private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen."; - private byte[] inputBuffer; - - private ArrayList expectedUTF8Tokens = new ArrayList(); - private ArrayList expectedHashedUTF8Tokens = new ArrayList(); - private ArrayList expectedCountedHashedUTF8Tokens = new ArrayList(); - - @Before - public void init() throws IOException { - // serialize text into bytes - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput dos = new DataOutputStream(baos); - dos.writeUTF(text); - inputBuffer = baos.toByteArray(); - - // init expected string tokens - expectedUTF8Tokens.add("hello"); - expectedUTF8Tokens.add("world"); - expectedUTF8Tokens.add("i"); - expectedUTF8Tokens.add("would"); - expectedUTF8Tokens.add("like"); - expectedUTF8Tokens.add("to"); - expectedUTF8Tokens.add("inform"); - expectedUTF8Tokens.add("you"); - expectedUTF8Tokens.add("of"); - expectedUTF8Tokens.add("the"); - expectedUTF8Tokens.add("importance"); - expectedUTF8Tokens.add("of"); - expectedUTF8Tokens.add("foo"); - expectedUTF8Tokens.add("bar"); - expectedUTF8Tokens.add("yes"); - expectedUTF8Tokens.add("foo"); - expectedUTF8Tokens.add("bar"); - expectedUTF8Tokens.add("jürgen"); - - // hashed tokens ignoring token count - for (int i = 0; i < expectedUTF8Tokens.size(); i++) { - int hash = tokenHash(expectedUTF8Tokens.get(i), 1); - expectedHashedUTF8Tokens.add(hash); - } - - // hashed tokens using token count - HashMap tokenCounts = new HashMap(); - for (int i = 0; i < expectedUTF8Tokens.size(); i++) { - Integer count = tokenCounts.get(expectedUTF8Tokens.get(i)); - if (count == null) { - count = 1; - tokenCounts.put(expectedUTF8Tokens.get(i), count); - } else { - count++; - } - - int hash = tokenHash(expectedUTF8Tokens.get(i), count); - expectedCountedHashedUTF8Tokens.add(hash); - } - } - - @Test - public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException { - - HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory(); - DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false, - tokenFactory); - - tokenizer.reset(inputBuffer, 0, inputBuffer.length); - - int tokenCount = 0; - - while (tokenizer.hasNext()) { - tokenizer.next(); - - // serialize token - ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream(); - DataOutput tokenDos = new DataOutputStream(tokenBaos); - - IToken token = tokenizer.getToken(); - token.serializeToken(tokenDos); - - // deserialize token - ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray()); - DataInput in = new DataInputStream(bais); - - Integer hashedToken = in.readInt(); - - // System.out.println(hashedToken); - - Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount)); - - tokenCount++; - } - } - - @Test - public void testWordTokenizerWithHashedUTF8Tokens() throws IOException { - - HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory(); - DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory); - - tokenizer.reset(inputBuffer, 0, inputBuffer.length); - - int tokenCount = 0; - - while (tokenizer.hasNext()) { - tokenizer.next(); - - // serialize token - ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream(); - DataOutput tokenDos = new DataOutputStream(tokenBaos); - - IToken token = tokenizer.getToken(); - token.serializeToken(tokenDos); - - // deserialize token - ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray()); - DataInput in = new DataInputStream(bais); - - Integer hashedToken = in.readInt(); - - // System.out.println(hashedToken); - - Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken); - - tokenCount++; - } - } - - @Test - public void testWordTokenizerWithUTF8Tokens() throws IOException { - - UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory(); - DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory); - - tokenizer.reset(inputBuffer, 0, inputBuffer.length); - - int tokenCount = 0; - - while (tokenizer.hasNext()) { - tokenizer.next(); - - // serialize hashed token - ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream(); - DataOutput tokenDos = new DataOutputStream(tokenBaos); - - IToken token = tokenizer.getToken(); - token.serializeToken(tokenDos); - - // deserialize token - ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray()); - DataInput in = new DataInputStream(bais); - - String strToken = in.readUTF(); - - // System.out.println(strToken); - - Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken); - - tokenCount++; - } - } - - // JAQL - public int tokenHash(String token, int tokenCount) { - int h = AbstractUTF8Token.GOLDEN_RATIO_32; - for (int i = 0; i < token.length(); i++) { - h ^= token.charAt(i); - h *= AbstractUTF8Token.GOLDEN_RATIO_32; - } - return h + tokenCount; - } -} http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java ---------------------------------------------------------------------- diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java index b88ed3a..ac975c4 100644 --- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java +++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java @@ -54,6 +54,7 @@ import org.apache.hyracks.storage.am.lsm.common.api.ILSMIndexAccessorInternal; @SuppressWarnings({ "rawtypes", "unchecked" }) public class ExternalFileIndexAccessor implements Serializable { + private final FilesIndexDescription filesIndexDescription = new FilesIndexDescription(); private static final long serialVersionUID = 1L; private ExternalBTreeDataflowHelper indexDataflowHelper; private ExternalLoopkupOperatorDiscriptor opDesc; @@ -119,7 +120,7 @@ public class ExternalFileIndexAccessor implements Serializable { int recordLength = tuple.getFieldLength(FilesIndexDescription.FILE_PAYLOAD_INDEX); ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength); DataInput in = new DataInputStream(stream); - ARecord externalFileRecord = (ARecord) FilesIndexDescription.EXTERNAL_FILE_RECORD_SERDE.deserialize(in); + ARecord externalFileRecord = (ARecord) filesIndexDescription.EXTERNAL_FILE_RECORD_SERDE.deserialize(in); setExternalFileFromARecord(externalFileRecord, file); } else { // This should never happen http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java ---------------------------------------------------------------------- diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java index 07c8e5f..a7844ce 100644 --- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java +++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java @@ -40,7 +40,6 @@ import org.apache.hyracks.storage.common.IStorageManagerInterface; * This operator is intended for using record ids to access data in external sources */ public class ExternalLoopkupOperatorDiscriptor extends AbstractTreeIndexOperatorDescriptor { - private static final long serialVersionUID = 1L; private final IControlledAdapterFactory adapterFactory; private final INullWriterFactory iNullWriterFactory; @@ -53,8 +52,8 @@ public class ExternalLoopkupOperatorDiscriptor extends AbstractTreeIndexOperator ISearchOperationCallbackFactory searchOpCallbackFactory, boolean retainNull, INullWriterFactory iNullWriterFactory) { super(spec, 1, 1, outRecDesc, storageManager, lcManagerProvider, fileSplitProvider, - FilesIndexDescription.EXTERNAL_FILE_INDEX_TYPE_TRAITS, - FilesIndexDescription.FILES_INDEX_COMP_FACTORIES, FilesIndexDescription.BLOOM_FILTER_FIELDS, + new FilesIndexDescription().EXTERNAL_FILE_INDEX_TYPE_TRAITS, + new FilesIndexDescription().FILES_INDEX_COMP_FACTORIES, FilesIndexDescription.BLOOM_FILTER_FIELDS, externalFilesIndexDataFlowHelperFactory, null, propagateInput, retainNull, iNullWriterFactory, null, searchOpCallbackFactory, null); this.adapterFactory = adapterFactory; http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java ---------------------------------------------------------------------- diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java index 0474ae5..cb4c0d2 100644 --- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java +++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java @@ -41,58 +41,64 @@ public class FilesIndexDescription { public final static int FILE_KEY_INDEX = 0; public final static int FILE_KEY_SIZE = 1; public final static int FILE_PAYLOAD_INDEX = 1; - public static RecordDescriptor FILE_INDEX_RECORD_DESCRIPTOR; - public static RecordDescriptor FILE_BUDDY_BTREE_RECORD_DESCRIPTOR; public final static String[] payloadFieldNames = { "FileName", "FileSize", "FileModDate" }; public final static IAType[] payloadFieldTypes = { BuiltinType.ASTRING, BuiltinType.AINT64, BuiltinType.ADATETIME }; - public static ARecordType EXTERNAL_FILE_RECORD_TYPE; - public static ISerializerDeserializer EXTERNAL_FILE_RECORD_SERDE; - public static final ISerializerDeserializer[] EXTERNAL_FILE_BUDDY_BTREE_FIELDS = new ISerializerDeserializer[1]; - public static final ITypeTraits[] EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS = new ITypeTraits[1]; - public static final ISerializerDeserializer[] EXTERNAL_FILE_TUPLE_FIELDS = new ISerializerDeserializer[FILE_INDEX_TUPLE_SIZE]; - public static final ITypeTraits[] EXTERNAL_FILE_INDEX_TYPE_TRAITS = new ITypeTraits[FILE_INDEX_TUPLE_SIZE]; - public static final IBinaryComparatorFactory[] FILES_INDEX_COMP_FACTORIES = new IBinaryComparatorFactory[] { AqlBinaryComparatorFactoryProvider.INSTANCE - .getBinaryComparatorFactory(BuiltinType.AINT32, true) }; + public static final int[] BLOOM_FILTER_FIELDS = { 0 }; public static final int EXTERNAL_FILE_NAME_FIELD_INDEX = 0; public static final int EXTERNAL_FILE_SIZE_FIELD_INDEX = 1; public static final int EXTERNAL_FILE_MOD_DATE_FIELD_INDEX = 2; - static { - try { - EXTERNAL_FILE_RECORD_TYPE = new ARecordType("ExternalFileRecordType", payloadFieldNames, payloadFieldTypes, - true); - EXTERNAL_FILE_RECORD_SERDE = AqlSerializerDeserializerProvider.INSTANCE - .getSerializerDeserializer(EXTERNAL_FILE_RECORD_TYPE); - EXTERNAL_FILE_TUPLE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE - .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE); - EXTERNAL_FILE_TUPLE_FIELDS[FILE_PAYLOAD_INDEX] = EXTERNAL_FILE_RECORD_SERDE; - EXTERNAL_FILE_BUDDY_BTREE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE - .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE); + public final ARecordType EXTERNAL_FILE_RECORD_TYPE; + public final ITypeTraits[] EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS = new ITypeTraits[1]; + public final ITypeTraits[] EXTERNAL_FILE_INDEX_TYPE_TRAITS = new ITypeTraits[FILE_INDEX_TUPLE_SIZE]; - EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE - .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE); - EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_PAYLOAD_INDEX] = AqlTypeTraitProvider.INSTANCE - .getTypeTrait(EXTERNAL_FILE_RECORD_TYPE); - EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE - .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE); + public final ISerializerDeserializer EXTERNAL_FILE_RECORD_SERDE; + public final RecordDescriptor FILE_INDEX_RECORD_DESCRIPTOR; + public final RecordDescriptor FILE_BUDDY_BTREE_RECORD_DESCRIPTOR; + public final ISerializerDeserializer[] EXTERNAL_FILE_BUDDY_BTREE_FIELDS = new ISerializerDeserializer[1]; + public final ISerializerDeserializer[] EXTERNAL_FILE_TUPLE_FIELDS = new ISerializerDeserializer[FILE_INDEX_TUPLE_SIZE]; + public final IBinaryComparatorFactory[] FILES_INDEX_COMP_FACTORIES = new IBinaryComparatorFactory[] { + AqlBinaryComparatorFactoryProvider.INSTANCE.getBinaryComparatorFactory(BuiltinType.AINT32, true) }; - FILE_INDEX_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_TUPLE_FIELDS, - EXTERNAL_FILE_INDEX_TYPE_TRAITS); - - FILE_BUDDY_BTREE_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_BUDDY_BTREE_FIELDS, - EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS); + public FilesIndexDescription() { + ARecordType type; + try { + type = new ARecordType("ExternalFileRecordType", payloadFieldNames, + payloadFieldTypes, true); } catch (Exception e) { e.printStackTrace(); - System.exit(1); + throw new RuntimeException(e); } + EXTERNAL_FILE_RECORD_TYPE = type; + EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE + .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE); + EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_PAYLOAD_INDEX] = AqlTypeTraitProvider.INSTANCE + .getTypeTrait(EXTERNAL_FILE_RECORD_TYPE); + EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE + .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE); + + EXTERNAL_FILE_RECORD_SERDE = AqlSerializerDeserializerProvider.INSTANCE + .getSerializerDeserializer(EXTERNAL_FILE_RECORD_TYPE); + + EXTERNAL_FILE_TUPLE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE + .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE); + EXTERNAL_FILE_TUPLE_FIELDS[FILE_PAYLOAD_INDEX] = EXTERNAL_FILE_RECORD_SERDE; + EXTERNAL_FILE_BUDDY_BTREE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE + .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE); + + FILE_INDEX_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_TUPLE_FIELDS, + EXTERNAL_FILE_INDEX_TYPE_TRAITS); + + FILE_BUDDY_BTREE_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_BUDDY_BTREE_FIELDS, + EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS); } @SuppressWarnings("unchecked") - public static void getBuddyBTreeTupleFromFileNumber(ArrayTupleReference tuple, ArrayTupleBuilder tupleBuilder, + public void getBuddyBTreeTupleFromFileNumber(ArrayTupleReference tuple, ArrayTupleBuilder tupleBuilder, AMutableInt32 aInt32) throws IOException, AsterixException { tupleBuilder.reset(); - FilesIndexDescription.FILE_BUDDY_BTREE_RECORD_DESCRIPTOR.getFields()[0].serialize(aInt32, + FILE_BUDDY_BTREE_RECORD_DESCRIPTOR.getFields()[0].serialize(aInt32, tupleBuilder.getDataOutput()); tupleBuilder.addFieldEndOffset(); tuple.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray()); http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java ---------------------------------------------------------------------- diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java index a1a5b5c..e79fe1c 100644 --- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java +++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java @@ -25,6 +25,7 @@ import java.io.DataInputStream; import org.apache.asterix.common.transactions.JobId; import org.apache.asterix.dataflow.data.nontagged.serde.AObjectSerializerDeserializer; +import org.apache.asterix.dataflow.data.nontagged.serde.AStringSerializerDeserializer; import org.apache.asterix.metadata.MetadataException; import org.apache.asterix.metadata.api.IValueExtractor; import org.apache.asterix.om.base.AString; @@ -36,6 +37,8 @@ import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference; * contains a serialized representation of a Dataset metadata entity. */ public class DatasetNameValueExtractor implements IValueExtractor { + private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer(); + @Override public String getValue(JobId jobId, ITupleReference tuple) throws MetadataException, HyracksDataException { byte[] serRecord = tuple.getFieldData(2); @@ -43,6 +46,6 @@ public class DatasetNameValueExtractor implements IValueExtractor { int recordLength = tuple.getFieldLength(2); ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength); DataInput in = new DataInputStream(stream); - return (((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue()); + return (((AString) aObjSerDer.deserialize(in)).getStringValue()); } } http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java ---------------------------------------------------------------------- diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java index 9d5e8b1..9a50a31 100644 --- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java +++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java @@ -40,6 +40,7 @@ import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference; public class DatatypeNameValueExtractor implements IValueExtractor { private final String dataverseName; private final MetadataNode metadataNode; + private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer(); public DatatypeNameValueExtractor(String dataverseName, MetadataNode metadataNode) { this.dataverseName = dataverseName; @@ -53,7 +54,7 @@ public class DatatypeNameValueExtractor implements IValueExtractor { int recordLength = tuple.getFieldLength(2); ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength); DataInput in = new DataInputStream(stream); - String typeName = ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue(); + String typeName = ((AString) aObjSerDer.deserialize(in)).getStringValue(); try { if (metadataNode.getDatatype(jobId, dataverseName, typeName).getIsAnonymous()) { // Get index 0 because it is anonymous type, and it is used in http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java ---------------------------------------------------------------------- diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java index d046650..41d92c9 100644 --- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java +++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java @@ -43,6 +43,7 @@ public class NestedDatatypeNameValueExtractor implements IValueExtractor public NestedDatatypeNameValueExtractor(String datatypeName) { this.datatypeName = datatypeName; } + private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer(); @Override public String getValue(JobId jobId, ITupleReference tuple) throws MetadataException, HyracksDataException { @@ -51,13 +52,13 @@ public class NestedDatatypeNameValueExtractor implements IValueExtractor int recordLength = tuple.getFieldLength(2); ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength); DataInput in = new DataInputStream(stream); - String nestedType = ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue(); + String nestedType = ((AString) aObjSerDer.deserialize(in)).getStringValue(); if (nestedType.equals(datatypeName)) { recordStartOffset = tuple.getFieldStart(1); recordLength = tuple.getFieldLength(1); stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength); in = new DataInputStream(stream); - return ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue(); + return ((AString) aObjSerDer.deserialize(in)).getStringValue(); } return null; } http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java ---------------------------------------------------------------------- diff --git a/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java b/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java index d664e12..55ff32f 100644 --- a/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java +++ b/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java @@ -45,6 +45,7 @@ public class RecordBuilder implements IARecordBuilder { private final static int DEFAULT_NUM_OPEN_FIELDS = 10; private final static byte SER_NULL_TYPE_TAG = ATypeTag.NULL.serialize(); private final static byte RECORD_TYPE_TAG = ATypeTag.RECORD.serialize(); + private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer(); private int openPartOffsetArraySize; private byte[] openPartOffsetArray; @@ -226,9 +227,8 @@ public class RecordBuilder implements IARecordBuilder { for (int i = 1; i < numberOfOpenFields; i++) { if (utf8Comparator.compare(openBytes, (int) openPartOffsets[i - 1], openFieldNameLengths[i - 1], openBytes, (int) openPartOffsets[i], openFieldNameLengths[i]) == 0) { - String field = UTF8StringSerializerDeserializer.INSTANCE - .deserialize(new DataInputStream(new ByteArrayInputStream(openBytes, - (int) openPartOffsets[i], openFieldNameLengths[i]))); + String field = utf8SerDer.deserialize(new DataInputStream(new ByteArrayInputStream(openBytes, + (int) openPartOffsets[i], openFieldNameLengths[i]))); throw new AsterixException("Open fields " + (i - 1) + " and " + i + " have the same field name \"" + field + "\""); } http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java ---------------------------------------------------------------------- diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java index f019f10..a3bff52 100644 --- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java +++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java @@ -26,8 +26,8 @@ import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken; public class AListElementToken implements IToken { protected byte[] data; - protected int start; - protected int length; + protected int startOffset; + protected int endOffset; protected int tokenLength; protected int typeTag; @@ -37,13 +37,13 @@ public class AListElementToken implements IToken { } @Override - public int getLength() { - return length; + public int getEndOffset() { + return endOffset; } @Override - public int getStart() { - return start; + public int getStartOffset() { + return startOffset; } @Override @@ -52,10 +52,10 @@ public class AListElementToken implements IToken { } @Override - public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) { + public void reset(byte[] data, int startOffset, int endOffset, int tokenLength, int tokenCount) { this.data = data; - this.start = start; - this.length = length; + this.startOffset = startOffset; + this.endOffset = endOffset; this.tokenLength = tokenLength; // We abuse the last param, tokenCount, to pass the type tag. typeTag = tokenCount; @@ -64,7 +64,7 @@ public class AListElementToken implements IToken { @Override public void serializeToken(GrowableArray out) throws IOException { out.getDataOutput().writeByte(typeTag); - out.getDataOutput().write(data, start, length); + out.getDataOutput().write(data, startOffset, endOffset - startOffset); } @Override http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java ---------------------------------------------------------------------- diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java index 5f6e0b8..32207d3 100644 --- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java +++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java @@ -59,9 +59,10 @@ public class AOrderedListBinaryTokenizer implements IBinaryTokenizer { itemOffset = getItemOffset(data, start, itemIndex); // Assuming homogeneous list. ATypeTag typeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(data[start + 1]); + // ? Can we handle the non-string type ? length = NonTaggedFormatUtil.getFieldValueLength(data, itemOffset, typeTag, false); // Last param is a hack to pass the type tag. - token.reset(data, itemOffset, length, length, data[start + 1]); + token.reset(data, itemOffset, itemOffset + length, length, data[start + 1]); } catch (AsterixException e) { throw new IllegalStateException(e); } http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java ---------------------------------------------------------------------- diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java index 64b5610..767a343 100644 --- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java +++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java @@ -19,7 +19,6 @@ package org.apache.asterix.dataflow.data.nontagged.comparators; -import org.apache.asterix.formats.nontagged.UTF8StringLowercasePointable; import org.apache.asterix.om.types.ATypeTag; import org.apache.asterix.om.types.EnumDeserializer; import org.apache.hyracks.api.dataflow.value.IBinaryComparator; @@ -31,6 +30,7 @@ import org.apache.hyracks.data.std.primitive.DoublePointable; import org.apache.hyracks.data.std.primitive.FloatPointable; import org.apache.hyracks.data.std.primitive.IntegerPointable; import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable; public class ListItemBinaryComparatorFactory implements IBinaryComparatorFactory { http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java ---------------------------------------------------------------------- diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java index 6935f24..493833b 100644 --- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java +++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java @@ -21,7 +21,6 @@ package org.apache.asterix.dataflow.data.nontagged.hash; import java.io.IOException; -import org.apache.asterix.formats.nontagged.UTF8StringLowercasePointable; import org.apache.asterix.om.types.ATypeTag; import org.apache.asterix.om.types.EnumDeserializer; import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction; @@ -29,6 +28,7 @@ import org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFactory; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.data.std.accessors.MurmurHash3BinaryHashFunctionFamily; import org.apache.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory; +import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable; import org.apache.hyracks.data.std.util.GrowableArray; /** http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java ---------------------------------------------------------------------- diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java index 596e168..7d88a90 100644 --- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java +++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java @@ -29,8 +29,13 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; +import org.apache.hyracks.util.bytes.HexPrinter; +import org.apache.hyracks.util.string.UTF8StringUtil; + public class PrintTools { + private static final GregorianCalendarSystem gCalInstance = GregorianCalendarSystem.getInstance(); private static long CHRONON_OF_DAY = 24 * 60 * 60 * 1000; @@ -185,13 +190,13 @@ public class PrintTools { } public static void writeUTF8StringAsCSV(byte[] b, int s, int l, OutputStream os) throws IOException { - int stringLength = UTF8StringPointable.getUTFLength(b, s); - int position = s + 2; // skip 2 bytes containing string size + int stringLength = UTF8StringUtil.getUTFLength(b, s); + int position = s + UTF8StringUtil.getNumBytesToStoreLength(stringLength); int maxPosition = position + stringLength; os.write('"'); while (position < maxPosition) { - char c = UTF8StringPointable.charAt(b, position); - int sz = UTF8StringPointable.charSize(b, position); + char c = UTF8StringUtil.charAt(b, position); + int sz = UTF8StringUtil.charSize(b, position); if (c == '"') { os.write('"'); } @@ -202,13 +207,13 @@ public class PrintTools { } public static void writeUTF8StringAsJSON(byte[] b, int s, int l, OutputStream os) throws IOException { - int stringLength = UTF8StringPointable.getUTFLength(b, s); - int position = s + 2; // skip 2 bytes containing string size - int maxPosition = position + stringLength; + int utfLength = UTF8StringUtil.getUTFLength(b, s); + int position = s + UTF8StringUtil.getNumBytesToStoreLength(utfLength); // skip 2 bytes containing string size + int maxPosition = position + utfLength; os.write('"'); while (position < maxPosition) { - char c = UTF8StringPointable.charAt(b, position); - int sz = UTF8StringPointable.charSize(b, position); + char c = UTF8StringUtil.charAt(b, position); + int sz = UTF8StringUtil.charSize(b, position); switch (c) { // escape case '\b': @@ -296,27 +301,9 @@ public class PrintTools { os.write('u'); os.write('0'); os.write('0'); - os.write(hex((c >>> 4) & 0x0f, CASE.LOWER_CASE)); - os.write(hex(c & 0x0f, CASE.LOWER_CASE)); + os.write(HexPrinter.hex((c >>> 4) & 0x0f, HexPrinter.CASE.LOWER_CASE)); + os.write(HexPrinter.hex(c & 0x0f, HexPrinter.CASE.LOWER_CASE)); } - public static Appendable printHexString(byte[] bytes, int start, int length, Appendable appendable) - throws IOException { - for (int i = 0; i < length; ++i) { - appendable.append((char) hex((bytes[start + i] >>> 4) & 0x0f, CASE.UPPER_CASE)); - appendable.append((char) hex((bytes[start + i] & 0x0f), CASE.UPPER_CASE)); - } - return appendable; - } - - public static byte hex(int i, CASE c) { - switch (c) { - case LOWER_CASE: - return (byte) (i < 10 ? i + '0' : i + ('a' - 10)); - case UPPER_CASE: - return (byte) (i < 10 ? i + '0' : i + ('A' - 10)); - } - return Byte.parseByte(null); - } }