asterixdb-notifications mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Taewoo Kim (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (ASTERIXDB-1544) Omit the fuzzyjoin on inverted index
Date Tue, 26 Jul 2016 02:04:21 GMT

    [ https://issues.apache.org/jira/browse/ASTERIXDB-1544?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15393053#comment-15393053
] 

Taewoo Kim commented on ASTERIXDB-1544:
---------------------------------------

@Chen: this issues will be fixed soon when my patch is approved.

> Omit the fuzzyjoin on inverted index
> ------------------------------------
>
>                 Key: ASTERIXDB-1544
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1544
>             Project: Apache AsterixDB
>          Issue Type: Bug
>         Environment: MAC/linux
>            Reporter: Wenhai
>            Assignee: Taewoo Kim
>            Priority: Critical
>
> In the current master, we have NO testCases covering the fuzzyjoin on the (inverted)
indexed field. Once we trigger a fuzzy join "~=" on a indexed field, we will always get a
error with following log.
> Schema
> {noformat}
> drop dataverse fuzzyjointest if exists;
> create dataverse fuzzyjointest;
> use dataverse fuzzyjointest;
> create type DBLPType as open {
>   tid: uuid,
>   id: int64,
>   dblpid: string?,
>   title: string?,
>   authors: string?,
>   misc: string?
> }
> create type CSXType as closed {
>   tid: uuid,
>   id: int64,
>   csxid: string?,
>   title: string?,
>   authors: string?,
>   misc: string?
> }
> create dataset DBLP(DBLPType) primary key tid autogenerated;
> create dataset CSX(CSXType) primary key tid autogenerated;
> load dataset DBLP
> using localfs
> (("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"))
pre-sorted;
> load dataset CSX
> using localfs
> (("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
> use dataverse fuzzyjointest;
> drop index DBLP.title_index if exists;
> create index title_index on DBLP(title) type keyword;
> drop index DBLP.author_index if exists;
> create index author_index on DBLP(authors) type keyword;
> drop index CSX.csx_author_index if exists;
> create index csx_author_index on CSX(authors) type keyword;
> {noformat}
> Query
> {noformat}
> use dataverse fuzzyjointest;
> set simthreshold '.7f'
> for $o in dataset('DBLP')
> for $t in dataset('CSX')
> where word-tokens($o.authors) ~= word-tokens($t.authors)
> return {"cid": $t.id, "did": $o.id}
> {noformat}
> Plan
> {noformat}
> distribute result [%0->$$9]
> -- DISTRIBUTE_RESULT  |PARTITIONED|
>   exchange 
>   -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>     project ([$$9])
>     -- STREAM_PROJECT  |PARTITIONED|
>       assign [$$9] <- [function-call: asterix:closed-record-constructor, Args:[AString:
{cid}, %0->$$18, AString: {did}, %0->$$19]]
>       -- ASSIGN  |PARTITIONED|
>         project ([$$18, $$19])
>         -- STREAM_PROJECT  |PARTITIONED|
>           exchange 
>           -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>             join (function-call: algebricks:eq, Args:[%0->$$22, %0->$$12])
>             -- HYBRID_HASH_JOIN [$$22][$$12]  |PARTITIONED|
>               exchange 
>               -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                 project ([$$19, $$22])
>                 -- STREAM_PROJECT  |PARTITIONED|
>                   assign [$$19] <- [function-call: asterix:field-access-by-index,
Args:[%0->$$0, AInt32: {1}]]
>                   -- ASSIGN  |PARTITIONED|
>                     exchange 
>                     -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                       data-scan []<-[$$22, $$0] <- fuzzyjointest:DBLP
>                       -- DATASOURCE_SCAN  |PARTITIONED|
>                         exchange 
>                         -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                           empty-tuple-source
>                           -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
>               exchange 
>               -- HASH_PARTITION_EXCHANGE [$$12]  |PARTITIONED|
>                 project ([$$18, $$12])
>                 -- STREAM_PROJECT  |PARTITIONED|
>                   select (function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check,
Args:[%0->$$14, function-call: asterix:word-tokens, Args:[%0->$$17], AFloat: {0.7}],
AInt32: {0}])
>                   -- STREAM_SELECT  |PARTITIONED|
>                     project ([$$17, $$18, $$12, $$14])
>                     -- STREAM_PROJECT  |PARTITIONED|
>                       assign [$$18, $$17] <- [function-call: asterix:field-access-by-index,
Args:[%0->$$1, AInt32: {1}], function-call: asterix:field-access-by-index, Args:[%0->$$1,
AInt32: {4}]]
>                       -- ASSIGN  |PARTITIONED|
>                         project ([$$1, $$12, $$14])
>                         -- STREAM_PROJECT  |PARTITIONED|
>                           exchange 
>                           -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                             unnest-map [$$13, $$1] <- function-call: asterix:index-search,
Args:[AString: {CSX}, AInt32: {0}, AString: {fuzzyjointest}, AString: {CSX}, ABoolean: {true},
ABoolean: {false}, AInt32: {1}, %0->$$26, AInt32: {1}, %0->$$26, TRUE, TRUE, TRUE]
>                             -- BTREE_SEARCH  |PARTITIONED|
>                               exchange 
>                               -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                 order (ASC, %0->$$26) 
>                                 -- STABLE_SORT [$$26(ASC)]  |PARTITIONED|
>                                   exchange 
>                                   -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                     unnest-map [$$26] <- function-call: asterix:index-search,
Args:[AString: {csx_author_index}, AInt32: {4}, AString: {fuzzyjointest}, AString: {CSX},
ABoolean: {true}, ABoolean: {true}, AInt32: {1}, AFloat: {0.7}, AInt32: {25}, AInt32: {1},
%0->$$14]
>                                     -- LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH  |PARTITIONED|
>                                       exchange 
>                                       -- BROADCAST_EXCHANGE  |PARTITIONED|
>                                         project ([$$12, $$14])
>                                         -- STREAM_PROJECT  |PARTITIONED|
>                                           assign [$$14] <- [function-call: asterix:word-tokens,
Args:[function-call: asterix:field-access-by-index, Args:[%0->$$25, AInt32: {4}]]]
>                                           -- ASSIGN  |PARTITIONED|
>                                             exchange 
>                                             -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                               data-scan []<-[$$12, $$25] <- fuzzyjointest:DBLP
>                                               -- DATASOURCE_SCAN  |PARTITIONED|
>                                                 exchange 
>                                                 -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                                   empty-tuple-source
>                                                   -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
> {noformat}
> The same error occurs similar to issue-1487.
> {noformat}
> org.apache.hyracks.api.exceptions.HyracksException: Job failed on account of:
> HYR0002: null
> 	at org.apache.hyracks.control.cc.job.JobRun.waitForCompletion(JobRun.java:212)
> 	at org.apache.hyracks.control.cc.work.WaitForJobCompletionWork$1.run(WaitForJobCompletionWork.java:48)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> 	at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: HYR0002: null
> 	at org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
> 	at org.apache.hyracks.control.nc.Task.run(Task.java:319)
> 	... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> 	at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:365)
> 	at org.apache.hyracks.control.nc.Task.run(Task.java:297)
> 	... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> 	at org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:143)
> 	at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:341)
> 	... 4 more
> Caused by: java.lang.NullPointerException
> 	at org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorNodePushable.createSearchPredicate(LSMInvertedIndexSearchOperatorNodePushable.java:56)
> 	at org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:131)
> 	... 5 more
> org.apache.hyracks.api.exceptions.HyracksException: Job failed on account of:
> HYR0002: null
> 	at org.apache.hyracks.control.cc.job.JobRun.waitForCompletion(JobRun.java:212)
> 	at org.apache.hyracks.control.cc.work.WaitForJobCompletionWork$1.run(WaitForJobCompletionWork.java:48)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> 	at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: HYR0002: null
> 	at org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
> 	at org.apache.hyracks.control.nc.Task.run(Task.java:319)
> 	... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> 	at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:365)
> 	at org.apache.hyracks.control.nc.Task.run(Task.java:297)
> 	... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> 	at org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:143)
> 	at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:341)
> 	... 4 more
> Caused by: java.lang.NullPointerException
> 	at org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorNodePushable.createSearchPredicate(LSMInvertedIndexSearchOperatorNodePushable.java:56)
> 	at org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:131)
> 	... 5 more
> Jul 24, 2016 8:31:29 AM org.apache.asterix.api.http.servlet.APIServlet doPost
> SEVERE: Job failed on account of:
> HYR0002: null
> org.apache.hyracks.api.exceptions.HyracksException: Job failed on account of:
> HYR0002: null
> 	at org.apache.hyracks.control.cc.job.JobRun.waitForCompletion(JobRun.java:212)
> 	at org.apache.hyracks.control.cc.work.WaitForJobCompletionWork$1.run(WaitForJobCompletionWork.java:48)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> 	at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: HYR0002: null
> 	at org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
> 	at org.apache.hyracks.control.nc.Task.run(Task.java:319)
> 	... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> 	at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:365)
> 	at org.apache.hyracks.control.nc.Task.run(Task.java:297)
> 	... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> 	at org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:143)
> 	at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:341)
> 	... 4 more
> Caused by: java.lang.NullPointerException
> 	at org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorNodePushable.createSearchPredicate(LSMInvertedIndexSearchOperatorNodePushable.java:56)
> 	at org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:131)
> 	... 5 more
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message