drill-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (DRILL-5375) Nested loop join: return correct result for left join
Date Wed, 29 Mar 2017 10:36:41 GMT

    [ https://issues.apache.org/jira/browse/DRILL-5375?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15946907#comment-15946907
] 

ASF GitHub Bot commented on DRILL-5375:
---------------------------------------

Github user arina-ielchiieva commented on a diff in the pull request:

    https://github.com/apache/drill/pull/794#discussion_r108382080
  
    --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/NestedLoopJoinTemplate.java
---
    @@ -40,132 +41,133 @@
       // Record count of the left batch currently being processed
       private int leftRecordCount = 0;
     
    -  // List of record counts  per batch in the hyper container
    +  // List of record counts per batch in the hyper container
       private List<Integer> rightCounts = null;
     
       // Output batch
       private NestedLoopJoinBatch outgoing = null;
     
    -  // Next right batch to process
    -  private int nextRightBatchToProcess = 0;
    -
    -  // Next record in the current right batch to process
    -  private int nextRightRecordToProcess = 0;
    -
    -  // Next record in the left batch to process
    -  private int nextLeftRecordToProcess = 0;
    +  // Iteration status tracker
    +  private IterationStatusTracker tracker = new IterationStatusTracker();
     
       /**
        * Method initializes necessary state and invokes the doSetup() to set the
    -   * input and output value vector references
    +   * input and output value vector references.
    +   *
        * @param context Fragment context
        * @param left Current left input batch being processed
        * @param rightContainer Hyper container
    +   * @param rightCounts Counts for each right container
        * @param outgoing Output batch
        */
    -  public void setupNestedLoopJoin(FragmentContext context, RecordBatch left,
    +  public void setupNestedLoopJoin(FragmentContext context,
    +                                  RecordBatch left,
                                       ExpandableHyperContainer rightContainer,
                                       LinkedList<Integer> rightCounts,
                                       NestedLoopJoinBatch outgoing) {
         this.left = left;
    -    leftRecordCount = left.getRecordCount();
    +    this.leftRecordCount = left.getRecordCount();
         this.rightCounts = rightCounts;
         this.outgoing = outgoing;
     
         doSetup(context, rightContainer, left, outgoing);
       }
     
       /**
    -   * This method is the core of the nested loop join. For every record on the right we
go over
    -   * the left batch and produce the cross product output
    +   * Main entry point for producing the output records. Thin wrapper around populateOutgoingBatch(),
this method
    +   * controls which left batch we are processing and fetches the next left input batch
once we exhaust the current one.
    +   *
    +   * @param joinType join type (INNER ot LEFT)
    +   * @return the number of records produced in the output batch
    +   */
    +  public int outputRecords(JoinRelType joinType) {
    +    int outputIndex = 0;
    +    while (leftRecordCount != 0) {
    +      outputIndex = populateOutgoingBatch(joinType, outputIndex);
    +      if (outputIndex >= NestedLoopJoinBatch.MAX_BATCH_SIZE) {
    +        break;
    +      }
    +      // reset state and get next left batch
    +      resetAndGetNextLeft();
    +    }
    +    return outputIndex;
    +  }
    +
    +  /**
    +   * This method is the core of the nested loop join.For each left batch record looks
for matching record
    +   * from the list of right batches. Match is checked by calling {@link #doEval(int,
int, int)} method.
    +   * If matching record is found both left and right records are written into output
batch,
    +   * otherwise if join type is LEFT, than only left record is written, right batch record
values will be null.
    +   *
    +   * @param joinType join type (INNER or LEFT)
        * @param outputIndex index to start emitting records at
        * @return final outputIndex after producing records in the output batch
        */
    -  private int populateOutgoingBatch(int outputIndex) {
    -
    -    // Total number of batches on the right side
    -    int totalRightBatches = rightCounts.size();
    -
    -    // Total number of records on the left
    -    int localLeftRecordCount = leftRecordCount;
    -
    -    /*
    -     * The below logic is the core of the NLJ. To have better performance we copy the
instance members into local
    -     * method variables, once we are done with the loop we need to update the instance
variables to reflect the new
    -     * state. To avoid code duplication of resetting the instance members at every exit
point in the loop we are using
    -     * 'goto'
    -     */
    -    int localNextRightBatchToProcess = nextRightBatchToProcess;
    -    int localNextRightRecordToProcess = nextRightRecordToProcess;
    -    int localNextLeftRecordToProcess = nextLeftRecordToProcess;
    -
    -    outer: {
    -
    -      for (; localNextRightBatchToProcess< totalRightBatches; localNextRightBatchToProcess++)
{ // for every batch on the right
    -        int compositeIndexPart = localNextRightBatchToProcess << 16;
    -        int rightRecordCount = rightCounts.get(localNextRightBatchToProcess);
    -
    -        for (; localNextRightRecordToProcess < rightRecordCount; localNextRightRecordToProcess++)
{ // for every record in this right batch
    -          for (; localNextLeftRecordToProcess < localLeftRecordCount; localNextLeftRecordToProcess++)
{ // for every record in the left batch
    -
    +  private int populateOutgoingBatch(JoinRelType joinType, int outputIndex) {
    +    // copy index and match counters as local variables to speed up processing
    +    int nextRightBatchToProcess = tracker.getNextRightBatchToProcess();
    +    int nextRightRecordToProcess = tracker.getNextRightRecordToProcess();
    +    int nextLeftRecordToProcess = tracker.getNextLeftRecordToProcess();
    +    boolean rightRecordMatched = tracker.isRightRecordMatched();
    +
    +    outer:
    +    // for every record in the left batch
    +    for (; nextLeftRecordToProcess < leftRecordCount; nextLeftRecordToProcess++) {
    +      // for every batch on the right
    +      for (; nextRightBatchToProcess < rightCounts.size(); nextRightBatchToProcess++)
{
    +        int rightRecordCount = rightCounts.get(nextRightBatchToProcess);
    --- End diff --
    
    Here we keep track of right records within one right record batch. We have list of right
records counts `List<Integer> rightCounts`. This list is populated with record counts
from each right input record batch [1], `getRecordCount()` returns int [2]. 
    
    [1] https://github.com/apache/drill/blob/2af709f43d01f341b2a52c6473ea49d6761fdc61/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/NestedLoopJoinBatch.java#L349
    
    [2] https://github.com/apache/drill/blob/master/exec/java-exec/src/main/java/org/apache/drill/exec/record/RecordBatch.java#L229


> Nested loop join: return correct result for left join
> -----------------------------------------------------
>
>                 Key: DRILL-5375
>                 URL: https://issues.apache.org/jira/browse/DRILL-5375
>             Project: Apache Drill
>          Issue Type: Bug
>    Affects Versions: 1.8.0
>            Reporter: Arina Ielchiieva
>            Assignee: Arina Ielchiieva
>              Labels: doc-impacting
>
> Mini repro:
> 1. Create 2 Hive tables with data
> {code}
> CREATE TABLE t1 (
>   FYQ varchar(999),
>   dts varchar(999),
>   dte varchar(999)
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
> 2016-Q1,2016-06-01,2016-09-30
> 2016-Q2,2016-09-01,2016-12-31
> 2016-Q3,2017-01-01,2017-03-31
> 2016-Q4,2017-04-01,2017-06-30
> CREATE TABLE t2 (
>   who varchar(999),
>   event varchar(999),
>   dt varchar(999)
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
> aperson,did somthing,2017-01-06
> aperson,did somthing else,2017-01-12
> aperson,had chrsitmas,2016-12-26
> aperson,went wild,2016-01-01
> {code}
> 2. Impala Query shows correct result
> {code}
> select t2.dt, t1.fyq, t2.who, t2.event
> from t2
> left join t1 on t2.dt between t1.dts and t1.dte
> order by t2.dt;
> +------------+---------+---------+-------------------+
> | dt         | fyq     | who     | event             |
> +------------+---------+---------+-------------------+
> | 2016-01-01 | NULL    | aperson | went wild         |
> | 2016-12-26 | 2016-Q2 | aperson | had chrsitmas     |
> | 2017-01-06 | 2016-Q3 | aperson | did somthing      |
> | 2017-01-12 | 2016-Q3 | aperson | did somthing else |
> +------------+---------+---------+-------------------+
> {code}
> 3. Drill query shows wrong results:
> {code}
> alter session set planner.enable_nljoin_for_scalar_only=false;
> use hive;
> select t2.dt, t1.fyq, t2.who, t2.event
> from t2
> left join t1 on t2.dt between t1.dts and t1.dte
> order by t2.dt;
> +-------------+----------+----------+--------------------+
> |     dt      |   fyq    |   who    |       event        |
> +-------------+----------+----------+--------------------+
> | 2016-12-26  | 2016-Q2  | aperson  | had chrsitmas      |
> | 2017-01-06  | 2016-Q3  | aperson  | did somthing       |
> | 2017-01-12  | 2016-Q3  | aperson  | did somthing else  |
> +-------------+----------+----------+--------------------+
> 3 rows selected (2.523 seconds)
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Mime
View raw message