hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From prasan...@apache.org
Subject hive git commit: HIVE-11043: ORC split strategies should adapt based on number of files (Gopal V reviewed by Prasanth Jayachandran)
Date Tue, 23 Jun 2015 21:21:26 GMT
Repository: hive
Updated Branches:
  refs/heads/master 724ef3383 -> 5f78f9ef1


HIVE-11043: ORC split strategies should adapt based on number of files (Gopal V reviewed by
Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5f78f9ef
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5f78f9ef
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5f78f9ef

Branch: refs/heads/master
Commit: 5f78f9ef1e6c798849d34cc66721e6c1d9709b6f
Parents: 724ef33
Author: Prasanth Jayachandran <j.prasanth.j@gmail.com>
Authored: Tue Jun 23 14:19:13 2015 -0700
Committer: Prasanth Jayachandran <j.prasanth.j@gmail.com>
Committed: Tue Jun 23 14:19:13 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 18 +++-
 .../hive/ql/io/orc/TestInputOutputFormat.java   | 97 +++++++++++++++++++-
 2 files changed, 111 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/5f78f9ef/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 5d6c9da..62e6de7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -374,6 +374,7 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
     private final int numBuckets;
     private final long maxSize;
     private final long minSize;
+    private final int minSplits;
     private final boolean footerInSplits;
     private final boolean cacheStripeDetails;
     private final AtomicInteger cacheHitCounter = new AtomicInteger(0);
@@ -382,6 +383,10 @@ public class OrcInputFormat  implements InputFormat<NullWritable,
OrcStruct>,
     private SplitStrategyKind splitStrategyKind;
 
     Context(Configuration conf) {
+      this(conf, 1);
+    }
+
+    Context(Configuration conf, final int minSplits) {
       this.conf = conf;
       minSize = conf.getLong(MIN_SPLIT_SIZE, DEFAULT_MIN_SPLIT_SIZE);
       maxSize = conf.getLong(MAX_SPLIT_SIZE, DEFAULT_MAX_SPLIT_SIZE);
@@ -404,6 +409,8 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
 
       cacheStripeDetails = (cacheStripeDetailsSize > 0);
 
+      this.minSplits = Math.min(cacheStripeDetailsSize, minSplits);
+
       synchronized (Context.class) {
         if (threadPool == null) {
           threadPool = Executors.newFixedThreadPool(numThreads,
@@ -681,7 +688,7 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
             break;
           default:
             // HYBRID strategy
-            if (avgFileSize > context.maxSize) {
+            if (avgFileSize > context.maxSize || numFiles <= context.minSplits) {
               splitStrategy = new ETLSplitStrategy(context, fs, dir, children, isOriginal,
deltas,
                   covered);
             } else {
@@ -983,8 +990,13 @@ public class OrcInputFormat  implements InputFormat<NullWritable,
OrcStruct>,
 
   static List<OrcSplit> generateSplitsInfo(Configuration conf)
       throws IOException {
+    return generateSplitsInfo(conf, -1);
+  }
+
+  static List<OrcSplit> generateSplitsInfo(Configuration conf, int numSplits)
+      throws IOException {
     // use threads to resolve directories into splits
-    Context context = new Context(conf);
+    Context context = new Context(conf, numSplits);
     List<OrcSplit> splits = Lists.newArrayList();
     List<Future<?>> pathFutures = Lists.newArrayList();
     List<Future<?>> splitFutures = Lists.newArrayList();
@@ -1049,7 +1061,7 @@ public class OrcInputFormat  implements InputFormat<NullWritable,
OrcStruct>,
   public InputSplit[] getSplits(JobConf job,
                                 int numSplits) throws IOException {
     perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.ORC_GET_SPLITS);
-    List<OrcSplit> result = generateSplitsInfo(job);
+    List<OrcSplit> result = generateSplitsInfo(job, numSplits);
     perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.ORC_GET_SPLITS);
     return result.toArray(new InputSplit[result.size()]);
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/5f78f9ef/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 0246cd5..12ae902 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -23,8 +23,11 @@ import static org.junit.Assert.assertTrue;
 
 import java.io.DataInput;
 import java.io.DataOutput;
+import java.io.File;
 import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.PrintWriter;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.sql.Date;
@@ -67,6 +70,7 @@ import org.apache.hadoop.hive.ql.io.HiveInputFormat;
 import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
 import org.apache.hadoop.hive.ql.io.InputFormatChecker;
 import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy;
+import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategyKind;
 import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
@@ -394,6 +398,97 @@ public class TestInputOutputFormat {
         OrcInputFormat.getInputPaths(conf));
   }
 
+  private FileSystem generateMockFiles(final int count, final int size) {
+    final byte[] data = new byte[size];
+    MockFile[] files = new MockFile[count];
+    for (int i = 0; i < count; i++) {
+      files[i] = new MockFile(String.format("mock:/a/b/part-%d", i), size, data);
+    }
+    return new MockFileSystem(conf, files);
+  }
+
+  @Test
+  public void testSplitStrategySelection() throws Exception {
+
+    conf.set("mapreduce.input.fileinputformat.split.maxsize", "500");
+    conf.setLong(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE.varname,
+        100);
+    final int[] counts = { 1, 10, 100, 256 };
+    final int[] sizes = { 100, 1000 };
+    final int[] numSplits = { 1, 9, 10, 11, 99, 111 };
+    final String[] strategyResults = new String[] {
+    "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */
+    "ETLSplitStrategy", /* 1 files x 100 size for 9 splits */
+    "ETLSplitStrategy", /* 1 files x 100 size for 10 splits */
+    "ETLSplitStrategy", /* 1 files x 100 size for 11 splits */
+    "ETLSplitStrategy", /* 1 files x 100 size for 99 splits */
+    "ETLSplitStrategy", /* 1 files x 100 size for 111 splits */
+    "ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */
+    "ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */
+    "ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */
+    "ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */
+    "ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */
+    "ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */
+    "BISplitStrategy", /* 10 files x 100 size for 1 splits */
+    "BISplitStrategy", /* 10 files x 100 size for 9 splits */
+    "ETLSplitStrategy", /* 10 files x 100 size for 10 splits */
+    "ETLSplitStrategy", /* 10 files x 100 size for 11 splits */
+    "ETLSplitStrategy", /* 10 files x 100 size for 99 splits */
+    "ETLSplitStrategy", /* 10 files x 100 size for 111 splits */
+    "ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */
+    "ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */
+    "ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */
+    "ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */
+    "ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */
+    "ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */
+    "BISplitStrategy", /* 100 files x 100 size for 1 splits */
+    "BISplitStrategy", /* 100 files x 100 size for 9 splits */
+    "BISplitStrategy", /* 100 files x 100 size for 10 splits */
+    "BISplitStrategy", /* 100 files x 100 size for 11 splits */
+    "BISplitStrategy", /* 100 files x 100 size for 99 splits */
+    "ETLSplitStrategy", /* 100 files x 100 size for 111 splits */
+    "ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */
+    "ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */
+    "ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */
+    "ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */
+    "ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */
+    "ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */
+    "BISplitStrategy", /* 256 files x 100 size for 1 splits */
+    "BISplitStrategy", /* 256 files x 100 size for 9 splits */
+    "BISplitStrategy", /* 256 files x 100 size for 10 splits */
+    "BISplitStrategy", /* 256 files x 100 size for 11 splits */
+    "BISplitStrategy", /* 256 files x 100 size for 99 splits */
+    "BISplitStrategy", /* 256 files x 100 size for 111 splits */
+    "ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */
+    "ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */
+    "ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */
+    "ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */
+    "ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */
+    "ETLSplitStrategy", /* 256 files x 1000 size for 111 splits */
+    };
+
+    int k = 0;
+
+    for (int c : counts) {
+      for (int s : sizes) {
+        final FileSystem fs = generateMockFiles(c, s);
+        for (int n : numSplits) {
+          final OrcInputFormat.Context context = new OrcInputFormat.Context(
+              conf, n);
+          OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(
+              context, fs, new MockPath(fs, "mock:/a/b"));
+          final SplitStrategy splitStrategy = gen.call();
+          assertTrue(
+              String.format(
+                  "Split strategy for %d files x %d size for %d splits", c, s,
+                  n),
+              splitStrategy.getClass().getSimpleName()
+                  .equals(strategyResults[k++]));
+        }
+      }
+    }
+  }
+
   @Test
   public void testFileGenerator() throws Exception {
     OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
@@ -1115,7 +1210,7 @@ public class TestInputOutputFormat {
     InputFormat<?,?> in = new OrcInputFormat();
     FileInputFormat.setInputPaths(conf, testFilePath.toString());
     InputSplit[] splits = in.getSplits(conf, 1);
-    assertTrue(1 == splits.length);
+    assertTrue(0 == splits.length);
     assertEquals(null, serde.getSerDeStats());
   }
 


Mime
View raw message