hudi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [incubator-hudi] xushiyan commented on a change in pull request #1404: [HUDI-344] Improve exporter tests
Date Sat, 14 Mar 2020 17:09:34 GMT
xushiyan commented on a change in pull request #1404: [HUDI-344] Improve exporter tests
URL: https://github.com/apache/incubator-hudi/pull/1404#discussion_r392603554
 
 

 ##########
 File path: hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieSnapshotExporter.java
 ##########
 @@ -18,205 +18,144 @@
 
 package org.apache.hudi.utilities;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hudi.DataSourceWriteOptions;
-import org.apache.hudi.common.HoodieCommonTestHarness;
+import org.apache.hudi.client.HoodieWriteClient;
+import org.apache.hudi.common.HoodieClientTestHarness;
 import org.apache.hudi.common.HoodieTestDataGenerator;
-import org.apache.hudi.common.model.HoodieTestUtils;
-import org.apache.hudi.common.util.FSUtils;
+import org.apache.hudi.common.model.HoodieAvroPayload;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.config.HoodieIndexConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
+import org.apache.hudi.index.HoodieIndex.IndexType;
+import org.apache.hudi.utilities.HoodieSnapshotExporter.Config;
 
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.SparkSession;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
 
-import java.io.File;
 import java.io.IOException;
-import java.util.HashMap;
+import java.util.Arrays;
 import java.util.List;
-import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-public class TestHoodieSnapshotExporter extends HoodieCommonTestHarness {
-  private static String TEST_WRITE_TOKEN = "1-0-1";
+public class TestHoodieSnapshotExporter extends HoodieClientTestHarness {
 
-  private SparkSession spark = null;
-  private HoodieTestDataGenerator dataGen = null;
-  private String outputPath = null;
-  private String rootPath = null;
-  private FileSystem fs = null;
-  private Map commonOpts;
-  private HoodieSnapshotExporter.Config cfg;
-  private JavaSparkContext jsc = null;
+  private static final Logger LOG = LogManager.getLogger(TestHoodieSnapshotExporter.class);
+  private static final int NUM_RECORDS = 100;
+  private static final String COMMIT_TIME = "20200101000000";
+  private static final String PARTITION_PATH = "2020/01/01";
+  private static final String TABLE_NAME = "testing";
+  private String sourcePath;
+  private String targetPath;
 
   @Before
-  public void initialize() throws IOException {
-    spark = SparkSession.builder()
-        .appName("Hoodie Datasource test")
-        .master("local[2]")
-        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-        .getOrCreate();
-    jsc = new JavaSparkContext(spark.sparkContext());
-    dataGen = new HoodieTestDataGenerator();
-    folder.create();
-    basePath = folder.getRoot().getAbsolutePath();
-    fs = FSUtils.getFs(basePath, spark.sparkContext().hadoopConfiguration());
-    commonOpts = new HashMap();
-
-    commonOpts.put("hoodie.insert.shuffle.parallelism", "4");
-    commonOpts.put("hoodie.upsert.shuffle.parallelism", "4");
-    commonOpts.put(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key");
-    commonOpts.put(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition");
-    commonOpts.put(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp");
-    commonOpts.put(HoodieWriteConfig.TABLE_NAME, "hoodie_test");
-
-
-    cfg = new HoodieSnapshotExporter.Config();
-
-    cfg.sourceBasePath = basePath;
-    cfg.targetOutputPath = outputPath = basePath + "/target";
-    cfg.outputFormat = "json";
-    cfg.outputPartitionField = "partition";
-
+  public void setUp() throws Exception {
+    initSparkContexts();
+    initDFS();
+    dataGen = new HoodieTestDataGenerator(new String[] {PARTITION_PATH});
+
+    // Initialize test data dirs
+    sourcePath = dfsBasePath + "/source/";
+    targetPath = dfsBasePath + "/target/";
+    dfs.mkdirs(new Path(sourcePath));
+    dfs.mkdirs(new Path(targetPath));
+    HoodieTableMetaClient
+        .initTableType(jsc.hadoopConfiguration(), sourcePath, HoodieTableType.COPY_ON_WRITE,
TABLE_NAME,
+            HoodieAvroPayload.class.getName());
+
+    // Prepare data as source Hudi dataset
+    HoodieWriteConfig cfg = getHoodieWriteConfig(sourcePath);
+    HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
+    hdfsWriteClient.startCommitWithTime(COMMIT_TIME);
+    List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
+    JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
+    hdfsWriteClient.bulkInsert(recordsRDD, COMMIT_TIME);
+    hdfsWriteClient.close();
+
+    RemoteIterator<LocatedFileStatus> itr = dfs.listFiles(new Path(sourcePath), true);
+    while (itr.hasNext()) {
+      LOG.info(">>> Prepared test file: " + itr.next().getPath());
+    }
   }
 
   @After
-  public void cleanup() {
-    if (spark != null) {
-      spark.stop();
-    }
+  public void tearDown() throws Exception {
+    cleanupSparkContexts();
+    cleanupDFS();
+    cleanupTestDataGenerator();
   }
 
-  @Test
-  public void testSnapshotExporter() throws IOException {
-    // Insert Operation
-    List<String> records = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("000",
100));
-    Dataset<Row> inputDF = spark.read().json(new JavaSparkContext(spark.sparkContext()).parallelize(records,
2));
-    inputDF.write().format("hudi")
-        .options(commonOpts)
-        .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
-        .mode(SaveMode.Overwrite)
-        .save(basePath);
-    long sourceCount = inputDF.count();
-
-    HoodieSnapshotExporter hoodieSnapshotExporter = new HoodieSnapshotExporter();
-    hoodieSnapshotExporter.export(spark, cfg);
-
-    long targetCount = spark.read().json(outputPath).count();
-
-    assertTrue(sourceCount == targetCount);
-
-    // Test Invalid OutputFormat
-    cfg.outputFormat = "foo";
-    int isError = hoodieSnapshotExporter.export(spark, cfg);
-    assertTrue(isError == -1);
+  private HoodieWriteConfig getHoodieWriteConfig(String basePath) {
+    return HoodieWriteConfig.newBuilder()
+        .withPath(basePath)
+        .withEmbeddedTimelineServerEnabled(false)
+        .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
+        .withParallelism(2, 2)
+        .withBulkInsertParallelism(2)
+        .forTable(TABLE_NAME)
+        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
+        .build();
   }
 
-  // for testEmptySnapshotCopy
-  public void init() throws IOException {
-    TemporaryFolder folder = new TemporaryFolder();
-    folder.create();
-    rootPath = "file://" + folder.getRoot().getAbsolutePath();
-    basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME;
-    outputPath = rootPath + "/output";
-
-    final Configuration hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
-    fs = FSUtils.getFs(basePath, hadoopConf);
-    HoodieTestUtils.init(hadoopConf, basePath);
+  @Test
+  public void testExportAsParquet() throws IOException {
+    HoodieSnapshotExporter.Config cfg = new Config();
+    cfg.sourceBasePath = sourcePath;
+    cfg.targetOutputPath = targetPath;
+    cfg.outputFormat = "parquet";
+    new HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(),
cfg);
+    assertEquals(NUM_RECORDS, sqlContext.read().parquet(targetPath).count());
+    assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS")));
 
 Review comment:
   @leesf Exporter seems has removed that logic for hudi case, though non-hudi case will do
it automatically via spark. I'll ensure both create the success tag. 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message