hudi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [incubator-hudi] satishkotha commented on a change in pull request #1341: [HUDI-626] Add exportToTable option to CLI
Date Thu, 20 Feb 2020 18:59:50 GMT
satishkotha commented on a change in pull request #1341: [HUDI-626] Add exportToTable option
to CLI
URL: https://github.com/apache/incubator-hudi/pull/1341#discussion_r382195220
 
 

 ##########
 File path: hudi-cli/src/main/java/org/apache/hudi/cli/utils/TempTableUtil.java
 ##########
 @@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.utils;
+
+import org.apache.hudi.exception.HoodieException;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class TempTableUtil {
+  private static final Logger LOG = LogManager.getLogger(TempTableUtil.class);
+
+  private JavaSparkContext jsc;
+  private SQLContext sqlContext;
+
+  public TempTableUtil(String appName) {
+    try {
+      SparkConf sparkConf = new SparkConf().setAppName(appName)
+              .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]");
+      jsc = new JavaSparkContext(sparkConf);
+      jsc.setLogLevel("ERROR");
+
+      sqlContext = new SQLContext(jsc);
+    } catch (Throwable ex) {
+      // log full stack trace and rethrow. Without this its difficult to debug failures,
if any
+      LOG.error("unable to initialize spark context ", ex);
+      throw new HoodieException(ex);
+    }
+  }
+
+  public void write(String tableName, List<String> headers, List<List<Comparable>>
rows) {
+    try {
+      if (headers.isEmpty() || rows.isEmpty()) {
+        return;
+      }
+
+      if (rows.stream().filter(row -> row.size() != headers.size()).count() > 0) {
+        throw new HoodieException("Invalid row, does not match headers " + headers.size()
+ " " + rows.size());
+      }
+
+      // replace all whitespaces in headers to make it easy to write sql queries
+      List<String> headersNoSpaces = headers.stream().map(title -> title.replaceAll("\\s+",""))
+              .collect(Collectors.toList());
+
+      // generate schema for table
+      StructType structType = new StructType();
+      for (int i = 0; i < headersNoSpaces.size(); i++) {
+        // try guessing data type from column data.
+        DataType headerDataType = getDataType(rows.get(0).get(i));
+        structType = structType.add(DataTypes.createStructField(headersNoSpaces.get(i), headerDataType,
true));
+      }
+      List<Row> records = rows.stream().map(row -> RowFactory.create(row.toArray(new
Comparable[row.size()])))
+              .collect(Collectors.toList());
+      Dataset<Row> dataset = this.sqlContext.createDataFrame(records, structType);
+      dataset.createOrReplaceTempView(tableName);
+      System.out.println("Wrote table view: " + tableName);
+    } catch (Throwable ex) {
+      // log full stack trace and rethrow. Without this its difficult to debug failures,
if any
+      LOG.error("unable to write ", ex);
+      throw new HoodieException(ex);
+    }
+  }
+
+  public void runQuery(String sqlText) {
+    try {
+      this.sqlContext.sql(sqlText).show(Integer.MAX_VALUE, false);
+    } catch (Throwable ex) {
+      // log full stack trace and rethrow. Without this its difficult to debug failures,
if any
+      LOG.error("unable to read ", ex);
+      throw new HoodieException(ex);
+    }
+  }
+
+  public void deleteTable(String tableName) {
+    try {
+      sqlContext.sql("DROP TABLE IF EXISTS " + tableName);
+    } catch (Throwable ex) {
+      // log full stack trace and rethrow. Without this its difficult to debug failures,
if any
+      LOG.error("unable to initialize spark context ", ex);
+      throw new HoodieException(ex);
+    }
+  }
+
+  private DataType getDataType(Comparable comparable) {
 
 Review comment:
   Not sure what you mean. This is dynamically inferring schema of tables output by CLI to
make it easy to filter. If you have suggestions on  how to improve this, let me know

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message