carbondata-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From chenliang613 <>
Subject [GitHub] carbondata issue #1660: [CARBONDATA-1731,CARBONDATA-1728] [BugFix] Update fa...
Date Sat, 16 Dec 2017 00:02:30 GMT
Github user chenliang613 commented on the issue:
    @anubhav100 @sounakr  you guys also can use my example script to reproduce. This example
simulate 7500000 data, can reproduce 1728, and this pr also can fix this issue.  please @sounakr
 double check it again.
    @anubhav100  i still have some queries, why need append "return true" after "blockletDetails.get(index).addDeletedRows(blocklet.getDeletedRows());"
    package org.apache.carbondata.examples
    import java.text.SimpleDateFormat
    import org.apache.spark.sql.SaveMode
    import org.apache.spark.sql.SparkSession
    import org.apache.carbondata.core.constants.CarbonCommonConstants
    import org.apache.carbondata.core.util.CarbonProperties
    object DataUpdateDeleteExample {
      def main(args: Array[String]) {
        // for local files
        val rootPath = new File(this.getClass.getResource("/").getPath
          + "../../../..").getCanonicalPath
        // for hdfs files
        // var rootPath = "hdfs://hdfs-host/carbon"
        var storeLocation = s"$rootPath/examples/spark2/target/store"
        var warehouse = s"$rootPath/examples/spark2/target/warehouse"
        var metastoredb = s"$rootPath/examples/spark2/target"
        import org.apache.spark.sql.CarbonSession._
        val spark = SparkSession
          .config("spark.sql.warehouse.dir", warehouse)
          .config("", "localhost")
          .config("spark.sql.crossJoin.enabled", "true")
        // Specify date format based on raw data
          .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd")
        import spark.implicits._
        // Drop table
        spark.sql("DROP TABLE IF EXISTS t3")
         // Simulate data and write to table t3
        var sdf = new SimpleDateFormat("yyyy-MM-dd")
        var df = spark.sparkContext.parallelize(1 to 7500000)
          .map(x => (x, new java.sql.Date(sdf.parse("2015-07-" + (x % 10 + 10)).getTime),
            "china", "aaa" + x, "phone" + 555 * x, "ASD" + (60000 + x), 14999 + x))
          .toDF("t3_id", "t3_date", "t3_country", "t3_name",
              "t3_phonetype", "t3_serialname", "t3_salary")
          .option("tableName", "t3")
          .option("tempCSV", "true")
          .option("compress", "true")
        // Query data again after the above update
               SELECT * FROM t3 ORDER BY t3_id
        spark.sql("delete from t3 where exists (select 1 from t3)").show()
               SELECT count(*) FROM t3
        // Drop table
        spark.sql("DROP TABLE IF EXISTS t3")


View raw message