carbondata-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jackylk <...@git.apache.org>
Subject [GitHub] carbondata pull request #1470: [CARBONDATA-1572] Support streaming ingest an...
Date Mon, 06 Nov 2017 13:29:58 GMT
Github user jackylk commented on a diff in the pull request:

    https://github.com/apache/carbondata/pull/1470#discussion_r149078206
  
    --- Diff: streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
---
    @@ -0,0 +1,102 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.execution.streaming
    +
    +import org.apache.spark.internal.io.FileCommitProtocol
    +import org.apache.spark.sql.{DataFrame, SparkSession}
    +
    +import org.apache.carbondata.common.logging.LogServiceFactory
    +import org.apache.carbondata.core.datastore.impl.FileFactory
    +import org.apache.carbondata.core.dictionary.server.DictionaryServer
    +import org.apache.carbondata.core.metadata.schema.table.CarbonTable
    +import org.apache.carbondata.core.util.path.CarbonStorePath
    +import org.apache.carbondata.hadoop.streaming.CarbonStreamOutputFormat
    +import org.apache.carbondata.processing.loading.model.CarbonLoadModel
    +import org.apache.carbondata.streaming.segment.StreamSegmentManager
    +
    +class CarbonAppendableStreamSink(
    +    sparkSession: SparkSession,
    +    val carbonTable: CarbonTable,
    +    var currentSegmentId: String,
    +    parameters: Map[String, String],
    +    carbonLoadModel: CarbonLoadModel,
    +    sever: Option[DictionaryServer]) extends Sink {
    +
    +  private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
    +  private val carbonTablePath = CarbonStorePath
    +    .getCarbonTablePath(carbonTable.getAbsoluteTableIdentifier)
    +  private val fileLogPath = carbonTablePath.getStreamingLogDir
    +  private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession,
fileLogPath)
    +  // prepare configuration
    +  private val hadoopConf = {
    +    val conf = sparkSession.sessionState.newHadoopConf()
    +    CarbonStreamOutputFormat.setCarbonLoadModel(conf, carbonLoadModel)
    +    // put all parameters into hadoopConf
    +    parameters.foreach { entry =>
    +      conf.set(entry._1, entry._2)
    +    }
    +    conf
    +  }
    +
    +  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    +    if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
    +      LOGGER.info(s"Skipping already committed batch $batchId")
    +    } else {
    +      checkOrHandOffSegment()
    +
    +      val committer = FileCommitProtocol.instantiate(
    +        className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
    +        jobId = batchId.toString,
    +        outputPath = fileLogPath,
    +        isAppend = false)
    +
    +      committer match {
    +        case manifestCommitter: ManifestFileCommitProtocol =>
    +          manifestCommitter.setupManifestOptions(fileLog, batchId)
    +        case _ => // Do nothing
    +      }
    +
    +      CarbonStreamProcessor.writeDataFileJob(
    +        sparkSession,
    +        carbonTable,
    +        parameters,
    +        batchId,
    +        currentSegmentId,
    +        data.queryExecution,
    +        committer,
    +        hadoopConf,
    +        sever)
    +    }
    +  }
    +
    +  // if the directory size of current segment beyond the threshold, hand off new segment
    +  private def checkOrHandOffSegment(): Unit = {
    +    val segmentDir = carbonTablePath.getSegmentDir("0", currentSegmentId)
    +    val fileType = FileFactory.getFileType(segmentDir)
    +    if (StreamSegmentManager.STREAM_SEGMENT_MAX_SIZE <= FileFactory.getDirectorySize(segmentDir))
{
    --- End diff --
    
    Can we make use of metadata instead of checking file system for every batch?


---

Mime
View raw message