Mailing-List: contact issues-help@carbondata.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@carbondata.incubator.apache.org
Date: Wed, 12 Apr 2017 09:16:41 +0000 (UTC)
From: "Crabo Yang (JIRA)" <jira@apache.org>
To: issues@carbondata.incubator.apache.org
Message-ID: <JIRA.13063357.1491960569000.263798.1491988601753@Atlassian.JIRA>
In-Reply-To: <JIRA.13063357.1491960569000@Atlassian.JIRA>
References: <JIRA.13063357.1491960569000@Atlassian.JIRA> <JIRA.13063357.1491960569296@jira-lw-us.apache.org>
Subject: [jira] [Comment Edited] (CARBONDATA-906) Always OOM error when
 import large dataset (100milion rows)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
archived-at: Wed, 12 Apr 2017 09:16:52 -0000


    [ https://issues.apache.org/jira/browse/CARBONDATA-906?page=3Dcom.atlas=
sian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=3D=
15965585#comment-15965585 ]=20

Crabo Yang edited comment on CARBONDATA-906 at 4/12/17 9:16 AM:
----------------------------------------------------------------

1.oozie spark-opts
<spark-opts>
--jars rds.importer-1.0-SNAPSHOT.jar,carbondata_2.10-1.0.0-incubating-shade=
-hadoop2.6.0-cdh5.7.0.jar=20
--num-executors 12 --executor-cores 4 --executor-memory 13G
--conf spark.yarn.executor.memoryOverhead=3D5120
--conf spark.executor.heartbeatInterval=3D10000000
--conf spark.network.timeout=3D10000000
</spark-opts>

2.create script=20
CREATE TABLE IF NOT EXISTS dmp_trade(id STRING,buyerNick STRING,buyerAlipay=
NO STRING,clientType STRING,sellerNick STRING,receiverName STRING,receiverM=
obile STRING,receiverPhone STRING,receiverCountry STRING,receiverState STRI=
NG,receiverCity STRING,receiverDistrict STRING,receiverTown STRING,receiver=
Address STRING,receiverZip STRING,status STRING,tradeFrom STRING,type STRIN=
G,stepTradeStatus STRING,shippingType STRING,title STRING,buyerMessage STRI=
NG,buyerMemo STRING,rxAuditStatus STRING,buyerEmail STRING,picPath STRING,s=
hopPick STRING,creditCardFee STRING,markDesc STRING,sellerMemo STRING,invoi=
ceName STRING,invoiceType STRING,tradeAttr STRING,esRange STRING,esDate STR=
ING,osDate STRING,osRange STRING,o2oSnatchStatus STRING,market STRING,etTyp=
e STRING,obs STRING,tradeOriginalJson STRING,point STRING,omniAttr STRING,o=
mniParam STRING,identity STRING,omnichannelParam STRING,assembly STRING,tra=
deId BIGINT,itemId BIGINT,platFormId INT,num INT,sellerFlag INT,naSource IN=
T,etShopId INT,forbidConsign INT,buyerFlag INT,topHold INT,nvoiceKind INT,p=
ayment STRING,price STRING,totalFee STRING,discountFee STRING,postFee STRIN=
G,stepPaidFee STRING,adjustFee STRING,buyerCodFee STRING,orderTaxFee STRING=
,couponFee STRING,paidCouponFee STRING,sellerRate STRING,buyerRate STRING,p=
ostGateDeclare STRING,crossBondedDeclare STRING,hasBuyerMessage STRING,hasP=
ostFee STRING,isShShip STRING,created TIMESTAMP,payTime TIMESTAMP,modified =
TIMESTAMP,endTime TIMESTAMP,consignTime TIMESTAMP,estConTime TIMESTAMP) STO=
RED BY 'carbondata';

3.carbon.properties
#System Configuration
#Mandatory. Carbon Store path
carbon.storelocation=3Dhdfs://master.nascent.com:8020/Opt/CarbonStore
#Base directory for Data files
carbon.ddl.base.hdfs.url=3Dhdfs://master.nascent.com:8020/opt/data
#Path where the bad records are stored
carbon.badRecords.location=3D/opt/Carbon/Spark/badrecords
#Mandatory. path to kettle home
carbon.kettle.home=3D/usr/lib/spark/carbonlib/carbonplugins
#Performance Configuration
#DataLoading Configuration
carbon.load.use.batch.sort=3Dtrue
enable.unsafe.sort=3Dtrue
offheap.sort.chunk.size.inmb=3D1024
carbon.load.batch.sort.size.inmb=3D450
#File read buffer size used during sorting(in MB) :MIN=3D1:MAX=3D100
carbon.sort.file.buffer.size=3D10
#Rowset size exchanged between data load graph steps :MIN=3D500:MAX=3D10000=
00
carbon.graph.rowset.size=3D10000
#Number of cores to be used while data loading
carbon.number.of.cores.while.loading=3D6
#Record count to sort and write to temp intermediate files
carbon.sort.size=3D500000
#Algorithm for hashmap for hashkey calculation
carbon.enableXXHash=3Dtrue
#Number of cores to be used for block sort while dataloading
#carbon.number.of.cores.block.sort=3D7
#max level cache size upto which level cache will be loaded in memory
#carbon.max.level.cache.size=3D-1
#enable prefetch of data during merge sort while reading data from sort tem=
p files in data loading
#carbon.merge.sort.prefetch=3Dtrue
#Compaction Configuration
#Number of cores to be used while compacting
carbon.number.of.cores.while.compacting=3D8
#For minor compaction, Number of segments to be merged in stage 1, number o=
f compacted segments to be merged in stage 2.
carbon.compaction.level.threshold=3D4,3
#default size (in MB) for major compaction to be triggered
carbon.major.compaction.size=3D1024
#Query Configuration
#Number of cores to be used for loading index into memory
carbon.number.of.cores=3D8
#Number of records to be in memory while querying :MIN=3D100000:MAX=3D24000=
0
carbon.inmemory.record.size=3D120000
#Improves the performance of filter query
carbon.enable.quick.filter=3Dfalse
##number of core to load the blocks in driver
#no.of.cores.to.load.blocks.in.driver=3D10

#Extra Configuration
##Timestamp format of input data used for timestamp data type.
#carbon.timestamp.format=3Dyyyy-MM-dd HH:mm:ss
######## Dataload Configuration ########
##File write buffer size used during sorting.
#carbon.sort.file.write.buffer.size=3D10485760
##Locking mechanism for data loading on a table
carbon.lock.type=3DHDFSLOCK
##Minimum no of intermediate files after which sort merged to be started.
#carbon.sort.intermediate.files.limit=3D20
##space reserved in percentage for writing block meta data in carbon data f=
ile
#carbon.block.meta.size.reserved.percentage=3D10
##csv reading buffer size.
#carbon.csv.read.buffersize.byte=3D1048576
##To identify and apply compression for non-high cardinality columns
#high.cardinality.value=3D100000
##maximum no of threads used for reading intermediate files for final mergi=
ng.
#carbon.merge.sort.reader.thread=3D3
##Carbon blocklet size. Note: this configuration cannot be change once stor=
e is generated
#carbon.blocklet.size=3D120000
##number of retries to get the metadata lock for loading data to table
#carbon.load.metadata.lock.retries=3D3
##Minimum blocklets needed for distribution.
#carbon.blockletdistribution.min.blocklet.size=3D10
##Interval between the retries to get the lock
#carbon.load.metadata.lock.retry.timeout.sec=3D5
##Temporary store location, By default it will take System.getProperty("jav=
a.io.tmpdir")
#carbon.tempstore.location=3D/opt/Carbon/TempStoreLoc
##data loading records count logger
#carbon.load.log.counter=3D500000
######## Compaction Configuration ########
##to specify number of segments to be preserved from compaction
#carbon.numberof.preserve.segments=3D0
##To determine the loads of number of days to be compacted
#carbon.allowed.compaction.days=3D0
##To enable compaction while data loading
#carbon.enable.auto.load.merge=3Dfalse
######## Query Configuration ########
##Maximum time allowed for one query to be executed.
max.query.execution.time=3D60
##Min max is feature added to enhance query performance. To disable this fe=
ature, make it false.
carbon.enableMinMax=3Dtrue
######## Global Dictionary Configurations ########
##To enable/disable identify high cardinality during first data loading
#high.cardinality.identify.enable=3Dtrue
##threshold to identify whether high cardinality column
#high.cardinality.threshold=3D1000000
##Percentage to identify whether column cardinality is more than configured=
 percent of total row count
#high.cardinality.row.count.percentage=3D80
##The property to set the date to be considered as start date for calculati=
ng the timestamp.
#carbon.cutOffTimestamp=3D2000-01-01 00:00:00
##The property to set the timestamp (ie milis) conversion to the SECOND, MI=
NUTE, HOUR or DAY level.
#carbon.timegranularity=3DSECOND


was (Author: crabo):
1.oozie spark-opts
<spark-opts>
--jars rds.importer-1.0-SNAPSHOT.jar,carbondata_2.10-1.0.0-incubating-shade=
-hadoop2.6.0-cdh5.7.0.jar=20
--num-executors 12 --executor-cores 4 --executor-memory 13G
--conf spark.yarn.executor.memoryOverhead=3D5120
--conf spark.executor.heartbeatInterval=3D10000000
--conf spark.network.timeout=3D10000000
</spark-opts>

2.create script=20
CREATE TABLE IF NOT EXISTS dmp_trade(id STRING,buyerNick STRING,buyerAlipay=
NO STRING,clientType STRING,sellerNick STRING,receiverName STRING,receiverM=
obile STRING,receiverPhone STRING,receiverCountry STRING,receiverState STRI=
NG,receiverCity STRING,receiverDistrict STRING,receiverTown STRING,receiver=
Address STRING,receiverZip STRING,status STRING,tradeFrom STRING,type STRIN=
G,stepTradeStatus STRING,shippingType STRING,title STRING,buyerMessage STRI=
NG,buyerMemo STRING,rxAuditStatus STRING,buyerEmail STRING,picPath STRING,s=
hopPick STRING,creditCardFee STRING,markDesc STRING,sellerMemo STRING,invoi=
ceName STRING,invoiceType STRING,tradeAttr STRING,esRange STRING,esDate STR=
ING,osDate STRING,osRange STRING,o2oSnatchStatus STRING,market STRING,etTyp=
e STRING,obs STRING,tradeOriginalJson STRING,point STRING,omniAttr STRING,o=
mniParam STRING,identity STRING,omnichannelParam STRING,assembly STRING,tra=
deId BIGINT,itemId BIGINT,platFormId INT,num INT,sellerFlag INT,naSource IN=
T,etShopId INT,forbidConsign INT,buyerFlag INT,topHold INT,nvoiceKind INT,p=
ayment STRING,price STRING,totalFee STRING,discountFee STRING,postFee STRIN=
G,stepPaidFee STRING,adjustFee STRING,buyerCodFee STRING,orderTaxFee STRING=
,couponFee STRING,paidCouponFee STRING,sellerRate STRING,buyerRate STRING,p=
ostGateDeclare STRING,crossBondedDeclare STRING,hasBuyerMessage STRING,hasP=
ostFee STRING,isShShip STRING,created TIMESTAMP,payTime TIMESTAMP,modified =
TIMESTAMP,endTime TIMESTAMP,consignTime TIMESTAMP,estConTime TIMESTAMP) STO=
RED BY 'carbondata';

3.carbon.properties
#System Configuration
#Mandatory. Carbon Store path
carbon.storelocation=3Dhdfs://master.nascent.com:8020/Opt/CarbonStore
#Base directory for Data files
carbon.ddl.base.hdfs.url=3Dhdfs://master.nascent.com:8020/opt/data
#Path where the bad records are stored
carbon.badRecords.location=3D/opt/Carbon/Spark/badrecords
#Mandatory. path to kettle home
carbon.kettle.home=3D/usr/lib/spark/carbonlib/carbonplugins

#Performance Configuration
# DataLoading Configuration
carbon.load.use.batch.sort=3Dtrue
enable.unsafe.sort=3Dtrue
offheap.sort.chunk.size.inmb=3D1024
carbon.load.batch.sort.size.inmb=3D450
#File read buffer size used during sorting(in MB) :MIN=3D1:MAX=3D100
carbon.sort.file.buffer.size=3D10
#Rowset size exchanged between data load graph steps :MIN=3D500:MAX=3D10000=
00
carbon.graph.rowset.size=3D10000
#Number of cores to be used while data loading
carbon.number.of.cores.while.loading=3D6
#Record count to sort and write to temp intermediate files
carbon.sort.size=3D500000
#Algorithm for hashmap for hashkey calculation
carbon.enableXXHash=3Dtrue
#Number of cores to be used for block sort while dataloading
#carbon.number.of.cores.block.sort=3D7
#max level cache size upto which level cache will be loaded in memory
#carbon.max.level.cache.size=3D-1
#enable prefetch of data during merge sort while reading data from sort tem=
p files in data loading
#carbon.merge.sort.prefetch=3Dtrue
#Compaction Configuration
#Number of cores to be used while compacting
carbon.number.of.cores.while.compacting=3D8
#For minor compaction, Number of segments to be merged in stage 1, number o=
f compacted segments to be merged in stage 2.
carbon.compaction.level.threshold=3D4,3
#default size (in MB) for major compaction to be triggered
carbon.major.compaction.size=3D1024
#Query Configuration
#Number of cores to be used for loading index into memory
carbon.number.of.cores=3D8
#Number of records to be in memory while querying :MIN=3D100000:MAX=3D24000=
0
carbon.inmemory.record.size=3D120000
#Improves the performance of filter query
carbon.enable.quick.filter=3Dfalse
##number of core to load the blocks in driver
#no.of.cores.to.load.blocks.in.driver=3D10

#Extra Configuration
##Timestamp format of input data used for timestamp data type.
#carbon.timestamp.format=3Dyyyy-MM-dd HH:mm:ss
######## Dataload Configuration ########
##File write buffer size used during sorting.
#carbon.sort.file.write.buffer.size=3D10485760
##Locking mechanism for data loading on a table
carbon.lock.type=3DHDFSLOCK
##Minimum no of intermediate files after which sort merged to be started.
#carbon.sort.intermediate.files.limit=3D20
##space reserved in percentage for writing block meta data in carbon data f=
ile
#carbon.block.meta.size.reserved.percentage=3D10
##csv reading buffer size.
#carbon.csv.read.buffersize.byte=3D1048576
##To identify and apply compression for non-high cardinality columns
#high.cardinality.value=3D100000
##maximum no of threads used for reading intermediate files for final mergi=
ng.
#carbon.merge.sort.reader.thread=3D3
##Carbon blocklet size. Note: this configuration cannot be change once stor=
e is generated
#carbon.blocklet.size=3D120000
##number of retries to get the metadata lock for loading data to table
#carbon.load.metadata.lock.retries=3D3
##Minimum blocklets needed for distribution.
#carbon.blockletdistribution.min.blocklet.size=3D10
##Interval between the retries to get the lock
#carbon.load.metadata.lock.retry.timeout.sec=3D5
##Temporary store location, By default it will take System.getProperty("jav=
a.io.tmpdir")
#carbon.tempstore.location=3D/opt/Carbon/TempStoreLoc
##data loading records count logger
#carbon.load.log.counter=3D500000
######## Compaction Configuration ########
##to specify number of segments to be preserved from compaction
#carbon.numberof.preserve.segments=3D0
##To determine the loads of number of days to be compacted
#carbon.allowed.compaction.days=3D0
##To enable compaction while data loading
#carbon.enable.auto.load.merge=3Dfalse
######## Query Configuration ########
##Maximum time allowed for one query to be executed.
max.query.execution.time=3D60
##Min max is feature added to enhance query performance. To disable this fe=
ature, make it false.
carbon.enableMinMax=3Dtrue
######## Global Dictionary Configurations ########
##To enable/disable identify high cardinality during first data loading
#high.cardinality.identify.enable=3Dtrue
##threshold to identify whether high cardinality column
#high.cardinality.threshold=3D1000000
##Percentage to identify whether column cardinality is more than configured=
 percent of total row count
#high.cardinality.row.count.percentage=3D80
##The property to set the date to be considered as start date for calculati=
ng the timestamp.
#carbon.cutOffTimestamp=3D2000-01-01 00:00:00
##The property to set the timestamp (ie milis) conversion to the SECOND, MI=
NUTE, HOUR or DAY level.
#carbon.timegranularity=3DSECOND

> Always OOM error when import large dataset (100milion rows)
> -----------------------------------------------------------
>
>                 Key: CARBONDATA-906
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-906
>             Project: CarbonData
>          Issue Type: Bug
>          Components: data-load
>    Affects Versions: 1.0.0-incubating
>            Reporter: Crabo Yang
>         Attachments: carbon.properties
>
>
> java.lang.OutOfMemoryError: GC overhead limit exceeded
> =09at java.util.concurrent.ConcurrentHashMap$Segment.put(ConcurrentHashMa=
p.java:457)
> =09at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1=
130)
> =09at org.apache.carbondata.core.cache.dictionary.ColumnReverseDictionary=
Info.addDataToDictionaryMap(ColumnReverseDictionaryInfo.java:101)
> =09at org.apache.carbondata.core.cache.dictionary.ColumnReverseDictionary=
Info.addDictionaryChunk(ColumnReverseDictionaryInfo.java:88)
> =09at org.apache.carbondata.core.cache.dictionary.DictionaryCacheLoaderIm=
pl.fillDictionaryValuesAndAddToDictionaryChunks(DictionaryCacheLoaderImpl.j=
ava:113)
> =09at org.apache.carbondata.core.cache.dictionary.DictionaryCacheLoaderIm=
pl.load(DictionaryCacheLoaderImpl.java:81)
> =09at org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCache=
.loadDictionaryData(AbstractDictionaryCache.java:236)
> =09at org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCache=
.checkAndLoadDictionaryData(AbstractDictionaryCache.java:186)
> =09at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.=
getDictionary(ReverseDictionaryCache.java:174)
> =09at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.=
get(ReverseDictionaryCache.java:67)
> =09at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.=
get(ReverseDictionaryCache.java:38)
> =09at org.apache.carbondata.processing.newflow.converter.impl.DictionaryF=
ieldConverterImpl.<init>(DictionaryFieldConverterImpl.java:92)
> =09at org.apache.carbondata.processing.newflow.converter.impl.FieldEncode=
rFactory.createFieldEncoder(FieldEncoderFactory.java:77)
> =09at org.apache.carbondata.processing.newflow.converter.impl.RowConverte=
rImpl.initialize(RowConverterImpl.java:102)
> =09at org.apache.carbondata.processing.newflow.steps.DataConverterProcess=
orStepImpl.initialize(DataConverterProcessorStepImpl.java:69)
> =09at org.apache.carbondata.processing.newflow.steps.SortProcessorStepImp=
l.initialize(SortProcessorStepImpl.java:57)
> =09at org.apache.carbondata.processing.newflow.steps.DataWriterProcessorS=
tepImpl.initialize(DataWriterProcessorStepImpl.java:79)
> =09at org.apache.carbondata.processing.newflow.DataLoadExecutor.execute(D=
ataLoadExecutor.java:45)
> =09at org.apache.carbondata.spark.rdd.NewDataFrameLoaderRDD$$anon$2.<init=
>(NewCarbonDataLoadRDD.scala:425)
> =09at org.apache.carbondata.spark.rdd.NewDataFrameLoaderRDD.compute(NewCa=
rbonDataLoadRDD.scala:383)
> =09at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
> =09at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
> =09at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> =09at org.apache.spark.scheduler.Task.run(Task.scala:89)
> =09at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:21=
3)
> =09at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecuto=
r.java:1145)
> =09at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecut=
or.java:615)
> =09at java.lang.Thread.run(Thread.java:745)


--
This message was sent by Atlassian JIRA
(v6.3.15#6346)