carbondata-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Crabo Yang (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (CARBONDATA-906) Always OOM error when import large dataset (100milion rows)
Date Wed, 12 Apr 2017 09:10:41 GMT

    [ https://issues.apache.org/jira/browse/CARBONDATA-906?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15965585#comment-15965585
] 

Crabo Yang commented on CARBONDATA-906:
---------------------------------------

1.oozie spark-opts
<spark-opts>
--jars rds.importer-1.0-SNAPSHOT.jar,carbondata_2.10-1.0.0-incubating-shade-hadoop2.6.0-cdh5.7.0.jar

--num-executors 12 --executor-cores 4 --executor-memory 13G
--conf spark.yarn.executor.memoryOverhead=5120
--conf spark.executor.heartbeatInterval=10000000
--conf spark.network.timeout=10000000
</spark-opts>

2.create script 
CREATE TABLE IF NOT EXISTS dmp_trade(id STRING,buyerNick STRING,buyerAlipayNO STRING,clientType
STRING,sellerNick STRING,receiverName STRING,receiverMobile STRING,receiverPhone STRING,receiverCountry
STRING,receiverState STRING,receiverCity STRING,receiverDistrict STRING,receiverTown STRING,receiverAddress
STRING,receiverZip STRING,status STRING,tradeFrom STRING,type STRING,stepTradeStatus STRING,shippingType
STRING,title STRING,buyerMessage STRING,buyerMemo STRING,rxAuditStatus STRING,buyerEmail STRING,picPath
STRING,shopPick STRING,creditCardFee STRING,markDesc STRING,sellerMemo STRING,invoiceName
STRING,invoiceType STRING,tradeAttr STRING,esRange STRING,esDate STRING,osDate STRING,osRange
STRING,o2oSnatchStatus STRING,market STRING,etType STRING,obs STRING,tradeOriginalJson STRING,point
STRING,omniAttr STRING,omniParam STRING,identity STRING,omnichannelParam STRING,assembly STRING,tradeId
BIGINT,itemId BIGINT,platFormId INT,num INT,sellerFlag INT,naSource INT,etShopId INT,forbidConsign
INT,buyerFlag INT,topHold INT,nvoiceKind INT,payment STRING,price STRING,totalFee STRING,discountFee
STRING,postFee STRING,stepPaidFee STRING,adjustFee STRING,buyerCodFee STRING,orderTaxFee STRING,couponFee
STRING,paidCouponFee STRING,sellerRate STRING,buyerRate STRING,postGateDeclare STRING,crossBondedDeclare
STRING,hasBuyerMessage STRING,hasPostFee STRING,isShShip STRING,created TIMESTAMP,payTime
TIMESTAMP,modified TIMESTAMP,endTime TIMESTAMP,consignTime TIMESTAMP,estConTime TIMESTAMP)
STORED BY 'carbondata';


> Always OOM error when import large dataset (100milion rows)
> -----------------------------------------------------------
>
>                 Key: CARBONDATA-906
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-906
>             Project: CarbonData
>          Issue Type: Bug
>          Components: data-load
>    Affects Versions: 1.0.0-incubating
>            Reporter: Crabo Yang
>         Attachments: carbon.properties
>
>
> java.lang.OutOfMemoryError: GC overhead limit exceeded
> 	at java.util.concurrent.ConcurrentHashMap$Segment.put(ConcurrentHashMap.java:457)
> 	at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1130)
> 	at org.apache.carbondata.core.cache.dictionary.ColumnReverseDictionaryInfo.addDataToDictionaryMap(ColumnReverseDictionaryInfo.java:101)
> 	at org.apache.carbondata.core.cache.dictionary.ColumnReverseDictionaryInfo.addDictionaryChunk(ColumnReverseDictionaryInfo.java:88)
> 	at org.apache.carbondata.core.cache.dictionary.DictionaryCacheLoaderImpl.fillDictionaryValuesAndAddToDictionaryChunks(DictionaryCacheLoaderImpl.java:113)
> 	at org.apache.carbondata.core.cache.dictionary.DictionaryCacheLoaderImpl.load(DictionaryCacheLoaderImpl.java:81)
> 	at org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCache.loadDictionaryData(AbstractDictionaryCache.java:236)
> 	at org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCache.checkAndLoadDictionaryData(AbstractDictionaryCache.java:186)
> 	at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.getDictionary(ReverseDictionaryCache.java:174)
> 	at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.get(ReverseDictionaryCache.java:67)
> 	at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.get(ReverseDictionaryCache.java:38)
> 	at org.apache.carbondata.processing.newflow.converter.impl.DictionaryFieldConverterImpl.<init>(DictionaryFieldConverterImpl.java:92)
> 	at org.apache.carbondata.processing.newflow.converter.impl.FieldEncoderFactory.createFieldEncoder(FieldEncoderFactory.java:77)
> 	at org.apache.carbondata.processing.newflow.converter.impl.RowConverterImpl.initialize(RowConverterImpl.java:102)
> 	at org.apache.carbondata.processing.newflow.steps.DataConverterProcessorStepImpl.initialize(DataConverterProcessorStepImpl.java:69)
> 	at org.apache.carbondata.processing.newflow.steps.SortProcessorStepImpl.initialize(SortProcessorStepImpl.java:57)
> 	at org.apache.carbondata.processing.newflow.steps.DataWriterProcessorStepImpl.initialize(DataWriterProcessorStepImpl.java:79)
> 	at org.apache.carbondata.processing.newflow.DataLoadExecutor.execute(DataLoadExecutor.java:45)
> 	at org.apache.carbondata.spark.rdd.NewDataFrameLoaderRDD$$anon$2.<init>(NewCarbonDataLoadRDD.scala:425)
> 	at org.apache.carbondata.spark.rdd.NewDataFrameLoaderRDD.compute(NewCarbonDataLoadRDD.scala:383)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
> 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> 	at org.apache.spark.scheduler.Task.run(Task.scala:89)
> 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> 	at java.lang.Thread.run(Thread.java:745)



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Mime
View raw message