Mailing-List: contact issues-help@carbondata.incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@carbondata.incubator.apache.org
Date: Wed, 12 Apr 2017 09:10:41 +0000 (UTC)
From: "Crabo Yang (JIRA)" <jira@apache.org>
To: issues@carbondata.incubator.apache.org
Message-ID: <JIRA.13063357.1491960569000.263729.1491988241676@Atlassian.JIRA>
In-Reply-To: <JIRA.13063357.1491960569000@Atlassian.JIRA>
References: <JIRA.13063357.1491960569000@Atlassian.JIRA> <JIRA.13063357.1491960569296@jira-lw-us.apache.org>
Subject: [jira] [Commented] (CARBONDATA-906) Always OOM error when import
 large dataset (100milion rows)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
archived-at: Wed, 12 Apr 2017 09:10:46 -0000


    [ https://issues.apache.org/jira/browse/CARBONDATA-906?page=3Dcom.atlas=
sian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=3D=
15965585#comment-15965585 ]=20

Crabo Yang commented on CARBONDATA-906:
---------------------------------------

1.oozie spark-opts
<spark-opts>
--jars rds.importer-1.0-SNAPSHOT.jar,carbondata_2.10-1.0.0-incubating-shade=
-hadoop2.6.0-cdh5.7.0.jar=20
--num-executors 12 --executor-cores 4 --executor-memory 13G
--conf spark.yarn.executor.memoryOverhead=3D5120
--conf spark.executor.heartbeatInterval=3D10000000
--conf spark.network.timeout=3D10000000
</spark-opts>

2.create script=20
CREATE TABLE IF NOT EXISTS dmp_trade(id STRING,buyerNick STRING,buyerAlipay=
NO STRING,clientType STRING,sellerNick STRING,receiverName STRING,receiverM=
obile STRING,receiverPhone STRING,receiverCountry STRING,receiverState STRI=
NG,receiverCity STRING,receiverDistrict STRING,receiverTown STRING,receiver=
Address STRING,receiverZip STRING,status STRING,tradeFrom STRING,type STRIN=
G,stepTradeStatus STRING,shippingType STRING,title STRING,buyerMessage STRI=
NG,buyerMemo STRING,rxAuditStatus STRING,buyerEmail STRING,picPath STRING,s=
hopPick STRING,creditCardFee STRING,markDesc STRING,sellerMemo STRING,invoi=
ceName STRING,invoiceType STRING,tradeAttr STRING,esRange STRING,esDate STR=
ING,osDate STRING,osRange STRING,o2oSnatchStatus STRING,market STRING,etTyp=
e STRING,obs STRING,tradeOriginalJson STRING,point STRING,omniAttr STRING,o=
mniParam STRING,identity STRING,omnichannelParam STRING,assembly STRING,tra=
deId BIGINT,itemId BIGINT,platFormId INT,num INT,sellerFlag INT,naSource IN=
T,etShopId INT,forbidConsign INT,buyerFlag INT,topHold INT,nvoiceKind INT,p=
ayment STRING,price STRING,totalFee STRING,discountFee STRING,postFee STRIN=
G,stepPaidFee STRING,adjustFee STRING,buyerCodFee STRING,orderTaxFee STRING=
,couponFee STRING,paidCouponFee STRING,sellerRate STRING,buyerRate STRING,p=
ostGateDeclare STRING,crossBondedDeclare STRING,hasBuyerMessage STRING,hasP=
ostFee STRING,isShShip STRING,created TIMESTAMP,payTime TIMESTAMP,modified =
TIMESTAMP,endTime TIMESTAMP,consignTime TIMESTAMP,estConTime TIMESTAMP) STO=
RED BY 'carbondata';


> Always OOM error when import large dataset (100milion rows)
> -----------------------------------------------------------
>
>                 Key: CARBONDATA-906
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-906
>             Project: CarbonData
>          Issue Type: Bug
>          Components: data-load
>    Affects Versions: 1.0.0-incubating
>            Reporter: Crabo Yang
>         Attachments: carbon.properties
>
>
> java.lang.OutOfMemoryError: GC overhead limit exceeded
> =09at java.util.concurrent.ConcurrentHashMap$Segment.put(ConcurrentHashMa=
p.java:457)
> =09at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1=
130)
> =09at org.apache.carbondata.core.cache.dictionary.ColumnReverseDictionary=
Info.addDataToDictionaryMap(ColumnReverseDictionaryInfo.java:101)
> =09at org.apache.carbondata.core.cache.dictionary.ColumnReverseDictionary=
Info.addDictionaryChunk(ColumnReverseDictionaryInfo.java:88)
> =09at org.apache.carbondata.core.cache.dictionary.DictionaryCacheLoaderIm=
pl.fillDictionaryValuesAndAddToDictionaryChunks(DictionaryCacheLoaderImpl.j=
ava:113)
> =09at org.apache.carbondata.core.cache.dictionary.DictionaryCacheLoaderIm=
pl.load(DictionaryCacheLoaderImpl.java:81)
> =09at org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCache=
.loadDictionaryData(AbstractDictionaryCache.java:236)
> =09at org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCache=
.checkAndLoadDictionaryData(AbstractDictionaryCache.java:186)
> =09at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.=
getDictionary(ReverseDictionaryCache.java:174)
> =09at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.=
get(ReverseDictionaryCache.java:67)
> =09at org.apache.carbondata.core.cache.dictionary.ReverseDictionaryCache.=
get(ReverseDictionaryCache.java:38)
> =09at org.apache.carbondata.processing.newflow.converter.impl.DictionaryF=
ieldConverterImpl.<init>(DictionaryFieldConverterImpl.java:92)
> =09at org.apache.carbondata.processing.newflow.converter.impl.FieldEncode=
rFactory.createFieldEncoder(FieldEncoderFactory.java:77)
> =09at org.apache.carbondata.processing.newflow.converter.impl.RowConverte=
rImpl.initialize(RowConverterImpl.java:102)
> =09at org.apache.carbondata.processing.newflow.steps.DataConverterProcess=
orStepImpl.initialize(DataConverterProcessorStepImpl.java:69)
> =09at org.apache.carbondata.processing.newflow.steps.SortProcessorStepImp=
l.initialize(SortProcessorStepImpl.java:57)
> =09at org.apache.carbondata.processing.newflow.steps.DataWriterProcessorS=
tepImpl.initialize(DataWriterProcessorStepImpl.java:79)
> =09at org.apache.carbondata.processing.newflow.DataLoadExecutor.execute(D=
ataLoadExecutor.java:45)
> =09at org.apache.carbondata.spark.rdd.NewDataFrameLoaderRDD$$anon$2.<init=
>(NewCarbonDataLoadRDD.scala:425)
> =09at org.apache.carbondata.spark.rdd.NewDataFrameLoaderRDD.compute(NewCa=
rbonDataLoadRDD.scala:383)
> =09at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
> =09at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
> =09at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> =09at org.apache.spark.scheduler.Task.run(Task.scala:89)
> =09at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:21=
3)
> =09at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecuto=
r.java:1145)
> =09at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecut=
or.java:615)
> =09at java.lang.Thread.run(Thread.java:745)


--
This message was sent by Atlassian JIRA
(v6.3.15#6346)