kylin-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Phong Pham <phongpham1...@gmail.com>
Subject Re:
Date Mon, 16 Jan 2017 02:47:37 GMT
Hi all,
    Hi all,
   * We still meet problems with query performance. Here is the cube info
of one cube*:
{
 "uuid": "6b2f4643-72a3-4a51-b9f2-47aa8e1322a5",
 "last_modified": 1484533219336,
 "version": "1.6.0",
 "name": "account_global_convtrack_summary_daily_test",
 "owner": "ADMIN",
 "descriptor": "account_global_convtrack_summary_daily_test",
 "cost": 50,
 "status": "READY",
 "segments": [
{
 "uuid": "85fa970e-6808-47c8-ae35-45d1975bb3bc",
 "name": "20160101000000_20161226000000",
 "storage_location_identifier": "KYLIN_7E4KIJ3YGX",
 "date_range_start": 1451606400000,
 "date_range_end": 1482710400000,
 "source_offset_start": 0,
 "source_offset_end": 0,
 "status": "READY",
 "size_kb": 9758001,
 "input_records": 8109122,
 "input_records_size": 102078756,
 "last_build_time": 1484533219335,
 "last_build_job_id": "a4f67403-17cb-4474-84d1-21ad64ed17a8",
 "create_time_utc": 1484527504660,
 "cuboid_shard_nums": {},
 "total_shards": 4,
 "blackout_cuboids": [],
 "binary_signature": null,
 "dictionaries": {
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/CITYID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/CITYID/0015e15c-9336-4040-b8ad-b7afba71d51c.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/TYPE":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/TYPE/56cc3576-3c19-40fb-8704-29dba88e3511.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/NETWORKID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/NETWORKID/edc1b900-8b8a-4834-a8ab-4d23e0087d61.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/WEEKGROUP":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/WEEKGROUP/3c3ae7e2-05a0-49a3-b396-ded7b1faaebd.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/DATESTATSBIGINT":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/DATESTATSBIGINT/b2003335-f10c-48b5-ac98-6d2ddd25854b.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/COUNTRYID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/COUNTRYID/233a3b35-9e0f-46e3-bb01-3330c907ab33.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/ACCOUNTID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/ACCOUNTID/612d8a57-8ed8-4fdd-bf99-c64fb2a583fe.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/DEVICEID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/DEVICEID/8813544c-aac3-4f26-849b-3e3d1b71d9e2.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/LANGUAGEID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/LANGUAGEID/02dea027-86cf-44e6-9bcf-9dbd4c33e54b.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/COMPANYID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/COMPANYID/75a5566e-b419-4fc8-9184-757b207a35d2.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/REGIONID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/REGIONID/81d5b463-8639-4633-83b9-9ac9e43e32cb.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/AFFILIATEID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/AFFILIATEID/0a35d5ce-dabb-4e32-ad5f-b87ef4c18ee3.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/SITEID":
"/dict/MTX_SYSTEM.TBL_CONVTRACK_SITES_ORC/SITEID/07e4f091-f6aa-4520-9069-416ee4c904de.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/MONTHGROUP":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/MONTHGROUP/e3bf45aa-3ff3-477b-aafd-d2c38a70caea.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/DATESTATS":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/DATESTATS/5a3d3dc6-90eb-493b-84d0-b1b8ca8b70ec.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/ISMOBILEDEVICE":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/ISMOBILEDEVICE/eba9f8db-c5f0-4283-8a77-5f72d75c5867.dict",
"METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/SOURCEURLID":
"/dict/METRIXA_GLOBAL_DATABASE.ACCOUNT_GLOBAL_CONVTRACK_SUMMARY_DAILY_ORC/SOURCEURLID/3f90d0de-6d04-4bc6-af20-0030a91326f0.dict"
 },
 "snapshots": {
"MTX_SYSTEM.TBL_MCM_COUNTRY_CITY_ORC":
"/table_snapshot/MTX_SYSTEM.TBL_MCM_COUNTRY_CITY_ORC/f32ec683-f83f-423a-820e-1bfd4b65426f.snapshot",
"METRIXA_GLOBAL_DATABASE.GLOBAL_SOURCEURL_ORC":
"/table_snapshot/METRIXA_GLOBAL_DATABASE.GLOBAL_SOURCEURL_ORC/32e8df3f-7188-4646-9eff-6c96792897f4.snapshot",
"MTX_SYSTEM.TBL_MCM_COUNTRY_REGION_ORC":
"/table_snapshot/MTX_SYSTEM.TBL_MCM_COUNTRY_REGION_ORC/e4378b9c-ff08-4207-92fa-3f0cf37f00d5.snapshot",
"MTX_SYSTEM.TBL_MCM_COUNTRY_ORC":
"/table_snapshot/MTX_SYSTEM.TBL_MCM_COUNTRY_ORC/2f2ffb19-d675-43a2-bb08-66a83801f875.snapshot",
"MTX_SYSTEM.GLOBAL_ACCOUNT_SEARCH_ENGINE_ORC":
"/table_snapshot/MTX_SYSTEM.GLOBAL_ACCOUNT_SEARCH_ENGINE_ORC/53ef6022-7249-4ef8-8518-b7d84c65fdfa.snapshot",
"MTX_SYSTEM.TBL_CONVTRACK_SITES_ORC":
"/table_snapshot/MTX_SYSTEM.TBL_CONVTRACK_SITES_ORC/0cbb0323-d434-44de-8891-85b024589743.snapshot",
"MTX_SYSTEM.TBL_MCM_LANGUAGE_ORC":
"/table_snapshot/MTX_SYSTEM.TBL_MCM_LANGUAGE_ORC/957e6a54-c618-4e5c-bc8d-c89952cafe1e.snapshot",
"MTX_SYSTEM.TBL_CONVTRACK_AFFILIATES_ORC":
"/table_snapshot/MTX_SYSTEM.TBL_CONVTRACK_AFFILIATES_ORC/f794bce2-dcb1-41b0-b9bf-fe3c9e1ad661.snapshot"
 },
 "index_path":
"/kylin/kylin_metadata/kylin-a4f67403-17cb-4474-84d1-21ad64ed17a8/account_global_convtrack_summary_daily_clone/secondary_index/",
 "rowkey_stats": [
[
 "DATESTATS",
 360,
 2
],
[
 "CITYID",
 60804,
 2
],
[
 "SOURCEURLID",
 38212,
 2
],
[
 "REGIONID",
 2822,
 2
],
[
 "COUNTRYID",
 238,
 1
],
[
 "LANGUAGEID",
 173,
 1
],
[
 "AFFILIATEID",
 36,
 1
],
[
 "ACCOUNTID",
 62,
 1
],
[
 "COMPANYID",
 19,
 1
],
[
 "SITEID",
 103,
 1
],
[
 "WEEKGROUP",
 52,
 1
],
[
 "MONTHGROUP",
 12,
 1
],
[
 "TYPE",
 2,
 1
],
[
 "ISMOBILEDEVICE",
 2,
 1
],
[
 "DEVICEID",
 338,
 2
],
[
 "NETWORKID",
 161,
 1
],
[
 "DATESTATSBIGINT",
 360,
 2
]
 ]
}
 ],
 "create_time_utc": 1484286587541,
 "size_kb": 9758001,
 "input_records_count": 8109122,
 "input_records_size": 102078756
}
*+ We have 2 colums that is high cardinality*: [
 "CITYID",
 60804,
 2
],
[
 "SOURCEURLID",
 38212,
 2
],
*+ We define left-join from model for all relations*
*+ With new aggregation:*
        Includes
["SITEID","COMPANYID","SOURCEURLID","DATESTATS","WEEKGROUP","MONTHGROUP","COUNTRYID","REGIONID","TYPE","ISMOBILEDEVICE","LANGUAGEID","DEVICEID","NETWORKID","ACCOUNTID","AFFILIATEID","CITYID"]

Mandatory Dimensions
["DATESTATS"]: Because we always use datestats as a filter

Hierarchy Dimensions: None < Maybe wee will put CountryId, RegionId, and
CityId
Joint Dimensions
["LANGUAGEID","ACCOUNTID","AFFILIATEID","SITEID","CITYID","REGIONID","COUNTRYID","SOURCEURLID"]:
Please explain to me more about join dimensions? I don't understand fully
about this theory.
*+ Rowkeys:*
We defined all rows is dict, because all of them are not ultra high
cardinality

The query that is very slow is that:
+ We get all dims and metrics, left join all dim tables and group by all
dims
+ We set datetstats condition for 1 year

And query often take a long time to executed: >10s

So are there problems with our cube design? I would like to hear your reply
soon.
Thanks so much for your help.

2017-01-12 21:28 GMT+07:00 ShaoFeng Shi <shaofengshi@apache.org>:

> Obviously there are too many segments (24*3=72), try to merge them as
> Billy suggested.
>
> Secondly if possible try to review and optimize the cube design
> (especially the rowkey sequence, put high-cardinality filter column to the
> begin position to minimal the scan range), see http://www.slideshare.net/
> YangLi43/design-cube-in-apache-kylin
>
> Thirdly try to give more power to the cluster, e.g use physical machines;
> and also use multiple kylin query nodes to balance the concurrent work
> load.
>
> Just some cents, hope it can help.
>
> 2017-01-12 22:16 GMT+08:00 Billy Liu <billyliu@apache.org>:
>
>> I have concerns with so many segments. Please try query only one cube
>> with one segment first.
>>
>> 2017-01-12 13:36 GMT+08:00 Phong Pham <phongpham1805@gmail.com>:
>>
>>> Hi,
>>> Thank you so much for your help. I really appreciate it. Im really
>>> impressed with your project and trying to apply it to our product. Our live
>>> product is still working on Mysql and MongoDb, but data is growing fast.
>>> That's why we need your product for the database engine replacement.
>>> About our problem with many queries on same time on Apache Kylin, I'm
>>> trying to monitor some elements on our system and review cubes. So are
>>> there some tutorials about concurrency of Kylin or HBase?
>>> I will give you more details abour our system:
>>> Hardware:
>>> 2 physical machines -> 7 vitural machines
>>> Each vitural machine:
>>> CPU: 8cores
>>> RAM: 24GB
>>> We are setup hadoop env with  hortonwork 2.5 and setup HBase with 5
>>> RegionServer, 2 Hbase masters
>>> Apahce Kylin we setup on 2 machines:
>>> + Node 1: using for build cubes
>>> + Node 2: using for only queries (this node also contain RegionServer)
>>> Cube and Queries:
>>> + Size of Cubes:
>>>   - Cube 1: 20GB/14M rows - 24 segments (maybe we need to meger them
>>> into 2-3 segments)
>>>   - Cube 2: 460MB/3M rows - 24 segments
>>>   - Cube 3: 1.3GB/1.4M rows - 24 segments
>>> + We use one query to read data from 3 cubes and union all into 1 result
>>> Test case:
>>> + On single request: 3s
>>> + On 5 requests on same times: (submit multi-requests from client):
>>> 20s/request
>>> And that is not acceptable when we go live.
>>> So hope you all review our struture and give us some best pratices with
>>> Kylin And Hbase.
>>> Thanks
>>>
>>> 2017-01-12 8:24 GMT+07:00 ShaoFeng Shi <shaofengshi@apache.org>:
>>>
>>>> In this case you need do some profiling to see what's the bottleneck:
>>>> Kylin or HBase or other factors like CPU, memory or network; maybe it is
>>>> related with the cube design, try to optimize the cube design with the
>>>> executed query is also a way; It is hard to give you good answer with a
>>>> couple words.
>>>>
>>>> 2017-01-11 19:50 GMT+08:00 Phong Pham <phongpham1805@gmail.com>:
>>>>
>>>>> Heres about detail on our system:
>>>>>
>>>>> Hbase: 5 nodes
>>>>> Data size: 24M rows
>>>>>
>>>>> Query result:
>>>>> *Success: true*
>>>>> *Duration: 20s*
>>>>> *Project: metrixa_global_database*
>>>>> *Realization Names: [xxx, xxx, xxx]*
>>>>> *Cuboid Ids: [45971, 24]*
>>>>>
>>>>>
>>>>> 2017-01-11 18:34 GMT+07:00 Phong Pham <phongpham1805@gmail.com>:
>>>>>
>>>>>> Hi all,
>>>>>>     I have a problem with concurrency on Apache Kylin. Execute single
>>>>>> query, it takes about 3s. Howerver,when i run multiple queries on
the same
>>>>>> time, each query take about 13-15s. So how can i solve problems?
>>>>>> My Kylin Version is 1.6.1
>>>>>> Thanks
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>> --
>>>> Best regards,
>>>>
>>>> Shaofeng Shi 史少锋
>>>>
>>>>
>>>
>>
>
>
> --
> Best regards,
>
> Shaofeng Shi 史少锋
>
>

Mime
View raw message