incubator-cassandra-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Laing, Michael" <michael.la...@nytimes.com>
Subject Re: Composite Column Grouping
Date Wed, 11 Sep 2013 18:33:32 GMT
Here's a slightly better version and a python script. -ml

-- put this in <file> and run using 'cqlsh -f <file>

DROP KEYSPACE latest;

CREATE KEYSPACE latest WITH replication = {
    'class': 'SimpleStrategy',
    'replication_factor' : 1
};

USE latest;

CREATE TABLE time_series (
    bucket_userid text, -- bucket is the beginning of a datetime span
concatenated with a shard designator
    user_id text,
    pkid text,
    timeuuid text,
    colname text,
    PRIMARY KEY (bucket_userid, timeuuid)
);

UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1000',
    colname = 'Col-Name-1'
WHERE
    bucket_userid = '2013-09-11T05:15-0_XYZ' AND
    timeuuid='200'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1001',
    colname = 'Col-Name-2'
WHERE
    bucket_userid = '2013-09-11T05:15-1_XYZ' AND
    timeuuid='201'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1000',
    colname = 'Col-Name-3'
WHERE
    bucket_userid = '2013-09-11T05:15-0_XYZ' AND
    timeuuid='202'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1000',
    colname = 'Col-Name-4'
WHERE
    bucket_userid = '2013-09-11T05:30-1_XYZ' AND
    timeuuid='203'
;
UPDATE time_series
SET
    user_id = 'XYZ',
    pkid = '1002',
    colname = 'Col-Name-5'
WHERE
    bucket_userid = '2013-09-11T05:30-0_XYZ' AND
    timeuuid='204'
;

-- This query assumes that the 'current' span is 2013-09-11T05:30 and I am
interested in this span and the previous one.

SELECT * FROM time_series
WHERE bucket_userid IN ( -- go back as many spans as you need to, all
shards in each span (cartesian product)
    '2013-09-11T05:15-0_XYZ',
    '2013-09-11T05:15-1_XYZ',
    '2013-09-11T05:30-0_XYZ',
    '2013-09-11T05:30-1_XYZ'
) -- you could add a range condition on timeuuid to further restrict the
results
ORDER BY timeuuid DESC;

-- returns:
-- bucket_userid          | timeuuid | colname    | pkid | user_id
--------------------------+----------+------------+------+---------
-- 2013-09-11T05:30-0_XYZ |      204 | Col-Name-5 | 1002 |     XYZ
-- 2013-09-11T05:30-1_XYZ |      203 | Col-Name-4 | 1000 |     XYZ
-- 2013-09-11T05:15-0_XYZ |      202 | Col-Name-3 | 1000 |     XYZ
-- 2013-09-11T05:15-1_XYZ |      201 | Col-Name-2 | 1001 |     XYZ
-- 2013-09-11T05:15-0_XYZ |      200 | Col-Name-1 | 1000 |     XYZ

-- do a stable purge on pkid to get the result


python script:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
logging.basicConfig()

import cassandra
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement

cql_cluster = Cluster()
cql_session = cql_cluster.connect()
cql_session.set_keyspace('latest')

select_stmt = """
    SELECT * FROM time_series
    WHERE bucket_userid IN ( -- go back as many spans as you need to, all
shards in each span (cartesian product)
        '2013-09-11T05:15-0_XYZ',
        '2013-09-11T05:15-1_XYZ',
        '2013-09-11T05:30-0_XYZ',
        '2013-09-11T05:30-1_XYZ'
    )
    ORDER BY timeuuid DESC;
"""

query = SimpleStatement(select_stmt)
rows = cql_session.execute(query)

pkids = set()
for row in rows:
    if row.pkid in pkids:
        continue
    else:
        print row.user_id, row.timeuuid, row.colname, row.pkid
        pkids.add(row.pkid)

# prints:

# XYZ 204 Col-Name-5 1002
# XYZ 203 Col-Name-4 1000
# XYZ 201 Col-Name-2 1001


On Wed, Sep 11, 2013 at 6:13 AM, Laing, Michael
<michael.laing@nytimes.com>wrote:

> Then you can do this. I handle millions of entries this way and it works
> well if you are mostly interested in recent activity.
>
> If you need to span all activity then you can use a separate table to
> maintain the 'latest'. This table should also be sharded as entries will be
> 'hot'. Sharding will spread the heat and the tombstones (compaction load)
> around the cluster.
>
> -ml
>
> -- put this in <file> and run using 'cqlsh -f <file>
>
> DROP KEYSPACE latest;
>
> CREATE KEYSPACE latest WITH replication = {
>     'class': 'SimpleStrategy',
>     'replication_factor' : 1
> };
>
> USE latest;
>
> CREATE TABLE time_series (
>     bucket_userid text, -- bucket is the beginning of a datetime span
> concatenated with a shard designator
>     pkid text,
>     timeuuid text,
>     colname text,
>     PRIMARY KEY (bucket_userid, timeuuid)
> );
>
> -- the example table is using 15 minute bucket spans and 2 shards for
> illustration (you would usually use more shards)
> -- adjust these appropriately for your application
>
> UPDATE time_series SET pkid = '1000', colname = 'Col-Name-1' where
> bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='200';
> UPDATE time_series SET pkid = '1001', colname = 'Col-Name-2' where
> bucket_userid = '2013-09-11T05:15-1_XYZ' AND timeuuid='201';
> UPDATE time_series SET pkid = '1000', colname = 'Col-Name-3' where
> bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='202';
> UPDATE time_series SET pkid = '1000', colname = 'Col-Name-4' where
> bucket_userid = '2013-09-11T05:30-1_XYZ' AND timeuuid='203';
> UPDATE time_series SET pkid = '1002', colname = 'Col-Name-5' where
> bucket_userid = '2013-09-11T05:30-0_XYZ' AND timeuuid='204';
>
> -- This query assumes that the 'current' span is 2013-09-11T05:30 and I am
> interested in this span and the previous one.
>
> SELECT * FROM time_series
> WHERE bucket_userid in ( -- go back as many spans as you need to, all
> shards in each span (cartesian product)
>     '2013-09-11T05:15-0_XYZ',
>     '2013-09-11T05:15-1_XYZ',
>     '2013-09-11T05:30-0_XYZ',
>     '2013-09-11T05:30-1_XYZ'
> )
> ORDER BY timeuuid DESC;
>
> -- returns:
> -- bucket_userid          | timeuuid | colname    | pkid
> --------------------------+----------+------------+------
> -- 2013-09-11T05:30-0_XYZ |      204 | Col-Name-5 | 1002
> -- 2013-09-11T05:30-1_XYZ |      203 | Col-Name-4 | 1000
> -- 2013-09-11T05:15-0_XYZ |      202 | Col-Name-3 | 1000
> -- 2013-09-11T05:15-1_XYZ |      201 | Col-Name-2 | 1001
> -- 2013-09-11T05:15-0_XYZ |      200 | Col-Name-1 | 1000
>
> -- do a stable purge on pkid to get the result.
>
>
> On Wed, Sep 11, 2013 at 1:01 AM, Ravikumar Govindarajan <
> ravikumar.govindarajan@gmail.com> wrote:
>
>> Thanks Michael,
>>
>> But I cannot sort the rows in memory, as the number of columns will be
>> quite huge.
>>
>> From the python script above:
>>    select_stmt = "select * from time_series where userid = 'XYZ'"
>>
>> This would return me many hundreds of thousands of columns. I need to go
>> in time-series order using ranges [Pagination queries].
>>
>>
>> On Wed, Sep 11, 2013 at 7:06 AM, Laing, Michael <
>> michael.laing@nytimes.com> wrote:
>>
>>> If you have set up the table as described in my previous message, you
>>> could run this python snippet to return the desired result:
>>>
>>> #!/usr/bin/env python
>>> # -*- coding: utf-8 -*-
>>> import logging
>>> logging.basicConfig()
>>>
>>> from operator import itemgetter
>>>
>>> import cassandra
>>> from cassandra.cluster import Cluster
>>> from cassandra.query import SimpleStatement
>>>
>>> cql_cluster = Cluster()
>>> cql_session = cql_cluster.connect()
>>> cql_session.set_keyspace('latest')
>>>
>>> select_stmt = "select * from time_series where userid = 'XYZ'"
>>> query = SimpleStatement(select_stmt)
>>> rows = cql_session.execute(query)
>>>
>>> results = []
>>> for row in rows:
>>>     max_time = max(row.colname.keys())
>>>     results.append((row.userid, row.pkid, max_time,
>>> row.colname[max_time]))
>>>
>>> sorted_results = sorted(results, key=itemgetter(2), reverse=True)
>>> for result in sorted_results: print result
>>>
>>> # prints:
>>>
>>> # (u'XYZ', u'1002', u'204', u'Col-Name-5')
>>> # (u'XYZ', u'1000', u'203', u'Col-Name-4')
>>> # (u'XYZ', u'1001', u'201', u'Col-Name-2')
>>>
>>>
>>>
>>> On Tue, Sep 10, 2013 at 6:32 PM, Laing, Michael <
>>> michael.laing@nytimes.com> wrote:
>>>
>>>> You could try this. C* doesn't do it all for you, but it will
>>>> efficiently get you the right data.
>>>>
>>>> -ml
>>>>
>>>> -- put this in <file> and run using 'cqlsh -f <file>
>>>>
>>>> DROP KEYSPACE latest;
>>>>
>>>> CREATE KEYSPACE latest WITH replication = {
>>>>     'class': 'SimpleStrategy',
>>>>     'replication_factor' : 1
>>>> };
>>>>
>>>> USE latest;
>>>>
>>>> CREATE TABLE time_series (
>>>>     userid text,
>>>>     pkid text,
>>>>     colname map<text, text>,
>>>>     PRIMARY KEY (userid, pkid)
>>>> );
>>>>
>>>> UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE
>>>> userid = 'XYZ' AND pkid = '1000';
>>>> UPDATE time_series SET colname = colname +
>>>> {'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001';
>>>> UPDATE time_series SET colname = colname +
>>>> {'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000';
>>>> UPDATE time_series SET colname = colname +
>>>> {'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000';
>>>> UPDATE time_series SET colname = colname +
>>>> {'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002';
>>>>
>>>> SELECT * FROM time_series WHERE userid = 'XYZ';
>>>>
>>>> -- returns:
>>>> -- userid | pkid | colname
>>>>
>>>> ----------+------+-----------------------------------------------------------------
>>>> --    XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203':
>>>> 'Col-Name-4'}
>>>> --    XYZ | 1001 |                                           {'201':
>>>> 'Col-Name-2'}
>>>> --    XYZ | 1002 |                                           {'204':
>>>> 'Col-Name-5'}
>>>>
>>>> -- use an app to pop off the latest key/value from the map for each
>>>> row, then sort by key desc.
>>>>
>>>>
>>>> On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan <
>>>> ravikumar.govindarajan@gmail.com> wrote:
>>>>
>>>>> I have been faced with a problem of grouping composites on the
>>>>> second-part.
>>>>>
>>>>> Lets say my CF contains this
>>>>>
>>>>>
>>>>> TimeSeriesCF
>>>>>                        key:                            UserID
>>>>>                        composite-col-name:    TimeUUID:PKID
>>>>>
>>>>> Some sample data
>>>>>
>>>>> UserID = XYZ
>>>>>                                  Time:PKID
>>>>>                Col-Name1 = 200:1000
>>>>>                Col-Name2 = 201:1001
>>>>>                Col-Name3 = 202:1000
>>>>>                Col-Name4 = 203:1000
>>>>>                Col-Name5 = 204:1002
>>>>>
>>>>> Whenever a time-series query is issued, it should return the following
>>>>> in time-desc order.
>>>>>
>>>>> UserID = XYZ
>>>>>               Col-Name5 = 204:1002
>>>>>               Col-Name4 = 203:1000
>>>>>               Col-Name2 = 201:1001
>>>>>
>>>>> Is something like this possible in Cassandra? Is there a different way
>>>>> to design and achieve the same objective?
>>>>>
>>>>> --
>>>>> Ravi
>>>>>
>>>>>
>>>>
>>>>
>>>
>>
>

Mime
View raw message