From dev-return-2801-archive-asf-public=cust-asf.ponee.io@madlib.apache.org Tue Feb 13 00:39:22 2018 Return-Path: X-Original-To: archive-asf-public@eu.ponee.io Delivered-To: archive-asf-public@eu.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by mx-eu-01.ponee.io (Postfix) with ESMTP id BC0E7180718 for ; Tue, 13 Feb 2018 00:39:22 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id ABF6F160C3F; Mon, 12 Feb 2018 23:39:22 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id BD3D3160C62 for ; Tue, 13 Feb 2018 00:39:21 +0100 (CET) Received: (qmail 68074 invoked by uid 500); 12 Feb 2018 23:39:20 -0000 Mailing-List: contact dev-help@madlib.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@madlib.apache.org Delivered-To: mailing list dev@madlib.apache.org Received: (qmail 67815 invoked by uid 99); 12 Feb 2018 23:39:20 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 12 Feb 2018 23:39:20 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 0B43AE38B8; Mon, 12 Feb 2018 23:39:20 +0000 (UTC) From: jingyimei To: dev@madlib.apache.org Reply-To: dev@madlib.apache.org References: In-Reply-To: Subject: [GitHub] madlib pull request #232: Multiple LDA improvements and fixes Content-Type: text/plain Message-Id: <20180212233920.0B43AE38B8@git1-us-west.apache.org> Date: Mon, 12 Feb 2018 23:39:20 +0000 (UTC) Github user jingyimei commented on a diff in the pull request: https://github.com/apache/madlib/pull/232#discussion_r167715361 --- Diff: src/ports/postgres/modules/utilities/text_utilities.sql_in --- @@ -74,175 +81,231 @@ tasks related to text. Flag to indicate if a vocabulary is to be created. If TRUE, an additional output table is created containing the vocabulary of all words, with an id assigned to each word. The table is called output_table_vocabulary - (suffix added to the output_table name) and contains the + (i.e., suffix added to the output_table name) and contains the following columns: - - \c wordid: An id assignment for each word - - \c word: The word/term + - \c wordid: An id for each word. + - \c word: The word/term corresponding to the id. @anchor examples @par Examples --# Prepare datasets with some example documents +-# First we create a document table with one document per row:
     DROP TABLE IF EXISTS documents;
    -CREATE TABLE documents(docid INTEGER, doc_contents TEXT);
    +CREATE TABLE documents(docid INT4, contents TEXT);
     INSERT INTO documents VALUES
    -(1, 'I like to eat broccoli and banana. I ate a banana and spinach smoothie for breakfast.'),
    -(2, 'Chinchillas and kittens are cute.'),
    -(3, 'My sister adopted two kittens yesterday'),
    -(4, 'Look at this cute hamster munching on a piece of broccoli');
    +(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),
    +(1, 'Chinchillas and kittens are cute.'),
    +(2, 'My sister adopted two kittens yesterday.'),
    +(3, 'Look at this cute hamster munching on a piece of broccoli.');
     
- --# Add a new column containing the words (lower-cased) in a text array +You can apply stemming, stop word removal and tokenization at this point +in order to prepare the documents for text processing. +Depending upon your database version, various tools are +available. Databases based on more recent versions of +PostgreSQL may do something like: +
    +SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
    +
+
    +                    tsvector_to_array                     
    ++----------------------------------------------------------
    + {ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
    + {chinchilla,cute,kitten}
    + {adopt,kitten,sister,two,yesterday}
    + {broccoli,cute,hamster,look,munch,piec}
    +(4 rows)
    +
+In this example, we assume a database based on an older +version of PostgreSQL and just perform basic punctuation +removal and tokenization. The array of words is added as +a new column to the documents table:
     ALTER TABLE documents ADD COLUMN words TEXT[];
    -UPDATE documents SET words = regexp_split_to_array(lower(doc_contents), E'[\\\\s+\\\\.]');
    +UPDATE documents SET words = 
    +    regexp_split_to_array(lower(
    +    regexp_replace(contents, E'[,.;\\']','', 'g')
    +    ), E'[\\\\s+]');
    +\\x on   
    +SELECT * FROM documents ORDER BY docid;
    +
+
    +-[ RECORD 1 ]------------------------------------------------------------------------------------
    +docid    | 0
    +contents | I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.
    +words    | {i,like,to,eat,broccoli,and,bananas,i,ate,a,banana,and,spinach,smoothie,for,breakfast}
    +-[ RECORD 2 ]------------------------------------------------------------------------------------
    +docid    | 1
    +contents | Chinchillas and kittens are cute.
    +words    | {chinchillas,and,kittens,are,cute}
    +-[ RECORD 3 ]------------------------------------------------------------------------------------
    +docid    | 2
    +contents | My sister adopted two kittens yesterday.
    +words    | {my,sister,adopted,two,kittens,yesterday}
    +-[ RECORD 4 ]------------------------------------------------------------------------------------
    +docid    | 3
    +contents | Look at this cute hamster munching on a piece of broccoli.
    +words    | {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
     
--# Compute the frequency of each word in each document +-# Compute the frequency of each word in each document:
    -DROP TABLE IF EXISTS documents_tf;
    -SELECT madlib.term_frequency('documents', 'docid', 'words', 'documents_tf');
    -SELECT * FROM documents_tf order by docid;
    +DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
    +SELECT madlib.term_frequency('documents',    -- input table
    +                             'docid',        -- document id
    +                             'words',        -- vector of words in document
    +                             'documents_tf'  -- output table
    +                            );
    +\\x off
    +SELECT * FROM documents_tf ORDER BY docid;
     
    - docid |    word    | count
    --------+------------+-------
    -     1 | ate        |     1
    -     1 | like       |     1
    -     1 | breakfast  |     1
    -     1 | to         |     1
    -     1 | broccoli   |     1
    -     1 | spinach    |     1
    -     1 | i          |     2
    -     1 | and        |     2
    -     1 | a          |     1
    -     1 |            |     2
    -     1 | smoothie   |     1
    -     1 | eat        |     1
    -     1 | banana     |     2
    -     1 | for        |     1
    -     2 | cute       |     1
    -     2 | are        |     1
    -     2 | kitten     |     1
    -     2 | and        |     1
    -     2 | chinchilla |     1
    -     3 | kitten     |     1
    -     3 | my         |     1
    -     3 | a          |     1
    -     3 | sister     |     1
    -     3 | adopted    |     1
    -     3 | yesterday  |     1
    -     4 | at         |     1
    -     4 | of         |     1
    -     4 | piece      |     1
    -     4 | this       |     1
    -     4 | a          |     1
    -     4 | broccoli   |     1
    -     4 | hamster    |     1
    -     4 | munching   |     1
    -     4 | cute       |     1
    -     4 | look       |     1
    -(35 rows)
    + docid |    word     | count 
    +-------+-------------+-------
    +     0 | a           |     1
    +     0 | breakfast   |     1
    +     0 | banana      |     1
    +     0 | and         |     2
    +     0 | eat         |     1
    +     0 | smoothie    |     1
    +     0 | to          |     1
    +     0 | like        |     1
    +     0 | broccoli    |     1
    +     0 | bananas     |     1
    +     0 | spinach     |     1
    +     0 | i           |     2
    +     0 | ate         |     1
    +     0 | for         |     1
    +     1 | are         |     1
    +     1 | cute        |     1
    +     1 | kittens     |     1
    +     1 | chinchillas |     1
    +     1 | and         |     1
    +     2 | two         |     1
    +     2 | yesterday   |     1
    +     2 | kittens     |     1
    +     2 | sister      |     1
    +     2 | my          |     1
    +     2 | adopted     |     1
    +     3 | this        |     1
    +     3 | at          |     1
    +     3 | a           |     1
    +     3 | broccoli    |     1
    +     3 | of          |     1
    +     3 | look        |     1
    +     3 | hamster     |     1
    +     3 | on          |     1
    +     3 | piece       |     1
    +     3 | cute        |     1
    +     3 | munching    |     1
    +(36 rows)
     
--# We also can create a vocabulary of the words and store a wordid in the output -table instead of the actual word. +-# Next we create a vocabulary of the words +and store a wordid in the output table instead of the +actual word:
    -DROP TABLE IF EXISTS documents_tf;
    -DROP TABLE IF EXISTS documents_tf_vocabulary;
    -SELECT madlib.term_frequency('documents', 'docid', 'words', 'documents_tf', TRUE);
    --- Output with wordid instead of the actual words
    -SELECT * FROM documents_tf order by docid;
    +DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
    +SELECT madlib.term_frequency('documents',    -- input table
    +                             'docid',        -- document id
    --- End diff --
    
    document id column


---