Github user jingyimei commented on a diff in the pull request:
https://github.com/apache/madlib/pull/232#discussion_r167715361
--- Diff: src/ports/postgres/modules/utilities/text_utilities.sql_in ---
@@ -74,175 +81,231 @@ tasks related to text.
Flag to indicate if a vocabulary is to be created. If TRUE, an additional
output table is created containing the vocabulary of all words, with an id
assigned to each word. The table is called <em>output_table</em>_vocabulary
- (suffix added to the <em>output_table</em> name) and contains the
+ (i.e., suffix added to the <em>output_table</em> name) and contains the
following columns:
- - \c wordid: An id assignment for each word
- - \c word: The word/term
+ - \c wordid: An id for each word.
+ - \c word: The word/term corresponding to the id.
</dd>
</dl>
@anchor examples
@par Examples
--# Prepare datasets with some example documents
+-# First we create a document table with one document per row:
<pre class="example">
DROP TABLE IF EXISTS documents;
-CREATE TABLE documents(docid INTEGER, doc_contents TEXT);
+CREATE TABLE documents(docid INT4, contents TEXT);
INSERT INTO documents VALUES
-(1, 'I like to eat broccoli and banana. I ate a banana and spinach smoothie for breakfast.'),
-(2, 'Chinchillas and kittens are cute.'),
-(3, 'My sister adopted two kittens yesterday'),
-(4, 'Look at this cute hamster munching on a piece of broccoli');
+(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),
+(1, 'Chinchillas and kittens are cute.'),
+(2, 'My sister adopted two kittens yesterday.'),
+(3, 'Look at this cute hamster munching on a piece of broccoli.');
</pre>
-
--# Add a new column containing the words (lower-cased) in a text array
+You can apply stemming, stop word removal and tokenization at this point
+in order to prepare the documents for text processing.
+Depending upon your database version, various tools are
+available. Databases based on more recent versions of
+PostgreSQL may do something like:
+<pre class="example">
+SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
+</pre>
+<pre class="result">
+ tsvector_to_array
++----------------------------------------------------------
+ {ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
+ {chinchilla,cute,kitten}
+ {adopt,kitten,sister,two,yesterday}
+ {broccoli,cute,hamster,look,munch,piec}
+(4 rows)
+</pre>
+In this example, we assume a database based on an older
+version of PostgreSQL and just perform basic punctuation
+removal and tokenization. The array of words is added as
+a new column to the documents table:
<pre class="example">
ALTER TABLE documents ADD COLUMN words TEXT[];
-UPDATE documents SET words = regexp_split_to_array(lower(doc_contents), E'[\\\\s+\\\\.]');
+UPDATE documents SET words =
+ regexp_split_to_array(lower(
+ regexp_replace(contents, E'[,.;\\']','', 'g')
+ ), E'[\\\\s+]');
+\\x on
+SELECT * FROM documents ORDER BY docid;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]------------------------------------------------------------------------------------
+docid | 0
+contents | I like to eat broccoli and bananas. I ate a banana and spinach smoothie for
breakfast.
+words | {i,like,to,eat,broccoli,and,bananas,i,ate,a,banana,and,spinach,smoothie,for,breakfast}
+-[ RECORD 2 ]------------------------------------------------------------------------------------
+docid | 1
+contents | Chinchillas and kittens are cute.
+words | {chinchillas,and,kittens,are,cute}
+-[ RECORD 3 ]------------------------------------------------------------------------------------
+docid | 2
+contents | My sister adopted two kittens yesterday.
+words | {my,sister,adopted,two,kittens,yesterday}
+-[ RECORD 4 ]------------------------------------------------------------------------------------
+docid | 3
+contents | Look at this cute hamster munching on a piece of broccoli.
+words | {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
</pre>
--# Compute the frequency of each word in each document
+-# Compute the frequency of each word in each document:
<pre class="example">
-DROP TABLE IF EXISTS documents_tf;
-SELECT madlib.term_frequency('documents', 'docid', 'words', 'documents_tf');
-SELECT * FROM documents_tf order by docid;
+DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
+SELECT madlib.term_frequency('documents', -- input table
+ 'docid', -- document id
+ 'words', -- vector of words in document
+ 'documents_tf' -- output table
+ );
+\\x off
+SELECT * FROM documents_tf ORDER BY docid;
</pre>
<pre class="result">
- docid | word | count
--------+------------+-------
- 1 | ate | 1
- 1 | like | 1
- 1 | breakfast | 1
- 1 | to | 1
- 1 | broccoli | 1
- 1 | spinach | 1
- 1 | i | 2
- 1 | and | 2
- 1 | a | 1
- 1 | | 2
- 1 | smoothie | 1
- 1 | eat | 1
- 1 | banana | 2
- 1 | for | 1
- 2 | cute | 1
- 2 | are | 1
- 2 | kitten | 1
- 2 | and | 1
- 2 | chinchilla | 1
- 3 | kitten | 1
- 3 | my | 1
- 3 | a | 1
- 3 | sister | 1
- 3 | adopted | 1
- 3 | yesterday | 1
- 4 | at | 1
- 4 | of | 1
- 4 | piece | 1
- 4 | this | 1
- 4 | a | 1
- 4 | broccoli | 1
- 4 | hamster | 1
- 4 | munching | 1
- 4 | cute | 1
- 4 | look | 1
-(35 rows)
+ docid | word | count
+-------+-------------+-------
+ 0 | a | 1
+ 0 | breakfast | 1
+ 0 | banana | 1
+ 0 | and | 2
+ 0 | eat | 1
+ 0 | smoothie | 1
+ 0 | to | 1
+ 0 | like | 1
+ 0 | broccoli | 1
+ 0 | bananas | 1
+ 0 | spinach | 1
+ 0 | i | 2
+ 0 | ate | 1
+ 0 | for | 1
+ 1 | are | 1
+ 1 | cute | 1
+ 1 | kittens | 1
+ 1 | chinchillas | 1
+ 1 | and | 1
+ 2 | two | 1
+ 2 | yesterday | 1
+ 2 | kittens | 1
+ 2 | sister | 1
+ 2 | my | 1
+ 2 | adopted | 1
+ 3 | this | 1
+ 3 | at | 1
+ 3 | a | 1
+ 3 | broccoli | 1
+ 3 | of | 1
+ 3 | look | 1
+ 3 | hamster | 1
+ 3 | on | 1
+ 3 | piece | 1
+ 3 | cute | 1
+ 3 | munching | 1
+(36 rows)
</pre>
--# We also can create a vocabulary of the words and store a wordid in the output
-table instead of the actual word.
+-# Next we create a vocabulary of the words
+and store a wordid in the output table instead of the
+actual word:
<pre class="example">
-DROP TABLE IF EXISTS documents_tf;
-DROP TABLE IF EXISTS documents_tf_vocabulary;
-SELECT madlib.term_frequency('documents', 'docid', 'words', 'documents_tf', TRUE);
--- Output with wordid instead of the actual words
-SELECT * FROM documents_tf order by docid;
+DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
+SELECT madlib.term_frequency('documents', -- input table
+ 'docid', -- document id
--- End diff --
document id column
---
|