mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From build...@apache.org
Subject svn commit: r948822 - in /websites/staging/mahout/trunk/content: ./ users/environment/classify-a-doc-from-the-shell.html
Date Thu, 23 Apr 2015 00:57:56 GMT
Author: buildbot
Date: Thu Apr 23 00:57:55 2015
New Revision: 948822

Log:
Staging update by buildbot for mahout

Added:
    websites/staging/mahout/trunk/content/users/environment/classify-a-doc-from-the-shell.html
Modified:
    websites/staging/mahout/trunk/content/   (props changed)

Propchange: websites/staging/mahout/trunk/content/
------------------------------------------------------------------------------
--- cms:source-revision (original)
+++ cms:source-revision Thu Apr 23 00:57:55 2015
@@ -1 +1 @@
-1675192
+1675527

Added: websites/staging/mahout/trunk/content/users/environment/classify-a-doc-from-the-shell.html
==============================================================================
--- websites/staging/mahout/trunk/content/users/environment/classify-a-doc-from-the-shell.html (added)
+++ websites/staging/mahout/trunk/content/users/environment/classify-a-doc-from-the-shell.html Thu Apr 23 00:57:55 2015
@@ -0,0 +1,535 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data framework, data integration,
+        data matching, data mining, data mining algorithms, data mining analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning methods,
+        learning techniques, lucene, machine learning, machine translation, mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data mining">
+  <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico">
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : 
+        'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+	
+	  var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/"><img src="/images/mahout-logo-brudman.png" alt="Logos for Mahout and Apache Software Foundation" /></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org" name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+             <!-- <li><a href="/">Home</a></li> --> 
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a>
+                  <li><a href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a href="http://www.apache.org/licenses/">License</a></li>
+                  <li><a href="http://www.apache.org/security/">Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer resources</a></li>
+                  <li><a href="/developers/version-control.html">Version control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check list</a></li>
+                  <li><a href="/developers/github.html">Handling Github PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Mahout-Samsara<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark Bindings Overview</a></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                  <li class="nav-header">Engines</li>
+                  <li><a href="/users/sparkbindings/home.html">Spark</a></li>
+                  <li><a href="/users/environment/h2o-internals.html">H2O</a></li>
+                  <li class="nav-header">References</li>
+                  <li><a href="/users/environment/in-core-reference.html">In-Core Algebraic DSL Reference</a></li>
+                  <li><a href="/users/environment/out-of-core-reference.html">Out-Of-Core Algebraic DSL Reference</a></li>
+                  <li class="nav-header">Tutorials</li>
+                  <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li>
+                  <li><a href="/users/environment/how-to-build-an-app.html">How to build an app</a></li>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Algorithms<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of algorithms</a>
+                  <li class="nav-header">Distributed Matrix Decomposition</li>
+                  <li><a href="/users/algorithms/d-qr.html">Cholesky QR</a></li>
+                  <li><a href="/users/algorithms/d-ssvd.html">SSVD</a></li>
+                  <li class="nav-header">Recommendations</li>
+                  <li><a href="/users/algorithms/recommender-overview.html">Recommender Overview</a></li>
+                  <li><a href="/users/algorithms/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li>
+                  <li class="nav-header">Classification</li>
+                  <li><a href="/users/algorithms/spark-naive-bayes.html">Spark Naive Bayes</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">MapReduce Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Overview</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a>
+                  <li><a href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Mahout MapReduce<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li class="nav-header">Classification</li>
+                  <li><a href="/users/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a href="/users/classification/hidden-markov-models.html">Hidden Markov Models</a></li>
+                  <li><a href="/users/classification/logistic-regression.html">Logistic Regression (Single Machine)</a></li>
+                  <li><a href="/users/classification/partial-implementation.html">Random Forest</a></li>
+                  <li class="nav-header">Classification Examples</li>
+                  <li><a href="/users/classification/breiman-example.html">Breiman example</a></li>
+                  <li><a href="/users/classification/twenty-newsgroups.html">20 newsgroups example</a></li>
+                  <li><a href="/users/classification/bankmarketing-example.html">SGD classifier bank marketing</a></li>
+                  <li><a href="/users/classification/wikipedia-classifier-example.html">Wikipedia XML parser and classifier</a></li>
+                  <li class="nav-header">Clustering</li>
+                  <li><a href="/users/clustering/k-means-clustering.html">k-Means</a></li>
+                  <li><a href="/users/clustering/canopy-clustering.html">Canopy</a></li>
+                  <li><a href="/users/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                  <li><a href="/users/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                  <li><a href="/users/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                  <li class="nav-header">Clustering Commandline usage</li>
+                  <li><a href="/users/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                  <li><a href="/users/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                  <li><a href="/users/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li>
+                  <li class="nav-header">Clustering Examples</li>
+                  <li><a href="/users/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li>
+                  <li class="nav-header">Cluster Post processing</li>
+                  <li><a href="/users/clustering/cluster-dumper.html">Cluster Dumper tool</a></li>
+                  <li><a href="/users/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li>
+                  <li class="nav-header">Recommendations</li>
+                  <li><a href="/users/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li>
+                  <li><a href="/users/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li>
+		  <li><a href="/users/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li>
+                  <li><a href="/users/recommender/recommender-documentation.html">Overview</a></li>
+                  <li><a href="/users/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li>
+                  <li><a href="/users/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li>
+               </ul>
+              </li>
+              <!--  <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                
+                </ul> -->
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+	<ul class="sidemenu">
+		<li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+	</ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li>
+      <li><a href="http://www.apache.org/dev/">Developer Resources</a></li>
+      <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
+      <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/">Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/">Hadoop</a></li>
+      <li><a href="http://bigtop.apache.org/">Bigtop</a></li>
+      <li><a href="http://spark.apache.org/">Spark</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <h1 id="classifying-a-document-with-the-mahout-shell">Classifying a Document with the Mahout Shell</h1>
+<p>This tutorial assumes that you have Spark configured for the <code>spark-shell</code> See <a href="http://mahout.apache.org/users/sparkbindings/play-with-shell.html">Playing with Mahout's Shell</a>.  As well we assume that Mahout is running in cluster mode (i.e. with the <code>MAHOUT_LOCAL</code> environment variable unset) so that the output is put into HDFS.</p>
+<h2 id="downloading-and-vectorizing-the-wikipedia-dataset">Downloading and Vectorizing the wikipedia dataset</h2>
+<p><em>As of Mahout v0.10.0, we are still reliant on the MapReduce versions of <code>mahout seqwiki</code> and <code>mahout seq2sparse</code> to extract and vectorize our text.  A</em> <a href="https://issues.apache.org/jira/browse/MAHOUT-1663"><em>Spark implemenation of seq2sparse</em></a> <em>is in the works for Mahout v0.11.</em> However, to download the wikipedia dataset, extract the bodies of the documentation, label each document and vectorize the text into TF-IDF vectors, we can sipmly run the <a href="https://github.com/apache/mahout/blob/master/examples/bin/classify-wikipedia.sh">wikipedia-classifier.sh</a> example.  </p>
+<div class="codehilite"><pre><span class="n">Please</span> <span class="n">select</span> <span class="n">a</span> <span class="n">number</span> <span class="n">to</span> <span class="n">choose</span> <span class="n">the</span> <span class="n">corresponding</span> <span class="n">task</span> <span class="n">to</span> <span class="n">run</span>
+1<span class="p">.</span> <span class="n">CBayes</span> <span class="p">(</span><span class="n">may</span> <span class="n">require</span> <span class="n">increased</span> <span class="n">heap</span> <span class="n">space</span> <span class="n">on</span> <span class="n">yarn</span><span class="p">)</span>
+2<span class="p">.</span> <span class="n">BinaryCBayes</span>
+3<span class="p">.</span> <span class="n">clean</span> <span class="o">--</span> <span class="n">cleans</span> <span class="n">up</span> <span class="n">the</span> <span class="n">work</span> <span class="n">area</span> <span class="n">in</span> <span class="o">/</span><span class="n">tmp</span><span class="o">/</span><span class="n">mahout</span><span class="o">-</span><span class="n">work</span><span class="o">-</span><span class="n">wiki</span>
+<span class="n">Enter</span> <span class="n">your</span> <span class="n">choice</span> <span class="p">:</span>
+</pre></div>
+
+
+<p>Enter (2). This will download a large recent XML dump of the wikipedia database, into a <code>/tmp/mahout-work-wiki</code> directory, unzip it and  place it into HDFS.  It will run a <a href="http://mahout.apache.org/users/classification/wikipedia-classifier-example.html">MapReduce job to parse the wikipedia set</a>, extracting and labeling only pages with category tags for [United States] and [United Kingdom]. It will then run <code>mahout seq2sparse</code> to convert the documents into TF-IDF vectors.  The script will also a build and test a <a href="http://mahout.apache.org/users/classification/bayesian.html">Naive Bayes model using MapReduce</a>.  When it is completed, you should see a confusion matrix on your screen.  For this tutorial, we will ignore the MapReduce model, and build a new model using Spark based on the vectorization data created by <code>seq2sparse</code>.</p>
+<h2 id="getting-started">Getting Started</h2>
+<p>Launch the <code>mahout-shell</code>.  There is an example script: <code>spark-document-classifier.mscala</code> (<code>.mscala</code> denotes a Mahout-Scala script which can be run similarly to an R-script).   We will be walking through this script for this tutorial but if you wanted to simply run the script, you could just issue the command: </p>
+<div class="codehilite"><pre><span class="n">mahout</span><span class="o">&gt;</span> <span class="p">:</span><span class="n">load</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">mahout</span><span class="o">/</span><span class="n">examples</span><span class="o">/</span><span class="n">bin</span><span class="o">/</span><span class="n">spark</span><span class="o">-</span><span class="n">document</span><span class="o">-</span><span class="n">classifier</span><span class="p">.</span><span class="n">mscala</span>
+</pre></div>
+
+
+<p>For now, lets take the script apart piece by piece.</p>
+<h2 id="imports">Imports</h2>
+<p>Our mahout Naive Bayes Imports:</p>
+<div class="codehilite"><pre><span class="n">import</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">classifier</span><span class="p">.</span><span class="n">naivebayes</span><span class="p">.</span><span class="n">_</span>
+<span class="n">import</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">classifier</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">_</span>
+<span class="n">import</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">nlp</span><span class="p">.</span><span class="n">tfidf</span><span class="p">.</span><span class="n">_</span>
+</pre></div>
+
+
+<p>Hadoop Imports needed to read our dictionary:</p>
+<div class="codehilite"><pre><span class="n">import</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">hadoop</span><span class="p">.</span><span class="n">io</span><span class="p">.</span><span class="n">Text</span>
+<span class="n">import</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">hadoop</span><span class="p">.</span><span class="n">io</span><span class="p">.</span><span class="n">IntWritable</span>
+<span class="n">import</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">hadoop</span><span class="p">.</span><span class="n">io</span><span class="p">.</span><span class="n">LongWritable</span>
+</pre></div>
+
+
+<h2 id="read-in-our-full-set-from-hdfs-as-vectorized-by-seq2sparse-in-classify-wikipediash">read in our full set from HDFS as vectorized by seq2sparse in classify-wikipedia.sh</h2>
+<div class="codehilite"><pre><span class="n">val</span> <span class="n">pathToData</span> <span class="p">=</span> &quot;<span class="o">/</span><span class="n">tmp</span><span class="o">/</span><span class="n">mahout</span><span class="o">-</span><span class="n">work</span><span class="o">-</span><span class="n">wiki</span><span class="o">/</span>&quot;
+<span class="n">val</span> <span class="n">fullData</span> <span class="p">=</span> <span class="n">drmDfsRead</span><span class="p">(</span><span class="n">pathToData</span> <span class="o">+</span> &quot;<span class="n">wikipediaVecs</span><span class="o">/</span><span class="n">tfidf</span><span class="o">-</span><span class="n">vectors</span>&quot;<span class="p">)</span>
+</pre></div>
+
+
+<h2 id="extract-the-category-of-each-observation-and-aggregate-those-observation-by-category">extract the category of each observation and aggregate those observation by category</h2>
+<div class="codehilite"><pre><span class="n">val</span> <span class="p">(</span><span class="n">labelIndex</span><span class="p">,</span> <span class="n">aggregatedObservations</span><span class="p">)</span> <span class="p">=</span> <span class="n">SparkNaiveBayes</span><span class="p">.</span><span class="n">extractLabelsAndAggregateObservations</span><span class="p">(</span><span class="n">fullData</span><span class="p">)</span>
+</pre></div>
+
+
+<h2 id="build-a-muitinomial-naive-bayes-model-and-self-test-on-the-training-set">build a Muitinomial Naive Bayes model and self test on the training set</h2>
+<div class="codehilite"><pre><span class="n">val</span> <span class="n">model</span> <span class="p">=</span> <span class="n">SparkNaiveBayes</span><span class="p">.</span><span class="n">train</span><span class="p">(</span><span class="n">aggregatedObservations</span><span class="p">,</span> <span class="n">labelIndex</span><span class="p">,</span> <span class="n">false</span><span class="p">)</span>
+<span class="n">val</span> <span class="n">resAnalyzer</span> <span class="p">=</span> <span class="n">SparkNaiveBayes</span><span class="p">.</span><span class="n">test</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">fullData</span><span class="p">,</span> <span class="n">false</span><span class="p">)</span>
+<span class="n">println</span><span class="p">(</span><span class="n">resAnalyzer</span><span class="p">)</span>
+</pre></div>
+
+
+<p>printing the result analyzer will display the confusion matrix</p>
+<h2 id="read-in-the-dictionary-and-document-frequency-count-from-hdfs">read in the dictionary and document frequency count from HDFS</h2>
+<div class="codehilite"><pre><span class="n">val</span> <span class="n">dictionary</span> <span class="p">=</span> <span class="n">sdc</span><span class="p">.</span><span class="n">sequenceFile</span><span class="p">(</span><span class="n">pathToData</span> <span class="o">+</span> &quot;<span class="n">wikipediaVecs</span><span class="o">/</span><span class="n">dictionary</span><span class="p">.</span><span class="n">file</span><span class="o">-</span>0&quot;<span class="p">,</span>
+                                  <span class="n">classOf</span><span class="p">[</span><span class="n">Text</span><span class="p">],</span>
+                                  <span class="n">classOf</span><span class="p">[</span><span class="n">IntWritable</span><span class="p">])</span>
+<span class="n">val</span> <span class="n">documentFrequencyCount</span> <span class="p">=</span> <span class="n">sdc</span><span class="p">.</span><span class="n">sequenceFile</span><span class="p">(</span><span class="n">pathToData</span> <span class="o">+</span> &quot;<span class="n">wikipediaVecs</span><span class="o">/</span><span class="n">df</span><span class="o">-</span><span class="n">count</span>&quot;<span class="p">,</span>
+                                              <span class="n">classOf</span><span class="p">[</span><span class="n">IntWritable</span><span class="p">],</span>
+                                              <span class="n">classOf</span><span class="p">[</span><span class="n">LongWritable</span><span class="p">])</span>
+
+<span class="o">//</span> <span class="n">setup</span> <span class="n">the</span> <span class="n">dictionary</span> <span class="n">and</span> <span class="n">document</span> <span class="n">frequency</span> <span class="n">count</span> <span class="n">as</span> <span class="n">maps</span>
+<span class="n">val</span> <span class="n">dictionaryRDD</span> <span class="p">=</span> <span class="n">dictionary</span><span class="p">.</span><span class="n">map</span> <span class="p">{</span> 
+                                <span class="k">case</span> <span class="p">(</span><span class="n">wKey</span><span class="p">,</span> <span class="n">wVal</span><span class="p">)</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">wKey</span><span class="p">.</span><span class="n">asInstanceOf</span><span class="p">[</span><span class="n">Text</span><span class="p">]</span>
+                                                         <span class="p">.</span><span class="n">toString</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">wVal</span><span class="p">.</span><span class="n">get</span><span class="p">()</span> 
+                                   <span class="p">}</span>
+
+<span class="n">val</span> <span class="n">documentFrequencyCountRDD</span> <span class="p">=</span> <span class="n">documentFrequencyCount</span><span class="p">.</span><span class="n">map</span> <span class="p">{</span>
+                                        <span class="k">case</span> <span class="p">(</span><span class="n">wKey</span><span class="p">,</span> <span class="n">wVal</span><span class="p">)</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">wKey</span><span class="p">.</span><span class="n">asInstanceOf</span><span class="p">[</span><span class="n">IntWritable</span><span class="p">]</span>
+                                                                 <span class="p">.</span><span class="n">get</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">wVal</span><span class="p">.</span><span class="n">get</span><span class="p">()</span> 
+                                                           <span class="p">}</span>
+
+<span class="n">val</span> <span class="n">dictionaryMap</span> <span class="p">=</span> <span class="n">dictionaryRDD</span><span class="p">.</span><span class="n">collect</span><span class="p">.</span><span class="n">map</span><span class="p">(</span><span class="n">x</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">x</span><span class="p">.</span><span class="n">_1</span><span class="p">.</span><span class="n">toString</span> <span class="o">-&gt;</span> <span class="n">x</span><span class="p">.</span><span class="n">_2</span><span class="p">.</span><span class="n">toInt</span><span class="p">).</span><span class="n">toMap</span>
+<span class="n">val</span> <span class="n">dfCountMap</span> <span class="p">=</span> <span class="n">documentFrequencyCountRDD</span><span class="p">.</span><span class="n">collect</span><span class="p">.</span><span class="n">map</span><span class="p">(</span><span class="n">x</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">x</span><span class="p">.</span><span class="n">_1</span><span class="p">.</span><span class="n">toInt</span> <span class="o">-&gt;</span> <span class="n">x</span><span class="p">.</span><span class="n">_2</span><span class="p">.</span><span class="n">toLong</span><span class="p">).</span><span class="n">toMap</span>
+</pre></div>
+
+
+<h2 id="define-a-function-to-tokeinze-and-vectorize-new-text-using-our-current-dictionary">define a function to tokeinze and vectorize new text using our current dictionary</h2>
+<p>For this simple example, our function ```vectorizeDocument(...) will tokenize a new document into unigrams using native Java String methods and vectorize usingour dictionary and document frequencies. You could also use a <a href="https://lucene.apache.org/core/">Lucene</a> analyzer for bigrams, trigrams, etc., and integrate Apache <a href="https://tika.apache.org/">Tika</a> to extract text from different document types (PDF, PPT, XLS, etc.).  Here, however we will kwwp ot simple and split ouor text using regexs and native String methods.</p>
+<div class="codehilite"><pre>def vectorizeDocument<span class="p">(</span>document: String<span class="p">,</span>
+                        dictionaryMap: Map<span class="p">[</span>String<span class="p">,</span>Int<span class="p">],</span>
+                        dfMap: Map<span class="p">[</span>Int<span class="p">,</span>Long<span class="p">])</span>: Vector <span class="o">=</span> <span class="p">{</span>
+    val wordCounts <span class="o">=</span> document.replaceAll<span class="p">(</span><span class="s">&quot;[^\\p{L}\\p{Nd}]+&quot;</span><span class="p">,</span> <span class="s">&quot; &quot;</span><span class="p">)</span>
+                                <span class="m">.</span>toLowerCase
+                                <span class="m">.</span>split<span class="p">(</span><span class="s">&quot; &quot;</span><span class="p">)</span>
+                                <span class="m">.</span>groupBy<span class="p">(</span>identity<span class="p">)</span>
+                                <span class="m">.</span>mapValues<span class="p">(</span>_<span class="m">.</span>length<span class="p">)</span>         
+    val vec <span class="o">=</span> new RandomAccessSparseVector<span class="p">(</span>dictionaryMap.size<span class="p">)</span>
+    val totalDFSize <span class="o">=</span> dfMap<span class="p">(</span><span class="m">-1</span><span class="p">)</span>
+    val docSize <span class="o">=</span> wordCounts.size
+    <span class="kr">for</span> <span class="p">(</span>word <span class="o">&lt;-</span> wordCounts<span class="p">)</span> <span class="p">{</span>
+        val term <span class="o">=</span> word._1
+        <span class="kr">if</span> <span class="p">(</span>dictionaryMap.contains<span class="p">(</span>term<span class="p">))</span> <span class="p">{</span>
+            val tfidf: TermWeight <span class="o">=</span> new TFIDF<span class="p">()</span>
+            val termFreq <span class="o">=</span> word._2
+            val dictIndex <span class="o">=</span> dictionaryMap<span class="p">(</span>term<span class="p">)</span>
+            val docFreq <span class="o">=</span> dfCountMap<span class="p">(</span>dictIndex<span class="p">)</span>
+            val currentTfIdf <span class="o">=</span> tfidf.calculate<span class="p">(</span>termFreq<span class="p">,</span>
+                                               docFreq.toInt<span class="p">,</span>
+                                               docSize<span class="p">,</span>
+                                               totalDFSize.toInt<span class="p">)</span>
+            vec.setQuick<span class="p">(</span>dictIndex<span class="p">,</span> currentTfIdf<span class="p">)</span>
+        <span class="p">}</span>
+    <span class="p">}</span>
+    vec
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="setup-our-classifier">setup our classifier</h2>
+<div class="codehilite"><pre><span class="n">val</span> <span class="n">labelMap</span> <span class="p">=</span> <span class="n">model</span><span class="p">.</span><span class="n">labelIndex</span>
+<span class="n">val</span> <span class="n">numLabels</span> <span class="p">=</span> <span class="n">model</span><span class="p">.</span><span class="n">numLabels</span>
+<span class="n">val</span> <span class="n">reverseLabelMap</span> <span class="p">=</span> <span class="n">labelMap</span><span class="p">.</span><span class="n">map</span><span class="p">(</span><span class="n">x</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">x</span><span class="p">.</span><span class="n">_2</span> <span class="o">-&gt;</span> <span class="n">x</span><span class="p">.</span><span class="n">_1</span><span class="p">)</span>
+
+<span class="o">//</span> <span class="n">instantiate</span> <span class="n">the</span> <span class="n">correct</span> <span class="n">type</span> <span class="n">of</span> <span class="n">classifier</span>
+<span class="n">val</span> <span class="n">classifier</span> <span class="p">=</span> <span class="n">model</span><span class="p">.</span><span class="n">isComplementary</span> <span class="n">match</span> <span class="p">{</span>
+    <span class="k">case</span> <span class="n">true</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">new</span> <span class="n">ComplementaryNBClassifier</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
+    <span class="k">case</span> <span class="n">_</span> <span class="p">=</span><span class="o">&gt;</span> <span class="n">new</span> <span class="n">StandardNBClassifier</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="define-an-argmax-function">define an argmax function</h2>
+<p>The label with the higest score wins the classification for a given document</p>
+<div class="codehilite"><pre>def argmax<span class="p">(</span>v: Vector<span class="p">)</span>: <span class="p">(</span>Int<span class="p">,</span> Double<span class="p">)</span> <span class="o">=</span> <span class="p">{</span>
+    var bestIdx: Int <span class="o">=</span> Integer.MIN_VALUE
+    var bestScore: Double <span class="o">=</span> Integer.MIN_VALUE.asInstanceOf<span class="p">[</span>Int<span class="p">]</span><span class="m">.</span>toDouble
+    <span class="kr">for</span><span class="p">(</span>i <span class="o">&lt;-</span> <span class="m">0</span> until v.size<span class="p">)</span> <span class="p">{</span>
+        <span class="kr">if</span><span class="p">(</span>v<span class="p">(</span>i<span class="p">)</span> <span class="o">&gt;</span> bestScore<span class="p">){</span>
+            bestScore <span class="o">=</span> v<span class="p">(</span>i<span class="p">)</span>
+            bestIdx <span class="o">=</span> i
+        <span class="p">}</span>
+    <span class="p">}</span>
+    <span class="p">(</span>bestIdx<span class="p">,</span> bestScore<span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="define-our-final-tf-idf-vector-classifier">define our final TF(-IDF) vector classifier</h2>
+<div class="codehilite"><pre><span class="n">def</span> <span class="n">classifyDocument</span><span class="p">(</span><span class="n">clvec</span><span class="p">:</span> <span class="n">Vector</span><span class="p">)</span> <span class="p">:</span> <span class="n">String</span> <span class="p">=</span> <span class="p">{</span>
+    <span class="n">val</span> <span class="n">cvec</span> <span class="p">=</span> <span class="n">classifier</span><span class="p">.</span><span class="n">classifyFull</span><span class="p">(</span><span class="n">clvec</span><span class="p">)</span>
+    <span class="n">val</span> <span class="p">(</span><span class="n">bestIdx</span><span class="p">,</span> <span class="n">bestScore</span><span class="p">)</span> <span class="p">=</span> <span class="n">argmax</span><span class="p">(</span><span class="n">cvec</span><span class="p">)</span>
+    <span class="n">reverseLabelMap</span><span class="p">(</span><span class="n">bestIdx</span><span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="two-sample-news-articles-united-states-football-and-united-kingdom-football">Two sample news articles: United States Football and United Kingdom Football</h2>
+<div class="codehilite"><pre><span class="c1">// A random United States football article</span>
+<span class="c1">// http://www.reuters.com/article/2015/01/28/us-nfl-superbowl-security-idUSKBN0L12JR20150128</span>
+<span class="n">val</span> <span class="n">UStextToClassify</span> <span class="o">=</span> <span class="k">new</span> <span class="n">String</span><span class="p">(</span><span class="s">&quot;(Reuters) - Super Bowl security officials acknowledge&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; the NFL championship game represents a high profile target on a world stage but are&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; unaware of any specific credible threats against Sunday&#39;s showcase. In advance of&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; one of the world&#39;s biggest single day sporting events, Homeland Security Secretary&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Jeh Johnson was in Glendale on Wednesday to review security preparations and tour&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; University of Phoenix Stadium where the Seattle Seahawks and New England Patriots&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; will battle. Deadly shootings in Paris and arrest of suspects in Belgium, Greece and&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Germany heightened fears of more attacks around the world and social media accounts&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; linked to Middle East militant groups have carried a number of threats to attack&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; high-profile U.S. events. There is no specific credible threat, said Johnson, who&quot;</span> <span class="o">+</span> 
+    <span class="s">&quot; has appointed a federal coordination team to work with local, state and federal&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; agencies to ensure safety of fans, players and other workers associated with the&quot;</span> <span class="o">+</span> 
+    <span class="s">&quot; Super Bowl. I&#39;m confident we will have a safe and secure and successful event.&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Sunday&#39;s game has been given a Special Event Assessment Rating (SEAR) 1 rating, the&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; same as in previous years, except for the year after the Sept. 11, 2001 attacks, when&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; a higher level was declared. But security will be tight and visible around Super&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Bowl-related events as well as during the game itself. All fans will pass through&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; metal detectors and pat downs. Over 4,000 private security personnel will be deployed&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; and the almost 3,000 member Phoenix police force will be on Super Bowl duty. Nuclear&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; device sniffing teams will be deployed and a network of Bio-Watch detectors will be&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; set up to provide a warning in the event of a biological attack. The Department of&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Homeland Security (DHS) said in a press release it had held special cyber-security&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; and anti-sniper training sessions. A U.S. official said the Transportation Security&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Administration, which is responsible for screening airline passengers, will add&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; screeners and checkpoint lanes at airports. Federal air marshals, behavior detection&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; officers and dog teams will help to secure transportation systems in the area. We&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; will be ramping it (security) up on Sunday, there is no doubt about that, said Federal&quot;</span><span class="o">+</span>
+    <span class="s">&quot; Coordinator Matthew Allen, the DHS point of contact for planning and support. I have&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; every confidence the public safety agencies that represented in the planning process&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; are going to have their best and brightest out there this weekend and we will have&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; a very safe Super Bowl.&quot;</span><span class="p">)</span>
+
+<span class="c1">// A random United Kingdom football article</span>
+<span class="c1">// http://www.reuters.com/article/2015/01/26/manchester-united-swissquote-idUSL6N0V52RZ20150126</span>
+<span class="n">val</span> <span class="n">UKtextToClassify</span> <span class="o">=</span> <span class="k">new</span> <span class="n">String</span><span class="p">(</span><span class="s">&quot;(Reuters) - Manchester United have signed a sponsorship&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; deal with online financial trading company Swissquote, expanding the commercial&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; partnerships that have helped to make the English club one of the richest teams in&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; world soccer. United did not give a value for the deal, the club&#39;s first in the sector,&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; but said on Monday it was a multi-year agreement. The Premier League club, 20 times&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; English champions, claim to have 659 million followers around the globe, making the&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; United name attractive to major brands like Chevrolet cars and sportswear group Adidas.&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Swissquote said the global deal would allow it to use United&#39;s popularity in Asia to&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; help it meet its targets for expansion in China. Among benefits from the deal,&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Swissquote&#39;s clients will have a chance to meet United players and get behind the scenes&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; at the Old Trafford stadium. Swissquote is a Geneva-based online trading company that&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; allows retail investors to buy and sell foreign exchange, equities, bonds and other asset&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; classes. Like other retail FX brokers, Swissquote was left nursing losses on the Swiss&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; franc after Switzerland&#39;s central bank stunned markets this month by abandoning its cap&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; on the currency. The fallout from the abrupt move put rival and West Ham United shirt&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; sponsor Alpari UK into administration. Swissquote itself was forced to book a 25 million&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Swiss francs ($28 million) provision for its clients who were left out of pocket&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; following the franc&#39;s surge. United&#39;s ability to grow revenues off the pitch has made&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; them the second richest club in the world behind Spain&#39;s Real Madrid, despite a&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; downturn in their playing fortunes. United Managing Director Richard Arnold said&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; there was still lots of scope for United to develop sponsorships in other areas of&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; business. The last quoted statistics that we had showed that of the top 25 sponsorship&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; categories, we were only active in 15 of those, Arnold told Reuters. I think there is a&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; huge potential still for the club, and the other thing we have seen is there is very&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; significant growth even within categories. United have endured a tricky transition&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; following the retirement of manager Alex Ferguson in 2013, finishing seventh in the&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; Premier League last season and missing out on a place in the lucrative Champions League.&quot;</span> <span class="o">+</span>
+    <span class="s">&quot; ($1 = 0.8910 Swiss francs) (Writing by Neil Maidment, additional reporting by Jemima&quot;</span> <span class="o">+</span> 
+    <span class="s">&quot; Kelly; editing by Keith Weir)&quot;</span><span class="p">)</span>
+</pre></div>
+
+
+<h2 id="vectorize-and-classify-our-documents">vectorize and classify our documents</h2>
+<div class="codehilite"><pre><span class="n">val</span> <span class="n">usVec</span> <span class="p">=</span> <span class="n">vectorizeDocument</span><span class="p">(</span><span class="n">UStextToClassify</span><span class="p">,</span> <span class="n">dictionaryMap</span><span class="p">,</span> <span class="n">dfCountMap</span><span class="p">)</span>
+<span class="n">val</span> <span class="n">ukVec</span> <span class="p">=</span> <span class="n">vectorizeDocument</span><span class="p">(</span><span class="n">UKtextToClassify</span><span class="p">,</span> <span class="n">dictionaryMap</span><span class="p">,</span> <span class="n">dfCountMap</span><span class="p">)</span>
+
+<span class="n">println</span><span class="p">(</span>&quot;<span class="n">Classifying</span> <span class="n">the</span> <span class="n">news</span> <span class="n">article</span> <span class="n">about</span> <span class="n">superbowl</span> <span class="n">security</span> <span class="p">(</span><span class="n">united</span> <span class="n">states</span><span class="p">)</span>&quot;<span class="p">)</span>
+<span class="n">classifyDocument</span><span class="p">(</span><span class="n">usVec</span><span class="p">)</span>
+
+<span class="n">println</span><span class="p">(</span>&quot;<span class="n">Classifying</span> <span class="n">the</span> <span class="n">news</span> <span class="n">article</span> <span class="n">about</span> <span class="n">Manchester</span> <span class="n">United</span> <span class="p">(</span><span class="n">united</span> <span class="n">kingdom</span><span class="p">)</span>&quot;<span class="p">)</span>
+<span class="n">classifyDocument</span><span class="p">(</span><span class="n">ukVec</span><span class="p">)</span>
+</pre></div>
+
+
+<h2 id="tie-everything-together-in-a-new-method-to-classify-new-text">tie everything together in a new method to classify new text</h2>
+<div class="codehilite"><pre><span class="n">def</span> <span class="n">classifyText</span><span class="p">(</span><span class="n">txt</span><span class="p">:</span> <span class="n">String</span><span class="p">):</span> <span class="n">String</span> <span class="p">=</span> <span class="p">{</span>
+    <span class="n">val</span> <span class="n">v</span> <span class="p">=</span> <span class="n">vectorizeDocument</span><span class="p">(</span><span class="n">txt</span><span class="p">,</span> <span class="n">dictionaryMap</span><span class="p">,</span> <span class="n">dfCountMap</span><span class="p">)</span>
+    <span class="n">classifyDocument</span><span class="p">(</span><span class="n">v</span><span class="p">)</span>
+
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="now-we-can-simply-call-our-classifytext-method-on-any-string">now we can simply call our classifyText method on any string</h2>
+<div class="codehilite"><pre><span class="n">classifyText</span><span class="p">(</span>&quot;<span class="n">Hello</span> <span class="n">world</span> <span class="n">from</span> <span class="n">Queens</span>&quot;<span class="p">)</span>
+<span class="n">classifyText</span><span class="p">(</span>&quot;<span class="n">Hello</span> <span class="n">world</span> <span class="n">from</span> <span class="n">London</span>&quot;<span class="p">)</span>
+</pre></div>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>



Mime
View raw message