predictionio-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From don...@apache.org
Subject [2/9] incubator-predictionio-template-text-classifier git commit: Filter out stop words from vectorization
Date Thu, 04 May 2017 18:25:24 GMT
Filter out stop words from vectorization

As per the discussion described in :
https://github.com/apache/incubator-predictionio-template-text-classifier/pull/8 . We implement
a
filter for stop words and they are added to the constructor of TFHasher during vectorization
of
words.


Project: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/commit/1a316143
Tree: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/tree/1a316143
Diff: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/diff/1a316143

Branch: refs/heads/master
Commit: 1a316143f169bc7804604d0914b380381dfb9fa1
Parents: 7bff411
Author: Natu Lauchande <nlauchande@gmail.com>
Authored: Mon Dec 5 17:36:04 2016 +0200
Committer: Natu Lauchande <nlauchande@gmail.com>
Committed: Tue Dec 6 04:04:47 2016 +0200

----------------------------------------------------------------------
 src/main/scala/Preparator.scala | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/blob/1a316143/src/main/scala/Preparator.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala
index c8b35d0..1681acc 100644
--- a/src/main/scala/Preparator.scala
+++ b/src/main/scala/Preparator.scala
@@ -26,7 +26,7 @@ class Preparator(pp: PreparatorParams)
 
   def prepare(sc: SparkContext, td: TrainingData): PreparedData = {
 
-    val tfHasher = new TFHasher(pp.numFeatures, pp.nGram)
+    val tfHasher = new TFHasher(pp.numFeatures, pp.nGram, td.stopWords)
 
     // Convert trainingdata's observation text into TF vector
     // and then fit a IDF model
@@ -57,7 +57,8 @@ class Preparator(pp: PreparatorParams)
 
 class TFHasher(
   val numFeatures: Int,
-  val nGram: Int
+  val nGram: Int,
+  val stopWords:Set[String]
 ) extends Serializable {
 
   private val hasher = new HashingTF(numFeatures = numFeatures)
@@ -65,6 +66,7 @@ class TFHasher(
   /** Hashing function: Text -> term frequency vector. */
   def hashTF(text: String): Vector = {
     val newList : Array[String] = text.split(" ")
+    .filterNot(stopWords.contains(_))
     .sliding(nGram)
     .map(_.mkString)
     .toArray


Mime
View raw message