predictionio-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From don...@apache.org
Subject [1/9] incubator-predictionio-template-text-classifier git commit: Changed tokenizer to use Apache Luce StandardAnalyzer for non-western languages
Date Thu, 04 May 2017 18:25:23 GMT
Repository: incubator-predictionio-template-text-classifier
Updated Branches:
  refs/heads/master 7bff41178 -> b24325a39


Changed tokenizer to use Apache Luce StandardAnalyzer for non-western languages


Project: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/commit/2bcbdae6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/tree/2bcbdae6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/diff/2bcbdae6

Branch: refs/heads/master
Commit: 2bcbdae63326873d996da9b4e1aa9afd952ecd67
Parents: 3d609f8
Author: Sebastiaan de Man <sebastiaan@gmail.com>
Authored: Sun Oct 30 21:31:48 2016 +0100
Committer: Sebastiaan de Man <sebastiaan@gmail.com>
Committed: Sun Oct 30 21:31:48 2016 +0100

----------------------------------------------------------------------
 src/main/scala/Preparator.scala | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/blob/2bcbdae6/src/main/scala/Preparator.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala
index c990944..fd043d6 100644
--- a/src/main/scala/Preparator.scala
+++ b/src/main/scala/Preparator.scala
@@ -11,6 +11,14 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
+import org.apache.lucene.analysis.standard.StandardAnalyzer
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
+import org.apache.lucene.util.Version
+
+import java.io.StringReader
+
+import scala.collection.mutable
+
 /** Define Preparator parameters. Recall that for our data
   * representation we are only required to input the n-gram window
   * components.
@@ -62,9 +70,28 @@ class TFHasher(
 
   private val hasher = new HashingTF(numFeatures = numFeatures)
 
+/** Use Lucene StandardAnalyzer to tokenize text **/
+ def tokenize(content: String): Seq[String] = {
+    val tReader = new StringReader(content)
+    val analyzer = new StandardAnalyzer(Version.LATEST)
+    val tStream = analyzer.tokenStream("contents", tReader)
+    val term = tStream.addAttribute(classOf[CharTermAttribute])
+    tStream.reset()
+
+    val result = mutable.ArrayBuffer.empty[String]
+    while (tStream.incrementToken()) {
+      val termValue = term.toString
+      
+        result += term.toString
+      
+    }
+    result
+}
+
+
   /** Hashing function: Text -> term frequency vector. */
   def hashTF(text: String): Vector = {
-    val newList : Array[String] = text.split(" ")
+    val newList : Array[String] = tokenize(text)
     .sliding(nGram)
     .map(_.mkString)
     .toArray


Mime
View raw message