predictionio-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From don...@apache.org
Subject [6/9] incubator-predictionio-template-text-classifier git commit: Merge branch 'STOP_WORDS' of https://github.com/nlauchande/incubator-predictionio-template-text-classifier
Date Thu, 04 May 2017 18:25:28 GMT
Merge branch 'STOP_WORDS' of https://github.com/nlauchande/incubator-predictionio-template-text-classifier


Project: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/commit/3d2baf55
Tree: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/tree/3d2baf55
Diff: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/diff/3d2baf55

Branch: refs/heads/master
Commit: 3d2baf5544491d7c0cc33cf83c6e32374e5d70d7
Parents: 5c45ef9 1a31614
Author: Donald Szeto <donald@apache.org>
Authored: Thu May 4 10:46:57 2017 -0700
Committer: Donald Szeto <donald@apache.org>
Committed: Thu May 4 10:46:57 2017 -0700

----------------------------------------------------------------------
 src/main/scala/Preparator.scala | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/blob/3d2baf55/src/main/scala/Preparator.scala
----------------------------------------------------------------------
diff --cc src/main/scala/Preparator.scala
index 9441257,1681acc..8a5cb5c
--- a/src/main/scala/Preparator.scala
+++ b/src/main/scala/Preparator.scala
@@@ -70,28 -63,10 +71,29 @@@ class TFHasher
  
    private val hasher = new HashingTF(numFeatures = numFeatures)
  
 +/** Use Lucene StandardAnalyzer to tokenize text **/
 + def tokenize(content: String): Seq[String] = {
 +    val tReader = new StringReader(content)
 +    val analyzer = new StandardAnalyzer(Version.LATEST)
 +    val tStream = analyzer.tokenStream("contents", tReader)
 +    val term = tStream.addAttribute(classOf[CharTermAttribute])
 +    tStream.reset()
 +
 +    val result = mutable.ArrayBuffer.empty[String]
 +    while (tStream.incrementToken()) {
 +      val termValue = term.toString
-       
++
 +        result += term.toString
-       
++
 +    }
 +    result
 +}
 +
 +
    /** Hashing function: Text -> term frequency vector. */
    def hashTF(text: String): Vector = {
 -    val newList : Array[String] = text.split(" ")
 +    val newList : Array[String] = tokenize(text)
+     .filterNot(stopWords.contains(_))
      .sliding(nGram)
      .map(_.mkString)
      .toArray
@@@ -104,7 -79,7 +106,7 @@@ class TFIDFModel
    val hasher: TFHasher,
    val idf: IDFModel
  ) extends Serializable {
--  
++
    /** trasform text to tf-idf vector. */
    def transform(text: String): Vector = {
      // Map(n-gram -> document tf)


Mime
View raw message