spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Andy Davidson <A...@SantaCruzIntegration.com>
Subject how to extend java transformer from Scala UnaryTransformer ?
Date Fri, 01 Jan 2016 19:38:49 GMT
I am trying to write a trivial transformer I use use in my pipeline. I am
using java and spark 1.5.2. It was suggested that I use the Tokenize.scala
class as an example. This should be very easy how ever I do not understand
Scala, I am having trouble debugging the following exception.

Any help would be greatly appreciated.

Happy New Year

Andy

java.lang.IllegalArgumentException: requirement failed: Param null__inputCol
does not belong to Stemmer_2f3aa96d-7919-4eaa-ad54-f7c620b92d1c.
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.ml.param.Params$class.shouldOwn(params.scala:557)
at org.apache.spark.ml.param.Params$class.set(params.scala:436)
at org.apache.spark.ml.PipelineStage.set(Pipeline.scala:37)
at org.apache.spark.ml.param.Params$class.set(params.scala:422)
at org.apache.spark.ml.PipelineStage.set(Pipeline.scala:37)
at org.apache.spark.ml.UnaryTransformer.setInputCol(Transformer.scala:83)
at com.pws.xxx.ml.StemmerTest.test(StemmerTest.java:30)



public class StemmerTest extends AbstractSparkTest {

    @Test

    public void test() {

        Stemmer stemmer = new Stemmer()

                                .setInputCol("raw²) //line 30

                                .setOutputCol("filtered");

    }

}


/**

 * @ see 
spark-1.5.1/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala

 * @ see 
https://chimpler.wordpress.com/2014/06/11/classifiying-documents-using-naive
-bayes-on-apache-spark-mllib/

 * @ see 
http://www.tonytruong.net/movie-rating-prediction-with-apache-spark-and-hort
onworks/

 * 

 * @author andrewdavidson

 *

 */

public class Stemmer extends UnaryTransformer<List<String>, List<String>,
Stemmer> implements Serializable{

    static Logger logger = LoggerFactory.getLogger(Stemmer.class);

    private static final long serialVersionUID = 1L;

    private static final  ArrayType inputType =
DataTypes.createArrayType(DataTypes.StringType, true);

    private final String uid = Stemmer.class.getSimpleName() + "_" +
UUID.randomUUID().toString();



    @Override

    public String uid() {

        return uid;

    }



    /*

       override protected def validateInputType(inputType: DataType): Unit =
{

    require(inputType == StringType, s"Input type must be string type but
got $inputType.")

  }

     */

    @Override

    public void validateInputType(DataType inputTypeArg) {

        String msg = "inputType must be " + inputType.simpleString() + " but
got " + inputTypeArg.simpleString();

        assert (inputType.equals(inputTypeArg)) : msg;

    }

    

    @Override

    public Function1<List<String>, List<String>> createTransformFunc() {

        // 
http://stackoverflow.com/questions/6545066/using-scala-from-java-passing-fun
ctions-as-parameters

        Function1<List<String>, List<String>> f = new
AbstractFunction1<List<String>, List<String>>() {

            public List<String> apply(List<String> words) {

                for(String word : words) {

                    logger.error("AEDWIP input word: {}", word);

                }

                return words;

            }

        };

        

        return f;

    }



    @Override

    public DataType outputDataType() {

        return DataTypes.createArrayType(DataTypes.StringType, true);

    }

}



Mime
View raw message