Author: robinanil
Date: Tue Feb 9 05:49:18 2010
New Revision: 907938
URL: http://svn.apache.org/viewvc?rev=907938&view=rev
Log:
MAHOUT-242 NGram Collocation using LLR (Drew Farris)
Added:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
Modified:
lucene/mahout/trunk/utils/pom.xml
Modified: lucene/mahout/trunk/utils/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/pom.xml?rev=907938&r1=907937&r2=907938&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/pom.xml (original)
+++ lucene/mahout/trunk/utils/pom.xml Tue Feb 9 05:49:18 2010
@@ -149,6 +149,16 @@
<artifactId>lucene-core</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymock</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymockclassextension</artifactId>
+ </dependency>
+
</dependencies>
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+public class CollocCombiner extends MapReduceBase implements
+ Reducer<Gram,Gram,Gram,Gram> {
+
+ /**
+ * collocation finder: pass 1 collec phase:
+ *
+ * given input from the mapper, k:h_subgram:1 v:ngram:1 k:t_subgram:1
+ * v:ngram:1
+ *
+ * count ngrams and subgrams.
+ *
+ * output is:
+ *
+ * k:h_subgram:subgramfreq v:ngram:ngramfreq k:t_subgram:subgramfreq
+ * v:ngram:ngramfreq
+ *
+ * Each ngram's frequency is essentially counted twice, frequency should be
+ * the same for the head and tail. Fix this to count only for the head and
+ * move the count into the value?
+ */
+ @Override
+ public void reduce(Gram key,
+ Iterator<Gram> value,
+ OutputCollector<Gram,Gram> output,
+ Reporter reporter) throws IOException {
+
+ HashMap<Gram,Gram> set = new HashMap<Gram,Gram>();
+ int subgramFrequency = 0;
+
+ while (value.hasNext()) {
+ Gram t = value.next();
+ subgramFrequency += t.getFrequency();
+
+ Gram s = set.get(t);
+ if (s == null) {
+ // t is potentially reused, so create a new object to populate the
+ // HashMap
+ Gram e = new Gram(t);
+ set.put(e, e);
+ } else {
+ s.incrementFrequency(t.getFrequency());
+ }
+ }
+
+ // emit subgram:subgramFreq ngram:ngramFreq pairs
+ key.setFrequency(subgramFrequency);
+
+ for (Gram t : set.keySet()) {
+ output.collect(key, t);
+ }
+ }
+
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,205 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.IOException;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.IdentityMapper;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.HadoopUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Driver for LLR collocation discovery mapreduce job */
+public class CollocDriver {
+
+ public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
+ public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private static final Logger log = LoggerFactory.getLogger(CollocDriver.class);
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("input").withRequired(true)
+ .withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Path for input files.").withShortName("i")
+ .create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true)
+ .withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Path write output to").withShortName("o")
+ .create();
+
+ Option maxNGramSizeOpt = obuilder
+ .withLongName("maxNGramSize")
+ .withRequired(false)
+ .withArgument(
+ abuilder.withName("size").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The maximum size of ngrams to create (2 = bigrams, 3 = trigrams, etc)")
+ .withShortName("n").create();
+
+ Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
+ false).withDescription("If set, overwrite the output directory")
+ .withShortName("w").create();
+
+ Option analyzerNameOpt = obuilder.withLongName("analyzerName")
+ .withRequired(false).withArgument(
+ abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
+ .create()).withDescription(
+ "Class name of analyzer to use for tokenization").withShortName("a")
+ .create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription(
+ "Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(
+ outputOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
+ .withOption(analyzerNameOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String input = cmdLine.getValue(inputOpt).toString();
+ String output = cmdLine.getValue(outputOpt).toString();
+
+ int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
+
+ if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
+ try {
+ maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+ .toString());
+ } catch (NumberFormatException ex) {
+ log.warn("Could not parse ngram size option");
+ }
+ }
+
+ if (cmdLine.hasOption(overwriteOutput) == true) {
+ HadoopUtil.overwriteOutput(output);
+ }
+
+ String analyzerName = null;
+ if (cmdLine.hasOption(analyzerNameOpt) == true) {
+ analyzerName = cmdLine.getValue(analyzerNameOpt).toString();
+ }
+
+ // parse input and extract collocations
+ long ngramCount = runPass1(input, output, maxNGramSize, analyzerName);
+
+ // tally collocations and perform LLR calculation
+ runPass2(ngramCount, output);
+
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+
+ }
+
+ /** pass1: generate collocations, ngrams */
+ public static long runPass1(String input,
+ String output,
+ int maxNGramSize,
+ String analyzerClass) throws IOException {
+ JobConf conf = new JobConf(CollocDriver.class);
+
+ conf.setMapOutputKeyClass(Gram.class);
+ conf.setMapOutputValueClass(Gram.class);
+
+ conf.setOutputKeyClass(Gram.class);
+ conf.setOutputValueClass(Gram.class);
+
+ conf.setCombinerClass(CollocCombiner.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(input));
+ Path outPath = new Path(output + "/pass1");
+ FileOutputFormat.setOutputPath(conf, outPath);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setMapperClass(CollocMapper.class);
+
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+ conf.setReducerClass(CollocReducer.class);
+ conf.set(NGramCollector.MAX_SHINGLE_SIZE, String.valueOf(maxNGramSize));
+
+ if (analyzerClass != null) {
+ conf.set(NGramCollector.ANALYZER_CLASS, analyzerClass);
+ }
+
+ RunningJob job = JobClient.runJob(conf);
+ return job.getCounters().findCounter(NGRAM_TOTAL).getValue();
+ }
+
+ /** pass2: perform the LLR calculation */
+ public static void runPass2(long nGramTotal, String output) throws IOException {
+ JobConf conf = new JobConf(CollocDriver.class);
+ conf.set(LLRReducer.NGRAM_TOTAL, String.valueOf(nGramTotal));
+
+ conf.setMapOutputKeyClass(Gram.class);
+ conf.setMapOutputValueClass(Gram.class);
+
+ conf.setOutputKeyClass(DoubleWritable.class);
+ conf.setOutputValueClass(Text.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(output + "/pass1"));
+ Path outPath = new Path(output + "/colloc");
+ FileOutputFormat.setOutputPath(conf, outPath);
+
+ conf.setMapperClass(IdentityMapper.class);
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ conf.setReducerClass(LLRReducer.class);
+ JobClient.runJob(conf);
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Runs pass 1 of the Collocation discovery job on input of
+ * SequeceFile<Text,Text>, where the key is a document id and the value is the
+ * document contents. . Delegates to NGramCollector to perform tokenization,
+ * ngram-creation and output collection.
+ *
+ * @see org.apache.mahout.text.SequenceFilesFromDirectory
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.NGramCollector
+ */
+public class CollocMapper extends MapReduceBase implements
+ Mapper<Text,Text,Gram,Gram> {
+
+ private final NGramCollector ngramCollector;
+
+ public CollocMapper() {
+ ngramCollector = new NGramCollector();
+ }
+
+ @Override
+ public void configure(JobConf job) {
+ super.configure(job);
+ ngramCollector.configure(job);
+ }
+
+ /**
+ * Collocation finder: pass 1 map phase.
+ *
+ * receives full documents in value and passes these to
+ * NGramCollector.collectNGrams.
+ *
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.NGramCollector#collectNgrams(Reader,
+ * OutputCollector, Reporter)
+ */
+ @Override
+ public void map(Text key,
+ Text value,
+ OutputCollector<Gram,Gram> collector,
+ Reporter reporter) throws IOException {
+
+ Reader r = new StringReader(value.toString());
+ ngramCollector.collectNgrams(r, collector, reporter);
+
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Reducer for Pass 1 of the collocation identification job. Generates counts
+ * for ngrams and subgrams.
+ */
+public class CollocReducer extends MapReduceBase implements
+ Reducer<Gram,Gram,Gram,Gram> {
+
+ /**
+ * collocation finder: pass 1 reduce phase:
+ *
+ * given input from the mapper, k:h_subgram v:ngram k:t_subgram v:ngram
+ *
+ * count ngrams and subgrams.
+ *
+ * output is:
+ *
+ * k:ngram:ngramfreq v:h_subgram:h_subgramfreq k:ngram:ngramfreq
+ * v:t_subgram:t_subgramfreq
+ *
+ * Each ngram's frequency is essentially counted twice, frequency should be
+ * the same for the head and tail. Fix this to count only for the head and
+ * move the count into the value?
+ */
+ @Override
+ public void reduce(Gram key,
+ Iterator<Gram> value,
+ OutputCollector<Gram,Gram> output,
+ Reporter reporter) throws IOException {
+
+ HashMap<Gram,Gram> set = new HashMap<Gram,Gram>();
+ int subgramFrequency = 0;
+
+ while (value.hasNext()) {
+ Gram t = value.next();
+ subgramFrequency += t.getFrequency();
+
+ Gram s = set.get(t);
+ if (s == null) {
+ // t is potentially reused, so create a new object to populate the
+ // HashMap
+ Gram e = new Gram(t);
+ set.put(e, e);
+ } else {
+ s.incrementFrequency(t.getFrequency());
+ }
+ }
+
+ // emit ngram:ngramFreq, subgram:subgramFreq pairs.
+ key.setFrequency(subgramFrequency);
+
+ for (Gram t : set.keySet()) {
+ output.collect(t, key);
+ }
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,235 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * Writable for holding data generated from the collocation discovery jobs.
+ * Depending on the job configuration gram may be one or more words. In some
+ * contexts this is used to hold a complete ngram, while in others it holds a
+ * part of an existing ngram (subgram). Tracks the frequency of the gram and its
+ * position in the ngram in which is was found.
+ */
+public class Gram implements WritableComparable<Gram> {
+
+ public static enum Position {
+ HEAD,
+ TAIL
+ };
+
+ private String gram;
+ private int frequency;
+ private Position position;
+
+ public Gram() {
+
+ }
+
+ public Gram(Gram other) {
+ this.gram = other.gram;
+ this.frequency = other.frequency;
+ this.position = other.position;
+ }
+
+ /**
+ * Create an gram that is at the head of its text unit with a frequency of 1
+ *
+ * @param gram
+ * the gram string
+ */
+ public Gram(String ngram) {
+ this(ngram, 1, HEAD);
+ }
+
+ /**
+ * Create an gram with a frequency of 1
+ *
+ * @param gram
+ * the gram string
+ * @param part
+ * whether the gram is at the head of its text unit.
+ */
+ public Gram(String ngram, Position position) {
+ this(ngram, 1, position);
+ }
+
+ /**
+ * Create an gram with a frequency of 1
+ *
+ * @param gram
+ * the gram string
+ * @param part
+ * whether the gram is at the head of its text unit.
+ */
+ public Gram(String ngram, int frequency) {
+ this(ngram, frequency, HEAD);
+ }
+
+ /**
+ *
+ * @param gram
+ * the gram string
+ * @param frequency
+ * the gram frequency
+ * @param part
+ * whether the gram is at the head of its text unit.
+ */
+ public Gram(String ngram, int frequency, Position position) {
+ this.gram = ngram;
+ this.frequency = frequency;
+ this.position = position;
+ }
+
+ /**
+ * @return position of gram in the text unit.
+ */
+ public Position getPosition() {
+ return this.position;
+ }
+
+ /**
+ * @param part
+ * position of the gram in the text unit.
+ */
+ public void setPosition(Position position) {
+ this.position = position;
+ }
+
+ /**
+ * @return gram term string
+ */
+ public String getString() {
+ return gram;
+ }
+
+ /**
+ * @param gram
+ * gram term string
+ */
+ public void setString(String str) {
+ this.gram = str;
+ }
+
+ /**
+ * @return gram frequency
+ * @return
+ */
+ public int getFrequency() {
+ return frequency;
+ }
+
+ /**
+ * @param frequency
+ * gram's frequency
+ */
+ public void setFrequency(int frequency) {
+ this.frequency = frequency;
+ }
+
+ public void incrementFrequency(int i) {
+ this.frequency += i;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ frequency = in.readInt();
+ boolean head = in.readBoolean();
+
+ if (head) position = Position.HEAD;
+ else position = Position.TAIL;
+
+ int fieldLen = in.readInt();
+ byte[] entry = new byte[fieldLen];
+ in.readFully(entry);
+ gram = Bytes.toString(entry);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(frequency);
+
+ if (position == Position.HEAD) out.writeBoolean(true);
+ else out.writeBoolean(false);
+
+ byte[] data = Bytes.toBytes(gram);
+ out.writeInt(data.length);
+ out.write(data);
+
+ }
+
+ @Override
+ public int compareTo(Gram other) {
+ int ret = getString().compareTo(other.getString());
+ if (ret != 0) {
+ return ret;
+ }
+
+ if (this.position == Position.HEAD && other.position != Position.HEAD) {
+ return -1;
+ }
+
+ if (this.position != Position.HEAD && other.position == Position.HEAD) {
+ return 1;
+ }
+
+ return 0;
+ }
+
+ /** Generates hashcode, does not include frequency in the hash calculation */
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((gram == null) ? 0 : gram.hashCode());
+ result = prime * result + ((position == null) ? 0 : position.hashCode());
+ return result;
+ }
+
+ /**
+ * Determines equality, does not include frequency in the equality calculation
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ Gram other = (Gram) obj;
+ if (gram == null) {
+ if (other.gram != null) return false;
+ } else if (!gram.equals(other.gram)) return false;
+ if (position == null) {
+ if (other.position != null) return false;
+ } else if (!position.equals(other.position)) return false;
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return "'" + gram + "'[" + (position == Position.HEAD ? "h" : "t") + "]:"
+ + frequency;
+ }
+
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.math.stats.LogLikelihood;
+import org.apache.mahout.utils.nlp.collocations.llr.Gram.Position;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reducer for pass 2 of the collocation discovery job. Collects ngram and
+ * sub-ngram frequencies and performs the Log-likelihood ratio calculation.
+ */
+public class LLRReducer extends MapReduceBase implements
+ Reducer<Gram,Gram,DoubleWritable,Text> {
+
+ public static enum Skipped {
+ EXTRA_HEAD,
+ EXTRA_TAIL,
+ MISSING_HEAD,
+ MISSING_TAIL;
+ };
+
+ private static final Logger log = LoggerFactory.getLogger(LLRReducer.class);
+
+ public static final String NGRAM_TOTAL = "ngramTotal";
+
+ long ngramTotal;
+ private final LLCallback ll;
+
+ public LLRReducer() {
+ this.ll = new ConcreteLLCallback();
+ }
+
+ /**
+ * plug in an alternate LL implementation, used for testing
+ *
+ * @param ll
+ * the LL to use.
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.LLRReducer.LLCallback
+ */
+ LLRReducer(LLCallback ll) {
+ this.ll = ll;
+ }
+
+ @Override
+ public void configure(JobConf job) {
+ super.configure(job);
+ this.ngramTotal = job.getLong(NGRAM_TOTAL, -1);
+
+ log.info("NGram Total is " + ngramTotal);
+
+ if (ngramTotal == -1) {
+ throw new RuntimeException("No NGRAM_TOTAL available in job config");
+ }
+ }
+
+ /**
+ * Perform LLR calculation, input is: k:ngram:ngramFreq
+ * v:(h_|t_)subgram:subgramfreq N = ngram total
+ *
+ * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B
+ * respectively below.
+ *
+ * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times
+ * A appears without B: hSubgramFreq - ngramFreq !A+ B: number of times B
+ * appears without A: tSubgramFreq - ngramFreq !A+!B: number of times neither
+ * A or B appears (in that order): N - (subgramFreqA + subgramFreqB -
+ * ngramFreq)
+ */
+ @Override
+ public void reduce(Gram key,
+ Iterator<Gram> values,
+ OutputCollector<DoubleWritable,Text> output,
+ Reporter reporter) throws IOException {
+
+ Gram ngram = key;
+ String[] gram = new String[2];
+ int[] gramFreq = new int[2];
+ gramFreq[0] = gramFreq[1] = -1;
+
+ // FIXME: better way to handle errors? Wouldn't an exception thrown here
+ // cause hadoop to re-try the job?
+ while (values.hasNext()) {
+ Gram value = values.next();
+
+ int pos = (value.getPosition() == Position.HEAD ? 0 : 1);
+
+ if (gramFreq[pos] != -1) {
+ log.warn("Extra {} for {}, skipping", value.getPosition(), ngram);
+ if (value.getPosition() == Position.HEAD) {
+ reporter.incrCounter(Skipped.EXTRA_HEAD, 1);
+ } else {
+ reporter.incrCounter(Skipped.EXTRA_TAIL, 1);
+ }
+ return;
+ }
+
+ gram[pos] = value.getString();
+ gramFreq[pos] = value.getFrequency();
+ }
+
+ if (gramFreq[0] == -1) {
+ log.warn("Missing head for {}, skipping.", ngram);
+ reporter.incrCounter(Skipped.MISSING_HEAD, 1);
+ return;
+ } else if (gramFreq[1] == -1) {
+ log.warn("Missing tail for {}, skipping", ngram);
+ reporter.incrCounter(Skipped.MISSING_TAIL, 1);
+ return;
+ }
+
+ int k11 = ngram.getFrequency(); /* a+b */
+ int k12 = gramFreq[0] - ngram.getFrequency(); /* a+!b */
+ int k21 = gramFreq[1] - ngram.getFrequency(); /* !b+a */
+ int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram
+ .getFrequency())); /* !a+!b */
+
+ try {
+ double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
+ DoubleWritable dd = new DoubleWritable(llr);
+ Text t = new Text(ngram.getString());
+ output.collect(dd, t);
+ } catch (IllegalArgumentException ex) {
+ log.error("Problem calculating LLR ratio: " + ex.getMessage());
+ log.error("NGram: " + ngram);
+ log.error("HEAD: " + gram[0] + ":" + gramFreq[0]);
+ log.error("TAIL: " + gram[1] + ":" + gramFreq[1]);
+ log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: "
+ + k22);
+ }
+ }
+
+ /**
+ * provide interface so the input to the llr calculation can be captured for
+ * validation in unit testing
+ */
+ public static interface LLCallback {
+ public double logLikelihoodRatio(int k11, int k12, int k21, int k22);
+ }
+
+ /** concrete implementation delegates to LogLikelihood class */
+ public static final class ConcreteLLCallback implements LLCallback {
+ public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+ return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
+ };
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Performs tokenization, ngram generation + collection for the first pass of
+ * the LLR collocation discovery job. Factors this code out of the mappers so
+ * that different input formats can be supported.
+ *
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.CollocMapperTextFile
+ */
+public class NGramCollector {
+
+ public static final String ANALYZER_CLASS = "analyzerClass";
+ public static final String MAX_SHINGLE_SIZE = "maxShingleSize";
+
+ public static enum Count {
+ NGRAM_TOTAL;
+ }
+
+ private static final Logger log = LoggerFactory
+ .getLogger(NGramCollector.class);
+
+ /**
+ * An analyzer to perform tokenization. A ShingleFilter will be wrapped around
+ * its output to create ngrams
+ */
+ private Analyzer a;
+
+ /** max size of shingles (ngrams) to create */
+ private int maxShingleSize;
+
+ public NGramCollector() {}
+
+ /**
+ * Configure the NGramCollector.
+ *
+ * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is
+ * provided. Otherwise a lucene StandardAnalyzer will be used that is set to
+ * be compatible to LUCENE_24.
+ *
+ * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the
+ * ShingleFilter.
+ *
+ * @param job
+ */
+ public void configure(JobConf job) {
+ this.a = null;
+ try {
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ String analyzerClass = job.get(NGramCollector.ANALYZER_CLASS);
+ if (analyzerClass != null) {
+ Class<?> cl = ccl.loadClass(analyzerClass);
+ a = (Analyzer) cl.newInstance();
+ }
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ }
+
+ if (this.a == null) {
+ // No analyzer specified. Use the LUCENE_24 analzer here because
+ // it does not preserve stop word positions.
+ this.a = new StandardAnalyzer(Version.LUCENE_24);
+ }
+
+ this.maxShingleSize = job.getInt(NGramCollector.MAX_SHINGLE_SIZE, 2);
+
+ if (log.isInfoEnabled()) {
+ log.info("Analyzer is {}", this.a.toString());
+ log.info("Max Ngram size is {}", this.maxShingleSize);
+ }
+ }
+
+ /**
+ * Receives a document and uses a lucene analyzer to tokenize them. The
+ * ShingleFilter delivers ngrams of the appropriate size which aren then
+ * decomposed into head and tail subgrams which are collected in the following
+ * manner
+ *
+ * k:h_subgram v:ngram k:t_subgram v:ngram
+ *
+ * The 'h_' or 't_' prefix is used to specify whether the subgram in question
+ * is the head or tail of the ngram. In this implementation the head of the
+ * ngram is a (n-1)gram, and the tail is a (1)gram.
+ *
+ * For example, given 'click and clack' and an ngram length of 3: k:'h_click
+ * and' v:'clack and clack' k;'t_clack' v:'click and clack'
+ *
+ * Also counts the total number of ngrams encountered and adds it to the
+ * counter CollocDriver.Count.NGRAM_TOTAL
+ *
+ * @param r
+ * The reader to read input from -- used to create a tokenstream from
+ * the analyzer
+ *
+ * @param collector
+ * The collector to send output to
+ *
+ * @param reporter
+ * Used to deliver the final ngram-count.
+ *
+ * @throws IOException
+ * if there's a problem with the ShingleFilter reading data or the
+ * collector collecting output.
+ */
+ public void collectNgrams(Reader r,
+ OutputCollector<Gram,Gram> collector,
+ Reporter reporter) throws IOException {
+ TokenStream st = a.tokenStream("text", r);
+ ShingleFilter sf = new ShingleFilter(st, maxShingleSize);
+
+ sf.reset();
+ int count = 0; // ngram count
+
+ do {
+ String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
+ .term();
+ String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
+ .type();
+
+ if ("shingle".equals(type)) {
+ count++;
+ Gram ngram = new Gram(term);
+
+ // obtain components, the leading (n-1)gram and the trailing unigram.
+ int i = term.lastIndexOf(' ');
+ if (i != -1) {
+ collector.collect(new Gram(term.substring(0, i), HEAD), ngram);
+ collector.collect(new Gram(term.substring(i + 1), TAIL), ngram);
+ }
+ }
+ } while (sf.incrementToken());
+
+ reporter.incrCounter(NGRAM_TOTAL, count);
+
+ sf.end();
+ sf.close();
+ r.close();
+ }
+}
Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test the CollocReducer FIXME: add negative test cases.
+ */
+public class CollocReducerTest {
+
+ OutputCollector<Gram,Gram> output;
+ Reporter reporter;
+
+ @Before
+ @SuppressWarnings("unchecked")
+ public void setUp() {
+ output = EasyMock.createMock(OutputCollector.class);
+ reporter = EasyMock.createMock(Reporter.class);
+ }
+
+ @Test
+ public void testReduce() throws Exception {
+ // test input, input[*][0] is the key,
+ // input[*][1..n] are the values passed in via
+ // the iterator.
+ Gram[][] input = new Gram[][] {
+ { new Gram("the", HEAD), new Gram("the best"), new Gram("the worst") },
+ { new Gram("of", HEAD), new Gram("of times"), new Gram("of times") },
+ { new Gram("times", TAIL), new Gram("of times"), new Gram("of times") }
+ };
+
+ // expected results.
+ Gram[][] values = new Gram[][] {
+ { new Gram("the best", 1), new Gram("the", 2, HEAD) },
+ { new Gram("the worst", 1), new Gram("the", 2, HEAD) },
+ { new Gram("of times", 2), new Gram("of", 2, HEAD) },
+ { new Gram("of times", 2), new Gram("times", 2, TAIL) }
+ };
+
+ // set up expectations
+ for (Gram[] v : values) {
+ output.collect(v[0], v[1]);
+ }
+ EasyMock.replay(reporter, output);
+
+ // play back the input data.
+ CollocReducer c = new CollocReducer();
+
+ for (Gram[] ii : input) {
+ List<Gram> vv = new LinkedList<Gram>();
+ for (int i = 1; i < ii.length; i++) {
+ vv.add(ii[i]);
+ }
+ c.reduce(ii[0], vv.iterator(), output, reporter);
+ }
+
+ EasyMock.verify(reporter, output);
+ }
+
+}
Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.mahout.utils.nlp.collocations.llr.Gram;
+import org.junit.Test;
+
+public class GramTest {
+
+ @Test
+ public void testEquality() {
+ Gram one = new Gram("foo", 2, HEAD);
+ Gram two = new Gram("foo", 3, HEAD);
+
+ TestCase.assertTrue(one.equals(two));
+ TestCase.assertTrue(two.equals(one));
+
+ Gram three = new Gram("foo", 4, TAIL);
+ Gram four = new Gram("foo");
+
+ TestCase.assertTrue(!three.equals(two));
+ TestCase.assertTrue(four.equals(one));
+ TestCase.assertTrue(one.equals(four));
+
+ Gram five = new Gram("foobar", 4, TAIL);
+
+ TestCase.assertTrue(!five.equals(four));
+ TestCase.assertTrue(!five.equals(three));
+ TestCase.assertTrue(!five.equals(two));
+ TestCase.assertTrue(!five.equals(one));
+ }
+
+ @Test
+ public void testHashing() {
+ Gram[] input =
+ {
+ new Gram("foo", 2, HEAD),
+ new Gram("foo", 3, HEAD),
+ new Gram("foo", 4, TAIL),
+ new Gram("foo", 5, TAIL),
+ new Gram("bar", 6, HEAD),
+ new Gram("bar", 7, TAIL),
+ new Gram("bar", 8),
+ new Gram("bar")
+ };
+
+ HashMap<Gram,Gram> map = new HashMap<Gram,Gram>();
+ for (Gram n : input) {
+ Gram val = map.get(n);
+ if (val != null) {
+ val.incrementFrequency(n.getFrequency());
+ } else {
+ map.put(n, n);
+ }
+ }
+
+ // frequencies of the items in the map.
+ int[] freq = {
+ 5,
+ 3,
+ 9,
+ 5,
+ 15,
+ 7,
+ 8,
+ 1
+ };
+
+ // true if the index should be the item in the map
+ boolean[] memb = {
+ true,
+ false,
+ true,
+ false,
+ true,
+ true,
+ false,
+ false
+ };
+
+ for (int i = 0; i < input.length; i++) {
+ System.err.println(i);
+ TestCase.assertEquals(freq[i], input[i].getFrequency());
+ TestCase.assertEquals(memb[i], input[i] == map.get(input[i]));
+ }
+ }
+}
Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.math.stats.LogLikelihood;
+import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.LLCallback;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Test the LLRReducer
+ * FIXME: Add negative test cases.
+ */
+@SuppressWarnings("deprecation")
+public class LLRReducerTest {
+
+ private static final Logger log =
+ LoggerFactory.getLogger(LLRReducerTest.class);
+
+ Reporter reporter;
+ LLCallback ll;
+ LLCallback cl;
+ // not verifying the llr algo output here, just the input, but it is handy
+ // to see the values emitted.
+ OutputCollector<DoubleWritable,Text> collector = new OutputCollector<DoubleWritable, Text>() {
+ @Override
+ public void collect(DoubleWritable key, Text value) throws IOException {
+ log.info(key.toString() + " " + value.toString());
+ }
+ };
+
+
+ @Before
+ public void setUp() {
+ reporter = EasyMock.createMock(Reporter.class);
+ ll = EasyMock.createMock(LLCallback.class);
+ cl = new LLCallback() {
+ @Override
+ public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+ log.info("k11:" + k11 + " k12:" + k12 + " k21:" + k21 + " k22:" + k22);
+ try {
+ return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ return -1;
+ }
+ }
+
+ };
+ }
+
+ @Test
+ public void testReduce() throws Exception {
+ LLRReducer reducer = new LLRReducer(ll);
+
+ // test input, input[*][0] is the key,
+ // input[*][1..n] are the values passed in via
+ // the iterator.
+
+
+ Gram[][] input = {
+ {new Gram("the best", 1), new Gram("the", 2, HEAD), new Gram("best", 1, TAIL) },
+ {new Gram("best of", 1), new Gram("best", 1, HEAD), new Gram("of", 2, TAIL) },
+ {new Gram("of times", 2), new Gram("of", 2, HEAD), new Gram("times", 2, TAIL) },
+ {new Gram("times the", 1), new Gram("times", 1, HEAD), new Gram("the", 1, TAIL) },
+ {new Gram("the worst", 1), new Gram("the", 2, HEAD), new Gram("worst", 1, TAIL) },
+ {new Gram("worst of", 1), new Gram("worst", 1, HEAD), new Gram("of", 2, TAIL) }
+ };
+
+ int[][] expectations = {
+ // A+B, A+!B, !A+B, !A+!B
+ {1, 1, 0, 5}, // the best
+ {1, 0, 1, 5}, // best of
+ {2, 0, 0, 5}, // of times
+ {1, 0, 0, 6}, // times the
+ {1, 1, 0, 5}, // the worst
+ {1, 0, 1, 5} // worst of
+ };
+
+ for (int[] ee: expectations) {
+ EasyMock.expect(ll.logLikelihoodRatio(ee[0], ee[1], ee[2], ee[3])).andDelegateTo(cl);
+ }
+
+ EasyMock.replay(ll);
+
+ JobConf config = new JobConf(CollocDriver.class);
+ config.set(LLRReducer.NGRAM_TOTAL, "7");
+ reducer.configure(config);
+
+ for (Gram[] ii: input) {
+ List<Gram> vv = new LinkedList<Gram>();
+ for (int i = 1; i < ii.length; i++) {
+ vv.add(ii[i]);
+ }
+ reducer.reduce(ii[0], vv.iterator(), collector, reporter);
+ }
+
+ EasyMock.verify(ll);
+ }
+}
Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java Tue Feb 9 05:49:18 2010
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Collections;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.utils.nlp.collocations.llr.Gram.Position;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+
+/** Test for NGramCollectorTest
+ * FIXME: Add negative test cases
+ */
+@SuppressWarnings("deprecation")
+public class NGramCollectorTest {
+
+ OutputCollector<Gram,Gram> collector;
+ Reporter reporter;
+
+ @Before
+ @SuppressWarnings("unchecked")
+ public void setUp() {
+ collector = EasyMock.createMock(OutputCollector.class);
+ reporter = EasyMock.createMock(Reporter.class);
+ }
+
+ @Test
+ public void testCollectNgrams() throws Exception {
+
+ String input = "the best of times the worst of times";
+
+ String[][] values =
+ new String[][]{
+ {"h_the", "the best"},
+ {"t_best", "the best"},
+ {"h_best", "best of"},
+ {"t_of", "best of"},
+ {"h_of", "of times"},
+ {"t_times", "of times"},
+ {"h_times", "times the"},
+ {"t_the", "times the"},
+ {"h_the", "the worst"},
+ {"t_worst", "the worst"},
+ {"h_worst", "worst of"},
+ {"t_of", "worst of"},
+ {"h_of", "of times"},
+ {"t_times", "of times"}
+ };
+ // set up expectations for mocks. ngram max size = 2
+
+ // setup expectations
+ for (String[] v: values) {
+ Position p = v[0].startsWith("h") ? HEAD : TAIL;
+ Gram subgram = new Gram(v[0].substring(2), p);
+ Gram ngram = new Gram(v[1]);
+ collector.collect(subgram, ngram);
+ }
+
+ reporter.incrCounter(NGRAM_TOTAL, 7);
+ EasyMock.replay(reporter, collector);
+
+ Reader r = new StringReader(input);
+
+ JobConf conf = new JobConf();
+ conf.set(NGramCollector.MAX_SHINGLE_SIZE, "2");
+ conf.set(NGramCollector.ANALYZER_CLASS, TestAnalyzer.class.getName());
+
+ NGramCollector c = new NGramCollector();
+ c.configure(conf);
+
+ c.collectNgrams(r, collector, reporter);
+
+ EasyMock.verify(reporter, collector);
+ }
+
+ /** A lucene 2.9 standard analyzer with no stopwords. */
+ public static class TestAnalyzer extends Analyzer {
+ final Analyzer a;
+
+ public TestAnalyzer() {
+ a = new StandardAnalyzer(Version.LUCENE_29, Collections.EMPTY_SET);
+ }
+
+ @Override
+ public TokenStream tokenStream(String arg0, Reader arg1) {
+ return a.tokenStream(arg0, arg1);
+ }
+ }
+}
|