mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From robina...@apache.org
Subject svn commit: r907938 - in /lucene/mahout/trunk/utils: ./ src/main/java/org/apache/mahout/utils/nlp/ src/main/java/org/apache/mahout/utils/nlp/collocations/ src/main/java/org/apache/mahout/utils/nlp/collocations/llr/ src/test/java/org/apache/mahout/utils...
Date Tue, 09 Feb 2010 05:49:19 GMT
Author: robinanil
Date: Tue Feb  9 05:49:18 2010
New Revision: 907938

URL: http://svn.apache.org/viewvc?rev=907938&view=rev
Log:
MAHOUT-242 NGram Collocation using LLR (Drew Farris)

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
Modified:
    lucene/mahout/trunk/utils/pom.xml

Modified: lucene/mahout/trunk/utils/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/pom.xml?rev=907938&r1=907937&r2=907938&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/pom.xml (original)
+++ lucene/mahout/trunk/utils/pom.xml Tue Feb  9 05:49:18 2010
@@ -149,6 +149,16 @@
       <artifactId>lucene-core</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymock</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymockclassextension</artifactId>
+    </dependency>
+
   </dependencies>
 
 

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+public class CollocCombiner extends MapReduceBase implements
+    Reducer<Gram,Gram,Gram,Gram> {
+  
+  /**
+   * collocation finder: pass 1 collec phase:
+   * 
+   * given input from the mapper, k:h_subgram:1 v:ngram:1 k:t_subgram:1
+   * v:ngram:1
+   * 
+   * count ngrams and subgrams.
+   * 
+   * output is:
+   * 
+   * k:h_subgram:subgramfreq v:ngram:ngramfreq k:t_subgram:subgramfreq
+   * v:ngram:ngramfreq
+   * 
+   * Each ngram's frequency is essentially counted twice, frequency should be
+   * the same for the head and tail. Fix this to count only for the head and
+   * move the count into the value?
+   */
+  @Override
+  public void reduce(Gram key,
+                     Iterator<Gram> value,
+                     OutputCollector<Gram,Gram> output,
+                     Reporter reporter) throws IOException {
+    
+    HashMap<Gram,Gram> set = new HashMap<Gram,Gram>();
+    int subgramFrequency = 0;
+    
+    while (value.hasNext()) {
+      Gram t = value.next();
+      subgramFrequency += t.getFrequency();
+      
+      Gram s = set.get(t);
+      if (s == null) {
+        // t is potentially reused, so create a new object to populate the
+        // HashMap
+        Gram e = new Gram(t);
+        set.put(e, e);
+      } else {
+        s.incrementFrequency(t.getFrequency());
+      }
+    }
+    
+    // emit subgram:subgramFreq ngram:ngramFreq pairs
+    key.setFrequency(subgramFrequency);
+    
+    for (Gram t : set.keySet()) {
+      output.collect(key, t);
+    }
+  }
+  
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,205 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.IOException;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.IdentityMapper;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.HadoopUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Driver for LLR collocation discovery mapreduce job */
+public class CollocDriver {
+  
+  public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
+  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+  
+  private static final Logger log = LoggerFactory.getLogger(CollocDriver.class);
+  
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    
+    Option inputOpt = obuilder.withLongName("input").withRequired(true)
+        .withArgument(
+          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+        .withDescription("The Path for input files.").withShortName("i")
+        .create();
+    
+    Option outputOpt = obuilder.withLongName("output").withRequired(true)
+        .withArgument(
+          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+        .withDescription("The Path write output to").withShortName("o")
+        .create();
+    
+    Option maxNGramSizeOpt = obuilder
+        .withLongName("maxNGramSize")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("size").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The maximum size of ngrams to create (2 = bigrams, 3 = trigrams, etc)")
+        .withShortName("n").create();
+    
+    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
+      false).withDescription("If set, overwrite the output directory")
+        .withShortName("w").create();
+    
+    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
+        .withRequired(false).withArgument(
+          abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
+              .create()).withDescription(
+          "Class name of analyzer to use for tokenization").withShortName("a")
+        .create();
+    
+    Option helpOpt = obuilder.withLongName("help").withDescription(
+      "Print out help").withShortName("h").create();
+    
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(
+      outputOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
+        .withOption(analyzerNameOpt).withOption(helpOpt).create();
+    
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+      
+      String input = cmdLine.getValue(inputOpt).toString();
+      String output = cmdLine.getValue(outputOpt).toString();
+      
+      int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
+      
+      if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
+        try {
+          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+              .toString());
+        } catch (NumberFormatException ex) {
+          log.warn("Could not parse ngram size option");
+        }
+      }
+      
+      if (cmdLine.hasOption(overwriteOutput) == true) {
+        HadoopUtil.overwriteOutput(output);
+      }
+      
+      String analyzerName = null;
+      if (cmdLine.hasOption(analyzerNameOpt) == true) {
+        analyzerName = cmdLine.getValue(analyzerNameOpt).toString();
+      }
+      
+      // parse input and extract collocations
+      long ngramCount = runPass1(input, output, maxNGramSize, analyzerName);
+      
+      // tally collocations and perform LLR calculation
+      runPass2(ngramCount, output);
+      
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+    }
+    
+  }
+  
+  /** pass1: generate collocations, ngrams */
+  public static long runPass1(String input,
+                              String output,
+                              int maxNGramSize,
+                              String analyzerClass) throws IOException {
+    JobConf conf = new JobConf(CollocDriver.class);
+    
+    conf.setMapOutputKeyClass(Gram.class);
+    conf.setMapOutputValueClass(Gram.class);
+    
+    conf.setOutputKeyClass(Gram.class);
+    conf.setOutputValueClass(Gram.class);
+    
+    conf.setCombinerClass(CollocCombiner.class);
+    
+    FileInputFormat.setInputPaths(conf, new Path(input));
+    Path outPath = new Path(output + "/pass1");
+    FileOutputFormat.setOutputPath(conf, outPath);
+    
+    conf.setInputFormat(SequenceFileInputFormat.class);
+    conf.setMapperClass(CollocMapper.class);
+    
+    conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setReducerClass(CollocReducer.class);
+    conf.set(NGramCollector.MAX_SHINGLE_SIZE, String.valueOf(maxNGramSize));
+    
+    if (analyzerClass != null) {
+      conf.set(NGramCollector.ANALYZER_CLASS, analyzerClass);
+    }
+    
+    RunningJob job = JobClient.runJob(conf);
+    return job.getCounters().findCounter(NGRAM_TOTAL).getValue();
+  }
+  
+  /** pass2: perform the LLR calculation */
+  public static void runPass2(long nGramTotal, String output) throws IOException {
+    JobConf conf = new JobConf(CollocDriver.class);
+    conf.set(LLRReducer.NGRAM_TOTAL, String.valueOf(nGramTotal));
+    
+    conf.setMapOutputKeyClass(Gram.class);
+    conf.setMapOutputValueClass(Gram.class);
+    
+    conf.setOutputKeyClass(DoubleWritable.class);
+    conf.setOutputValueClass(Text.class);
+    
+    FileInputFormat.setInputPaths(conf, new Path(output + "/pass1"));
+    Path outPath = new Path(output + "/colloc");
+    FileOutputFormat.setOutputPath(conf, outPath);
+    
+    conf.setMapperClass(IdentityMapper.class);
+    conf.setInputFormat(SequenceFileInputFormat.class);
+    conf.setOutputFormat(TextOutputFormat.class);
+    conf.setReducerClass(LLRReducer.class);
+    JobClient.runJob(conf);
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Runs pass 1 of the Collocation discovery job on input of
+ * SequeceFile<Text,Text>, where the key is a document id and the value is the
+ * document contents. . Delegates to NGramCollector to perform tokenization,
+ * ngram-creation and output collection.
+ * 
+ * @see org.apache.mahout.text.SequenceFilesFromDirectory
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.NGramCollector
+ */
+public class CollocMapper extends MapReduceBase implements
+    Mapper<Text,Text,Gram,Gram> {
+  
+  private final NGramCollector ngramCollector;
+  
+  public CollocMapper() {
+    ngramCollector = new NGramCollector();
+  }
+  
+  @Override
+  public void configure(JobConf job) {
+    super.configure(job);
+    ngramCollector.configure(job);
+  }
+  
+  /**
+   * Collocation finder: pass 1 map phase.
+   * 
+   * receives full documents in value and passes these to
+   * NGramCollector.collectNGrams.
+   * 
+   * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.NGramCollector#collectNgrams(Reader,
+   *      OutputCollector, Reporter)
+   */
+  @Override
+  public void map(Text key,
+                  Text value,
+                  OutputCollector<Gram,Gram> collector,
+                  Reporter reporter) throws IOException {
+    
+    Reader r = new StringReader(value.toString());
+    ngramCollector.collectNgrams(r, collector, reporter);
+    
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Reducer for Pass 1 of the collocation identification job. Generates counts
+ * for ngrams and subgrams.
+ */
+public class CollocReducer extends MapReduceBase implements
+    Reducer<Gram,Gram,Gram,Gram> {
+  
+  /**
+   * collocation finder: pass 1 reduce phase:
+   * 
+   * given input from the mapper, k:h_subgram v:ngram k:t_subgram v:ngram
+   * 
+   * count ngrams and subgrams.
+   * 
+   * output is:
+   * 
+   * k:ngram:ngramfreq v:h_subgram:h_subgramfreq k:ngram:ngramfreq
+   * v:t_subgram:t_subgramfreq
+   * 
+   * Each ngram's frequency is essentially counted twice, frequency should be
+   * the same for the head and tail. Fix this to count only for the head and
+   * move the count into the value?
+   */
+  @Override
+  public void reduce(Gram key,
+                     Iterator<Gram> value,
+                     OutputCollector<Gram,Gram> output,
+                     Reporter reporter) throws IOException {
+    
+    HashMap<Gram,Gram> set = new HashMap<Gram,Gram>();
+    int subgramFrequency = 0;
+    
+    while (value.hasNext()) {
+      Gram t = value.next();
+      subgramFrequency += t.getFrequency();
+      
+      Gram s = set.get(t);
+      if (s == null) {
+        // t is potentially reused, so create a new object to populate the
+        // HashMap
+        Gram e = new Gram(t);
+        set.put(e, e);
+      } else {
+        s.incrementFrequency(t.getFrequency());
+      }
+    }
+    
+    // emit ngram:ngramFreq, subgram:subgramFreq pairs.
+    key.setFrequency(subgramFrequency);
+    
+    for (Gram t : set.keySet()) {
+      output.collect(t, key);
+    }
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,235 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * Writable for holding data generated from the collocation discovery jobs.
+ * Depending on the job configuration gram may be one or more words. In some
+ * contexts this is used to hold a complete ngram, while in others it holds a
+ * part of an existing ngram (subgram). Tracks the frequency of the gram and its
+ * position in the ngram in which is was found.
+ */
+public class Gram implements WritableComparable<Gram> {
+  
+  public static enum Position {
+    HEAD,
+    TAIL
+  };
+  
+  private String gram;
+  private int frequency;
+  private Position position;
+  
+  public Gram() {
+
+  }
+  
+  public Gram(Gram other) {
+    this.gram = other.gram;
+    this.frequency = other.frequency;
+    this.position = other.position;
+  }
+  
+  /**
+   * Create an gram that is at the head of its text unit with a frequency of 1
+   * 
+   * @param gram
+   *          the gram string
+   */
+  public Gram(String ngram) {
+    this(ngram, 1, HEAD);
+  }
+  
+  /**
+   * Create an gram with a frequency of 1
+   * 
+   * @param gram
+   *          the gram string
+   * @param part
+   *          whether the gram is at the head of its text unit.
+   */
+  public Gram(String ngram, Position position) {
+    this(ngram, 1, position);
+  }
+  
+  /**
+   * Create an gram with a frequency of 1
+   * 
+   * @param gram
+   *          the gram string
+   * @param part
+   *          whether the gram is at the head of its text unit.
+   */
+  public Gram(String ngram, int frequency) {
+    this(ngram, frequency, HEAD);
+  }
+  
+  /**
+   * 
+   * @param gram
+   *          the gram string
+   * @param frequency
+   *          the gram frequency
+   * @param part
+   *          whether the gram is at the head of its text unit.
+   */
+  public Gram(String ngram, int frequency, Position position) {
+    this.gram = ngram;
+    this.frequency = frequency;
+    this.position = position;
+  }
+  
+  /**
+   * @return position of gram in the text unit.
+   */
+  public Position getPosition() {
+    return this.position;
+  }
+  
+  /**
+   * @param part
+   *          position of the gram in the text unit.
+   */
+  public void setPosition(Position position) {
+    this.position = position;
+  }
+  
+  /**
+   * @return gram term string
+   */
+  public String getString() {
+    return gram;
+  }
+  
+  /**
+   * @param gram
+   *          gram term string
+   */
+  public void setString(String str) {
+    this.gram = str;
+  }
+  
+  /**
+   * @return gram frequency
+   * @return
+   */
+  public int getFrequency() {
+    return frequency;
+  }
+  
+  /**
+   * @param frequency
+   *          gram's frequency
+   */
+  public void setFrequency(int frequency) {
+    this.frequency = frequency;
+  }
+  
+  public void incrementFrequency(int i) {
+    this.frequency += i;
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    frequency = in.readInt();
+    boolean head = in.readBoolean();
+    
+    if (head) position = Position.HEAD;
+    else position = Position.TAIL;
+    
+    int fieldLen = in.readInt();
+    byte[] entry = new byte[fieldLen];
+    in.readFully(entry);
+    gram = Bytes.toString(entry);
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(frequency);
+    
+    if (position == Position.HEAD) out.writeBoolean(true);
+    else out.writeBoolean(false);
+    
+    byte[] data = Bytes.toBytes(gram);
+    out.writeInt(data.length);
+    out.write(data);
+    
+  }
+  
+  @Override
+  public int compareTo(Gram other) {
+    int ret = getString().compareTo(other.getString());
+    if (ret != 0) {
+      return ret;
+    }
+    
+    if (this.position == Position.HEAD && other.position != Position.HEAD) {
+      return -1;
+    }
+    
+    if (this.position != Position.HEAD && other.position == Position.HEAD) {
+      return 1;
+    }
+    
+    return 0;
+  }
+  
+  /** Generates hashcode, does not include frequency in the hash calculation */
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((gram == null) ? 0 : gram.hashCode());
+    result = prime * result + ((position == null) ? 0 : position.hashCode());
+    return result;
+  }
+  
+  /**
+   * Determines equality, does not include frequency in the equality calculation
+   */
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (obj == null) return false;
+    if (getClass() != obj.getClass()) return false;
+    Gram other = (Gram) obj;
+    if (gram == null) {
+      if (other.gram != null) return false;
+    } else if (!gram.equals(other.gram)) return false;
+    if (position == null) {
+      if (other.position != null) return false;
+    } else if (!position.equals(other.position)) return false;
+    return true;
+  }
+  
+  @Override
+  public String toString() {
+    return "'" + gram + "'[" + (position == Position.HEAD ? "h" : "t") + "]:"
+           + frequency;
+  }
+  
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.math.stats.LogLikelihood;
+import org.apache.mahout.utils.nlp.collocations.llr.Gram.Position;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reducer for pass 2 of the collocation discovery job. Collects ngram and
+ * sub-ngram frequencies and performs the Log-likelihood ratio calculation.
+ */
+public class LLRReducer extends MapReduceBase implements
+    Reducer<Gram,Gram,DoubleWritable,Text> {
+  
+  public static enum Skipped {
+    EXTRA_HEAD,
+    EXTRA_TAIL,
+    MISSING_HEAD,
+    MISSING_TAIL;
+  };
+  
+  private static final Logger log = LoggerFactory.getLogger(LLRReducer.class);
+  
+  public static final String NGRAM_TOTAL = "ngramTotal";
+  
+  long ngramTotal;
+  private final LLCallback ll;
+  
+  public LLRReducer() {
+    this.ll = new ConcreteLLCallback();
+  }
+  
+  /**
+   * plug in an alternate LL implementation, used for testing
+   * 
+   * @param ll
+   *          the LL to use.
+   * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.LLRReducer.LLCallback
+   */
+  LLRReducer(LLCallback ll) {
+    this.ll = ll;
+  }
+  
+  @Override
+  public void configure(JobConf job) {
+    super.configure(job);
+    this.ngramTotal = job.getLong(NGRAM_TOTAL, -1);
+    
+    log.info("NGram Total is " + ngramTotal);
+    
+    if (ngramTotal == -1) {
+      throw new RuntimeException("No NGRAM_TOTAL available in job config");
+    }
+  }
+  
+  /**
+   * Perform LLR calculation, input is: k:ngram:ngramFreq
+   * v:(h_|t_)subgram:subgramfreq N = ngram total
+   * 
+   * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B
+   * respectively below.
+   * 
+   * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times
+   * A appears without B: hSubgramFreq - ngramFreq !A+ B: number of times B
+   * appears without A: tSubgramFreq - ngramFreq !A+!B: number of times neither
+   * A or B appears (in that order): N - (subgramFreqA + subgramFreqB -
+   * ngramFreq)
+   */
+  @Override
+  public void reduce(Gram key,
+                     Iterator<Gram> values,
+                     OutputCollector<DoubleWritable,Text> output,
+                     Reporter reporter) throws IOException {
+    
+    Gram ngram = key;
+    String[] gram = new String[2];
+    int[] gramFreq = new int[2];
+    gramFreq[0] = gramFreq[1] = -1;
+    
+    // FIXME: better way to handle errors? Wouldn't an exception thrown here
+    // cause hadoop to re-try the job?
+    while (values.hasNext()) {
+      Gram value = values.next();
+      
+      int pos = (value.getPosition() == Position.HEAD ? 0 : 1);
+      
+      if (gramFreq[pos] != -1) {
+        log.warn("Extra {} for {}, skipping", value.getPosition(), ngram);
+        if (value.getPosition() == Position.HEAD) {
+          reporter.incrCounter(Skipped.EXTRA_HEAD, 1);
+        } else {
+          reporter.incrCounter(Skipped.EXTRA_TAIL, 1);
+        }
+        return;
+      }
+      
+      gram[pos] = value.getString();
+      gramFreq[pos] = value.getFrequency();
+    }
+    
+    if (gramFreq[0] == -1) {
+      log.warn("Missing head for {}, skipping.", ngram);
+      reporter.incrCounter(Skipped.MISSING_HEAD, 1);
+      return;
+    } else if (gramFreq[1] == -1) {
+      log.warn("Missing tail for {}, skipping", ngram);
+      reporter.incrCounter(Skipped.MISSING_TAIL, 1);
+      return;
+    }
+    
+    int k11 = ngram.getFrequency(); /* a+b */
+    int k12 = gramFreq[0] - ngram.getFrequency(); /* a+!b */
+    int k21 = gramFreq[1] - ngram.getFrequency(); /* !b+a */
+    int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram
+        .getFrequency())); /* !a+!b */
+    
+    try {
+      double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
+      DoubleWritable dd = new DoubleWritable(llr);
+      Text t = new Text(ngram.getString());
+      output.collect(dd, t);
+    } catch (IllegalArgumentException ex) {
+      log.error("Problem calculating LLR ratio: " + ex.getMessage());
+      log.error("NGram: " + ngram);
+      log.error("HEAD: " + gram[0] + ":" + gramFreq[0]);
+      log.error("TAIL: " + gram[1] + ":" + gramFreq[1]);
+      log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: "
+                + k22);
+    }
+  }
+  
+  /**
+   * provide interface so the input to the llr calculation can be captured for
+   * validation in unit testing
+   */
+  public static interface LLCallback {
+    public double logLikelihoodRatio(int k11, int k12, int k21, int k22);
+  }
+  
+  /** concrete implementation delegates to LogLikelihood class */
+  public static final class ConcreteLLCallback implements LLCallback {
+    public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+      return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
+    };
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Performs tokenization, ngram generation + collection for the first pass of
+ * the LLR collocation discovery job. Factors this code out of the mappers so
+ * that different input formats can be supported.
+ * 
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.CollocMapperTextFile
+ */
+public class NGramCollector {
+  
+  public static final String ANALYZER_CLASS = "analyzerClass";
+  public static final String MAX_SHINGLE_SIZE = "maxShingleSize";
+  
+  public static enum Count {
+    NGRAM_TOTAL;
+  }
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(NGramCollector.class);
+  
+  /**
+   * An analyzer to perform tokenization. A ShingleFilter will be wrapped around
+   * its output to create ngrams
+   */
+  private Analyzer a;
+  
+  /** max size of shingles (ngrams) to create */
+  private int maxShingleSize;
+  
+  public NGramCollector() {}
+  
+  /**
+   * Configure the NGramCollector.
+   * 
+   * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is
+   * provided. Otherwise a lucene StandardAnalyzer will be used that is set to
+   * be compatible to LUCENE_24.
+   * 
+   * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the
+   * ShingleFilter.
+   * 
+   * @param job
+   */
+  public void configure(JobConf job) {
+    this.a = null;
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      String analyzerClass = job.get(NGramCollector.ANALYZER_CLASS);
+      if (analyzerClass != null) {
+        Class<?> cl = ccl.loadClass(analyzerClass);
+        a = (Analyzer) cl.newInstance();
+      }
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    }
+    
+    if (this.a == null) {
+      // No analyzer specified. Use the LUCENE_24 analzer here because
+      // it does not preserve stop word positions.
+      this.a = new StandardAnalyzer(Version.LUCENE_24);
+    }
+    
+    this.maxShingleSize = job.getInt(NGramCollector.MAX_SHINGLE_SIZE, 2);
+    
+    if (log.isInfoEnabled()) {
+      log.info("Analyzer is {}", this.a.toString());
+      log.info("Max Ngram size is {}", this.maxShingleSize);
+    }
+  }
+  
+  /**
+   * Receives a document and uses a lucene analyzer to tokenize them. The
+   * ShingleFilter delivers ngrams of the appropriate size which aren then
+   * decomposed into head and tail subgrams which are collected in the following
+   * manner
+   * 
+   * k:h_subgram v:ngram k:t_subgram v:ngram
+   * 
+   * The 'h_' or 't_' prefix is used to specify whether the subgram in question
+   * is the head or tail of the ngram. In this implementation the head of the
+   * ngram is a (n-1)gram, and the tail is a (1)gram.
+   * 
+   * For example, given 'click and clack' and an ngram length of 3: k:'h_click
+   * and' v:'clack and clack' k;'t_clack' v:'click and clack'
+   * 
+   * Also counts the total number of ngrams encountered and adds it to the
+   * counter CollocDriver.Count.NGRAM_TOTAL
+   * 
+   * @param r
+   *          The reader to read input from -- used to create a tokenstream from
+   *          the analyzer
+   * 
+   * @param collector
+   *          The collector to send output to
+   * 
+   * @param reporter
+   *          Used to deliver the final ngram-count.
+   * 
+   * @throws IOException
+   *           if there's a problem with the ShingleFilter reading data or the
+   *           collector collecting output.
+   */
+  public void collectNgrams(Reader r,
+                            OutputCollector<Gram,Gram> collector,
+                            Reporter reporter) throws IOException {
+    TokenStream st = a.tokenStream("text", r);
+    ShingleFilter sf = new ShingleFilter(st, maxShingleSize);
+    
+    sf.reset();
+    int count = 0; // ngram count
+    
+    do {
+      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
+          .term();
+      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
+          .type();
+      
+      if ("shingle".equals(type)) {
+        count++;
+        Gram ngram = new Gram(term);
+        
+        // obtain components, the leading (n-1)gram and the trailing unigram.
+        int i = term.lastIndexOf(' ');
+        if (i != -1) {
+          collector.collect(new Gram(term.substring(0, i), HEAD), ngram);
+          collector.collect(new Gram(term.substring(i + 1), TAIL), ngram);
+        }
+      }
+    } while (sf.incrementToken());
+    
+    reporter.incrCounter(NGRAM_TOTAL, count);
+    
+    sf.end();
+    sf.close();
+    r.close();
+  }
+}

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test the CollocReducer FIXME: add negative test cases.
+ */
+public class CollocReducerTest {
+  
+  OutputCollector<Gram,Gram> output;
+  Reporter reporter;
+  
+  @Before
+  @SuppressWarnings("unchecked")
+  public void setUp() {
+    output = EasyMock.createMock(OutputCollector.class);
+    reporter = EasyMock.createMock(Reporter.class);
+  }
+
+  @Test
+  public void testReduce() throws Exception {
+    // test input, input[*][0] is the key,
+    // input[*][1..n] are the values passed in via
+    // the iterator.
+    Gram[][] input = new Gram[][] {
+        { new Gram("the",   HEAD), new Gram("the best"), new Gram("the worst") },
+        { new Gram("of",    HEAD), new Gram("of times"), new Gram("of times") },
+        { new Gram("times", TAIL), new Gram("of times"), new Gram("of times") }
+    };
+
+    // expected results.
+    Gram[][] values = new Gram[][] {
+        { new Gram("the best",  1), new Gram("the", 2,   HEAD) }, 
+        { new Gram("the worst", 1), new Gram("the", 2,   HEAD) }, 
+        { new Gram("of times",  2), new Gram("of",  2,   HEAD) }, 
+        { new Gram("of times",  2), new Gram("times", 2, TAIL) }
+    };
+
+    // set up expectations
+    for (Gram[] v : values) {
+      output.collect(v[0], v[1]);
+    }
+    EasyMock.replay(reporter, output);
+    
+    // play back the input data.
+    CollocReducer c = new CollocReducer();
+    
+    for (Gram[] ii : input) {
+      List<Gram> vv = new LinkedList<Gram>();
+      for (int i = 1; i < ii.length; i++) {
+        vv.add(ii[i]);
+      }
+      c.reduce(ii[0], vv.iterator(), output, reporter);
+    }
+    
+    EasyMock.verify(reporter, output);
+  }
+  
+}

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.mahout.utils.nlp.collocations.llr.Gram;
+import org.junit.Test;
+
+public class GramTest {
+  
+  @Test
+  public void testEquality() {
+    Gram one = new Gram("foo", 2, HEAD);
+    Gram two = new Gram("foo", 3, HEAD);
+    
+    TestCase.assertTrue(one.equals(two));
+    TestCase.assertTrue(two.equals(one));
+    
+    Gram three = new Gram("foo", 4, TAIL);
+    Gram four = new Gram("foo");
+    
+    TestCase.assertTrue(!three.equals(two));
+    TestCase.assertTrue(four.equals(one));
+    TestCase.assertTrue(one.equals(four));
+    
+    Gram five = new Gram("foobar", 4, TAIL);
+    
+    TestCase.assertTrue(!five.equals(four));
+    TestCase.assertTrue(!five.equals(three));
+    TestCase.assertTrue(!five.equals(two));
+    TestCase.assertTrue(!five.equals(one));
+  }
+  
+  @Test
+  public void testHashing() {
+    Gram[] input = 
+    {
+        new Gram("foo", 2, HEAD),
+        new Gram("foo", 3, HEAD),
+        new Gram("foo", 4, TAIL),
+        new Gram("foo", 5, TAIL),
+        new Gram("bar", 6, HEAD),
+        new Gram("bar", 7, TAIL),
+        new Gram("bar", 8),
+        new Gram("bar")
+    };
+    
+    HashMap<Gram,Gram> map = new HashMap<Gram,Gram>();
+    for (Gram n : input) {
+      Gram val = map.get(n);
+      if (val != null) {
+        val.incrementFrequency(n.getFrequency());
+      } else {
+        map.put(n, n);
+      }
+    }
+    
+    // frequencies of the items in the map.
+    int[] freq = {
+        5,
+        3,
+        9,
+        5,
+        15,
+        7,
+        8,
+        1
+    };
+    
+    // true if the index should be the item in the map
+    boolean[] memb = {
+        true,
+        false,
+        true,
+        false,
+        true,
+        true,
+        false,
+        false
+    };
+    
+    for (int i = 0; i < input.length; i++) {
+      System.err.println(i);
+      TestCase.assertEquals(freq[i], input[i].getFrequency());
+      TestCase.assertEquals(memb[i], input[i] == map.get(input[i]));
+    }
+  }
+}

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.math.stats.LogLikelihood;
+import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.LLCallback;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Test the LLRReducer 
+ *  FIXME: Add negative test cases.
+ */
+@SuppressWarnings("deprecation")
+public class LLRReducerTest {
+
+  private static final Logger log = 
+    LoggerFactory.getLogger(LLRReducerTest.class);
+
+  Reporter reporter;
+  LLCallback ll;
+  LLCallback cl;
+  // not verifying the llr algo output here, just the input, but it is handy
+  // to see the values emitted.
+  OutputCollector<DoubleWritable,Text> collector = new OutputCollector<DoubleWritable, Text>() {
+    @Override
+    public void collect(DoubleWritable key, Text value) throws IOException {
+      log.info(key.toString() + " " + value.toString());
+    }
+  };
+
+
+  @Before
+  public void setUp() {
+    reporter  = EasyMock.createMock(Reporter.class);
+    ll        = EasyMock.createMock(LLCallback.class);
+    cl        = new LLCallback() {
+      @Override
+      public double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+        log.info("k11:" + k11 + " k12:" + k12 + " k21:" + k21 + " k22:" + k22);
+        try {
+          return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
+        }
+        catch (Exception e) {
+          e.printStackTrace();
+          return -1;
+        }
+      }
+
+    };
+  }
+
+  @Test
+  public void testReduce() throws Exception {
+    LLRReducer reducer = new LLRReducer(ll);
+
+    // test input, input[*][0] is the key, 
+    // input[*][1..n] are the values passed in via
+    // the iterator.
+    
+    
+    Gram[][] input = {
+        {new Gram("the best",  1), new Gram("the",   2, HEAD), new Gram("best",  1, TAIL) },
+        {new Gram("best of",   1), new Gram("best",  1, HEAD), new Gram("of",    2, TAIL) },
+        {new Gram("of times",  2), new Gram("of",    2, HEAD), new Gram("times", 2, TAIL) },
+        {new Gram("times the", 1), new Gram("times", 1, HEAD), new Gram("the",   1, TAIL) },
+        {new Gram("the worst", 1), new Gram("the",   2, HEAD), new Gram("worst", 1, TAIL) },
+        {new Gram("worst of",  1), new Gram("worst", 1, HEAD), new Gram("of",    2, TAIL) }
+    };
+
+    int[][] expectations = {
+        // A+B, A+!B, !A+B, !A+!B
+        {1, 1, 0, 5}, // the best
+        {1, 0, 1, 5}, // best of
+        {2, 0, 0, 5}, // of times
+        {1, 0, 0, 6}, // times the
+        {1, 1, 0, 5}, // the worst
+        {1, 0, 1, 5}  // worst of
+    };
+
+    for (int[] ee: expectations) {
+      EasyMock.expect(ll.logLikelihoodRatio(ee[0], ee[1], ee[2], ee[3])).andDelegateTo(cl);
+    }
+
+    EasyMock.replay(ll);
+
+    JobConf config = new JobConf(CollocDriver.class);
+    config.set(LLRReducer.NGRAM_TOTAL, "7");
+    reducer.configure(config);
+
+    for (Gram[] ii: input) {
+      List<Gram> vv = new LinkedList<Gram>();
+      for (int i = 1; i < ii.length; i++) {
+        vv.add(ii[i]);
+      }
+      reducer.reduce(ii[0], vv.iterator(), collector, reporter);
+    }
+
+    EasyMock.verify(ll);
+  }
+}

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java?rev=907938&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java Tue Feb  9 05:49:18 2010
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Position.TAIL;
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Collections;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.utils.nlp.collocations.llr.Gram.Position;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+
+/** Test for NGramCollectorTest
+ * FIXME: Add negative test cases
+ */
+@SuppressWarnings("deprecation")
+public class NGramCollectorTest {
+
+  OutputCollector<Gram,Gram> collector;
+  Reporter reporter;
+
+  @Before
+  @SuppressWarnings("unchecked")
+  public void setUp() {
+    collector = EasyMock.createMock(OutputCollector.class);
+    reporter  = EasyMock.createMock(Reporter.class);
+  }
+
+  @Test
+  public void testCollectNgrams() throws Exception {
+
+    String input = "the best of times the worst of times";
+
+    String[][] values = 
+      new String[][]{
+        {"h_the",   "the best"},
+        {"t_best",  "the best"},
+        {"h_best",  "best of"},
+        {"t_of",    "best of"},
+        {"h_of",    "of times"},
+        {"t_times", "of times"},
+        {"h_times", "times the"},
+        {"t_the",   "times the"},
+        {"h_the",   "the worst"},
+        {"t_worst", "the worst"},
+        {"h_worst", "worst of"},
+        {"t_of",    "worst of"},
+        {"h_of",    "of times"},
+        {"t_times", "of times"}
+    };
+    // set up expectations for mocks. ngram max size = 2
+
+    // setup expectations
+    for (String[] v: values) {
+      Position p = v[0].startsWith("h") ? HEAD : TAIL;
+      Gram subgram = new Gram(v[0].substring(2), p);
+      Gram ngram = new Gram(v[1]);
+      collector.collect(subgram, ngram);
+    }
+
+    reporter.incrCounter(NGRAM_TOTAL, 7);
+    EasyMock.replay(reporter, collector);
+    
+    Reader r = new StringReader(input);
+
+    JobConf conf = new JobConf();
+    conf.set(NGramCollector.MAX_SHINGLE_SIZE, "2");
+    conf.set(NGramCollector.ANALYZER_CLASS, TestAnalyzer.class.getName());
+
+    NGramCollector c = new NGramCollector();
+    c.configure(conf);
+    
+    c.collectNgrams(r, collector, reporter);
+
+    EasyMock.verify(reporter, collector);
+  }
+
+  /** A lucene 2.9 standard analyzer with no stopwords. */
+  public static class TestAnalyzer extends Analyzer {
+    final Analyzer a;
+    
+    public TestAnalyzer() {
+      a = new StandardAnalyzer(Version.LUCENE_29, Collections.EMPTY_SET);
+    }
+    
+    @Override
+    public TokenStream tokenStream(String arg0, Reader arg1) {
+      return a.tokenStream(arg0, arg1);
+    }
+  }
+}



Mime
View raw message