mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r955489 - in /mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/
Date Thu, 17 Jun 2010 06:24:28 GMT
Author: srowen
Date: Thu Jun 17 06:24:28 2010
New Revision: 955489

URL: http://svn.apache.org/viewvc?rev=955489&view=rev
Log:
MAHOUT-407 committed per Sebastian

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java

Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.mapreduce.Partitioner;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Varint;
+
+/**
+ * used as key for the {@link CapSimilaritiesPerItemReducer} to collect all items similar
to the item with the itemID
+ *
+ * ensure that the similar items are seen in descending order by their similarity value via
secondary sort
+ */
+public class CapSimilaritiesPerItemKeyWritable implements WritableComparable<CapSimilaritiesPerItemKeyWritable>
{
+
+  private long itemID;
+  private double associatedSimilarity;
+
+  static {
+    WritableComparator.define(CapSimilaritiesPerItemKeyWritable.class, new CapSimilaritiesPerItemKeyComparator());
+  }
+
+  public CapSimilaritiesPerItemKeyWritable() {
+    super();
+  }
+
+  public CapSimilaritiesPerItemKeyWritable(long itemID, double associatedSimilarity) {
+    super();
+    this.itemID = itemID;
+    this.associatedSimilarity = associatedSimilarity;
+  }
+
+  public long getItemID() {
+    return itemID;
+  }
+
+  public double getAssociatedSimilarity() {
+    return associatedSimilarity;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    itemID = Varint.readSignedVarLong(in);
+    associatedSimilarity = in.readDouble();
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    Varint.writeSignedVarLong(itemID, out);
+    out.writeDouble(associatedSimilarity);
+  }
+
+  @Override
+  public int compareTo(CapSimilaritiesPerItemKeyWritable other) {
+    return (itemID == other.itemID) ? 0 : (itemID < other.itemID) ? -1 : 1;
+  }
+
+  @Override
+  public int hashCode() {
+    return RandomUtils.hashLong(itemID);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof CapSimilaritiesPerItemKeyWritable) {
+      return itemID == ((CapSimilaritiesPerItemKeyWritable)other).itemID;
+    }
+    return false;
+  }
+
+  public static class CapSimilaritiesPerItemKeyComparator extends WritableComparator {
+
+    public CapSimilaritiesPerItemKeyComparator() {
+      super(CapSimilaritiesPerItemKeyWritable.class, true);
+    }
+
+    @Override
+    public int compare(WritableComparable a, WritableComparable b) {
+      CapSimilaritiesPerItemKeyWritable capKey1 = (CapSimilaritiesPerItemKeyWritable) a;
+      CapSimilaritiesPerItemKeyWritable capKey2 = (CapSimilaritiesPerItemKeyWritable) b;
+
+      int result = compare(capKey1.getItemID(), capKey2.getItemID());
+      if (result == 0) {
+        result = -1 * compare(capKey1.getAssociatedSimilarity(), capKey2.getAssociatedSimilarity());
+      }
+      return result;
+    }
+
+    protected static int compare(long a, long b) {
+      return (a == b) ? 0 : (a < b) ? -1 : 1;
+    }
+
+    protected static int compare(double a, double b) {
+      return (a == b) ? 0 : (a < b) ? -1 : 1;
+    }
+  }
+
+  public static class CapSimilaritiesPerItemKeyPartitioner
+      extends Partitioner<CapSimilaritiesPerItemKeyWritable,SimilarItemWritable> {
+
+    @Override
+    public int getPartition(CapSimilaritiesPerItemKeyWritable key, SimilarItemWritable value,
int numPartitions) {
+      return (key.hashCode() * 127) % numPartitions;
+    }
+  }
+
+
+  public static class CapSimilaritiesPerItemKeyGroupingComparator extends WritableComparator
{
+
+    public CapSimilaritiesPerItemKeyGroupingComparator() {
+      super(CapSimilaritiesPerItemKeyWritable.class, true);
+    }
+
+    @Override
+    public int compare(WritableComparable a, WritableComparable b) {
+      CapSimilaritiesPerItemKeyWritable capKey1 = (CapSimilaritiesPerItemKeyWritable) a;
+      CapSimilaritiesPerItemKeyWritable capKey2 = (CapSimilaritiesPerItemKeyWritable) b;
+      return a.compareTo(b);
+    }
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+
+/**
+ * maps out all pairs of similar items so that all similar items for an item can be collected
in
+ * the {@link CapSimilaritiesPerItemReducer}
+ *
+ */
+public class CapSimilaritiesPerItemMapper
+    extends Mapper<EntityEntityWritable,DoubleWritable,CapSimilaritiesPerItemKeyWritable,SimilarItemWritable>
{
+
+  @Override
+  protected void map(EntityEntityWritable itemPair, DoubleWritable similarity, Context ctx)
+      throws IOException, InterruptedException {
+
+    long itemIDA = itemPair.getAID();
+    long itemIDB = itemPair.getBID();
+    double value = similarity.get();
+
+    ctx.write(new CapSimilaritiesPerItemKeyWritable(itemIDA, value), new SimilarItemWritable(itemIDB,
value));
+    ctx.write(new CapSimilaritiesPerItemKeyWritable(itemIDB, value), new SimilarItemWritable(itemIDA,
value));
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+
+/**
+ * this reducer sees all similar items for an item in descending similarity value order and
writes maximally as much
+ * as specified in the "maxSimilaritiesPerItem" option of {@link ItemSimilarityJob}
+ */
+public class CapSimilaritiesPerItemReducer
+    extends Reducer<CapSimilaritiesPerItemKeyWritable,SimilarItemWritable,EntityEntityWritable,DoubleWritable>
{
+
+  private int maxSimilaritiesPerItem;
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    maxSimilaritiesPerItem = ctx.getConfiguration().getInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM,
-1);
+    if (maxSimilaritiesPerItem < 1) {
+      throw new IllegalStateException("Maximum similar items per item was not set correctly");
+    }
+  }
+
+  @Override
+  protected void reduce(CapSimilaritiesPerItemKeyWritable capKey, Iterable<SimilarItemWritable>
similarItems,
+      Context ctx) throws IOException, InterruptedException {
+    long itemAID = capKey.getItemID();
+
+    /* we see the similar items in descending value order because of secondary sort */
+    int n=0;
+    for (SimilarItemWritable similarItem : similarItems) {
+      long itemBID = similarItem.getItemID();
+      EntityEntityWritable itemPair = toItemPair(itemAID, itemBID);
+      ctx.write(itemPair, new DoubleWritable(similarItem.getValue()));
+
+      if (++n == maxSimilaritiesPerItem) {
+        break;
+      }
+    }
+  }
+
+  protected EntityEntityWritable toItemPair(long itemAID, long itemBID) {
+    /* smaller ID first */
+    if (itemAID < itemBID) {
+      return new EntityEntityWritable(itemAID, itemBID);
+    } else {
+      return new EntityEntityWritable(itemBID, itemAID);
+    }
+  }
+}

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=955489&r1=955488&r2=955489&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Thu Jun 17 06:24:28 2010
@@ -32,6 +32,7 @@ import org.apache.hadoop.io.DoubleWritab
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
@@ -48,14 +49,14 @@ import org.apache.mahout.math.VarIntWrit
 import org.apache.mahout.math.VarLongWritable;
 
 /**
- * <p>Runs a completely distributed computation of the cosine distance of the itemvectors
of the user-item-matrix
+ * <p>Runs a completely distributed computation of the similarity of the itemvectors
of the user-item-matrix
  *  as a series of mapreduces.</p>
  *
  * <p>Algorithm used is a slight modification from the algorithm described in
  * http://www.umiacs.umd.edu/~jimmylin/publications/Elsayed_etal_ACL2008_short.pdf</p>
  *
  * <pre>
- * Example:
+ * Example using cosine distance:
  *
  * user-item-matrix:
  *
@@ -97,6 +98,8 @@ import org.apache.mahout.math.VarLongWri
  * <li>-Dmapred.output.dir=(path): output path where the computations output should
go</li>
  * <li>--similarityClassname (classname): an implemenation of {@link DistributedItemSimilarity}
used to compute the
  * similarity</li>
+ * <li>--maxSimilaritiesPerItem (integer): try to cap the number of similar items per
item to this number
+ * (default: 100)</li>
  * </ol>
  *
  *
@@ -109,10 +112,15 @@ import org.apache.mahout.math.VarLongWri
 public final class ItemSimilarityJob extends AbstractJob {
 
   public static final String DISTRIBUTED_SIMILARITY_CLASSNAME =
-    "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.distributedSimilarityClassname";
+      "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.distributedSimilarityClassname";
 
   public static final String NUMBER_OF_USERS =
-    "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.numberOfUsers";
+      "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.numberOfUsers";
+
+  public static final String MAX_SIMILARITIES_PER_ITEM =
+      "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.maxSimilaritiesPerItem";
+
+  private static final Integer DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
 
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException
{
@@ -120,6 +128,8 @@ public final class ItemSimilarityJob ext
     addInputOption();
     addOutputOption();
     addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate");
+    addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per
item to this number " +
+    		"(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ")", String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
 
     Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
@@ -127,6 +137,7 @@ public final class ItemSimilarityJob ext
     }
 
     String distributedSimilarityClassname = parsedArgs.get("--similarityClassname");
+    int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
 
     Path inputPath = getInputPath();
     Path outputPath = getOutputPath();
@@ -135,6 +146,8 @@ public final class ItemSimilarityJob ext
     Path countUsersPath = new Path(tempDirPath, "countUsers");
     Path itemVectorsPath = new Path(tempDirPath, "itemVectors");
     Path userVectorsPath = new Path(tempDirPath, "userVectors");
+    Path similaritiesPath = new Path(tempDirPath, "similarities");
+    Path cappedSimilaritiesPath = new Path(tempDirPath, "cappedSimilarities");
 
     AtomicInteger currentPhase = new AtomicInteger();
 
@@ -186,7 +199,7 @@ public final class ItemSimilarityJob ext
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
       Job similarity = prepareJob(userVectorsPath,
-                                  outputPath,
+                                  similaritiesPath,
                                   SequenceFileInputFormat.class,
                                   CopreferredItemsMapper.class,
                                   ItemPairWritable.class,
@@ -194,7 +207,7 @@ public final class ItemSimilarityJob ext
                                   SimilarityReducer.class,
                                   EntityEntityWritable.class,
                                   DoubleWritable.class,
-                                  TextOutputFormat.class);
+                                  SequenceFileOutputFormat.class);
       Configuration conf = similarity.getConfiguration();
       int numberOfUsers = readNumberOfUsers(conf, countUsersPath);
       conf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
@@ -202,6 +215,40 @@ public final class ItemSimilarityJob ext
       similarity.waitForCompletion(true);
     }
 
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job capSimilaritiesPerItem = prepareJob(similaritiesPath,
+                                              cappedSimilaritiesPath,
+                                              SequenceFileInputFormat.class,
+                                              CapSimilaritiesPerItemMapper.class,
+                                              CapSimilaritiesPerItemKeyWritable.class,
+                                              SimilarItemWritable.class,
+                                              CapSimilaritiesPerItemReducer.class,
+                                              EntityEntityWritable.class,
+                                              DoubleWritable.class,
+                                              SequenceFileOutputFormat.class);
+
+      capSimilaritiesPerItem.getConfiguration().setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
+      capSimilaritiesPerItem.setPartitionerClass(
+          CapSimilaritiesPerItemKeyWritable.CapSimilaritiesPerItemKeyPartitioner.class);
+      capSimilaritiesPerItem.setGroupingComparatorClass(
+          CapSimilaritiesPerItemKeyWritable.CapSimilaritiesPerItemKeyGroupingComparator.class);
+      capSimilaritiesPerItem.waitForCompletion(true);
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job removeDuplicates = prepareJob(cappedSimilaritiesPath,
+                                        outputPath,
+                                        SequenceFileInputFormat.class,
+                                        Mapper.class,
+                                        EntityEntityWritable.class,
+                                        DoubleWritable.class,
+                                        RemoveDuplicatesReducer.class,
+                                        EntityEntityWritable.class,
+                                        DoubleWritable.class,
+                                        TextOutputFormat.class);
+      removeDuplicates.waitForCompletion(true);
+    }
+
     return 0;
   }
 

Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+
+/**
+ * makes sure that every pair of similar items is written only once
+ */
+public class RemoveDuplicatesReducer
+    extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>
{
+
+  @Override
+  protected void reduce(EntityEntityWritable itemPair, Iterable<DoubleWritable> values,
Context ctx)
+      throws IOException, InterruptedException {
+    DoubleWritable value = values.iterator().next();
+    ctx.write(itemPair, value);
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Varint;
+
+public class SimilarItemWritable implements Writable {
+
+  private long itemID;
+  private double value;
+
+  public SimilarItemWritable() {
+    super();
+  }
+
+  public SimilarItemWritable(long itemID, double value) {
+    super();
+    this.itemID = itemID;
+    this.value = value;
+  }
+
+  public long getItemID() {
+    return itemID;
+  }
+
+  public double getValue() {
+    return value;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    itemID = Varint.readSignedVarLong(in);
+    value = in.readDouble();
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    Varint.writeSignedVarLong(itemID, out);
+    out.writeDouble(value);
+  }
+
+  @Override
+  public int hashCode() {
+    return RandomUtils.hashLong(itemID);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof SimilarItemWritable) {
+      return (itemID == ((SimilarItemWritable)other).itemID);
+    }
+    return false;
+  }
+}

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=955489&r1=955488&r2=955489&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
(original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Thu Jun 17 06:24:28 2010
@@ -42,6 +42,7 @@ import org.apache.mahout.cf.taste.hadoop
 import org.apache.mahout.cf.taste.hadoop.EntityPrefWritableArrayWritable;
 import org.apache.mahout.cf.taste.hadoop.ToUserPrefsMapper;
 import org.apache.mahout.cf.taste.hadoop.similarity.CoRating;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedTanimotoCoefficientSimilarity;
 import org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.math.VarIntWritable;
@@ -280,7 +281,63 @@ public final class ItemSimilarityTest ex
     SimilarityReducer reducer = new SimilarityReducer();
     reducer.setup(context);
     reducer.reduce(new ItemPairWritable(12L, 34L, 2.0, 10.0),
-                   Arrays.asList(new CoRating(2.5f, 2.0f),new CoRating(2.0f, 2.5f)), context);
+                   Arrays.asList(new CoRating(2.5f, 2.0f), new CoRating(2.0f, 2.5f)), context);
+
+    EasyMock.verify(context);
+  }
+
+  public void testCapSimilaritiesPerItemMapper() throws Exception {
+    Mapper<EntityEntityWritable,DoubleWritable,CapSimilaritiesPerItemKeyWritable,SimilarItemWritable>.Context
context =
+      EasyMock.createMock(Mapper.Context.class);
+
+    context.write(new CapSimilaritiesPerItemKeyWritable(1L, 0.89d), new SimilarItemWritable(5L,
0.89d));
+    context.write(new CapSimilaritiesPerItemKeyWritable(5L, 0.89d), new SimilarItemWritable(1L,
0.89d));
+
+    EasyMock.replay(context);
+
+    CapSimilaritiesPerItemMapper mapper = new CapSimilaritiesPerItemMapper();
+    EntityEntityWritable itemPair = new EntityEntityWritable(1L, 5L);
+    mapper.map(itemPair, new DoubleWritable(0.89d), context);
+
+    EasyMock.verify(context);
+  }
+
+  public void testCapSimilaritiesPerItemReducer() throws Exception {
+    Reducer<CapSimilaritiesPerItemKeyWritable,SimilarItemWritable,EntityEntityWritable,DoubleWritable>.Context
context =
+      EasyMock.createMock(Reducer.Context.class);
+
+    Configuration conf = new Configuration();
+    EasyMock.expect(context.getConfiguration()).andStubReturn(conf);
+    conf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, 2);
+
+    context.write(new EntityEntityWritable(1L, 3L), new DoubleWritable(0.9d));
+    context.write(new EntityEntityWritable(1L, 6L), new DoubleWritable(0.7d));
+
+    EasyMock.replay(context);
+
+    CapSimilaritiesPerItemReducer reducer = new CapSimilaritiesPerItemReducer();
+
+    List<SimilarItemWritable> similarItems = Arrays.asList(new SimilarItemWritable(3L,
0.9d),
+        new SimilarItemWritable(6L, 0.7d), new SimilarItemWritable(123l, 0.2d));
+
+    reducer.setup(context);
+    reducer.reduce(new CapSimilaritiesPerItemKeyWritable(1L, 1d), similarItems, context);
+
+    EasyMock.verify(context);
+  }
+
+  public void testRemoveDuplicatesReducer() throws Exception {
+    Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>.Context
context =
+      EasyMock.createMock(Reducer.Context.class);
+
+    context.write(new EntityEntityWritable(1L, 2L), new DoubleWritable(0.5d));
+
+    EasyMock.replay(context);
+
+    List<DoubleWritable> values = Arrays.asList(new DoubleWritable(0.5d), new DoubleWritable(0.5d));
+
+    RemoveDuplicatesReducer reducer = new RemoveDuplicatesReducer();
+    reducer.reduce(new EntityEntityWritable(1L, 2L), values, context);
 
     EasyMock.verify(context);
   }
@@ -321,7 +378,7 @@ public final class ItemSimilarityTest ex
     similarityJob.setConf(conf);
 
     similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-        "org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity"});
+       DistributedUncenteredZeroAssumingCosineSimilarity.class.getName()});
 
     File countUsersPart = new File(tmpDir, "countUsers");
     int numberOfUsers = ItemSimilarityJob.readNumberOfUsers(new Configuration(),
@@ -364,7 +421,102 @@ public final class ItemSimilarityTest ex
 
     int linesWritten = currentLine-1;
     assertEquals(2, linesWritten);
+  }
 
+  public void testMaxSimilaritiesPerItem() throws Exception {
+
+    File inputFile = getTestTempFile("prefsForMaxSimilarities.txt");
+    File outputDir = getTestTempDir("output");
+    outputDir.delete();
+    File tmpDir = getTestTempDir("tmp");
+
+    /* user-item-matrix
+
+            i1  i2  i3
+        u1   1   0   1
+        u2   0   1   1
+        u3   1   1   0
+        u4   1   1   1
+        u5   0   1   0
+        u6   1   1   0
+
+        tanimoto(i1,i2) = 0.5
+        tanimoto(i2,i3) = 0.333
+        tanimoto(i3,i1) = 0.4
+
+        When we set maxSimilaritiesPerItem to 1 the following pairs should be found:
+
+        i1 --> i2
+        i2 --> i1
+        i3 --> i1
+
+     */
+
+    BufferedWriter writer = new BufferedWriter(new FileWriter(inputFile));
+    try {
+      writer.write("1,1,1\n" +
+                   "1,3,1\n" +
+                   "2,2,1\n" +
+                   "2,3,1\n" +
+                   "3,1,1\n" +
+                   "3,2,1\n" +
+                   "4,1,1\n" +
+                   "4,2,1\n" +
+                   "4,3,1\n" +
+                   "5,2,1\n" +
+                   "6,1,1\n" +
+                   "6,2,1\n");
+    } finally {
+      writer.close();
+    }
+
+    ItemSimilarityJob similarityJob = new ItemSimilarityJob();
+
+    Configuration conf = new Configuration();
+    conf.set("mapred.input.dir", inputFile.getAbsolutePath());
+    conf.set("mapred.output.dir", outputDir.getAbsolutePath());
+    conf.setBoolean("mapred.output.compress", false);
+
+    similarityJob.setConf(conf);
+
+    similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
+        DistributedTanimotoCoefficientSimilarity.class.getName(), "--maxSimilaritiesPerItem",
"1"});
+
+    File outPart = outputDir.listFiles(new FilenameFilter() {
+      @Override
+      public boolean accept(File dir, String name) {
+        return name.startsWith("part-");
+      }
+    })[0];
+    BufferedReader reader = new BufferedReader(new FileReader(outPart));
+
+    String line;
+    int currentLine = 1;
+    while ( (line = reader.readLine()) != null) {
+
+      String[] tokens = line.split("\t");
+
+      long itemAID = Long.parseLong(tokens[0]);
+      long itemBID = Long.parseLong(tokens[1]);
+      double similarity = Double.parseDouble(tokens[2]);
+
+      if (currentLine == 1) {
+        assertEquals(1L, itemAID);
+        assertEquals(2L, itemBID);
+        assertEquals(0.5d, similarity, 0.0001d);
+      }
+
+      if (currentLine == 2) {
+        assertEquals(1L, itemAID);
+        assertEquals(3L, itemBID);
+        assertEquals(0.4, similarity, 0.0001d);
+      }
+
+      currentLine++;
+    }
+
+    int linesWritten = currentLine-1;
+    assertEquals(2, linesWritten);
   }
 
 }



Mime
View raw message