Author: srowen
Date: Thu Jun 17 06:24:28 2010
New Revision: 955489
URL: http://svn.apache.org/viewvc?rev=955489&view=rev
Log:
MAHOUT-407 committed per Sebastian
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemKeyWritable.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.mapreduce.Partitioner;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Varint;
+
+/**
+ * used as key for the {@link CapSimilaritiesPerItemReducer} to collect all items similar
to the item with the itemID
+ *
+ * ensure that the similar items are seen in descending order by their similarity value via
secondary sort
+ */
+public class CapSimilaritiesPerItemKeyWritable implements WritableComparable<CapSimilaritiesPerItemKeyWritable>
{
+
+ private long itemID;
+ private double associatedSimilarity;
+
+ static {
+ WritableComparator.define(CapSimilaritiesPerItemKeyWritable.class, new CapSimilaritiesPerItemKeyComparator());
+ }
+
+ public CapSimilaritiesPerItemKeyWritable() {
+ super();
+ }
+
+ public CapSimilaritiesPerItemKeyWritable(long itemID, double associatedSimilarity) {
+ super();
+ this.itemID = itemID;
+ this.associatedSimilarity = associatedSimilarity;
+ }
+
+ public long getItemID() {
+ return itemID;
+ }
+
+ public double getAssociatedSimilarity() {
+ return associatedSimilarity;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ itemID = Varint.readSignedVarLong(in);
+ associatedSimilarity = in.readDouble();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ Varint.writeSignedVarLong(itemID, out);
+ out.writeDouble(associatedSimilarity);
+ }
+
+ @Override
+ public int compareTo(CapSimilaritiesPerItemKeyWritable other) {
+ return (itemID == other.itemID) ? 0 : (itemID < other.itemID) ? -1 : 1;
+ }
+
+ @Override
+ public int hashCode() {
+ return RandomUtils.hashLong(itemID);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other instanceof CapSimilaritiesPerItemKeyWritable) {
+ return itemID == ((CapSimilaritiesPerItemKeyWritable)other).itemID;
+ }
+ return false;
+ }
+
+ public static class CapSimilaritiesPerItemKeyComparator extends WritableComparator {
+
+ public CapSimilaritiesPerItemKeyComparator() {
+ super(CapSimilaritiesPerItemKeyWritable.class, true);
+ }
+
+ @Override
+ public int compare(WritableComparable a, WritableComparable b) {
+ CapSimilaritiesPerItemKeyWritable capKey1 = (CapSimilaritiesPerItemKeyWritable) a;
+ CapSimilaritiesPerItemKeyWritable capKey2 = (CapSimilaritiesPerItemKeyWritable) b;
+
+ int result = compare(capKey1.getItemID(), capKey2.getItemID());
+ if (result == 0) {
+ result = -1 * compare(capKey1.getAssociatedSimilarity(), capKey2.getAssociatedSimilarity());
+ }
+ return result;
+ }
+
+ protected static int compare(long a, long b) {
+ return (a == b) ? 0 : (a < b) ? -1 : 1;
+ }
+
+ protected static int compare(double a, double b) {
+ return (a == b) ? 0 : (a < b) ? -1 : 1;
+ }
+ }
+
+ public static class CapSimilaritiesPerItemKeyPartitioner
+ extends Partitioner<CapSimilaritiesPerItemKeyWritable,SimilarItemWritable> {
+
+ @Override
+ public int getPartition(CapSimilaritiesPerItemKeyWritable key, SimilarItemWritable value,
int numPartitions) {
+ return (key.hashCode() * 127) % numPartitions;
+ }
+ }
+
+
+ public static class CapSimilaritiesPerItemKeyGroupingComparator extends WritableComparator
{
+
+ public CapSimilaritiesPerItemKeyGroupingComparator() {
+ super(CapSimilaritiesPerItemKeyWritable.class, true);
+ }
+
+ @Override
+ public int compare(WritableComparable a, WritableComparable b) {
+ CapSimilaritiesPerItemKeyWritable capKey1 = (CapSimilaritiesPerItemKeyWritable) a;
+ CapSimilaritiesPerItemKeyWritable capKey2 = (CapSimilaritiesPerItemKeyWritable) b;
+ return a.compareTo(b);
+ }
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemMapper.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+
+/**
+ * maps out all pairs of similar items so that all similar items for an item can be collected
in
+ * the {@link CapSimilaritiesPerItemReducer}
+ *
+ */
+public class CapSimilaritiesPerItemMapper
+ extends Mapper<EntityEntityWritable,DoubleWritable,CapSimilaritiesPerItemKeyWritable,SimilarItemWritable>
{
+
+ @Override
+ protected void map(EntityEntityWritable itemPair, DoubleWritable similarity, Context ctx)
+ throws IOException, InterruptedException {
+
+ long itemIDA = itemPair.getAID();
+ long itemIDB = itemPair.getBID();
+ double value = similarity.get();
+
+ ctx.write(new CapSimilaritiesPerItemKeyWritable(itemIDA, value), new SimilarItemWritable(itemIDB,
value));
+ ctx.write(new CapSimilaritiesPerItemKeyWritable(itemIDB, value), new SimilarItemWritable(itemIDA,
value));
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CapSimilaritiesPerItemReducer.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+
+/**
+ * this reducer sees all similar items for an item in descending similarity value order and
writes maximally as much
+ * as specified in the "maxSimilaritiesPerItem" option of {@link ItemSimilarityJob}
+ */
+public class CapSimilaritiesPerItemReducer
+ extends Reducer<CapSimilaritiesPerItemKeyWritable,SimilarItemWritable,EntityEntityWritable,DoubleWritable>
{
+
+ private int maxSimilaritiesPerItem;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ super.setup(ctx);
+ maxSimilaritiesPerItem = ctx.getConfiguration().getInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM,
-1);
+ if (maxSimilaritiesPerItem < 1) {
+ throw new IllegalStateException("Maximum similar items per item was not set correctly");
+ }
+ }
+
+ @Override
+ protected void reduce(CapSimilaritiesPerItemKeyWritable capKey, Iterable<SimilarItemWritable>
similarItems,
+ Context ctx) throws IOException, InterruptedException {
+ long itemAID = capKey.getItemID();
+
+ /* we see the similar items in descending value order because of secondary sort */
+ int n=0;
+ for (SimilarItemWritable similarItem : similarItems) {
+ long itemBID = similarItem.getItemID();
+ EntityEntityWritable itemPair = toItemPair(itemAID, itemBID);
+ ctx.write(itemPair, new DoubleWritable(similarItem.getValue()));
+
+ if (++n == maxSimilaritiesPerItem) {
+ break;
+ }
+ }
+ }
+
+ protected EntityEntityWritable toItemPair(long itemAID, long itemBID) {
+ /* smaller ID first */
+ if (itemAID < itemBID) {
+ return new EntityEntityWritable(itemAID, itemBID);
+ } else {
+ return new EntityEntityWritable(itemBID, itemAID);
+ }
+ }
+}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=955489&r1=955488&r2=955489&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Thu Jun 17 06:24:28 2010
@@ -32,6 +32,7 @@ import org.apache.hadoop.io.DoubleWritab
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
@@ -48,14 +49,14 @@ import org.apache.mahout.math.VarIntWrit
import org.apache.mahout.math.VarLongWritable;
/**
- * <p>Runs a completely distributed computation of the cosine distance of the itemvectors
of the user-item-matrix
+ * <p>Runs a completely distributed computation of the similarity of the itemvectors
of the user-item-matrix
* as a series of mapreduces.</p>
*
* <p>Algorithm used is a slight modification from the algorithm described in
* http://www.umiacs.umd.edu/~jimmylin/publications/Elsayed_etal_ACL2008_short.pdf</p>
*
* <pre>
- * Example:
+ * Example using cosine distance:
*
* user-item-matrix:
*
@@ -97,6 +98,8 @@ import org.apache.mahout.math.VarLongWri
* <li>-Dmapred.output.dir=(path): output path where the computations output should
go</li>
* <li>--similarityClassname (classname): an implemenation of {@link DistributedItemSimilarity}
used to compute the
* similarity</li>
+ * <li>--maxSimilaritiesPerItem (integer): try to cap the number of similar items per
item to this number
+ * (default: 100)</li>
* </ol>
*
*
@@ -109,10 +112,15 @@ import org.apache.mahout.math.VarLongWri
public final class ItemSimilarityJob extends AbstractJob {
public static final String DISTRIBUTED_SIMILARITY_CLASSNAME =
- "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.distributedSimilarityClassname";
+ "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.distributedSimilarityClassname";
public static final String NUMBER_OF_USERS =
- "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.numberOfUsers";
+ "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.numberOfUsers";
+
+ public static final String MAX_SIMILARITIES_PER_ITEM =
+ "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.maxSimilaritiesPerItem";
+
+ private static final Integer DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException
{
@@ -120,6 +128,8 @@ public final class ItemSimilarityJob ext
addInputOption();
addOutputOption();
addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate");
+ addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per
item to this number " +
+ "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ")", String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
@@ -127,6 +137,7 @@ public final class ItemSimilarityJob ext
}
String distributedSimilarityClassname = parsedArgs.get("--similarityClassname");
+ int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
Path inputPath = getInputPath();
Path outputPath = getOutputPath();
@@ -135,6 +146,8 @@ public final class ItemSimilarityJob ext
Path countUsersPath = new Path(tempDirPath, "countUsers");
Path itemVectorsPath = new Path(tempDirPath, "itemVectors");
Path userVectorsPath = new Path(tempDirPath, "userVectors");
+ Path similaritiesPath = new Path(tempDirPath, "similarities");
+ Path cappedSimilaritiesPath = new Path(tempDirPath, "cappedSimilarities");
AtomicInteger currentPhase = new AtomicInteger();
@@ -186,7 +199,7 @@ public final class ItemSimilarityJob ext
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job similarity = prepareJob(userVectorsPath,
- outputPath,
+ similaritiesPath,
SequenceFileInputFormat.class,
CopreferredItemsMapper.class,
ItemPairWritable.class,
@@ -194,7 +207,7 @@ public final class ItemSimilarityJob ext
SimilarityReducer.class,
EntityEntityWritable.class,
DoubleWritable.class,
- TextOutputFormat.class);
+ SequenceFileOutputFormat.class);
Configuration conf = similarity.getConfiguration();
int numberOfUsers = readNumberOfUsers(conf, countUsersPath);
conf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
@@ -202,6 +215,40 @@ public final class ItemSimilarityJob ext
similarity.waitForCompletion(true);
}
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Job capSimilaritiesPerItem = prepareJob(similaritiesPath,
+ cappedSimilaritiesPath,
+ SequenceFileInputFormat.class,
+ CapSimilaritiesPerItemMapper.class,
+ CapSimilaritiesPerItemKeyWritable.class,
+ SimilarItemWritable.class,
+ CapSimilaritiesPerItemReducer.class,
+ EntityEntityWritable.class,
+ DoubleWritable.class,
+ SequenceFileOutputFormat.class);
+
+ capSimilaritiesPerItem.getConfiguration().setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
+ capSimilaritiesPerItem.setPartitionerClass(
+ CapSimilaritiesPerItemKeyWritable.CapSimilaritiesPerItemKeyPartitioner.class);
+ capSimilaritiesPerItem.setGroupingComparatorClass(
+ CapSimilaritiesPerItemKeyWritable.CapSimilaritiesPerItemKeyGroupingComparator.class);
+ capSimilaritiesPerItem.waitForCompletion(true);
+ }
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Job removeDuplicates = prepareJob(cappedSimilaritiesPath,
+ outputPath,
+ SequenceFileInputFormat.class,
+ Mapper.class,
+ EntityEntityWritable.class,
+ DoubleWritable.class,
+ RemoveDuplicatesReducer.class,
+ EntityEntityWritable.class,
+ DoubleWritable.class,
+ TextOutputFormat.class);
+ removeDuplicates.waitForCompletion(true);
+ }
+
return 0;
}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/RemoveDuplicatesReducer.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+
+/**
+ * makes sure that every pair of similar items is written only once
+ */
+public class RemoveDuplicatesReducer
+ extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>
{
+
+ @Override
+ protected void reduce(EntityEntityWritable itemPair, Iterable<DoubleWritable> values,
Context ctx)
+ throws IOException, InterruptedException {
+ DoubleWritable value = values.iterator().next();
+ ctx.write(itemPair, value);
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java?rev=955489&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarItemWritable.java
Thu Jun 17 06:24:28 2010
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Varint;
+
+public class SimilarItemWritable implements Writable {
+
+ private long itemID;
+ private double value;
+
+ public SimilarItemWritable() {
+ super();
+ }
+
+ public SimilarItemWritable(long itemID, double value) {
+ super();
+ this.itemID = itemID;
+ this.value = value;
+ }
+
+ public long getItemID() {
+ return itemID;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ itemID = Varint.readSignedVarLong(in);
+ value = in.readDouble();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ Varint.writeSignedVarLong(itemID, out);
+ out.writeDouble(value);
+ }
+
+ @Override
+ public int hashCode() {
+ return RandomUtils.hashLong(itemID);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other instanceof SimilarItemWritable) {
+ return (itemID == ((SimilarItemWritable)other).itemID);
+ }
+ return false;
+ }
+}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=955489&r1=955488&r2=955489&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
(original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Thu Jun 17 06:24:28 2010
@@ -42,6 +42,7 @@ import org.apache.mahout.cf.taste.hadoop
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritableArrayWritable;
import org.apache.mahout.cf.taste.hadoop.ToUserPrefsMapper;
import org.apache.mahout.cf.taste.hadoop.similarity.CoRating;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedTanimotoCoefficientSimilarity;
import org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.VarIntWritable;
@@ -280,7 +281,63 @@ public final class ItemSimilarityTest ex
SimilarityReducer reducer = new SimilarityReducer();
reducer.setup(context);
reducer.reduce(new ItemPairWritable(12L, 34L, 2.0, 10.0),
- Arrays.asList(new CoRating(2.5f, 2.0f),new CoRating(2.0f, 2.5f)), context);
+ Arrays.asList(new CoRating(2.5f, 2.0f), new CoRating(2.0f, 2.5f)), context);
+
+ EasyMock.verify(context);
+ }
+
+ public void testCapSimilaritiesPerItemMapper() throws Exception {
+ Mapper<EntityEntityWritable,DoubleWritable,CapSimilaritiesPerItemKeyWritable,SimilarItemWritable>.Context
context =
+ EasyMock.createMock(Mapper.Context.class);
+
+ context.write(new CapSimilaritiesPerItemKeyWritable(1L, 0.89d), new SimilarItemWritable(5L,
0.89d));
+ context.write(new CapSimilaritiesPerItemKeyWritable(5L, 0.89d), new SimilarItemWritable(1L,
0.89d));
+
+ EasyMock.replay(context);
+
+ CapSimilaritiesPerItemMapper mapper = new CapSimilaritiesPerItemMapper();
+ EntityEntityWritable itemPair = new EntityEntityWritable(1L, 5L);
+ mapper.map(itemPair, new DoubleWritable(0.89d), context);
+
+ EasyMock.verify(context);
+ }
+
+ public void testCapSimilaritiesPerItemReducer() throws Exception {
+ Reducer<CapSimilaritiesPerItemKeyWritable,SimilarItemWritable,EntityEntityWritable,DoubleWritable>.Context
context =
+ EasyMock.createMock(Reducer.Context.class);
+
+ Configuration conf = new Configuration();
+ EasyMock.expect(context.getConfiguration()).andStubReturn(conf);
+ conf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, 2);
+
+ context.write(new EntityEntityWritable(1L, 3L), new DoubleWritable(0.9d));
+ context.write(new EntityEntityWritable(1L, 6L), new DoubleWritable(0.7d));
+
+ EasyMock.replay(context);
+
+ CapSimilaritiesPerItemReducer reducer = new CapSimilaritiesPerItemReducer();
+
+ List<SimilarItemWritable> similarItems = Arrays.asList(new SimilarItemWritable(3L,
0.9d),
+ new SimilarItemWritable(6L, 0.7d), new SimilarItemWritable(123l, 0.2d));
+
+ reducer.setup(context);
+ reducer.reduce(new CapSimilaritiesPerItemKeyWritable(1L, 1d), similarItems, context);
+
+ EasyMock.verify(context);
+ }
+
+ public void testRemoveDuplicatesReducer() throws Exception {
+ Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>.Context
context =
+ EasyMock.createMock(Reducer.Context.class);
+
+ context.write(new EntityEntityWritable(1L, 2L), new DoubleWritable(0.5d));
+
+ EasyMock.replay(context);
+
+ List<DoubleWritable> values = Arrays.asList(new DoubleWritable(0.5d), new DoubleWritable(0.5d));
+
+ RemoveDuplicatesReducer reducer = new RemoveDuplicatesReducer();
+ reducer.reduce(new EntityEntityWritable(1L, 2L), values, context);
EasyMock.verify(context);
}
@@ -321,7 +378,7 @@ public final class ItemSimilarityTest ex
similarityJob.setConf(conf);
similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
- "org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity"});
+ DistributedUncenteredZeroAssumingCosineSimilarity.class.getName()});
File countUsersPart = new File(tmpDir, "countUsers");
int numberOfUsers = ItemSimilarityJob.readNumberOfUsers(new Configuration(),
@@ -364,7 +421,102 @@ public final class ItemSimilarityTest ex
int linesWritten = currentLine-1;
assertEquals(2, linesWritten);
+ }
+ public void testMaxSimilaritiesPerItem() throws Exception {
+
+ File inputFile = getTestTempFile("prefsForMaxSimilarities.txt");
+ File outputDir = getTestTempDir("output");
+ outputDir.delete();
+ File tmpDir = getTestTempDir("tmp");
+
+ /* user-item-matrix
+
+ i1 i2 i3
+ u1 1 0 1
+ u2 0 1 1
+ u3 1 1 0
+ u4 1 1 1
+ u5 0 1 0
+ u6 1 1 0
+
+ tanimoto(i1,i2) = 0.5
+ tanimoto(i2,i3) = 0.333
+ tanimoto(i3,i1) = 0.4
+
+ When we set maxSimilaritiesPerItem to 1 the following pairs should be found:
+
+ i1 --> i2
+ i2 --> i1
+ i3 --> i1
+
+ */
+
+ BufferedWriter writer = new BufferedWriter(new FileWriter(inputFile));
+ try {
+ writer.write("1,1,1\n" +
+ "1,3,1\n" +
+ "2,2,1\n" +
+ "2,3,1\n" +
+ "3,1,1\n" +
+ "3,2,1\n" +
+ "4,1,1\n" +
+ "4,2,1\n" +
+ "4,3,1\n" +
+ "5,2,1\n" +
+ "6,1,1\n" +
+ "6,2,1\n");
+ } finally {
+ writer.close();
+ }
+
+ ItemSimilarityJob similarityJob = new ItemSimilarityJob();
+
+ Configuration conf = new Configuration();
+ conf.set("mapred.input.dir", inputFile.getAbsolutePath());
+ conf.set("mapred.output.dir", outputDir.getAbsolutePath());
+ conf.setBoolean("mapred.output.compress", false);
+
+ similarityJob.setConf(conf);
+
+ similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
+ DistributedTanimotoCoefficientSimilarity.class.getName(), "--maxSimilaritiesPerItem",
"1"});
+
+ File outPart = outputDir.listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.startsWith("part-");
+ }
+ })[0];
+ BufferedReader reader = new BufferedReader(new FileReader(outPart));
+
+ String line;
+ int currentLine = 1;
+ while ( (line = reader.readLine()) != null) {
+
+ String[] tokens = line.split("\t");
+
+ long itemAID = Long.parseLong(tokens[0]);
+ long itemBID = Long.parseLong(tokens[1]);
+ double similarity = Double.parseDouble(tokens[2]);
+
+ if (currentLine == 1) {
+ assertEquals(1L, itemAID);
+ assertEquals(2L, itemBID);
+ assertEquals(0.5d, similarity, 0.0001d);
+ }
+
+ if (currentLine == 2) {
+ assertEquals(1L, itemAID);
+ assertEquals(3L, itemBID);
+ assertEquals(0.4, similarity, 0.0001d);
+ }
+
+ currentLine++;
+ }
+
+ int linesWritten = currentLine-1;
+ assertEquals(2, linesWritten);
}
}
|