commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From chtom...@apache.org
Subject [1/4] [text] TEXT-2 Jaccard Index and Jaccard Distance
Date Wed, 30 Nov 2016 20:43:20 GMT
Repository: commons-text
Updated Branches:
  refs/heads/master 43ba72325 -> e3bb7f75a


TEXT-2  Jaccard Index and Jaccard Distance


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/a6a6a274
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/a6a6a274
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/a6a6a274

Branch: refs/heads/master
Commit: a6a6a274a6ed92945ae0ea0707b38629c9a01ace
Parents: 5888d49
Author: drajakumar <drajakumar@paypal.com>
Authored: Sun Nov 27 13:59:43 2016 +0530
Committer: drajakumar <drajakumar@paypal.com>
Committed: Sun Nov 27 13:59:43 2016 +0530

----------------------------------------------------------------------
 .../text/similarity/JaccardDistance.java        | 50 ++++++++++++
 .../text/similarity/JaccardSimilarity.java      | 86 ++++++++++++++++++++
 .../text/similarity/JaccardDistanceTest.java    | 66 +++++++++++++++
 .../text/similarity/JaccardSimilarityTest.java  | 66 +++++++++++++++
 4 files changed, 268 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
new file mode 100644
index 0000000..0c6d491
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * Measures the Jaccard distance of two sets of character sequence. Jaccard
+ * distance is the dissimilarity between two sets. Its the complementary of
+ * Jaccard similarity.
+ *
+ * <p>
+ * For further explanation about Jaccard Distance, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * </p>
+ */
+public class JaccardDistance implements EditDistance<Double> {
+
+    private final JaccardSimilarity jaccardSimilarity = new JaccardSimilarity();
+
+    /**
+     * Calculates Jaccard distance of two set character sequence passed as
+     * input. Calculates Jaccard similarity and returns the complementary of it.
+     * 
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return index
+     * @throws IllegalArgumentException
+     *             if either String input {@code null}
+     */
+    @Override
+    public Double apply(CharSequence left, CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Input cannot be null");
+        }
+        return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d;
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
new file mode 100644
index 0000000..6c78bad
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
+ * sequence. Jaccard similarity is the size of the intersection divided by the
+ * size of the union of the two sets.
+ *
+ * <p>
+ * For further explanation about Jaccard Similarity, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * </p>
+ */
+public class JaccardSimilarity implements EditDistance<Double> {
+
+    /**
+     * Calculates Jaccard Similarity of two set character sequence passed as
+     * input.
+     * 
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return index
+     * @throws IllegalArgumentException
+     *             if either String input {@code null}
+     */
+    @Override
+    public Double apply(CharSequence left, CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Input cannot be null");
+        }
+        return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
+    }
+
+    /**
+     * Calculates Jaccard Similarity of two set character sequence passed as
+     * input. Does the calculation by identifying the union (characters in at
+     * least one of the two sets) of the two sets and intersection (characters
+     * which are present in set one which are present in set two)
+     * 
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return index
+     */
+    private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
+        Set<String> intersectionSet = new HashSet<String>();
+        Set<String> unionSet = new HashSet<String>();
+        boolean unionFilled = false;
+        int leftLength = left.length();
+        int rightLength = right.length();
+        if (leftLength == 0 || rightLength == 0) {
+            return 0d;
+        }
+
+        for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
+            unionSet.add(String.valueOf(left.charAt(leftIndex)));
+            for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
+                if (!unionFilled) {
+                    unionSet.add(String.valueOf(right.charAt(rightIndex)));
+                }
+                if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
+                    intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
+                }
+            }
+            unionFilled = true;
+        }
+        return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
new file mode 100644
index 0000000..4f3f55d
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.JaccardDistance}.
+ */
+public class JaccardDistanceTest {
+
+    private static JaccardDistance classBeingTested;
+    
+    @BeforeClass
+    public static void setUp() {
+        classBeingTested = new JaccardDistance();
+    }
+
+    @Test
+    public void testGettingJaccardDistance() {
+        assertEquals(1.00d, classBeingTested.apply("", ""), 0.0d);
+        assertEquals(1.00d, classBeingTested.apply("left", ""), 0.0d);
+        assertEquals(1.00d, classBeingTested.apply("", "right"), 0.0d);
+        assertEquals(0.25d, classBeingTested.apply("frog", "fog"), 0.0d);
+        assertEquals(1.00d, classBeingTested.apply("fly", "ant"), 0.0d);
+        assertEquals(0.78d, classBeingTested.apply("elephant", "hippo"), 0.0d);
+        assertEquals(0.36d, classBeingTested.apply("ABC Corporation", "ABC Corp"), 0.0d);
+        assertEquals(0.24d, classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises,
Inc."), 0.0d);
+        assertEquals(0.11d, classBeingTested.apply("My Gym Children's Fitness Center", "My
Gym. Childrens Fitness"), 0.0d);
+        assertEquals(0.10d, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
+        assertEquals(0.87d, classBeingTested.apply("left", "right"), 0.0d);
+        assertEquals(0.87d, classBeingTested.apply("leettteft", "ritttght"), 0.0d);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGettingJaccardDistanceNullNull() throws Exception {
+        classBeingTested.apply(null, null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGettingJaccardDistanceStringNull() throws Exception {
+        classBeingTested.apply(" ", null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGettingJaccardDistanceNullString() throws Exception {
+        classBeingTested.apply(null, "right");
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
new file mode 100644
index 0000000..0e8773c
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.JaccardSimilarity}.
+ */
+public class JaccardSimilarityTest {
+
+    private static JaccardSimilarity classBeingTested;
+    
+    @BeforeClass
+    public static void setUp() {
+        classBeingTested = new JaccardSimilarity();
+    }
+
+    @Test
+    public void testGettingJaccardSimilarity() {
+        assertEquals(0.00d, classBeingTested.apply("", ""), 0.0d);
+        assertEquals(0.00d, classBeingTested.apply("left", ""), 0.0d);
+        assertEquals(0.00d, classBeingTested.apply("", "right"), 0.0d);
+        assertEquals(0.75d, classBeingTested.apply("frog", "fog"), 0.0d);
+        assertEquals(0.00d, classBeingTested.apply("fly", "ant"), 0.0d);
+        assertEquals(0.22d, classBeingTested.apply("elephant", "hippo"), 0.0d);
+        assertEquals(0.64d, classBeingTested.apply("ABC Corporation", "ABC Corp"), 0.0d);
+        assertEquals(0.76d, classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises,
Inc."), 0.0d);
+        assertEquals(0.89d, classBeingTested.apply("My Gym Children's Fitness Center", "My
Gym. Childrens Fitness"), 0.0d);
+        assertEquals(0.9d, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
+        assertEquals(0.13d, classBeingTested.apply("left", "right"), 0.0d);
+        assertEquals(0.13d, classBeingTested.apply("leettteft", "ritttght"), 0.0d);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGettingJaccardSimilarityNullNull() throws Exception {
+        classBeingTested.apply(null, null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGettingJaccardSimilarityStringNull() throws Exception {
+        classBeingTested.apply(" ", null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGettingJaccardSimilarityNullString() throws Exception {
+        classBeingTested.apply(null, "right");
+    }
+}


Mime
View raw message