metron-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject [1/2] metron git commit: METRON-1061 Add Fuzzy String Scoring to Stellar (ottobackwards) closes apache/metron#667
Date Sat, 26 Aug 2017 15:20:58 GMT
Repository: metron
Updated Branches:
  refs/heads/master a2bae0bce -> c8e84fa3b


METRON-1061 Add Fuzzy String Scoring to Stellar (ottobackwards) closes apache/metron#667


Project: http://git-wip-us.apache.org/repos/asf/metron/repo
Commit: http://git-wip-us.apache.org/repos/asf/metron/commit/d5dbfc20
Tree: http://git-wip-us.apache.org/repos/asf/metron/tree/d5dbfc20
Diff: http://git-wip-us.apache.org/repos/asf/metron/diff/d5dbfc20

Branch: refs/heads/master
Commit: d5dbfc20cb42708c27e9a02c5f7eacac98604745
Parents: a2bae0b
Author: ottobackwards <ottobackwards@gmail.com>
Authored: Sat Aug 26 10:46:01 2017 -0400
Committer: otto <otto@apache.org>
Committed: Sat Aug 26 10:46:01 2017 -0400

----------------------------------------------------------------------
 dependencies_with_url.csv                       |   1 +
 metron-stellar/stellar-common/README.md         |  14 +++
 metron-stellar/stellar-common/pom.xml           |   5 +
 .../stellar/dsl/functions/TextFunctions.java    | 112 +++++++++++++++++++
 .../dsl/functions/TextFunctionsTest.java        | 101 +++++++++++++++++
 5 files changed, 233 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/dependencies_with_url.csv
----------------------------------------------------------------------
diff --git a/dependencies_with_url.csv b/dependencies_with_url.csv
index 83078ad..fac1164 100644
--- a/dependencies_with_url.csv
+++ b/dependencies_with_url.csv
@@ -177,6 +177,7 @@ commons-logging:commons-logging:jar:1.1.3:compile,ASLv2,http://commons.apache.or
 commons-logging:commons-logging:jar:1.2:compile,ASLv2,http://commons.apache.org/proper/commons-logging/
 commons-net:commons-net:jar:3.1:compile,ASLv2,http://commons.apache.org/net/
 commons-net:commons-net:jar:3.1:provided,ASLv2,http://commons.apache.org/net/
+commons-text:commons-text:jar:1.1:compile,ASLv2,http://commons.apache.org/proper/commons-text/
 commons-validator:commons-validator:jar:1.4.0:compile,ASLv2,http://commons.apache.org/validator/
 commons-validator:commons-validator:jar:1.5.1:compile,ASLv2,http://commons.apache.org/proper/commons-validator/
 commons-validator:commons-validator:jar:1.6:compile,ASLv2,http://commons.apache.org/proper/commons-validator/

http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/README.md
----------------------------------------------------------------------
diff --git a/metron-stellar/stellar-common/README.md b/metron-stellar/stellar-common/README.md
index a25c831..8746e60 100644
--- a/metron-stellar/stellar-common/README.md
+++ b/metron-stellar/stellar-common/README.md
@@ -131,6 +131,8 @@ In the core language functions, we support basic functional programming
primitiv
 | [ `FILL_RIGHT`](#fill_right)                                                          
            |
 | [ `FILTER`](#filter)                                                                  
            |
 | [ `FLOOR`](#floor)                                                               |
+| [ `FUZZY_LANGS`](#fuzzy_langs)                                                   |
+| [ `FUZZY_SCORE`](#fuzzy_score)                                                   |
 | [ `FORMAT`](#format)                                                                  
            |
 | [ `GEO_GET`](#geo_get)                                                                
            |
 | [ `GET`](#get)                                                                        
            |
@@ -412,6 +414,18 @@ In the core language functions, we support basic functional programming
primitiv
     * format - string
     * arguments... - object(s)
   * Returns: A formatted string.
+  
+### `FUZZY_LANGS`
+  * Description: Returns a list of IETF BCP 47 available to the system, such as en, fr, de.
+  * Returns: A list of IEF BGP 47 language tag strings
+
+### `FUZZY_SCORE`
+  * Description: Returns the Fuzzy Score which indicates the similarity score between two
strings. One point is given for every matched character. Subsequent matches yield two bonus
points. A higher score indicates a higher similarity.
+  * Input:
+    * string - The full term that should be matched against.
+    * string - The query that will be matched against a term.
+    * string - The IETF BCP 47 language code to use.
+  * Returns: An Integer representing the score.
 
 ### `GEO_GET`
   * Description: Look up an IPV4 address and returns geographic information about it

http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/pom.xml
----------------------------------------------------------------------
diff --git a/metron-stellar/stellar-common/pom.xml b/metron-stellar/stellar-common/pom.xml
index 2f4cb6e..5945bbd 100644
--- a/metron-stellar/stellar-common/pom.xml
+++ b/metron-stellar/stellar-common/pom.xml
@@ -97,6 +97,11 @@
             <version>1.10</version>
         </dependency>
         <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>1.1</version>
+        </dependency>
+        <dependency>
             <groupId>commons-validator</groupId>
             <artifactId>commons-validator</artifactId>
             <version>1.6</version>

http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java
----------------------------------------------------------------------
diff --git a/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java
b/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java
new file mode 100644
index 0000000..01e5da4
--- /dev/null
+++ b/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements.  See the NOTICE file distributed with this work for additional information
regarding
+ * copyright ownership.  The ASF licenses this file to you under the Apache License, Version
2.0
+ * (the "License"); you may not use this file except in compliance with the License.  You
may obtain
+ * a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express
+ * or implied. See the License for the specific language governing permissions and limitations
under
+ * the License.
+ */
+
+package org.apache.metron.stellar.dsl.functions;
+
+import com.google.common.collect.ImmutableList;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.text.similarity.FuzzyScore;
+import org.apache.metron.stellar.dsl.BaseStellarFunction;
+import org.apache.metron.stellar.dsl.ParseException;
+import org.apache.metron.stellar.dsl.Stellar;
+
+public class TextFunctions {
+
+  private static final List<String> tagsList;
+
+  static {
+    List<String> tags = new ArrayList<>();
+    for (Locale locale : Locale.getAvailableLocales()) {
+      tags.add(locale.toLanguageTag());
+    }
+    tagsList = ImmutableList.copyOf(tags);
+  }
+
+  @Stellar(name = "LANGS",
+      namespace = "FUZZY",
+      description = "Returns a list of IETF BCP 47 available to the system, such as en, fr,
de. "
+          + "These values may be passed to FUZZY_SCORE",
+      params = {},
+      returns = "A list of IEF BCP 47 language tag strings")
+  /**
+   * GetAvailableLanaguageTags exposes IEF BCP 47 lanaguage tags available to the system
+   */
+  public static class GetAvailableLanaguageTags extends BaseStellarFunction {
+
+    @Override
+    public Object apply(List<Object> list) {
+      return tagsList;
+    }
+  }
+
+  @Stellar(name = "SCORE",
+      namespace = "FUZZY",
+      description =
+          "Returns the Fuzzy Score which indicates the similarity score between two Strings
"
+              +
+              "One point is given for every matched character. Subsequent matches yield two
bonus "
+              +
+              "points. A higher score indicates a higher similarity",
+      params = {
+          "string - The full term that should be matched against",
+          "string - The query that will be matched against a term",
+          "string - The IETF BCP 47 language code to use such as en, fr, de "
+              +
+              "( SEE  FUZZY_LANGS  and https://tools.ietf.org/html/bcp47)"
+      },
+      returns = "integer representing the score")
+  /**
+   * FuzzyScoreFunction exposes the Apache Commons Text Similarity FuzzyScore through
+   * Stellar.
+   */
+  public static class FuzzyScoreFunction extends BaseStellarFunction {
+
+    @Override
+    public Object apply(List<Object> list) {
+      if (list.size() < 3) {
+        throw new IllegalStateException("FUZZY_SCORE expects three args: [string, string,
string]");
+      }
+      Object oterm = list.get(0);
+      Object oquery = list.get(1);
+      Object olang = list.get(2);
+
+      // return 0 here, validate will pass 3 nulls
+      // if we change validate to pass default of expected type, we can differentiate
+      if (!(oterm instanceof String) || !(oquery instanceof String) || !(olang instanceof
String)) {
+        return 0;
+      }
+
+      String term = (String) oterm;
+      String query = (String) oquery;
+      String lang = (String) olang;
+
+      if (!tagsList.contains(lang)) {
+        throw new ParseException(
+            "FUZZY_SCORE requires a valid IETF BCP47 language code see FUZZY_LANGS and https://tools.ietf.org/html/bcp47");
+      }
+      
+      if (StringUtils.isEmpty(term) || StringUtils.isEmpty(query)) {
+        return 0;
+      }
+
+      Locale locale = Locale.forLanguageTag(lang);
+      FuzzyScore score = new FuzzyScore(locale);
+      return score.fuzzyScore(term, query);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java
----------------------------------------------------------------------
diff --git a/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java
b/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java
new file mode 100644
index 0000000..07b3619
--- /dev/null
+++ b/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements.  See the NOTICE file distributed with this work for additional information
regarding
+ * copyright ownership.  The ASF licenses this file to you under the Apache License, Version
2.0
+ * (the "License"); you may not use this file except in compliance with the License.  You
may obtain
+ * a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express
+ * or implied. See the License for the specific language governing permissions and limitations
under
+ * the License.
+ */
+
+package org.apache.metron.stellar.dsl.functions;
+
+import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.run;
+import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.runPredicate;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.metron.stellar.dsl.DefaultVariableResolver;
+import org.apache.metron.stellar.dsl.ParseException;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TextFunctionsTest {
+
+  static final Map<String, String> variableMap = new HashMap<String, String>()
{{
+    put("metron", "metron");
+    put("sentence", "metron is great");
+    put("empty", "");
+    put("english", "en");
+    put("klingon", "Kling");
+    put("asf", "Apache Software Foundation");
+  }};
+
+  @Test
+  public void testGetAvailableLanguageTags() {
+    Object ret = run("FUZZY_LANGS()", new HashMap<>());
+    Assert.assertNotNull(ret);
+    Assert.assertTrue(ret instanceof List);
+    List<String> tags = (List<String>) ret;
+    Assert.assertTrue(tags.size() > 0);
+    Assert.assertTrue(tags.contains("en"));
+    Assert.assertTrue(tags.contains("fr"));
+  }
+
+  @Test()
+  public void testNoMatchStrings() throws Exception {
+    Assert.assertTrue(runPredicate("0 == FUZZY_SCORE(metron,'z',english)",
+        new DefaultVariableResolver(v -> variableMap.get(v),
+            v -> variableMap.containsKey(v))));
+  }
+
+  @Test(expected = ParseException.class)
+  public void testMissingLanguage() throws Exception {
+    runPredicate("0 == FUZZY_SCORE(metron,'z',klingon)",
+        new DefaultVariableResolver(v -> variableMap.get(v),
+            v -> variableMap.containsKey(v)));
+  }
+
+  @Test()
+  public void testEmptyFirstArg() throws Exception {
+    Assert.assertTrue(runPredicate("0 == FUZZY_SCORE(empty,'z',english)",
+        new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v))));
+  }
+
+  @Test()
+  public void testEmptyFirstTwoArgs() throws Exception {
+    Assert.assertTrue(runPredicate("0 == FUZZY_SCORE(empty,empty,english)",
+        new DefaultVariableResolver(v -> variableMap.get(v),
+            v -> variableMap.containsKey(v))));
+  }
+
+  @Test(expected = ParseException.class)
+  public void testEmptyArgs() throws Exception {
+    runPredicate("0 == FUZZY_SCORE(empty,empty,empty)",
+        new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v)));
+  }
+
+  @Test(expected = ParseException.class)
+  public void testNoArgs() throws Exception {
+    runPredicate("0 == FUZZY_SCORE()",
+        new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v)));
+  }
+
+  @Test
+  public void testHappyStringFunctions() throws Exception {
+    Assert
+        .assertTrue(runPredicate("1 == FUZZY_SCORE(metron,'m',english)",
+            new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v))));
+    Assert.assertTrue(
+        runPredicate("16 == FUZZY_SCORE(metron,'metron',english)",
+            new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v))));
+    Assert.assertTrue(runPredicate("3 == FUZZY_SCORE(asf,'asf',english)",
+        new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v))));
+  }
+}


Mime
View raw message