metron-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ceste...@apache.org
Subject incubator-metron git commit: METRON-640: Add a Stellar function to compute shannon entropy for strings closes apache/incubator-metron#403
Date Wed, 11 Jan 2017 14:15:07 GMT
Repository: incubator-metron
Updated Branches:
  refs/heads/master 09cb50288 -> 763f2fc42


METRON-640: Add a Stellar function to compute shannon entropy for strings closes apache/incubator-metron#403


Project: http://git-wip-us.apache.org/repos/asf/incubator-metron/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-metron/commit/763f2fc4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-metron/tree/763f2fc4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-metron/diff/763f2fc4

Branch: refs/heads/master
Commit: 763f2fc4220caac8e9b4b994e5a6987b43a21bd3
Parents: 09cb502
Author: cstella <cestella@gmail.com>
Authored: Wed Jan 11 09:14:57 2017 -0500
Committer: cstella <cestella@gmail.com>
Committed: Wed Jan 11 09:14:57 2017 -0500

----------------------------------------------------------------------
 metron-platform/metron-common/README.md         |   7 +
 .../common/dsl/functions/StringFunctions.java   |  38 +++
 .../dsl/functions/StringFunctionsTest.java      | 254 ++++++++++---------
 3 files changed, 182 insertions(+), 117 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-metron/blob/763f2fc4/metron-platform/metron-common/README.md
----------------------------------------------------------------------
diff --git a/metron-platform/metron-common/README.md b/metron-platform/metron-common/README.md
index b41333b..de277bc 100644
--- a/metron-platform/metron-common/README.md
+++ b/metron-platform/metron-common/README.md
@@ -107,6 +107,7 @@ Using parens such as: "foo" : "\<ok\>" requires escaping; "foo":
"\'\<ok\>\'"
 | [ `STATS_SUM_LOGS`](../../metron-analytics/metron-statistics#stats_sum_logs)          
            |
 | [ `STATS_SUM_SQUARES`](../../metron-analytics/metron-statistics#stats_sum_squares)    
            |
 | [ `STATS_VARIANCE`](../../metron-analytics/metron-statistics#stats_variance)          
            |
+| [ `STRING_ENTROPY`](#string_entropy)                                                  
            |
 | [ `SYSTEM_ENV_GET`](#system_env_get)                                                  
            |
 | [ `SYSTEM_PROPERTY_GET`](#system_property_get)                                        
            |
 | [ `TO_DOUBLE`](#to_double)                                                            
            |
@@ -401,6 +402,12 @@ MAP_GET`
     * pattern - The proposed regex pattern
   * Returns: True if the regex pattern matches the string and false if otherwise.
 
+### `STRING_ENTROPY`
+  * Description: Computes the base-2 shannon entropy of a string.
+  * Input:
+    * input - String 
+  * Returns: The base-2 shannon entropy of the string (https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition).
 The unit of this is bits.
+
 ### `SPLIT`
   * Description: Splits the string by the delimiter.
   * Input:

http://git-wip-us.apache.org/repos/asf/incubator-metron/blob/763f2fc4/metron-platform/metron-common/src/main/java/org/apache/metron/common/dsl/functions/StringFunctions.java
----------------------------------------------------------------------
diff --git a/metron-platform/metron-common/src/main/java/org/apache/metron/common/dsl/functions/StringFunctions.java
b/metron-platform/metron-common/src/main/java/org/apache/metron/common/dsl/functions/StringFunctions.java
index 1239abb..fa2b55b 100644
--- a/metron-platform/metron-common/src/main/java/org/apache/metron/common/dsl/functions/StringFunctions.java
+++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/dsl/functions/StringFunctions.java
@@ -21,13 +21,16 @@ package org.apache.metron.common.dsl.functions;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
+import org.apache.commons.lang.StringUtils;
 import org.apache.metron.common.dsl.BaseStellarFunction;
 import org.apache.metron.common.dsl.ParseException;
 import org.apache.metron.common.dsl.Stellar;
 import org.apache.metron.common.utils.ConversionUtils;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 public class StringFunctions {
 
@@ -284,4 +287,39 @@ public class StringFunctions {
     }
     return org.apache.commons.lang.StringUtils.rightPad(input,requiredLength,fill);
   }
+
+  @Stellar( namespace="STRING"
+          , name="ENTROPY"
+          , description = "Computes the base-2 shannon entropy of a string"
+          , params = { "input - String" }
+          , returns = "The base-2 shannon entropy of the string (https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition).
 The unit of this is bits."
+  )
+  public static class Entropy extends BaseStellarFunction {
+    @Override
+    public Object apply(List<Object> strings) {
+      /*
+      Shannon entropy is defined as follows:
+      \Eta(X) = - \sum(p(x_i)*log_2(p(x_i)), i=0, n-1) where x_i are distinct characters
in the string.
+       */
+      Map<Character, Integer> frequency = new HashMap<>();
+      if(strings.size() != 1) {
+        throw new IllegalArgumentException("STRING_ENTROPY expects exactly one argument which
is a string.");
+      }
+      String input = ConversionUtils.convert(strings.get(0), String.class);
+      if(StringUtils.isEmpty(input)) {
+        return 0.0;
+      }
+      for(int i = 0;i < input.length();++i) {
+        char c = input.charAt(i);
+        frequency.put(c, frequency.getOrDefault(c, 0) + 1);
+      }
+      double ret = 0.0;
+      double log2 = Math.log(2);
+      for(Integer f : frequency.values()) {
+        double p = f.doubleValue()/input.length();
+        ret -= p * Math.log(p) / log2;
+      }
+      return ret;
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-metron/blob/763f2fc4/metron-platform/metron-common/src/test/java/org/apache/metron/common/dsl/functions/StringFunctionsTest.java
----------------------------------------------------------------------
diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/dsl/functions/StringFunctionsTest.java
b/metron-platform/metron-common/src/test/java/org/apache/metron/common/dsl/functions/StringFunctionsTest.java
index bee54b3..18d2eb2 100644
--- a/metron-platform/metron-common/src/test/java/org/apache/metron/common/dsl/functions/StringFunctionsTest.java
+++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/dsl/functions/StringFunctionsTest.java
@@ -19,6 +19,7 @@
 package org.apache.metron.common.dsl.functions;
 
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
 import org.apache.commons.collections.map.HashedMap;
 import org.apache.metron.common.dsl.ParseException;
 import org.junit.Assert;
@@ -32,124 +33,143 @@ import static org.apache.metron.common.utils.StellarProcessorUtils.runPredicate;
 
 public class StringFunctionsTest {
 
-    @Test
-    public void testStringFunctions() throws Exception {
-        final Map<String, String> variableMap = new HashMap<String, String>()
{{
-            put("foo", "casey");
-            put("ip", "192.168.0.1");
-            put("empty", "");
-            put("spaced", "metron is great");
-        }};
-        Assert.assertTrue(runPredicate("true and TO_UPPER(foo) == 'CASEY'", v -> variableMap.get(v)));
-        Assert.assertTrue(runPredicate("foo in [ TO_LOWER('CASEY'), 'david' ]", v -> variableMap.get(v)));
-        Assert.assertTrue(runPredicate("TO_UPPER(foo) in [ TO_UPPER('casey'), 'david' ] and
IN_SUBNET(ip, '192.168.0.0/24')", v -> variableMap.get(v)));
-        Assert.assertFalse(runPredicate("TO_LOWER(foo) in [ TO_UPPER('casey'), 'david' ]",
v -> variableMap.get(v)));
+  @Test
+  public void testStringFunctions() throws Exception {
+    final Map<String, String> variableMap = new HashMap<String, String>() {{
+      put("foo", "casey");
+      put("ip", "192.168.0.1");
+      put("empty", "");
+      put("spaced", "metron is great");
+    }};
+    Assert.assertTrue(runPredicate("true and TO_UPPER(foo) == 'CASEY'", v -> variableMap.get(v)));
+    Assert.assertTrue(runPredicate("foo in [ TO_LOWER('CASEY'), 'david' ]", v -> variableMap.get(v)));
+    Assert.assertTrue(runPredicate("TO_UPPER(foo) in [ TO_UPPER('casey'), 'david' ] and IN_SUBNET(ip,
'192.168.0.0/24')", v -> variableMap.get(v)));
+    Assert.assertFalse(runPredicate("TO_LOWER(foo) in [ TO_UPPER('casey'), 'david' ]", v
-> variableMap.get(v)));
+  }
+
+  @Test
+  public void testStringFunctions_advanced() throws Exception {
+    final Map<String, Object> variableMap = new HashMap<String, Object>() {{
+      put("foo", "casey");
+      put("bar", "bar.casey.grok");
+      put("ip", "192.168.0.1");
+      put("empty", "");
+      put("spaced", "metron is great");
+      put("myList", ImmutableList.of("casey", "apple", "orange"));
+    }};
+    Assert.assertTrue(runPredicate("foo in SPLIT(bar, '.')", v -> variableMap.get(v)));
+    Assert.assertFalse(runPredicate("foo in SPLIT(ip, '.')", v -> variableMap.get(v)));
+    Assert.assertTrue(runPredicate("foo in myList", v -> variableMap.get(v)));
+    Assert.assertFalse(runPredicate("foo not in myList", v -> variableMap.get(v)));
+  }
+
+  @Test
+  public void testLeftRightFills() throws Exception{
+    final Map<String, Object> variableMap = new HashMap<String, Object>() {{
+      put("foo", null);
+      put("bar", null);
+      put("notInt","oh my");
+    }};
+
+    //LEFT
+    Object left = run("FILL_LEFT('123','X', 10)",new HashedMap());
+    Assert.assertNotNull(left);
+    Assert.assertEquals(10,((String)left).length());
+    Assert.assertEquals("XXXXXXX123",(String)left);
+
+    //RIGHT
+    Object right = run("FILL_RIGHT('123','X', 10)", new HashedMap());
+    Assert.assertNotNull(right);
+    Assert.assertEquals(10,((String)right).length());
+    Assert.assertEquals("123XXXXXXX",(String)right);
+
+    //INPUT ALREADY LENGTH
+    Object same = run("FILL_RIGHT('123','X', 3)", new HashedMap());
+    Assert.assertEquals(3,((String)same).length());
+    Assert.assertEquals("123",(String)same);
+
+    //INPUT BIGGER THAN LENGTH
+    Object tooBig = run("FILL_RIGHT('1234567890','X', 3)", new HashedMap());
+    Assert.assertEquals(10,((String)tooBig).length());
+    Assert.assertEquals("1234567890",(String)tooBig);
+
+    //NULL VARIABLES
+    boolean thrown = false;
+    try{
+      run("FILL_RIGHT('123',foo,bar)", variableMap);
+    }catch(ParseException pe) {
+      thrown = true;
+      Assert.assertTrue(pe.getMessage().contains("are both required"));
     }
-
-    @Test
-    public void testStringFunctions_advanced() throws Exception {
-        final Map<String, Object> variableMap = new HashMap<String, Object>()
{{
-            put("foo", "casey");
-            put("bar", "bar.casey.grok");
-            put("ip", "192.168.0.1");
-            put("empty", "");
-            put("spaced", "metron is great");
-            put("myList", ImmutableList.of("casey", "apple", "orange"));
-        }};
-        Assert.assertTrue(runPredicate("foo in SPLIT(bar, '.')", v -> variableMap.get(v)));
-        Assert.assertFalse(runPredicate("foo in SPLIT(ip, '.')", v -> variableMap.get(v)));
-        Assert.assertTrue(runPredicate("foo in myList", v -> variableMap.get(v)));
-        Assert.assertFalse(runPredicate("foo not in myList", v -> variableMap.get(v)));
+    Assert.assertTrue(thrown);
+    thrown = false;
+
+    // NULL LENGTH
+    try{
+      run("FILL_RIGHT('123','X',bar)", variableMap);
+    }catch(ParseException pe) {
+      thrown = true;
+      Assert.assertTrue(pe.getMessage().contains("are both required"));
     }
-
-    @Test
-    public void testLeftRightFills() throws Exception{
-        final Map<String, Object> variableMap = new HashMap<String, Object>()
{{
-            put("foo", null);
-            put("bar", null);
-            put("notInt","oh my");
-        }};
-
-        //LEFT
-        Object left = run("FILL_LEFT('123','X', 10)",new HashedMap());
-        Assert.assertNotNull(left);
-        Assert.assertEquals(10,((String)left).length());
-        Assert.assertEquals("XXXXXXX123",(String)left);
-
-        //RIGHT
-        Object right = run("FILL_RIGHT('123','X', 10)", new HashedMap());
-        Assert.assertNotNull(right);
-        Assert.assertEquals(10,((String)right).length());
-        Assert.assertEquals("123XXXXXXX",(String)right);
-
-        //INPUT ALREADY LENGTH
-        Object same = run("FILL_RIGHT('123','X', 3)", new HashedMap());
-        Assert.assertEquals(3,((String)same).length());
-        Assert.assertEquals("123",(String)same);
-
-        //INPUT BIGGER THAN LENGTH
-        Object tooBig = run("FILL_RIGHT('1234567890','X', 3)", new HashedMap());
-        Assert.assertEquals(10,((String)tooBig).length());
-        Assert.assertEquals("1234567890",(String)tooBig);
-
-        //NULL VARIABLES
-        boolean thrown = false;
-        try{
-            run("FILL_RIGHT('123',foo,bar)", variableMap);
-        }catch(ParseException pe) {
-            thrown = true;
-            Assert.assertTrue(pe.getMessage().contains("are both required"));
-        }
-        Assert.assertTrue(thrown);
-        thrown = false;
-
-        // NULL LENGTH
-        try{
-            run("FILL_RIGHT('123','X',bar)", variableMap);
-        }catch(ParseException pe) {
-            thrown = true;
-            Assert.assertTrue(pe.getMessage().contains("are both required"));
-        }
-        Assert.assertTrue(thrown);
-        thrown = false;
-
-        // NULL FILL
-        try{
-            run("FILL_RIGHT('123',foo, 7)", variableMap);
-        }catch(ParseException pe) {
-            thrown = true;
-            Assert.assertTrue(pe.getMessage().contains("are both required"));
-        }
-        Assert.assertTrue(thrown);
-        thrown = false;
-
-        // NON INTEGER LENGTH
-        try {
-            run("FILL_RIGHT('123','X', 'z' )", new HashedMap());
-        }catch(ParseException pe){
-            thrown = true;
-            Assert.assertTrue(pe.getMessage().contains("not a valid Integer"));
-        }
-        Assert.assertTrue(thrown);
-        thrown = false;
-
-        // EMPTY STRING PAD
-        try {
-            Object returnValue = run("FILL_RIGHT('123','', 10 )", new HashedMap());
-        }catch(ParseException pe) {
-            thrown = true;
-            Assert.assertTrue(pe.getMessage().contains("cannot be an empty"));
-        }
-        Assert.assertTrue(thrown);
-        thrown = false;
-
-        //MISSING LENGTH PARAMETER
-        try {
-            run("FILL_RIGHT('123',foo)", variableMap);
-        }catch(ParseException pe){
-            thrown = true;
-            Assert.assertTrue(pe.getMessage().contains("expects three"));
-        }
-        Assert.assertTrue(thrown);
+    Assert.assertTrue(thrown);
+    thrown = false;
+
+    // NULL FILL
+    try{
+      run("FILL_RIGHT('123',foo, 7)", variableMap);
+    }catch(ParseException pe) {
+      thrown = true;
+      Assert.assertTrue(pe.getMessage().contains("are both required"));
+    }
+    Assert.assertTrue(thrown);
+    thrown = false;
+
+    // NON INTEGER LENGTH
+    try {
+      run("FILL_RIGHT('123','X', 'z' )", new HashedMap());
+    }catch(ParseException pe){
+      thrown = true;
+      Assert.assertTrue(pe.getMessage().contains("not a valid Integer"));
+    }
+    Assert.assertTrue(thrown);
+    thrown = false;
+
+    // EMPTY STRING PAD
+    try {
+      Object returnValue = run("FILL_RIGHT('123','', 10 )", new HashedMap());
+    }catch(ParseException pe) {
+      thrown = true;
+      Assert.assertTrue(pe.getMessage().contains("cannot be an empty"));
+    }
+    Assert.assertTrue(thrown);
+    thrown = false;
+
+    //MISSING LENGTH PARAMETER
+    try {
+      run("FILL_RIGHT('123',foo)", variableMap);
+    }catch(ParseException pe){
+      thrown = true;
+      Assert.assertTrue(pe.getMessage().contains("expects three"));
     }
+    Assert.assertTrue(thrown);
+  }
+
+  @Test
+  public void shannonEntropyTest() throws Exception {
+    //test empty string
+    Assert.assertEquals(0.0, (Double)run("STRING_ENTROPY('')", new HashMap<>()), 0.0);
+    Assert.assertEquals(0.0, (Double)run("STRING_ENTROPY(foo)", ImmutableMap.of("foo", "")),
0.0);
+
+    /*
+    Now consider the string aaaaaaaaaabbbbbccccc or 10 a's followed by 5 b's and 5 c's.
+    The probabilities of each character is as follows:
+    p(a) = 1/2
+    p(b) = 1/4
+    p(c) = 1/4
+    so the shannon entropy should be
+      -p(a)*log_2(p(a)) - p(b)*log_2(p(b)) - p(c)*log_2(p(c)) =
+      -0.5*-1 - 0.25*-2 - 0.25*-2 = 1.5
+     */
+    Assert.assertEquals(1.5, (Double)run("STRING_ENTROPY(foo)", ImmutableMap.of("foo", "aaaaaaaaaabbbbbccccc")),
0.0);
+  }
 }


Mime
View raw message