opennlp-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jo...@apache.org
Subject svn commit: r1176845 - in /incubator/opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/postag/ test/java/opennlp/tools/postag/ test/resources/opennlp/tools/postag/
Date Wed, 28 Sep 2011 12:28:58 GMT
Author: joern
Date: Wed Sep 28 12:28:58 2011
New Revision: 1176845

URL: http://svn.apache.org/viewvc?rev=1176845&view=rev
Log:
OPENNLP-286 Fixes to the POSDictionary and new test code to ensure case sensitive and case
insensitive dictionaries are working as expected.

Added:
    incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseInsensitive.xml
  (with props)
    incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseSensitive.xml
  (with props)
    incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryWithoutCaseAttribute.xml
  (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
    incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSDictionaryTest.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java?rev=1176845&r1=1176844&r2=1176845&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
(original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
Wed Sep 28 12:28:58 2011
@@ -36,6 +36,7 @@ import opennlp.tools.dictionary.serializ
 import opennlp.tools.dictionary.serializer.EntryInserter;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.StringList;
+import opennlp.tools.util.StringUtil;
 
 /**
  * Provides a means of determining which tags are valid for a particular word
@@ -118,10 +119,12 @@ public class POSDictionary implements It
       for (int ti = 0, tl = parts.length - 1; ti < tl; ti++) {
         tags[ti] = parts[ti + 1];
       }
-      if (caseSensitive)
+      if (caseSensitive) {
         dictionary.put(parts[0], tags);
-      else
-        dictionary.put(parts[0].toLowerCase(), tags);
+      }
+      else {
+        dictionary.put(StringUtil.toLowerCase(parts[0]), tags);
+      }
     }
   }
 
@@ -293,6 +296,17 @@ public class POSDictionary implements It
 
     newPosDict.caseSensitive = isCaseSensitive;
     
+    // TODO: The dictionary API needs to be improved to do this better!
+    if (!isCaseSensitive) {
+      Map<String, String[]> lowerCasedDictionary = new HashMap<String, String[]>();
+      
+      for (Map.Entry<String, String[]> entry : newPosDict.dictionary.entrySet()) {
+        lowerCasedDictionary.put(StringUtil.toLowerCase(entry.getKey()), entry.getValue());
+      }
+      
+      newPosDict.dictionary = lowerCasedDictionary;
+    }
+    
     return newPosDict;
   }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSDictionaryTest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSDictionaryTest.java?rev=1176845&r1=1176844&r2=1176845&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSDictionaryTest.java
(original)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSDictionaryTest.java
Wed Sep 28 12:28:58 2011
@@ -17,7 +17,7 @@
 
 package opennlp.tools.postag;
 
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
@@ -26,6 +26,7 @@ import java.io.InputStream;
 
 import opennlp.tools.util.InvalidFormatException;
 
+import org.junit.Assert;
 import org.junit.Test;
 
 /**
@@ -33,20 +34,15 @@ import org.junit.Test;
  */
 public class POSDictionaryTest {
 
-  @Test
-  public void testSerialization() throws IOException, InvalidFormatException {
-    POSDictionary dictionary = new POSDictionary();
-
-    dictionary.addTags("a", "1", "2", "3");
-    dictionary.addTags("b", "4", "5", "6");
-    dictionary.addTags("c", "7", "8", "9");
-    dictionary.addTags("Always", "RB","NNP");
-
-
+  private static POSDictionary loadDictionary(String name) throws IOException {
+    return POSDictionary.create(POSDictionaryTest.class.getResourceAsStream(name));
+  }
+  
+  private static POSDictionary serializeDeserializeDict(POSDictionary dict) throws IOException
{
     ByteArrayOutputStream out = new ByteArrayOutputStream();
 
     try {
-      dictionary.serialize(out);
+      dict.serialize(out);
     }
     finally {
        out.close();
@@ -61,7 +57,55 @@ public class POSDictionaryTest {
     finally {
         in.close();
     }
+    
+    return serializedDictionary;
+  }
+  
+  @Test
+  public void testSerialization() throws IOException, InvalidFormatException {
+    POSDictionary dictionary = new POSDictionary();
+
+    dictionary.addTags("a", "1", "2", "3");
+    dictionary.addTags("b", "4", "5", "6");
+    dictionary.addTags("c", "7", "8", "9");
+    dictionary.addTags("Always", "RB","NNP");
+
+    assertTrue(dictionary.equals(serializeDeserializeDict(dictionary)));
+  }
+  
+  @Test
+  public void testLoadingDictionaryWithoutCaseAttribute() throws IOException {
+    POSDictionary dict = loadDictionary("TagDictionaryWithoutCaseAttribute.xml");
+    
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("McKinsey"));
+    assertNull(dict.getTags("Mckinsey"));
+  }
 
-    assertTrue(dictionary.equals(serializedDictionary));
+  @Test
+  public void testCaseSensitiveDictionary() throws IOException {
+    POSDictionary dict = loadDictionary("TagDictionaryCaseSensitive.xml");
+
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("McKinsey"));
+    assertNull(dict.getTags("Mckinsey"));
+    
+    dict = serializeDeserializeDict(dict);
+    
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("McKinsey"));
+    assertNull(dict.getTags("Mckinsey"));
+  }
+  
+  @Test
+  public void testCaseInsensitiveDictionary() throws IOException {
+    POSDictionary dict = loadDictionary("TagDictionaryCaseInsensitive.xml");
+    
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("McKinsey"));
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("Mckinsey"));
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("MCKINSEY"));
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("mckinsey"));
+    
+    dict = serializeDeserializeDict(dict);
+    
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("McKinsey"));
+    assertArrayEquals(new String[]{"NNP"}, dict.getTags("Mckinsey"));
   }
 }
\ No newline at end of file

Added: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseInsensitive.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseInsensitive.xml?rev=1176845&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseInsensitive.xml
(added)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseInsensitive.xml
Wed Sep 28 12:28:58 2011
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.    
+-->
+
+<dictionary case_sensitive="false">
+<entry tags="NNP">
+<token>McKinsey</token>
+</entry>
+</dictionary>

Propchange: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseInsensitive.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseSensitive.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseSensitive.xml?rev=1176845&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseSensitive.xml
(added)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseSensitive.xml
Wed Sep 28 12:28:58 2011
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.    
+-->
+
+<dictionary case_sensitive="true">
+<entry tags="NNP">
+<token>McKinsey</token>
+</entry>
+</dictionary>

Propchange: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryCaseSensitive.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryWithoutCaseAttribute.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryWithoutCaseAttribute.xml?rev=1176845&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryWithoutCaseAttribute.xml
(added)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryWithoutCaseAttribute.xml
Wed Sep 28 12:28:58 2011
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.    
+-->
+
+<dictionary>
+<entry tags="NNP">
+<token>McKinsey</token>
+</entry>
+</dictionary>

Propchange: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/postag/TagDictionaryWithoutCaseAttribute.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message