mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject svn commit: r1455258 - in /mahout/trunk/integration/src: main/java/org/apache/mahout/utils/vectors/arff/ test/java/org/apache/mahout/utils/vectors/arff/
Date Mon, 11 Mar 2013 17:51:30 GMT
Author: ssc
Date: Mon Mar 11 17:51:29 2013
New Revision: 1455258

URL: http://svn.apache.org/r1455258
Log:
MAHOUT-1150 ARFF Integration does not support quoted identifiers

Added:
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
Modified:
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
Mon Mar 11 17:51:29 2013
@@ -38,6 +38,25 @@ public enum ARFFType {
   
   public String getLabel(String line) {
     int idx = line.lastIndexOf(indicator);
-    return line.substring(ARFFModel.ATTRIBUTE.length(), idx).trim();
+    return removeQuotes(line.substring(ARFFModel.ATTRIBUTE.length(), idx));
+  }
+
+  /**
+   * Remove quotes and leading/trailing whitespace from a single or double quoted string
+   * @param str quotes from
+   * @return  A string without quotes
+   */
+  public static String removeQuotes(String str) {
+    String cleaned = str;
+    if (cleaned != null) {
+        cleaned = cleaned.trim();
+        boolean isQuoted = cleaned.length() > 1 &&
+            (cleaned.startsWith("\"") &&  cleaned.endsWith("\"") ||
+            cleaned.startsWith("'") &&  cleaned.endsWith("'"));
+        if (isQuoted) {
+          cleaned = cleaned.substring(1, cleaned.length() - 1);
+        }
+      }
+    return cleaned;
   }
 }

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
Mon Mar 11 17:51:29 2013
@@ -85,7 +85,7 @@ public class ARFFVectorIterable implemen
       if (lower.startsWith(ARFFModel.ARFF_COMMENT)) {
         continue;
       } else if (lower.startsWith(ARFFModel.RELATION)) {
-        model.setRelation(line.substring(ARFFModel.RELATION.length()).trim());
+        model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
       } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
         String label;
         ARFFType type;
@@ -108,7 +108,7 @@ public class ARFFVectorIterable implemen
           int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
           String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length()
- 1));
           for (int i = 0; i < classes.length; i++) {
-            model.addNominal(label, classes[i].trim(), i + 1);
+            model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
           }
         } else if (lower.contains(ARFFType.DATE.getIndicator())) {
           label = ARFFType.DATE.getLabel(lower);

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
Mon Mar 11 17:51:29 2013
@@ -117,7 +117,7 @@ public class MapBackedARFFModel implemen
     double result;
     Map<String,Integer> classes = nominalMap.get(label);
     if (classes != null) {
-      Integer ord = classes.get(data);
+      Integer ord = classes.get(ARFFType.removeQuotes(data));
       if (ord != null) {
         result = ord;
       } else {

Added: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java?rev=1455258&view=auto
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
(added)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
Mon Mar 11 17:51:29 2013
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.arff;
+
+import org.apache.mahout.utils.MahoutTestCase;
+import org.junit.Test;
+
+public class ARFFTypeTest extends MahoutTestCase{
+
+  @Test
+  public void removeQuotes() {
+    
+    assertEquals(null, ARFFType.removeQuotes(null));
+    assertEquals("", ARFFType.removeQuotes("\"\""));
+    assertEquals("", ARFFType.removeQuotes("''"));
+    assertEquals("", ARFFType.removeQuotes(""));
+    assertEquals("", ARFFType.removeQuotes("  "));
+    assertEquals("single", ARFFType.removeQuotes("'single'"));
+    assertEquals("double", ARFFType.removeQuotes("\"double\""));
+    assertEquals("trim", ARFFType.removeQuotes(" trim "));
+  }
+}

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1455258&r1=1455257&r2=1455258&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
(original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
Mon Mar 11 17:51:29 2013
@@ -246,6 +246,55 @@ public final class ARFFVectorIterableTes
     assertEquals(3.0, vector.get(2), EPSILON);
   }
 
+  @Test
+  public void testQuotes() throws Exception {
+    
+    // ARFF allows quotes on identifiers
+    String arff = "@RELATION 'quotes'\n"
+        + "@ATTRIBUTE 'theNumeric' NUMERIC\n"
+        + "@ATTRIBUTE \"theInteger\" INTEGER\n"
+        + "@ATTRIBUTE theReal REAL\n"
+        + "@ATTRIBUTE theNominal {\"double-quote\", 'single-quote', no-quote}\n"
+        + "@DATA\n"
+        + "1.0,2,3.0,\"no-quote\"\n"
+        + "4.0,5,6.0,single-quote\n"
+        + "7.0,8,9.0,'double-quote'\n"
+      ;
+    ARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
+    model = iterable.getModel();
+    assertNotNull(model);
+    assertEquals("quotes", model.getRelation());
+
+    // check attribute labels
+    assertEquals(4, model.getLabelSize());
+    assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
+    assertEquals(ARFFType.INTEGER, model.getARFFType(1));
+    assertEquals(ARFFType.REAL, model.getARFFType(2));
+    assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
+
+    Map<String, Integer> labelBindings = model.getLabelBindings();
+    assertTrue(labelBindings.keySet().contains("thenumeric"));
+    assertTrue(labelBindings.keySet().contains("theinteger"));
+    assertTrue(labelBindings.keySet().contains("thereal"));
+    assertTrue(labelBindings.keySet().contains("thenominal"));
+    
+    // check nominal values
+    Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
+    assertNotNull(nominalMap);
+    assertEquals(3, nominalMap.size());
+    assertTrue(nominalMap.keySet().contains("double-quote"));
+    assertTrue(nominalMap.keySet().contains("single-quote"));
+    assertTrue(nominalMap.keySet().contains("no-quote"));
+
+    // check data values
+    Iterator<Vector> it = iterable.iterator();
+    Vector vector = it.next();
+    assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
+    assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
+    assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
+  }
+
   private static final String SAMPLE_DENSE_ARFF = "   % Comments\n" + "   % \n" + "   % Comments
go here"
                                                   + "   % \n" + "   @RELATION golf\n" + '\n'
                                                   + "   @ATTRIBUTE outlook {sunny,overcast,
rain}\n"



Mime
View raw message