marmotta-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wik...@apache.org
Subject [3/4] git commit: MARMOTTA-423: some more small fixes to what a fucking serialization
Date Mon, 20 Jan 2014 16:57:30 GMT
MARMOTTA-423: some more small fixes to what a fucking serialization


Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/68dcda20
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/68dcda20
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/68dcda20

Branch: refs/heads/develop
Commit: 68dcda20d8e6a1c116b5463c5c1f8a3b489cad1e
Parents: 8d1946c
Author: Sergio Fernández <wikier@apache.org>
Authored: Mon Jan 20 17:57:02 2014 +0100
Committer: Sergio Fernández <wikier@apache.org>
Committed: Mon Jan 20 17:57:02 2014 +0100

----------------------------------------------------------------------
 .../provider/freebase/FreebaseProvider.java     | 47 +++++++++++++++-----
 1 file changed, 36 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/marmotta/blob/68dcda20/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
index ccd0159..97ea6c7 100644
--- a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
+++ b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
@@ -20,6 +20,7 @@ package org.apache.marmotta.ldclient.provider.freebase;
 import com.google.common.base.Preconditions;
 
 import javolution.util.function.Predicate;
+import org.apache.commons.lang3.StringEscapeUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.marmotta.commons.sesame.model.ModelCommons;
 import org.apache.marmotta.ldclient.api.endpoint.Endpoint;
@@ -35,6 +36,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.Collections;
 import java.util.List;
 import java.util.regex.Matcher;
@@ -56,6 +59,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
     public static final String DEFAULT_ENCODING = "UTF-8";
     private static final Pattern CHARSET_PATTERN = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
     private static final  Pattern FREEBASE_LITERAL_PATTERN = Pattern.compile("^\\s+([a-z]+:[a-z]+(?:\\.[a-z]+)*)\\s+\"(.*)\"(@[a-z]+)?(;|\\.)$");
+    private static final  Pattern FREEBASE_RESOURCE_PATTERN = Pattern.compile("^\\s+([a-z]+:[a-z]+(?:\\.[a-z]+)*)\\s+([a-z]+:.*(\\..*))(;|\\.)$");
 
     /**
      * Return the name of this data provider. To be used e.g. in the configuration and in
log messages.
@@ -126,7 +130,8 @@ public class FreebaseProvider extends AbstractHttpProvider {
     }
 
     /**
-     * Fixes Freebase deficiencies on Turtle serialization
+     * Fixes Freebase deficiencies on Turtle serialization, doing
+     * some dirty things they may be semantically wrong.
      *
      * @param is stream with the raw data
      * @return fixed stream
@@ -136,19 +141,20 @@ public class FreebaseProvider extends AbstractHttpProvider {
         StringBuffer sb = new StringBuffer();
         String line;
         while ((line = br.readLine()) != null) {
-            Matcher m = FREEBASE_LITERAL_PATTERN.matcher(line);
-            if (m.matches()) {
+            Matcher literalMatcher = FREEBASE_LITERAL_PATTERN.matcher(line);
+            Matcher resourceMatcher = FREEBASE_RESOURCE_PATTERN.matcher(line);
+            if (literalMatcher.matches()) {
                 //literal found
                 try {
-                    final String literal = m.group(2);
+                    final String literal = literalMatcher.group(2);
                     final String fixed = fixLiteral(literal);
                     //log.debug("literal: --{}--{}", literal, fixed);
-                    String triple = m.group(1) + "    \"" + fixed + "\"";
-                    if (m.group(3) != null) {
-                        triple += m.group(3);
+                    String triple = literalMatcher.group(1) + "    \"" + fixed + "\"";
+                    if (literalMatcher.group(3) != null) {
+                        triple += literalMatcher.group(3);
                     }
                     log.debug("new triple: {}", triple);
-                    sb.append("    " + triple + m.group(4));
+                    sb.append("    " + triple + literalMatcher.group(4));
                     sb.append(("\n"));
                 } catch (Exception e) {
                     log.error("Error fixing line, so triple ignored: {}", e.getMessage());
@@ -157,8 +163,19 @@ public class FreebaseProvider extends AbstractHttpProvider {
                         sb.append((".\n"));
                     }
                 }
+            } else if (resourceMatcher.matches()) {
+                if (resourceMatcher.group(2).contains("$")) {
+                    //awaiting for https://openrdf.atlassian.net/browse/SES-2000
+                    if (line.endsWith(".")) {
+                        sb.append((".\n"));
+                    }
+                } else {
+                    //fine, so pass-through
+                    sb.append(line);
+                    sb.append(("\n"));
+                }
             } else {
-                //not a triple with a literal as object, so pass-through
+                //something else, so pass-through
                 sb.append(line);
                 sb.append(("\n"));
             }
@@ -169,17 +186,25 @@ public class FreebaseProvider extends AbstractHttpProvider {
     private String fixLiteral(String literal) throws UnsupportedEncodingException {
 
         //non-escaped quotes
-        literal = literal.replaceAll("\"", "\\\"");
+        literal = literal.replaceAll("\"", "'");
 
-        //wrong encoding
+        //wrong charset
         if (literal.contains("\\x")) {
             //TODO: find a way to re-code properly the literal
             //http://www.ic.unicamp.br/~stolfi/EXPORT/www/ISO-8859-1-Encoding.html
             literal = literal.replaceAll("\\\\xe1", "á");
+            literal = literal.replaceAll("\\\\xe3", "ã");
+            literal = literal.replaceAll("\\\\xe7", "ç");
             literal = literal.replaceAll("\\\\xe9", "é");
             literal = literal.replaceAll("\\\\xed", "í");
             literal = literal.replaceAll("\\\\xf3", "ó");
             literal = literal.replaceAll("\\\\xfa", "ú");
+            literal = literal.replaceAll("\\\\x", ""); //FIXME: wrong, wrong, wrong!
+        }
+
+        //wrong unicode encoding
+        if (literal.contains("\\u")) {
+            literal = StringEscapeUtils.unescapeJava(literal);
         }
 
         return literal;


Mime
View raw message