jena-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rve...@apache.org
Subject [11/50] [abbrv] git commit: add CSV2RDF tool
Date Thu, 02 Oct 2014 09:48:20 GMT
add CSV2RDF tool

git-svn-id: http://svn.apache.org/repos/asf/jena/Experimental/jena-csv@1613797 13f79535-47bb-0310-9956-ffa450edef68


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/5b0eaa4a
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/5b0eaa4a
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/5b0eaa4a

Branch: refs/heads/jena-csv
Commit: 5b0eaa4a5edfc50a225d5500f508a390d32b3dcb
Parents: 534d0cf
Author: Ying Jiang <jpz6311whu@apache.org>
Authored: Sun Jul 27 14:24:26 2014 +0000
Committer: Ying Jiang <jpz6311whu@apache.org>
Committed: Sun Jul 27 14:24:26 2014 +0000

----------------------------------------------------------------------
 src/main/java/riotcmd/LocatorOupputFile.java | 148 ++++++++++++++++
 src/main/java/riotcmd/ModDest.java           |  51 ++++++
 src/main/java/riotcmd/csv2rdf.java           | 205 ++++++++++++++++++++++
 3 files changed, 404 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/5b0eaa4a/src/main/java/riotcmd/LocatorOupputFile.java
----------------------------------------------------------------------
diff --git a/src/main/java/riotcmd/LocatorOupputFile.java b/src/main/java/riotcmd/LocatorOupputFile.java
new file mode 100644
index 0000000..3d9cc52
--- /dev/null
+++ b/src/main/java/riotcmd/LocatorOupputFile.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package riotcmd;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.security.AccessControlException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.util.FileManager;
+import com.hp.hpl.jena.util.FileUtils;
+import com.hp.hpl.jena.util.LocatorFile;
+
+public class LocatorOupputFile {
+    static Logger log = LoggerFactory.getLogger(LocatorOupputFile.class) ;
+    private String thisDir = null ;
+    private String thisDirLogStr = "" ;
+    
+    public LocatorOupputFile(String dir)
+    {
+        if ( dir != null )
+        {
+            if ( dir.endsWith("/") || dir.endsWith(java.io.File.separator) )
+                dir = dir.substring(0,dir.length()-1) ;
+            thisDirLogStr = " ["+dir+"]" ;
+        }
+        thisDir = dir ;
+    }
+
+    LocatorOupputFile()
+    {
+        this(null) ;
+    }
+    
+    @Override
+    public boolean equals( Object other )
+    {
+        return
+            other instanceof LocatorFile
+            && equals( thisDir, ((LocatorOupputFile) other).thisDir );
+    }
+    
+    private boolean equals( String a, String b )
+    {
+        return a == null ? b == null : a.equals(  b  );
+    }
+
+    @Override
+    public int hashCode()
+    {
+        if ( thisDir == null )
+            return 157 ;
+        return thisDir.hashCode();
+    }
+    
+    private File toFile(String filenameOrURI)
+    {
+        String fn = FileUtils.toFilename(filenameOrURI) ;
+        if ( fn == null )
+            return null ;
+        
+        if ( thisDir != null && ! fn.startsWith("/") && ! fn.startsWith(FileManager.filePathSeparator)
)
+            fn = thisDir+java.io.File.separator+fn ;
+                     
+        return new File(fn) ;
+    }
+    
+    
+    public boolean exists(String filenameOrURI)
+    {
+        File f = toFile(filenameOrURI) ;
+        
+        if ( f == null )
+            return false ;
+        
+        return f.exists() ;
+    }
+    
+
+    public OutputStream open(String filenameOrURI)
+    {
+        // Worry about %20.
+        // toFile calls FileUtils.toFilename(filenameOrURI) ;
+        File f = toFile(filenameOrURI) ;
+
+        try {
+            if ( f == null )
+            {
+                if ( log.isTraceEnabled())
+                    log.trace("Not found: "+filenameOrURI+thisDirLogStr) ;
+                return null ;
+            }
+        } catch (AccessControlException e) {
+            log.warn("Security problem testing for file", e);
+            return null;
+        }
+        
+        try {
+            OutputStream out = new FileOutputStream(f) ;
+
+            if ( log.isTraceEnabled() )
+                log.trace("Found: "+filenameOrURI+thisDirLogStr) ;
+                
+            
+            // Create base -- Java 1.4-isms
+            //base = f.toURI().toURL().toExternalForm() ;
+            //base = base.replaceFirst("^file:/([^/])", "file:///$1") ;
+            return out ;
+        } catch (IOException ioEx)
+        {
+            // Includes FileNotFoundException
+            // We already tested whether the file exists or not.
+            // log.warn("File unreadable (but exists): "+f.getPath()+" Exception: "+ioEx.getMessage())
;
+            return null ;
+        }
+    }
+    
+    public String getDir()  { return thisDir ; }
+    
+
+    public String getName()
+    {
+        String tmp = "LocatorFile" ;
+        if ( thisDir != null )
+            tmp = tmp+"("+thisDir+")" ;
+        return tmp ;
+    }
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/5b0eaa4a/src/main/java/riotcmd/ModDest.java
----------------------------------------------------------------------
diff --git a/src/main/java/riotcmd/ModDest.java b/src/main/java/riotcmd/ModDest.java
new file mode 100644
index 0000000..739adcd
--- /dev/null
+++ b/src/main/java/riotcmd/ModDest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package riotcmd;
+
+import arq.cmd.CmdException;
+import arq.cmdline.ArgDecl;
+import arq.cmdline.ArgModuleGeneral;
+import arq.cmdline.CmdArgModule;
+import arq.cmdline.CmdGeneral;
+
+public class ModDest implements ArgModuleGeneral{
+	
+	private ArgDecl argDest     = new ArgDecl(ArgDecl.HasValue, "dest") ;
+	private String dest         = null ;
+
+	@Override
+	public void processArgs(CmdArgModule cmdLine) {
+		if ( cmdLine.contains(argDest) ) {
+			dest = cmdLine.getValue(argDest) ;
+        } else {
+        	throw new CmdException("No destination output file! Please add '--dest=file' in
the program arguements") ;
+        }
+	}
+
+	@Override
+	public void registerWith(CmdGeneral cmdLine) {
+		cmdLine.getUsage().startCategory("Destination Output") ;
+		cmdLine.add(argDest,    "--dest=file",      "The destination output file") ;	
+	}
+	
+    public String getDest() {
+        return dest ;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/5b0eaa4a/src/main/java/riotcmd/csv2rdf.java
----------------------------------------------------------------------
diff --git a/src/main/java/riotcmd/csv2rdf.java b/src/main/java/riotcmd/csv2rdf.java
new file mode 100644
index 0000000..882a29a
--- /dev/null
+++ b/src/main/java/riotcmd/csv2rdf.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package riotcmd;
+
+import java.io.OutputStream;
+
+import org.apache.jena.atlas.io.IO;
+import org.apache.jena.atlas.web.ContentType;
+import org.apache.jena.atlas.web.TypedInputStream;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFDataMgr;
+import org.apache.jena.riot.RDFLanguages;
+import org.apache.jena.riot.ReaderRIOT;
+import org.apache.jena.riot.RiotException;
+import org.apache.jena.riot.SysRIOT;
+import org.apache.jena.riot.lang.LabelToNode;
+import org.apache.jena.riot.lang.StreamRDFCounting;
+import org.apache.jena.riot.out.NodeToLabel;
+import org.apache.jena.riot.process.inf.InfFactory;
+import org.apache.jena.riot.system.ErrorHandler;
+import org.apache.jena.riot.system.ErrorHandlerFactory;
+import org.apache.jena.riot.system.RiotLib;
+import org.apache.jena.riot.system.StreamRDF;
+import org.apache.jena.riot.system.StreamRDF2;
+import org.apache.jena.riot.system.StreamRDFLib;
+import org.apache.jena.riot.system.SyntaxLabels;
+
+import arq.cmd.CmdException;
+
+import com.hp.hpl.jena.sparql.util.Utils;
+
+/**
+ * It's a command line tool for direct and scalable transforming from CSV to the formatted
RDF syntax (i.e. N-Triples), 
+ * with no intermediary Graph or PropertyTable.
+ * 
+ * It reuses the parsing functions from CmdLangParse and sinks the triples into the destination
output file.
+ *
+ */
+public class csv2rdf extends CmdLangParse{
+	
+	protected ModDest modDest = new ModDest() ;
+	protected OutputStream destOut;
+
+    public static void main(String... argv)
+    {
+        new csv2rdf(argv).mainRun() ;
+    }    
+    
+    protected csv2rdf(String[] argv)
+    {
+        super(argv) ;
+        super.addModule(modDest) ;
+        
+    }
+	
+	@Override
+	protected Lang selectLang(String filename, ContentType contentType,
+			Lang dftLang) {
+		return RDFLanguages.CSV; 
+	}
+
+	@Override
+	protected String getCommandName() {
+		return Utils.classShortName(csv2rdf.class) ;
+	}
+	
+    @Override
+    protected String getSummary()
+    {
+        return getCommandName()+" --dest=outputFile inputFile ..." ;
+    }
+
+	// override the original CmdLangParse.parseRIOT()
+    protected void parseRIOT(String baseURI, String filename, TypedInputStream in)
+    {
+    	
+    	String dest = modDest.getDest();
+    	LocatorOupputFile l = new LocatorOupputFile();
+    	destOut = l.open(dest);
+    	
+    	if (destOut == null){
+            System.err.println("Can't write to destination output file: '"+dest+"' ") ;
+            return ;
+    	}
+    	
+        // I ti s shame we effectively duplicate deciding thelnaguage but we want to control
the
+        // pasrer at a deep level (in validation, we want line numbers get into error message)
+        // This code predates RDFDataMgr.
+        
+        ContentType ct = in.getMediaType() ;
+        
+        baseURI = SysRIOT.chooseBaseIRI(baseURI, filename) ;
+        
+        boolean checking = true ;
+        if ( modLangParse.explicitChecking() )  checking = true ;
+        if ( modLangParse.explicitNoChecking() ) checking = false ;
+        
+        ErrorHandler errHandler = null ;
+        if ( checking )
+        {
+            if ( modLangParse.stopOnBadTerm() )
+                errHandler = ErrorHandlerFactory.errorHandlerStd  ;
+            else
+                // Try to go on if possible.  This is the default behaviour.
+                errHandler = ErrorHandlerFactory.errorHandlerWarn ;
+        }
+        
+        if ( modLangParse.skipOnBadTerm() )
+        {
+            // TODO skipOnBadterm
+        }
+        
+        Lang lang = selectLang(filename, ct, RDFLanguages.NQUADS) ;  
+        LangHandler handler = dispatch.get(lang) ;
+        if ( handler == null )
+            throw new CmdException("Undefined language: "+lang) ; 
+        
+        // If multiple files, choose the overall labels. 
+        if ( langHandlerOverall == null )
+            langHandlerOverall = handler ;
+        else
+        {
+            if ( langHandlerOverall != langHandlerAny )
+            {
+                if ( langHandlerOverall != handler )
+                    langHandlerOverall = langHandlerAny ;
+            }
+        }
+        
+        // Make a flag.
+        // Input and output subflags.
+        // If input is "label, then output using NodeToLabel.createBNodeByLabelRaw() ;
+        // else use NodeToLabel.createBNodeByLabel() ;
+        // Also, as URI.
+        final boolean labelsAsGiven = false ;
+        
+        NodeToLabel labels = SyntaxLabels.createNodeToLabel() ;
+        if ( labelsAsGiven )
+            labels = NodeToLabel.createBNodeByLabelEncoded() ;
+        
+        StreamRDF s = StreamRDFLib.sinkNull() ;
+        if ( ! modLangParse.toBitBucket() )
+            s = StreamRDFLib.writer(output) ;
+        
+        // add dest output
+        if ( destOut != null)
+        	s = new StreamRDF2(s,  StreamRDFLib.writer(destOut));
+        
+        if ( setup != null )
+            s = InfFactory.inf(s, setup) ;
+        
+        StreamRDFCounting sink = StreamRDFLib.count(s) ;
+        s = null ;
+        
+        ReaderRIOT reader = RDFDataMgr.createReader(lang) ;
+        try {
+            if ( checking ) {
+                if ( lang == RDFLanguages.NTRIPLES || lang == RDFLanguages.NQUADS )
+                    reader.setParserProfile(RiotLib.profile(baseURI, false, true, errHandler))
;
+                else
+                    reader.setParserProfile(RiotLib.profile(baseURI, true, true, errHandler))
;
+            } else
+                reader.setParserProfile(RiotLib.profile(baseURI, false, false, errHandler))
;
+
+            if ( labelsAsGiven )
+                reader.getParserProfile().setLabelToNode(LabelToNode.createUseLabelAsGiven())
;
+            modTime.startTimer() ;
+            reader.read(in, baseURI, ct, sink, null) ;
+        } catch (RiotException ex) {
+            // Should have handled the exception and logged a message by now.
+            // System.err.println("++++"+ex.getMessage());
+
+            if ( modLangParse.stopOnBadTerm() )
+                return ;
+        } finally {
+            // Not close - we may write again to the underlying output stream in another
call to parse a file.  
+            sink.finish() ;
+            IO.close(in) ;
+        }
+        long x = modTime.endTimer() ;
+        long n = sink.countTriples()+sink.countQuads() ;
+
+        if ( modTime.timingEnabled() )
+            output(filename, n, x, handler) ;
+        
+        totalMillis += x ;
+        totalTuples += n ;
+    }
+}


Mime
View raw message