tika-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From max...@apache.org
Subject svn commit: r1303359 - in /tika/trunk/tika-app: pom.xml src/main/java/org/apache/tika/cli/TikaCLI.java src/test/java/org/apache/tika/cli/TikaCLITest.java src/test/resources/test-data/coffee.xls
Date Wed, 21 Mar 2012 11:05:05 GMT
Author: maxcom
Date: Wed Mar 21 11:05:04 2012
New Revision: 1303359

URL: http://svn.apache.org/viewvc?rev=1303359&view=rev
Log:
TIKA-877 - fix extraction for OLE-attachements in TikaCli

Added:
    tika/trunk/tika-app/src/test/resources/test-data/coffee.xls   (with props)
Modified:
    tika/trunk/tika-app/pom.xml
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Modified: tika/trunk/tika-app/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1303359&r1=1303358&r2=1303359&view=diff
==============================================================================
--- tika/trunk/tika-app/pom.xml (original)
+++ tika/trunk/tika-app/pom.xml Wed Mar 21 11:05:04 2012
@@ -62,6 +62,12 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <artifactId>commons-io</artifactId>
+      <groupId>commons-io</groupId>
+      <version>2.1</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1303359&r1=1303358&r2=1303359&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Mar 21 11:05:04
2012
@@ -16,16 +16,7 @@
  */
 package org.apache.tika.cli;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.io.Writer;
+import java.io.*;
 import java.lang.reflect.Field;
 import java.net.ServerSocket;
 import java.net.Socket;
@@ -53,6 +44,10 @@ import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.log4j.SimpleLayout;
 import org.apache.log4j.WriterAppender;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.CompositeDetector;
@@ -89,6 +84,7 @@ import com.google.gson.Gson;
  * Simple command line interface for Apache Tika.
  */
 public class TikaCLI {
+    private File extractDir = new File(".");
 
     public static void main(String[] args) throws Exception {
         BasicConfigurator.configure(
@@ -353,6 +349,8 @@ public class TikaCLI {
             type = LANGUAGE;
         } else if (arg.equals("-d") || arg.equals("--detect")) {
             type = DETECT;
+        } else if (arg.startsWith("--extract-dir=")) {
+            extractDir = new File(arg.substring("--extract-dir=".length()));
         } else if (arg.equals("-z") || arg.equals("--extract")) {
             type = NO_OUTPUT;
             context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
@@ -427,6 +425,7 @@ public class TikaCLI {
         out.println("    -d  or --detect        Detect document type");
         out.println("    -eX or --encoding=X    Use output encoding X");
         out.println("    -z  or --extract       Extract all attachements into current directory");
       
+        out.println("    --extract-dir=<dir>    Specify target directory for -z");
       
         out.println("    -r  or --pretty-print  For XML and XHTML outputs, adds newlines
and");
         out.println("                           whitespace, for better readability");
         out.println();
@@ -685,7 +684,7 @@ public class TikaCLI {
                 }
             }
 
-            File outputFile = new File(name);
+            File outputFile = new File(extractDir, name);
             if (outputFile.exists()) {
                 System.err.println("File '"+name+"' already exists; skipping");
                 return;
@@ -695,10 +694,41 @@ public class TikaCLI {
 
             FileOutputStream os = new FileOutputStream(outputFile);
 
-            IOUtils.copy(inputStream, os);
+            if (inputStream instanceof TikaInputStream) {
+                TikaInputStream tin = (TikaInputStream) inputStream;
+
+                if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof
DirectoryEntry) {
+                    POIFSFileSystem fs = new POIFSFileSystem();
+                    copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
+                    fs.writeFilesystem(os);
+                } else {
+                    IOUtils.copy(inputStream, os);
+                }
+            } else {
+                IOUtils.copy(inputStream, os);
+            }
 
             os.close();
         }
+
+        protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
+                throws IOException {
+            for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
+                if (entry instanceof DirectoryEntry) {
+                    // Need to recurse
+                    DirectoryEntry newDir = destDir.createDirectory(entry.getName());
+                    copy((DirectoryEntry) entry, newDir);
+                } else {
+                    // Copy entry
+                    InputStream contents = new DocumentInputStream((DocumentEntry) entry);
+                    try {
+                        destDir.createDocument(entry.getName(), contents);
+                    } finally {
+                        contents.close();
+                    }
+                }
+            }
+        }
     }
 
     private class TikaServer extends Thread {

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1303359&r1=1303358&r2=1303359&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Mar 21 11:05:04
2012
@@ -23,6 +23,7 @@ import java.net.URI;
 
 import junit.framework.Assert;
 import junit.framework.TestCase;
+import org.apache.commons.io.FileUtils;
 
 /**
  * Tests the Tika's cli
@@ -173,4 +174,28 @@ public class TikaCLITest extends TestCas
         System.setOut(stdout);
     }
 
+    public void testExtract() throws Exception {
+        File tempFile = File.createTempFile("tika-test-", "");
+        tempFile.delete();
+        tempFile.mkdir(); // not really good method for production usage, but ok for tests
+                          // google guava library has better solution
+
+        try {
+            String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resorcePrefix
+ "/coffee.xls"};
+            
+            TikaCLI.main(params);
+            
+            File expected1 = new File(tempFile, "MBD002B040A.wps");
+            File expected2 = new File(tempFile, "file5");
+            
+            assertTrue(expected1.exists());
+            assertTrue(expected2.exists());
+            
+            assertTrue(expected1.length()>0);
+            assertTrue(expected2.length()>0);
+        } finally {
+            FileUtils.deleteDirectory(tempFile);
+        }
+
+    }
 }

Added: tika/trunk/tika-app/src/test/resources/test-data/coffee.xls
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/coffee.xls?rev=1303359&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-app/src/test/resources/test-data/coffee.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



Mime
View raw message