tika-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amy...@apache.org
Subject svn commit: r1220687 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/detect/ tika-parsers/src/test/java/org/apache/tika/mime/ t...
Date Mon, 19 Dec 2011 11:15:57 GMT
Author: amylka
Date: Mon Dec 19 11:15:56 2011
New Revision: 1220687

URL: http://svn.apache.org/viewvc?rev=1220687&view=rev
Log:
TIKA-812 Support for detection of MS Works 7.0 Spreadsheet files

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
  (with props)
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1220687&r1=1220686&r2=1220687&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Dec
19 11:15:56 2011
@@ -2777,6 +2777,17 @@
     <_comment>OLE10 Native Embedded Document</_comment>
   </mime-type>
 
+  <mime-type type="application/x-tika-msworks-spreadsheet">
+    <glob pattern="*.xlr"/>
+    <sub-class-of type="application/vnd.ms-excel"/>
+    <!-- this has to be highter than the Excel match -->
+    <magic priority="60">
+      <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
+         <match value="W\x00k\x00s\x00S\x00S\x00W\x00o\x00r\x00k\x00B\x00o\x00o\x00k"
type="string" offset="1152:4096" />
+      </match>
+    </magic>
+  </mime-type>
+
   <!-- =================================================================== -->
   <!-- Office Open XML file formats                                        -->
   <!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm -->

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1220687&r1=1220686&r2=1220687&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Mon Dec 19 11:15:56 2011
@@ -69,6 +69,7 @@ public class OfficeParser extends Abstra
                     POIFSDocumentType.PROJECT.type,
                     POIFSDocumentType.VISIO.type,
                     // Works isn't supported
+                    POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
                     POIFSDocumentType.OUTLOOK.type,
                     MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12")
                     )));
@@ -84,6 +85,7 @@ public class OfficeParser extends Abstra
         PROJECT("mpp", MediaType.application("vnd.ms-project")),
         VISIO("vsd", MediaType.application("vnd.visio")),
         WORKS("wps", MediaType.application("vnd.ms-works")),
+        XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
         OUTLOOK("msg", MediaType.application("vnd.ms-outlook"));
 
         private final String extension;
@@ -186,6 +188,7 @@ public class OfficeParser extends Abstra
            new HSLFExtractor(context).parse(root, xhtml);
            break;
         case WORKBOOK:
+        case XLR:
            Locale locale = context.get(Locale.class, Locale.getDefault());
            new ExcelExtractor(context).parse(root, xhtml, locale);
            break;

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1220687&r1=1220686&r2=1220687&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
(original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Mon Dec 19 11:15:56 2011
@@ -71,6 +71,9 @@ public class POIFSContainerDetector impl
 
     /** Microsoft Works */
     public static final MediaType WPS = application("vnd.ms-works");
+    
+    /** Microsoft Works Spreadsheet 7.0 */
+    public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
 
     /** Microsoft Outlook */
     public static final MediaType MSG = application("vnd.ms-outlook");
@@ -133,7 +136,12 @@ public class POIFSContainerDetector impl
      */
     protected static MediaType detect(Set<String> names) {
         if (names != null) {
-            if (names.contains("Workbook")) {
+            if (names.contains("WksSSWorkBook")) {
+                // This check has to be before names.contains("Workbook")
+                // Works 7.0 spreadsheet files contain both
+                // we want to avoid classifying this as Excel
+                return XLR; 
+            } else if (names.contains("Workbook")) {
                 return XLS;
             } else if (names.contains("EncryptedPackage") && 
                     names.contains("EncryptionInfo") &&

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1220687&r1=1220686&r2=1220687&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Mon Dec 19 11:15:56 2011
@@ -70,6 +70,7 @@ public class TestContainerAwareDetector 
         assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
         assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
         assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
+        assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
         assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
         assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1220687&r1=1220686&r2=1220687&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Mon Dec
19 11:15:56 2011
@@ -1,5 +1,5 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
+* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
@@ -27,7 +27,9 @@ import junit.framework.TestCase;
 
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.microsoft.POIFSContainerDetector;
 
 /**
  * 
@@ -149,6 +151,35 @@ public class TestMimeTypes extends TestC
     }
     
     /**
+     * Files generated by Works 7.0 Spreadsheet application use the OLE2
+     * structure and resemble Excel files (they contain a "Workbook"). They are
+     * not Excel though. The {@link POIFSContainerDetector} can detect them
+     * properly. With plain {@link MimeTypes} they are detected as Excel,
+     * because of the "Workbook" string. It's a problem we discussed in TIKA-806
+     * and agreed that we live with that. The policy is that container-based
+     * detection should trump magic-based detection. It's implemented in
+     * {@link DefaultDetector} (TIKA-786) and users who don't want to use to
+     * {@link DefaultDetector} should be aware of it.
+     * 
+     * @throws Exception
+     */
+    public void testWorks70Detection() throws Exception {
+        // this is possible due to MimeTypes guessing the type
+        // based on the WksSSWorkBook near the beginning of the
+        // file
+        assertTypeByData("application/x-tika-msworks-spreadsheet",
+                "testWORKSSpreadsheet7.0.xlr");
+        
+        // this is right, we made x-xlr a subtype of vnd.ms-excel
+        assertTypeByNameAndData("application/x-tika-msworks-spreadsheet",
+                "testWORKSSpreadsheet7.0.xlr");
+        
+        // with name-only, everything should be all right
+        assertTypeByName("application/x-tika-msworks-spreadsheet", 
+                "testWORKSSpreadsheet7.0.xlr");
+    }
+    
+    /**
      * Note - detecting container formats by mime magic is very very
      *  iffy, as we can't be sure where things will end up.
      * People really ought to use the container aware detection...

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1220687&r1=1220686&r2=1220687&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Mon Dec 19 11:15:56 2011
@@ -192,5 +192,22 @@ public class ExcelParserTest extends Tes
             input.close();
         }
     }
+    
+    public void testWorksSpreadsheet70() throws Exception {
+        InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/testWORKSSpreadsheet7.0.xlr");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+
+            String content = handler.toString();
+            assertTrue(content.contains("Microsoft Works"));
+        } finally {
+            input.close();
+        }
+    }
 
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr?rev=1220687&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



Mime
View raw message