tika-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amy...@apache.org
Subject svn commit: r1221323 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/detect/ tika-parsers/src/test/java/org/apache/tika/mime/ t...
Date Tue, 20 Dec 2011 15:55:48 GMT
Author: amylka
Date: Tue Dec 20 15:55:48 2011
New Revision: 1221323

URL: http://svn.apache.org/viewvc?rev=1221323&view=rev
Log:
TIKA-821 Added support for detection of old MS Works Word Processor files

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor3.0.wps
  (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor4.0.wps
  (with props)
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1221323&r1=1221322&r2=1221323&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Dec
20 15:55:48 2011
@@ -1355,6 +1355,11 @@
   </mime-type>
 
   <mime-type type="application/vnd.ms-works">
+    <magic priority="50">
+      <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
+         <match value="M\x00a\x00t\x00O\x00S\x00T" type="string" offset="1152:4096" />
+      </match>
+    </magic>
     <glob pattern="*.wps"/>
     <glob pattern="*.wks"/>
     <glob pattern="*.wcm"/>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1221323&r1=1221322&r2=1221323&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
(original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Tue Dec 20 15:55:48 2011
@@ -164,6 +164,9 @@ public class POIFSContainerDetector impl
                 return VSD;
             } else if (names.contains("\u0001Ole10Native")) {
                 return OLE10_NATIVE;
+            } else if (names.contains("MatOST")) {
+            	// this occurs on older Works Word Processor files (versions 3.0 and 4.0)
+            	return WPS;
             } else if (names.contains("CONTENTS") && names.contains("SPELLING"))
{
                // Newer Works files
                return WPS;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1221323&r1=1221322&r2=1221323&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Tue Dec 20 15:55:48 2011
@@ -70,6 +70,11 @@ public class TestContainerAwareDetector 
         assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
         assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
         assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
+        // older Works Word Processor files can't be recognized
+    	// they were created with Works Word Processor 7.0 (hence the text inside)
+    	// and exported to the older formats with the "Save As" feature
+        assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works");
+        assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works");
         assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
         assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
         assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
@@ -79,6 +84,7 @@ public class TestContainerAwareDetector 
         assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
         assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");
         
+        
         // With the filename and data
         assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");
         assertTypeByNameAndData("testWORD.doc", "application/msword");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1221323&r1=1221322&r2=1221323&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Tue Dec
20 15:55:48 2011
@@ -164,19 +164,40 @@ public class TestMimeTypes extends TestC
      * @throws Exception
      */
     public void testWorks70Detection() throws Exception {
-        // this is possible due to MimeTypes guessing the type
-        // based on the WksSSWorkBook near the beginning of the
-        // file
-        assertTypeByData("application/x-tika-msworks-spreadsheet",
-                "testWORKSSpreadsheet7.0.xlr");
-        
-        // this is right, we made x-xlr a subtype of vnd.ms-excel
-        assertTypeByNameAndData("application/x-tika-msworks-spreadsheet",
-                "testWORKSSpreadsheet7.0.xlr");
-        
-        // with name-only, everything should be all right
-        assertTypeByName("application/x-tika-msworks-spreadsheet", 
-                "testWORKSSpreadsheet7.0.xlr");
+    	assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
+    			// with name-only, everything should be all right 
+    			"application/x-tika-msworks-spreadsheet",
+    			// this is possible due to MimeTypes guessing the type
+    	        // based on the WksSSWorkBook near the beginning of the
+    	        // file
+    			"application/x-tika-msworks-spreadsheet",
+    			// this is right, the magic-based detection works, there is
+    	        // no need for the name-based detection to refine it
+    			"application/x-tika-msworks-spreadsheet");
+    }
+    
+    /**
+     * Files generated by Works Word Processor versions 3.0 and 4.0 use the
+     * OLE2 structure. They don't resemble Word though.
+     * 
+     * @throws Exception
+     */
+    public void testOldWorksWordProcessorDetection() throws Exception {
+    	assertTypeDetection(
+    			"testWORKSWordProcessor3.0.wps",
+    			// .wps is just like any other works extension
+    			"application/vnd.ms-works",
+    			// this is due to MatOST substring
+    			"application/vnd.ms-works",
+    			// magic-based detection works, no need to refine it
+    			"application/vnd.ms-works");
+    	
+    	// files in version 4.0 are no different from those in version 3.0
+    	assertTypeDetection(
+    			"testWORKSWordProcessor4.0.wps",
+    			"application/vnd.ms-works",
+    			"application/vnd.ms-works",
+    			"application/vnd.ms-works");
     }
     
     /**
@@ -579,6 +600,13 @@ public class TestMimeTypes extends TestC
        }
     }
     
+    private void assertTypeDetection(String filename, String byName, String byData, 
+    		String byNameAndData) throws IOException {
+    	assertTypeByName(byName, filename);
+    	assertTypeByData(byData, filename);
+    	assertTypeByNameAndData(byNameAndData, filename);
+    }
+    
     private void assertTypeByNameAndData(String expected, String filename)
 	    throws IOException {
        assertEquals(expected, getTypeByNameAndData(filename).toString());

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor3.0.wps
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor3.0.wps?rev=1221323&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor3.0.wps
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor4.0.wps
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor4.0.wps?rev=1221323&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKSWordProcessor4.0.wps
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



Mime
View raw message