tika-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amy...@apache.org
Subject svn commit: r1220698 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: detect/TextDetector.java mime/MimeTypes.java
Date Mon, 19 Dec 2011 11:36:33 GMT
Author: amylka
Date: Mon Dec 19 11:36:33 2011
New Revision: 1220698

URL: http://svn.apache.org/viewvc?rev=1220698&view=rev
Log:
TIKA-814 MimeTypes detects plain text based on a larger sample of bytes.

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1220698&r1=1220697&r2=1220698&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Mon Dec 19
11:36:33 2011
@@ -45,7 +45,7 @@ public class TextDetector implements Det
      * The number of bytes from the beginning of the document stream
      * to test for control bytes.
      */
-    private static final int NUMBER_OF_BYTES_TO_TEST = 512;
+    private static final int DEFAULT_NUMBER_OF_BYTES_TO_TEST = 512;
 
     /**
      * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
@@ -81,6 +81,24 @@ public class TextDetector implements Det
         IS_CONTROL_BYTE[0x1B] = false; // escape
     }
 
+    private final int bytesToTest;
+    
+    /**
+     * Constructs a {@link TextDetector} which will look at the default number
+     * of bytes from the beginning of the document.
+     */
+    public TextDetector() {
+        this(DEFAULT_NUMBER_OF_BYTES_TO_TEST);
+    }
+
+    /**
+     * Constructs a {@link TextDetector} which will look at a given number of
+     * bytes from the beginning of the document.
+     */
+    public TextDetector(int bytesToTest) {
+        this.bytesToTest = bytesToTest;
+    }
+    
     /**
      * Looks at the beginning of the document input stream to determine
      * whether the document is text or not.
@@ -96,13 +114,13 @@ public class TextDetector implements Det
             return MediaType.OCTET_STREAM;
         }
 
-        input.mark(NUMBER_OF_BYTES_TO_TEST);
+        input.mark(bytesToTest);
         try {
             int chars = 0;
             int controls = 0;
             int asciis = 0;
             int ch = input.read();
-            while (ch != -1 && chars < NUMBER_OF_BYTES_TO_TEST) {
+            while (ch != -1 && chars < bytesToTest) {
                 if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
                     controls++;
                 } else if (ch < 127) {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1220698&r1=1220697&r2=1220698&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Mon Dec 19 11:36:33
2011
@@ -210,7 +210,7 @@ public final class MimeTypes implements 
 
         // Finally, assume plain text if no control bytes are found
         try {
-            TextDetector detector = new TextDetector();
+            TextDetector detector = new TextDetector(getMinLength());
             ByteArrayInputStream stream = new ByteArrayInputStream(data);
             return forName(detector.detect(stream, new Metadata()).toString());
         } catch (Exception e) {



Mime
View raw message