any23-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ans...@apache.org
Subject svn commit: r1380397 [1/2] - in /incubator/any23/trunk: api/src/main/resources/ core/src/main/java/org/apache/any23/extractor/csv/ core/src/main/java/org/apache/any23/mime/ core/src/main/resources/ core/src/main/resources/org/apache/any23/mime/ core/sr...
Date Mon, 03 Sep 2012 23:11:17 GMT
Author: ansell
Date: Mon Sep  3 23:11:15 2012
New Revision: 1380397

URL: http://svn.apache.org/viewvc?rev=1380397&view=rev
Log:
ANY23-117 : Split out mime detection into a module

Added:
    incubator/any23/trunk/api/src/main/resources/
    incubator/any23/trunk/api/src/main/resources/default-configuration.properties
    incubator/any23/trunk/csvutils/
    incubator/any23/trunk/csvutils/pom.xml
    incubator/any23/trunk/csvutils/src/
    incubator/any23/trunk/csvutils/src/main/
    incubator/any23/trunk/csvutils/src/main/java/
    incubator/any23/trunk/csvutils/src/main/java/org/
    incubator/any23/trunk/csvutils/src/main/java/org/apache/
    incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/
    incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/
    incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/csv/
    incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
    incubator/any23/trunk/mime/
    incubator/any23/trunk/mime/pom.xml
    incubator/any23/trunk/mime/src/
    incubator/any23/trunk/mime/src/main/
    incubator/any23/trunk/mime/src/main/java/
    incubator/any23/trunk/mime/src/main/java/org/
    incubator/any23/trunk/mime/src/main/java/org/apache/
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/NaiveMIMETypeDetector.java
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/package-info.java
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/purifier/
    incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/purifier/WhiteSpacesPurifier.java
    incubator/any23/trunk/mime/src/main/resources/
    incubator/any23/trunk/mime/src/main/resources/org/
    incubator/any23/trunk/mime/src/main/resources/org/apache/
    incubator/any23/trunk/mime/src/main/resources/org/apache/any23/
    incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/
    incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/mimetypes.xml
    incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/tika-config.xml
    incubator/any23/trunk/mime/src/test/
    incubator/any23/trunk/mime/src/test/java/
    incubator/any23/trunk/mime/src/test/java/org/
    incubator/any23/trunk/mime/src/test/java/org/apache/
    incubator/any23/trunk/mime/src/test/java/org/apache/any23/
    incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/
    incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/TikaMIMETypeDetectorTest.java
    incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/purifier/
    incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/purifier/WhiteSpacesPurifierTest.java
Removed:
    incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
    incubator/any23/trunk/core/src/main/java/org/apache/any23/mime/
    incubator/any23/trunk/core/src/main/resources/default-configuration.properties
    incubator/any23/trunk/core/src/main/resources/org/apache/any23/mime/
    incubator/any23/trunk/core/src/test/java/org/apache/any23/mime/

Added: incubator/any23/trunk/api/src/main/resources/default-configuration.properties
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/api/src/main/resources/default-configuration.properties?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/api/src/main/resources/default-configuration.properties (added)
+++ incubator/any23/trunk/api/src/main/resources/default-configuration.properties Mon Sep  3 23:11:15 2012
@@ -0,0 +1,74 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Any23 Core Version
+any23.core.version=${project.version} (${implementation.build.tstamp})
+
+# HTTP Client Configuration.
+# ---- Default HTTP User Agent if not specified.
+any23.http.user.agent.default=Any23-CLI
+# ---- HTTP client timeout in milliseconds.
+any23.http.client.timeout=10000
+# ---- HTTP client max number of connections.
+any23.http.client.max.connections=5
+
+# RDFa Extractor
+any23.rdfa.extractor.xslt=rdfa.xslt
+
+# Allows to enable(on)/disable(off) addition
+# of timestamp and size metadata triples.
+any23.extraction.metadata.timesize=off
+
+# Allows to enable(on)/disable(off) addition
+# of nesting metadata triples.
+any23.extraction.metadata.nesting=on
+
+# Allows to enable(on)/disable(off)
+# the domain triple for every Microformat entity.
+any23.extraction.metadata.domain.per.entity=off
+
+# Allows to decide which RDFa Extractor to enable.
+# If 'on' will be activated the programmatic RDFa 1.1 Extractor
+# (org.deri.any23.extractor.rdfa.RDFa11Extractor) otherwise will be
+# registered the RDFa 1.0 legacy one (org.deri.any23.extractor.rdfa.RDFaExtractor).
+any23.extraction.rdfa.programmatic=on
+
+# The extraction context URI to be used by the
+# SingleDocumentExtraction. If == '?' the document URI will
+# be used. It can be overriden by specifying a different
+# value in ExtractionParameters.
+any23.extraction.context.uri=?
+
+# Any23 Core Plugin Dirs
+any23.plugin.dirs=./plugins
+
+# Microdata Configuration.
+# ---- Enables(on)/disables(off) strict Microdata extraction.
+#      If turned off, it uses the any23.microdata.ns.default
+#      to build URIs for anonymous properties
+any23.microdata.strict=off
+# ---- Microdata default namespace.
+any23.microdata.ns.default=http://rdf.data-vocabulary.org/
+
+# Allows to enable(on)/disable(off) the registration
+# of HTMLMetaExtractor.java to
+# ExtractionRegistry.java
+any23.extraction.head.meta=off
+
+# Allows to specify a CSV file separator and comment delimeter
+any23.extraction.csv.field=,
+any23.extraction.csv.comment=#
\ No newline at end of file

Added: incubator/any23/trunk/csvutils/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/csvutils/pom.xml?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/csvutils/pom.xml (added)
+++ incubator/any23/trunk/csvutils/pom.xml Mon Sep  3 23:11:15 2012
@@ -0,0 +1,24 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <artifactId>apache-any23</artifactId>
+    <groupId>org.apache.any23</groupId>
+    <version>0.7.1-incubating-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+  <artifactId>apache-any23-csvutils</artifactId>
+  <name>Apache Any23 :: CSV Utilities</name>
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-api</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+    </dependency>
+  </dependencies>
+</project>
+

Added: incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java (added)
+++ incubator/any23/trunk/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java Mon Sep  3 23:11:15 2012
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.csv;
+
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVStrategy;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+/**
+ * This class is responsible to build a reader first guessing the configuration
+ * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
+ *
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @author Michele Mostarda ( michele.mostarda@gmail.com )
+ */
+public class CSVReaderBuilder {
+
+    private static final String DEFAULT_FIELD_DELIMITER = ",";
+
+    private static final String DEFAULT_COMMENT_DELIMITER = "#";
+
+    public static final char NULL_CHAR = ' ';
+
+    private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
+
+    private static DefaultConfiguration defaultConfiguration =
+            DefaultConfiguration.singleton();
+
+    private static final CSVStrategy[] strategies;
+
+    static {
+        strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
+        strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
+        int index = 1;
+        for(char dlmt : popularDelimiters) {
+            strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
+        }
+    }
+
+    /**
+     * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
+     * from the provided <i>CSV</i> file.
+     *
+     * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
+     * @return a {@link CSVParser}
+     * @throws java.io.IOException
+     */
+    public static CSVParser build(InputStream is) throws IOException {
+        CSVStrategy bestStrategy = getBestStrategy(is);
+        if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
+        return new CSVParser( new InputStreamReader(is), bestStrategy );
+    }
+
+    /**
+     * Checks whether the given input stream is a CSV or not.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
+     *         <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean isCSV(InputStream is) throws IOException {
+        return getBestStrategy(is) != null;
+    }
+
+    private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
+        for( CSVStrategy strategy : strategies ) {
+            if( testStrategy(is, strategy) ) {
+                return strategy;
+            }
+        }
+        return null;
+    }
+
+    private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
+        return new CSVStrategy(delimiter, '\'', comment);
+    }
+
+    private static CSVStrategy getCSVStrategyFromConfiguration() {
+        char fieldDelimiter = getCharValueFromConfiguration(
+                "any23.extraction.csv.field",
+                DEFAULT_FIELD_DELIMITER
+        );
+        char commentDelimiter = getCharValueFromConfiguration(
+                "any23.extraction.csv.comment",
+                DEFAULT_COMMENT_DELIMITER
+        );
+        return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
+    }
+
+    private static char getCharValueFromConfiguration(String property, String defaultValue) {
+        String delimiter = defaultConfiguration.getProperty(
+                property,
+                defaultValue
+        );
+        if (delimiter.length() != 1 || delimiter.equals("")) {
+            throw new RuntimeException(property + " value must be a single character");
+        }
+        return delimiter.charAt(0);
+    }
+
+    /**
+     * make sure the reader has correct delimiter and quotation set.
+     * Check first lines and make sure they have the same amount of columns and at least 2
+     *
+     * @param is input stream to be checked
+     * @param strategy strategy to be verified.
+     * @return
+     * @throws IOException
+     * @param is
+     */
+    private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
+        final int MIN_COLUMNS = 2;
+
+        is.mark(Integer.MAX_VALUE);
+        try {
+            final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
+            int linesToCheck = 5;
+            int headerColumnCount = -1;
+            while (linesToCheck > 0) {
+                String[] row;
+                row = parser.getLine();
+                if (row == null) {
+                    break;
+                }
+                if (row.length < MIN_COLUMNS) {
+                    return false;
+                }
+                if (headerColumnCount == -1) { // first row
+                    headerColumnCount = row.length;
+                } else { // make sure rows have the same number of columns or one more than the header
+                    if (row.length < headerColumnCount) {
+                        return false;
+                    } else if (row.length - 1 > headerColumnCount) {
+                        return false;
+                    }
+                }
+                linesToCheck--;
+            }
+            return true;
+        } finally {
+            is.reset();
+        }
+    }
+
+
+}

Added: incubator/any23/trunk/mime/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/pom.xml?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/pom.xml (added)
+++ incubator/any23/trunk/mime/pom.xml Mon Sep  3 23:11:15 2012
@@ -0,0 +1,69 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <artifactId>apache-any23</artifactId>
+    <groupId>org.apache.any23</groupId>
+    <version>0.7.1-incubating-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+  <artifactId>apache-any23-mime</artifactId>
+  <name>Apache Any23 :: Mime Type Detection</name>
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-api</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-csvutils</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-test-resources</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.openrdf.sesame</groupId>
+      <artifactId>sesame-rio-turtle</artifactId>
+      <scope>compile</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.openrdf.sesame</groupId>
+      <artifactId>sesame-rio-ntriples</artifactId>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.openrdf.sesame</groupId>
+      <artifactId>sesame-rio-n3</artifactId>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>any23-nquads</artifactId>
+      <version>${project.version}</version>
+      <scope>runtime</scope>
+    </dependency>
+  </dependencies>
+</project>

Added: incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/NaiveMIMETypeDetector.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/NaiveMIMETypeDetector.java?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/NaiveMIMETypeDetector.java (added)
+++ incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/NaiveMIMETypeDetector.java Mon Sep  3 23:11:15 2012
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.mime;
+
+import org.openrdf.rio.RDFFormat;
+import org.openrdf.rio.Rio;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Basic implementation of {@link MIMETypeDetector} based
+ * on file extensions.
+ */
+public class NaiveMIMETypeDetector implements MIMETypeDetector {
+
+    private final static Map<String, String> extensions = new HashMap<String, String>() {
+        {
+            // extension -> mime type
+            put("html" , "text/html"            );
+            put("htm"  , "text/html"            );
+            put("xhtml", "application/xhtml+xml");
+            put("xht"  , "application/xhtml+xml");
+            put("xrdf" , "application/rdf+xml"  );
+            put("rdfx" , "application/rdf+xml"  );
+            put("owl"  , "application/rdf+xml"  );
+            put("txt"  , "text/plain"           );
+        }
+    };
+
+    private final static Pattern extensionRegex = Pattern.compile(".*\\.([a-z0-9]+)");
+
+    public MIMEType guessMIMEType(
+            String fileName,
+            InputStream input,
+
+            MIMEType mimeTypeFromMetadata
+    ) {
+        if (mimeTypeFromMetadata != null) {
+            return mimeTypeFromMetadata;
+        }
+
+        final RDFFormat parserFormatForFileName = Rio.getParserFormatForFileName(fileName);
+        if (parserFormatForFileName != null) {
+            return MIMEType.parse(parserFormatForFileName.getDefaultMIMEType());
+        }
+
+        String extension = getExtension(fileName);
+        if (extension == null) {
+            // Assume index file on web server.
+            extension = "html";
+        }
+        if (extensions.containsKey(extension)) {
+            return MIMEType.parse(extensions.get(extension));
+        }
+        return null;
+    }
+
+    private String getExtension(String filename) {
+        Matcher m = extensionRegex.matcher(filename);
+        if (!m.matches()) return null;
+        return m.group(1);
+    }
+
+}

Added: incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java (added)
+++ incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java Mon Sep  3 23:11:15 2012
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.mime;
+
+import org.apache.any23.extractor.csv.CSVReaderBuilder;
+import org.apache.any23.mime.purifier.Purifier;
+import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.openrdf.rio.RDFFormat;
+import org.openrdf.rio.RDFParser;
+import org.openrdf.rio.Rio;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.regex.Pattern;
+
+/**
+ * Implementation of {@link MIMETypeDetector} based on
+ * <a href="http://tika.apache.org/">Apache Tika</a>.
+ *
+ * @author Michele Mostarda (michele.mostarda@gmail.com)
+ * @author Davide Palmisano (dpalmisano@gmail.com)
+ */
+public class TikaMIMETypeDetector implements MIMETypeDetector {
+
+    private Purifier purifier;
+
+    public static final String CSV_MIMETYPE = "text/csv";
+
+    public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
+
+    /**
+     * N3 patterns.
+     */
+    private static final Pattern[] N3_PATTERNS = {
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."             ), // * URI URI .
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."             ), // * URI BNODE .
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."     ), // * URI LLITERAL .
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.")  // * URI TLITERAL .
+    };
+
+    /**
+     * N-Quads patterns.
+     */
+    private static final Pattern[] NQUADS_PATTERNS = {
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."             ), // * URI URI      URI .
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."             ), // * URI BNODE    URI .
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."     ), // * URI LLITERAL URI .
+            Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.")  // * URI TLITERAL URI .
+    };
+
+    private static TikaConfig config = null;
+
+    private static Tika tika;
+
+    private static MimeTypes types;
+
+    /**
+     * Checks if the stream contains the <i>N3</i> triple patterns.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean checkN3Format(InputStream is) throws IOException {
+        return findPattern(N3_PATTERNS, '.', is);
+    }
+
+    /**
+     * Checks if the stream contains the <i>NQuads</i> patterns.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean checkNQuadsFormat(InputStream is) throws IOException {
+        return findPattern(NQUADS_PATTERNS, '.', is);
+    }
+
+    /**
+     * Checks if the stream contains <i>Turtle</i> triple patterns.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean checkTurtleFormat(InputStream is) throws IOException {
+        String sample = extractDataSample(is, '.');
+        RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
+        turtleParser.setDatatypeHandling(RDFParser.DatatypeHandling.VERIFY);
+        turtleParser.setStopAtFirstError(true);
+        turtleParser.setVerifyData(true);
+        ByteArrayInputStream bais = new ByteArrayInputStream( sample.getBytes() );
+        try {
+            turtleParser.parse(bais, "");
+            return true;
+        } catch (Exception e) {
+            return false;
+        }
+    }
+
+    /**
+     * Checks if the stream contains a valid <i>CSV</i> pattern.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean checkCSVFormat(InputStream is) throws IOException {
+        return CSVReaderBuilder.isCSV(is);
+    }
+
+    /**
+     * Tries to apply one of the given patterns on a sample of the input stream.
+     *
+     * @param patterns the patterns to apply.
+     * @param delimiterChar the delimiter of the sample.
+     * @param is the input stream to sample.
+     * @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise.
+     * @throws IOException
+     */
+    private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is)
+    throws IOException {
+        String sample = extractDataSample(is, delimiterChar);
+        for(Pattern pattern : patterns) {
+            if(pattern.matcher(sample).find()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Extracts a sample data from the input stream, from the current
+     * mark to the first <i>breakChar</i> char.
+     *
+     * @param is the input stream to sample.
+     * @param breakChar the char to break to sample.
+     * @return the sample string.
+     * @throws IOException if an error occurs during sampling.
+     */
+    private static String extractDataSample(InputStream is, char breakChar) throws IOException {
+        BufferedReader br = new BufferedReader(new InputStreamReader(is));
+        StringBuilder sb = new StringBuilder();
+        final int MAX_SIZE = 1024 * 2;
+        int c;
+        boolean insideBlock = false;
+        int read = 0;
+        br.mark(MAX_SIZE);
+        try {
+            while ((c = br.read()) != -1) {
+                read++;
+                if (read > MAX_SIZE) {
+                    break;
+                }
+                if ('<' == c) {
+                    insideBlock = true;
+                } else if ('>' == c) {
+                    insideBlock = false;
+                } else if ('"' == c) {
+                    insideBlock = !insideBlock;
+                }
+                sb.append((char) c);
+                if (!insideBlock && breakChar == c) {
+                    break;
+                }
+            }
+        } finally {
+            is.reset();
+            br.reset();
+        }
+        return sb.toString();
+    }
+
+    public TikaMIMETypeDetector(Purifier purifier) {
+        this.purifier = purifier;
+        InputStream is = getResourceAsStream();
+        if (config == null) {
+            try {
+                config = new TikaConfig(is);
+            } catch (Exception e) {
+                throw new RuntimeException("Error while loading Tika configuration.", e);
+            }
+        }
+
+        if (types == null) {
+            types = config.getMimeRepository();
+        }
+
+        if(tika == null) {
+            tika = new Tika(config);
+        }
+    }
+
+    public TikaMIMETypeDetector() {
+        this( new WhiteSpacesPurifier() );
+    }
+
+    /**
+     * Estimates the <code>MIME</code> type of the content of input file.
+     * The <i>input</i> stream must be resettable.
+     *
+     * @param fileName name of the data source.
+     * @param input <code>null</code> or a <b>resettable</i> input stream containing data.
+     * @param mimeTypeFromMetadata mimetype declared in metadata.
+     * @return the supposed mime type or <code>null</code> if nothing appropriate found.
+     * @throws IllegalArgumentException if <i>input</i> is not <code>null</code> and is not resettable.
+     */
+    public MIMEType guessMIMEType(
+            String fileName,
+            InputStream input,
+            MIMEType mimeTypeFromMetadata
+    ) {
+        if(input != null) {
+            try {
+                this.purifier.purify(input);
+            } catch (IOException e) {
+                throw new RuntimeException("Error while purifying the provided input", e);
+            }
+        }
+
+        final Metadata meta = new Metadata();
+        if (mimeTypeFromMetadata != null)
+            meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
+        if (fileName != null)
+            meta.set(Metadata.RESOURCE_NAME_KEY, fileName);
+
+        String type;
+        try {
+            final String mt = guessMimeTypeByInputAndMeta(input, meta);
+            if( ! MimeTypes.OCTET_STREAM.equals(mt) ) {
+                type = mt;
+            } else {
+                if( checkN3Format(input) ) {
+                    type = RDFFormat.N3.getDefaultMIMEType();
+                } else if( checkNQuadsFormat(input) ) {
+                    type = RDFFormat.NQUADS.getDefaultMIMEType();
+                } else if( checkTurtleFormat(input) ) {
+                    type = RDFFormat.TURTLE.getDefaultMIMEType();
+                } else if( checkCSVFormat(input) ) {
+                    type = CSV_MIMETYPE;
+                }
+                else {
+                    type = MimeTypes.OCTET_STREAM; 
+                }
+            }
+        } catch (IOException ioe) {
+            throw new RuntimeException("Error while retrieving mime type.", ioe);
+        }
+        return MIMEType.parse(type);
+    }
+
+     /**
+      * Loads the <code>Tika</code> configuration file.
+      *
+      * @return the input stream containing the configuration.
+      */
+     private InputStream getResourceAsStream() {
+         InputStream result;
+         result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
+         if (result == null) {
+             result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
+             if (result == null) {
+                 result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
+             }
+         }
+         return result;
+     }
+
+    /**
+     * Automatically detects the MIME type of a document based on magic
+     * markers in the stream prefix and any given metadata hints.
+     * <p/>
+     * The given stream is expected to support marks, so that this method
+     * can reset the stream to the position it was in before this method
+     * was called.
+     *
+     * @param stream   document stream
+     * @param metadata metadata hints
+     * @return MIME type of the document
+     * @throws IOException if the document stream could not be read
+     */
+    private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata)
+    throws IOException {
+        if (stream != null) {
+            final String type = tika.detect(stream);
+            if ( type != null && ! isGenericMIMEType(type) ) {
+                return type;
+            }
+        }
+
+        // Determines the MIMEType based on Content-Type hint if available.
+        final String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        String candidateMIMEType = null;
+        if (contentType != null) {
+            try {
+                MimeType type = types.forName(contentType);
+                if (type != null) {
+                    if( ! isPlainMIMEType(type.getName()) ) {
+                        return type.getName();
+                    } else {
+                        candidateMIMEType = type.getName();
+                    }
+                }
+            }
+            catch (MimeTypeException mte) {
+                // Malformed ocntent-type value, ignore.
+            }
+        }
+
+        // Determines the MIMEType based on resource name hint if available.
+        final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (resourceName != null) {
+            MimeType type = types.getMimeType(resourceName);
+            if (type != null) {
+                return type.getName();
+            }
+        }
+
+        // Finally, use the default type if no matches found
+        if(candidateMIMEType != null) {
+            return candidateMIMEType;
+        } else {
+            return MimeTypes.OCTET_STREAM;
+        }
+    }
+
+    private boolean isPlainMIMEType(String type) {
+        return
+            type.equals(MimeTypes.OCTET_STREAM)
+                ||
+            type.equals(MimeTypes.PLAIN_TEXT);
+    }
+
+    private boolean isGenericMIMEType(String type) {
+        return
+            isPlainMIMEType(type)
+                ||
+            type.equals(MimeTypes.XML);
+    }
+
+}
+

Added: incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/package-info.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/package-info.java?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/package-info.java (added)
+++ incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/package-info.java Mon Sep  3 23:11:15 2012
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This package provides support for <code>MIME Type</code> data detection.
+ */
+package org.apache.any23.mime;
\ No newline at end of file

Added: incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/purifier/WhiteSpacesPurifier.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/purifier/WhiteSpacesPurifier.java?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/purifier/WhiteSpacesPurifier.java (added)
+++ incubator/any23/trunk/mime/src/main/java/org/apache/any23/mime/purifier/WhiteSpacesPurifier.java Mon Sep  3 23:11:15 2012
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.mime.purifier;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Implementation of {@link Purifier} that removes all the eventual blank
+ * characters at the header of a file that might prevents its <i>MIME Type</i> detection.
+ *
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ */
+public class WhiteSpacesPurifier implements Purifier {
+
+    /**
+     * {@inheritDoc}
+     */
+    public void purify(InputStream inputStream) throws IOException {
+        if(!inputStream.markSupported())
+            throw new IllegalArgumentException("Provided InputStream does not support marks");
+
+        // mark the current position
+        inputStream.mark(Integer.MAX_VALUE);
+        int byteRead = inputStream.read();
+        char charRead = (char) byteRead;
+        while(isBlank(charRead) && (byteRead != -1)) {
+            // if here means that the previos character must be removed, so mark.
+            inputStream.mark(Integer.MAX_VALUE);            
+            byteRead = inputStream.read();
+            charRead = (char) byteRead;
+        }
+        // if exit go back to the last valid mark.
+        inputStream.reset();
+    }
+    
+    private boolean isBlank(char c) {
+        return c == '\t' || c == '\n' || c == ' ' || c == '\r' || c == '\b' || c == '\f';
+    }
+}

Added: incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/mimetypes.xml?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/mimetypes.xml (added)
+++ incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/mimetypes.xml Mon Sep  3 23:11:15 2012
@@ -0,0 +1,853 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!--
+  Description: This xml file defines the valid mime types used
+  by Tika. The mime types within this file are based on the types in the
+  mime-types.xml file available in Apache Nutch.
+-->
+<mime-info>
+
+    <!-- DEFAULT -->
+    <mime-type type="application/octet-stream">
+        <magic priority="50">
+            <match value="\037\036" type="string" offset="0"/>
+            <match value="017437" type="host16" offset="0"/>
+            <match value="0x1fff" type="host16" offset="0"/>
+            <match value="\377\037" type="string" offset="0"/>
+            <match value="0145405" type="host16" offset="0"/>
+        </magic>
+        <glob pattern="*.bin"/>
+    </mime-type>
+
+    <mime-type type="text/plain">
+        <magic priority="50">
+            <match value="This is TeX," type="string" offset="0"/>
+            <match value="This is METAFONT," type="string" offset="0"/>
+        </magic>
+        <glob pattern="*.txt"/>
+        <glob pattern="*.asc"/>
+        <glob pattern="*.nt"/>
+        <glob pattern="INSTALL"/>
+        <glob pattern="KEYS"/>
+        <glob pattern="Makefile"/>
+        <glob pattern="README"/>
+        <glob pattern="abs-linkmap"/>
+        <glob pattern="abs-menulinks"/>
+    </mime-type>
+
+    <mime-type type="application/xml">
+        <alias type="text/xml"/>
+        <magic priority="50">
+            <match value="&lt;?xml" type="string" offset="0"/>
+            <match value="&lt;?XML" type="string" offset="0"/>
+            <match value="&lt;!--" type="string" offset="0"/>
+            <match value="0xFFFE3C003F0078006D006C00" type="string" offset="0"/>
+            <match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
+        </magic>
+        <glob pattern="*.xml"/>
+        <glob pattern="*.xsl"/>
+        <glob pattern="*.xsd"/>
+        <sub-class-of type="text/plain"/>
+    </mime-type>
+
+     <mime-type type="text/csv">
+        <_comment>Comma separated Value</_comment>
+        <glob pattern="*.csv"/>
+         <sub-class-of type="text/plain"/>
+    </mime-type>
+
+    <!-- BEGIN: Semantic Web document mime types. -->
+
+    <!-- N3 -->
+    <mime-type type="text/rdf+n3">
+        <alias type="text/n3"/>
+        <alias type="application/n3"/>
+        <glob pattern="*.n3"/>
+        <magic priority="50">
+            <match value="@prefix" type="string" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <!-- NQuads -->
+    <mime-type type="text/x-nquads">
+        <alias type="text/rdf+nq"/>
+        <alias type="text/nq"/>
+        <alias type="application/nq"/>
+        <glob pattern="*.nq"/>
+    </mime-type>
+
+    <!-- Turtle -->
+    <mime-type type="text/turtle"> 
+        <alias type="application/x-turtle"/>
+        <alias type="application/turtle"/>
+        <glob pattern="*.ttl"/>
+    </mime-type>
+
+    <!-- RDFXML -->
+    <mime-type type="application/rdf+xml">
+        <sub-class-of type="application/xml"/>
+        <root-XML localName="RDF"/>
+        <root-XML localName="rdf"/>
+        <root-XML namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+        <root-XML namespaceURI="http://purl.org/rss/1.0/"/>
+        <alias type="text/rdf"/>
+        <magic priority="50">
+            <match value="&lt;rdf:RDF" type="string" offset="0:64"/>
+            <match value="&lt;RDF" type="string" offset="0:64"/>
+            <match value="xmlns:rdf" type="string" offset="0:64"/>
+            <match value="*&lt;DOCTYPE rdf:RDF" type="string" offset="0:120"/>
+        </magic>
+        <glob pattern="*.rdf"/>
+        <glob pattern="*.rdfs"/>
+        <glob pattern="*.xrdf"/>
+        <glob pattern="*.owl"/>
+        <glob pattern="*.rdfx"/>
+    </mime-type>
+
+    <!-- TriX -->
+    <mime-type type="application/trix">
+        <sub-class-of type="application/xml"/>
+        <root-XML namespaceURI="http://www.w3.org/2004/03/trix/trix-1/" localName="TriX"/>
+        <root-XML localName="TriX"/>
+        <glob pattern="*.trx"/>
+        <glob pattern="*.trix"/>
+    </mime-type>
+
+    <!-- END Semantic Web document mime types. -->
+
+    <!-- RSS -->
+    <mime-type type="application/rss+xml">
+        <alias type="text/rss"/>
+        <root-XML localName="rss"/>
+        <root-XML namespaceURI="http://purl.org/rss/1.0/"/>
+        <glob pattern="*.rss"/>
+    </mime-type>
+
+    <!-- ATOM -->
+    <mime-type type="application/atom+xml">
+        <sub-class-of type="application/xml"/>
+        <root-XML localName="feed"/>
+        <root-XML namespaceURI="http://www.w3.org/2005/Atom/" localName="feed"/>
+        <root-XML namespaceURI="http://www.w3.org/2005/Atom"  localName="feed"/>
+    </mime-type>
+
+    <!-- WSDL -->
+    <mime-type type="application/x-wsdl">
+        <sub-class-of type="application/xml"/>
+        <root-XML localName="definitions"/>
+        <root-XML namespaceURI="http://schemas.xmlsoap.org/wsdl/" localName="definitions"/>
+        <glob pattern="*.wsdl"/>
+    </mime-type>
+
+    <!-- HTML -->
+    <mime-type type="text/html">
+        <magic priority="50">
+            <match value="&lt;!DOCTYPE HTML" type="string" offset="0:64"/>
+            <match value="&lt;!doctype html" type="string" offset="0:64"/>
+            <match value="&lt;HEAD" type="string" offset="0:64"/>
+            <match value="&lt;head" type="string" offset="0:64"/>
+            <match value="&lt;TITLE" type="string" offset="0:64"/>
+            <match value="&lt;title" type="string" offset="0:64"/>
+            <match value="&lt;html" type="string" offset="0:64"/>
+            <match value="&lt;HTML" type="string" offset="0:64"/>
+            <match value="&lt;BODY" type="string" offset="0"/>
+            <match value="&lt;body" type="string" offset="0"/>
+            <match value="&lt;TITLE" type="string" offset="0"/>
+            <match value="&lt;title" type="string" offset="0"/>
+            <match value="&lt;!--" type="string" offset="0"/>
+            <match value="&lt;h1" type="string" offset="0"/>
+            <match value="&lt;H1" type="string" offset="0"/>
+            <match value="&lt;!doctype HTML" type="string" offset="0"/>
+            <match value="&lt;!DOCTYPE html" type="string" offset="0"/>
+        </magic>
+        <glob pattern="*.html"/>
+        <glob pattern="*.htm"/>
+    </mime-type>
+
+    <!-- XHTML -->
+    <mime-type type="application/xhtml+xml">
+        <sub-class-of type="application/xml"/>
+        <root-XML namespaceURI='http://www.w3.org/1999/xhtml'
+                  localName='html'/>
+        <glob pattern="*.xhtml"/>
+        <magic priority="50">
+            <match value="&lt;!DOCTYPE html PUBLIC &quot;-//W3C//DTD XHTML+RDFa 1.0//EN" type="string" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <!-- BEGIN: MS-Office documents -->
+    <mime-type type="application/vnd.ms-powerpoint">
+        <glob pattern="*.ppz"/>
+        <glob pattern="*.ppt"/>
+        <glob pattern="*.pps"/>
+        <glob pattern="*.pot"/>
+        <magic priority="50">
+            <match value="0xcfd0e011" type="little32" offset="0"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.ms-excel">
+        <magic priority="50">
+            <match value="Microsoft Excel 5.0 Worksheet" type="string"
+                   offset="2080"/>
+        </magic>
+        <glob pattern="*.xls"/>
+        <glob pattern="*.xlc"/>
+        <glob pattern="*.xll"/>
+        <glob pattern="*.xlm"/>
+        <glob pattern="*.xlw"/>
+        <glob pattern="*.xla"/>
+        <glob pattern="*.xlt"/>
+        <glob pattern="*.xld"/>
+        <alias type="application/msexcel"/>
+    </mime-type>
+
+    <mime-type type="application/msword">
+        <magic priority="50">
+            <match value="\x31\xbe\x00\x00" type="string" offset="0"/>
+            <match value="PO^Q`" type="string" offset="0"/>
+            <match value="\376\067\0\043" type="string" offset="0"/>
+            <match value="\333\245-\0\0\0" type="string" offset="0"/>
+            <match value="Microsoft Word 6.0 Document" type="string"
+                   offset="2080"/>
+            <match value="Microsoft Word document data" type="string"
+                   offset="2112"/>
+        </magic>
+        <glob pattern="*.doc"/>
+        <alias type="application/vnd.ms-word"/>
+    </mime-type>
+    <!-- END: MS-Office documents -->
+
+    <!--
+         =====================================================================
+         Open Document Format for Office Applications (OpenDocument) v1.0
+         http://www.oasis-open.org/specs/index.php#opendocumentv1.0
+         =====================================================================
+     -->
+
+    <mime-type type="application/vnd.oasis.opendocument.text">
+        <comment>OpenDocument v1.0: Text document</comment>
+        <alias type="application/x-vnd.oasis.opendocument.text"/>
+        <glob pattern="*.odt"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.text"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.text-template">
+        <comment>OpenDocument v1.0: Text document used as template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.text-template"/>
+        <glob pattern="*.ott"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.text-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.graphics">
+        <comment>OpenDocument v1.0: Graphics document (Drawing)
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.graphics"/>
+        <glob pattern="*.odg"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.graphics"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.graphics-template">
+        <comment>OpenDocument v1.0: Graphics document used as
+            template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.graphics-template"/>
+        <glob pattern="*.otg"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.graphics-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.presentation">
+        <comment>OpenDocument v1.0: Presentation document
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.presentation"/>
+        <glob pattern="*.odp"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.presentation"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.presentation-template">
+        <comment>OpenDocument v1.0: Presentation document used as
+            template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.presentation-template"/>
+        <glob pattern="*.otp"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.presentation-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.spreadsheet">
+        <comment>OpenDocument v1.0: Spreadsheet document</comment>
+        <alias type="application/x-vnd.oasis.opendocument.spreadsheet"/>
+        <glob pattern="*.ods"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.spreadsheet"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.spreadsheet-template">
+        <comment>OpenDocument v1.0: Spreadsheet document used as
+            template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.spreadsheet-template"/>
+        <glob pattern="*.ots"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.spreadsheet-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.chart">
+        <comment>OpenDocument v1.0: Chart document</comment>
+        <alias type="application/x-vnd.oasis.opendocument.chart"/>
+        <glob pattern="*.odc"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.chart"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.chart-template">
+        <comment>OpenDocument v1.0: Chart document used as
+            template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.chart-template"/>
+        <glob pattern="*.otc"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.chart-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.image">
+        <comment>OpenDocument v1.0: Image document</comment>
+        <alias type="application/x-vnd.oasis.opendocument.image"/>
+        <glob pattern="*.odi"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.image"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.image-template">
+        <comment>OpenDocument v1.0: Image document used as
+            template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.image-template"/>
+        <glob pattern="*.oti"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.image-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.formula">
+        <comment>OpenDocument v1.0: Formula document</comment>
+        <alias type="application/x-vnd.oasis.opendocument.formula"/>
+        <glob pattern="*.odf"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.formula"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.formula-template">
+        <comment>OpenDocument v1.0: Formula document used as
+            template
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.formula-template"/>
+        <glob pattern="*.otf"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.formula-template"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.text-master">
+        <comment>OpenDocument v1.0: Global Text document</comment>
+        <alias type="application/x-vnd.oasis.opendocument.text-master"/>
+        <glob pattern="*.odm"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.text-master"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.oasis.opendocument.text-web">
+        <comment>OpenDocument v1.0: Text document used as template
+            for HTML documents
+        </comment>
+        <alias type="application/x-vnd.oasis.opendocument.text-web"/>
+        <glob pattern="*.oth"/>
+        <magic>
+            <match type="string" offset="0" value="PK">
+                <match type="string" offset="30"
+                       value="mimetypeapplication/vnd.oasis.opendocument.text-web"/>
+            </match>
+        </magic>
+    </mime-type>
+
+    <!-- BEGIN: OSX Specific -->
+    <mime-type type="application/mac-binhex40">
+        <glob pattern="*.hqx"/>
+    </mime-type>
+
+    <mime-type type="application/mac-compactpro">
+        <glob pattern="*.cpt"/>
+    </mime-type>
+    <!-- END: OSX Specific -->
+
+    <mime-type type="application/rtf">
+        <glob pattern="*.rtf"/>
+        <alias type="text/rtf"/>
+    </mime-type>
+
+    <mime-type type="application/pdf">
+        <magic priority="50">
+            <match value="%PDF-" type="string" offset="0"/>
+        </magic>
+        <glob pattern="*.pdf"/>
+        <alias type="application/x-pdf"/>
+    </mime-type>
+
+    <mime-type type="application/x-mif">
+        <alias type="application/vnd.mif"/>
+    </mime-type>
+
+    <mime-type type="application/vnd.wap.wbxml">
+        <glob pattern="*.wbxml"/>
+    </mime-type>
+
+    <mime-type type="application/vnd.wap.wmlc">
+        <_comment>Compiled WML Document</_comment>
+        <glob pattern="*.wmlc"/>
+    </mime-type>
+
+    <mime-type type="application/vnd.wap.wmlscriptc">
+        <_comment>Compiled WML Script</_comment>
+        <glob pattern="*.wmlsc"/>
+    </mime-type>
+
+    <mime-type type="text/vnd.wap.wmlscript">
+        <_comment>WML Script</_comment>
+        <glob pattern="*.wmls"/>
+    </mime-type>
+
+    <mime-type type="application/x-bzip">
+        <alias type="application/x-bzip2"/>
+    </mime-type>
+
+    <mime-type type="application/x-bzip-compressed-tar">
+        <glob pattern="*.tbz"/>
+        <glob pattern="*.tbz2"/>
+    </mime-type>
+
+    <mime-type type="application/x-cdlink">
+        <_comment>Virtual CD-ROM CD Image File</_comment>
+        <glob pattern="*.vcd"/>
+    </mime-type>
+
+    <mime-type type="application/x-director">
+        <_comment>Shockwave Movie</_comment>
+        <glob pattern="*.dcr"/>
+        <glob pattern="*.dir"/>
+        <glob pattern="*.dxr"/>
+    </mime-type>
+
+    <mime-type type="application/x-futuresplash">
+        <_comment>Macromedia FutureSplash File</_comment>
+        <glob pattern="*.spl"/>
+    </mime-type>
+
+    <mime-type type="application/x-java">
+        <alias type="application/java"/>
+    </mime-type>
+
+    <mime-type type="application/x-koan">
+        <_comment>SSEYO Koan File</_comment>
+        <glob pattern="*.skp"/>
+        <glob pattern="*.skd"/>
+        <glob pattern="*.skt"/>
+        <glob pattern="*.skm"/>
+    </mime-type>
+
+    <mime-type type="application/x-latex">
+        <_comment>LaTeX Source Document</_comment>
+        <glob pattern="*.latex"/>
+    </mime-type>
+
+    <mime-type type="application/x-ms-dos-executable">
+        <alias type="application/x-dosexec"/>
+    </mime-type>
+
+    <mime-type type="application/ogg">
+        <alias type="application/x-ogg"/>
+    </mime-type>
+
+    <mime-type type="application/x-rar">
+        <alias type="application/x-rar-compressed"/>
+    </mime-type>
+
+    <mime-type type="application/x-shellscript">
+        <alias type="application/x-sh"/>
+    </mime-type>
+
+    <mime-type type="audio/midi">
+        <glob pattern="*.kar"/>
+    </mime-type>
+
+    <mime-type type="audio/x-pn-realaudio">
+        <alias type="audio/x-realaudio"/>
+    </mime-type>
+
+    <mime-type type="image/tiff">
+        <magic priority="50">
+            <match value="0x4d4d2a00" type="string" offset="0"/>
+            <match value="0x49492a00" type="string" offset="0"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="message/rfc822">
+        <magic priority="50">
+            <match type="string" value="Relay-Version:" offset="0"/>
+            <match type="string" value="#! rnews" offset="0"/>
+            <match type="string" value="N#! rnews" offset="0"/>
+            <match type="string" value="Forward to" offset="0"/>
+            <match type="string" value="Pipe to" offset="0"/>
+            <match type="string" value="Return-Path:" offset="0"/>
+            <match type="string" value="From:" offset="0"/>
+            <match type="string" value="Message-ID:" offset="0"/>
+            <match type="string" value="Date:" offset="0"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="image/vnd.wap.wbmp">
+        <_comment>Wireless Bitmap File Format</_comment>
+        <glob pattern="*.wbmp"/>
+    </mime-type>
+
+    <mime-type type="image/x-psd">
+        <alias type="image/photoshop"/>
+    </mime-type>
+
+    <mime-type type="image/x-xcf">
+        <alias type="image/xcf"/>
+        <magic priority="50">
+            <match type="string" value="gimp xcf " offset="0"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="model/iges">
+        <_comment>
+            Initial Graphics Exchange Specification Format
+        </_comment>
+        <glob pattern="*.igs"/>
+        <glob pattern="*.iges"/>
+    </mime-type>
+
+    <mime-type type="model/mesh">
+        <glob pattern="*.msh"/>
+        <glob pattern="*.mesh"/>
+        <glob pattern="*.silo"/>
+    </mime-type>
+
+    <mime-type type="model/vrml">
+        <glob pattern="*.vrml"/>
+    </mime-type>
+
+    <mime-type type="text/x-tcl">
+        <alias type="application/x-tcl"/>
+    </mime-type>
+
+    <mime-type type="text/x-tex">
+        <alias type="application/x-tex"/>
+    </mime-type>
+
+    <mime-type type="text/x-texinfo">
+        <alias type="application/x-texinfo"/>
+    </mime-type>
+
+    <mime-type type="text/x-troff-me">
+        <alias type="application/x-troff-me"/>
+    </mime-type>
+
+    <mime-type type="video/vnd.mpegurl">
+        <glob pattern="*.mxu"/>
+    </mime-type>
+
+    <mime-type type="x-conference/x-cooltalk">
+        <_comment>Cooltalk Audio</_comment>
+        <glob pattern="*.ice"/>
+    </mime-type>
+
+    <!-- ===================================================================== -->
+    <!-- TIKA-85: http://www.apache.org/dev/svn-eol-style.txt                  -->
+    <!-- ===================================================================== -->
+
+    <mime-type type="image/x-icon">
+        <glob pattern="*.ico"/>
+    </mime-type>
+
+    <mime-type type="image/jpeg">
+        <glob pattern="*.jpg"/>
+    </mime-type>
+
+    <mime-type type="image/png">
+        <glob pattern="*.png"/>
+    </mime-type>
+
+    <mime-type type="application/zip">
+        <glob pattern="*.zip"/>
+        <magic priority="50">
+            <match type="string" value="PK" offset="0:2"/>
+        </magic>
+    </mime-type>
+
+
+    <mime-type type="application/gzip">
+        <glob pattern="*.gz"/>
+        <glob pattern="*.tgz"/>
+        <glob pattern="*.gzip"/>
+        <!--
+          <magic priority="50">
+              <match type="host16" value="1f 8b 08" offset="0:2" />
+          </magic>
+        -->
+    </mime-type>
+
+    <mime-type type="audio/basic">
+        <glob pattern="*.au"/>
+        <glob pattern="*.snd"/>
+    </mime-type>
+
+    <mime-type type="video/x-ms-asf">
+        <glob pattern="*.asf"/>
+        <magicNumber encoding="hex">30 26 b2 75 8e 66 cf 11 a6 d9 00 aa 00 62 ce 6c</magicNumber>
+    </mime-type>
+
+    <mime-type type="video/x-ms-asx">
+        <glob pattern="*.asx"/>
+        <magic priority="50">
+            <match type="string" value="&lt;asx" offset="0:64"/>
+            <match type="string" value="&lt;ASX" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="audio/x-ms-wax">
+        <glob pattern="*.wax"/>
+    </mime-type>
+
+    <mime-type type="video/x-ms-wvx">
+        <glob pattern="*.wvx"/>
+    </mime-type>
+
+    <mime-type type="video/x-ms-wmx">
+        <glob pattern="*.wmx"/>
+    </mime-type>
+
+    <mime-type type="video/x-msvideo">
+        <glob pattern="*.avi"/>
+        <!--	<magicNumber encoding="hex" offset="8">41 56 49 20</magicNumber> -->
+    </mime-type>
+
+
+    <!--
+          This MIME type was invented to let the wmv and wma media types
+          share the magic number. Differentiation between these two files
+          is only possible based on file name extension.
+    -->
+
+    <mime-type type="application/x-ms-wm">
+        <magicNumber encoding="hex">30 26 b2 75 8e 66 cf 11 a6 d9 00 aa 00 62 ce 6c</magicNumber>
+    </mime-type>
+
+    <mime-type type="audio/x-ms-wma">
+        <sub-class-of type="application/x-ms-wm"/>
+        <glob pattern="*.wma"/>
+    </mime-type>
+
+    <mime-type type="video/x-ms-wmv">
+        <sub-class-of type="application/x-ms-wm"/>
+        <glob pattern="*.wmv"/>
+        <glob pattern="*.wm"/>
+    </mime-type>
+
+    <mime-type type="video/quicktime">
+        <glob pattern="*.mov"/>
+        <!--  <magicNumber encoding="string" offset="4">moov</magicNumber> -->
+    </mime-type>
+
+    <mime-type type="video/mpeg">
+        <glob pattern="*.mpg"/>
+        <glob pattern="*.mpeg"/>
+        <!--
+      <magicNumber encoding="hex">00 00 01 b3</magicNumber>
+      <magicNumber encoding="hex">00 00 01 ba</magicNumber> -->
+    </mime-type>
+
+    <mime-type type="application/x-shockwave-flash">
+        <glob pattern="*.swf"/>
+        <!--  <magicNumber encoding="hex">46 57 53</magicNumber>-->
+    </mime-type>
+
+    <mime-type type="application/x-ogg">
+        <glob pattern="*.ogg"/>
+        <magic priority="50">
+            <match type="string" value="OggS" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/vnd.rn-realmedia">
+        <glob pattern="*.rm"/>
+        <glob pattern="*.ram"/>
+        <magic priority="50">
+            <match type="string" value=".RMF" offset="0:64"/>
+            <match type="string" value="rtsp://" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="audio/x-wav">
+        <glob pattern="*.wav"/>
+        <magic priority="50">
+            <match type="string" value="WAVE" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="audio/mpeg">
+        <glob pattern="*.mp3"/>
+        <glob pattern="*.mp2"/>
+        <magic priority="50">
+            <match type="string" value="ID3" offset="0:64"/>
+        </magic>
+
+    </mime-type>
+
+    <mime-type type="audio/midi">
+        <glob pattern="*.mid"/>
+        <glob pattern="*.midi"/>
+        <glob pattern="*.rmi"/>
+        <magic priority="50">
+            <match type="string" value="MThd" offset="0:64"/>
+            <match type="string" value="RMI" offset="0:8"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="video/x-msvideo">
+        <glob pattern="*.avi"/>
+        <!-- 	<magicNumber encoding="hex" offset="8">41 56 49 20</magicNumber> -->
+    </mime-type>
+
+    <mime-type type="video/mp4">
+        <glob pattern="*.mp4"/>
+        <glob pattern="*.mpg4"/>
+        <glob pattern="*.m4v"/>
+        <glob pattern="*.mp4v"/>
+        <glob pattern="*.divx"/>
+        <glob pattern="*.xvid"/>
+        <glob pattern="*.264"/>
+    </mime-type>
+
+    <mime-type type="audio/mp4">
+        <glob pattern="*.m4a"/>
+        <glob pattern="*.m4p"/>
+    </mime-type>
+
+    <mime-type type="video/3gpp">
+        <glob pattern="*.3gp"/>
+        <glob pattern="*.3g2"/>
+    </mime-type>
+
+    <mime-type type="audio/x-aiff">
+        <glob pattern="*.aiff"/>
+        <glob pattern="*.aif"/>
+        <glob pattern="*.aifc"/>
+        <glob pattern="*.aiff"/>
+        <magic priority="50">
+            <match type="string" value="FORM" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="application/x-ms-wmd">
+        <sub-class-of type="application/zip"/>
+        <glob pattern="*.wmd"/>
+    </mime-type>
+
+    <mime-type type="video/x-flv">
+        <glob pattern="*.flv"/>
+        <magic priority="50">
+            <match type="string" value="FLV" offset="0:64"/>
+        </magic>
+    </mime-type>
+
+    <mime-type type="audio/flac">
+        <glob pattern="*.flac"/>
+        <!--<magicNumber encoding="hex">66 4c 61 43 00 00 00 22</magicNumber>-->
+    </mime-type>
+
+    <mime-type type="application/smil">
+        <glob pattern="*.smi"/>
+        <glob pattern="*.smil"/>
+    </mime-type>
+</mime-info>

Added: incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/tika-config.xml?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/tika-config.xml (added)
+++ incubator/any23/trunk/mime/src/main/resources/org/apache/any23/mime/tika-config.xml Mon Sep  3 23:11:15 2012
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <mimeTypeRepository resource="/org/apache/any23/mime/mimetypes.xml" magic="false"/>
+</properties>
\ No newline at end of file

Added: incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/TikaMIMETypeDetectorTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/TikaMIMETypeDetectorTest.java?rev=1380397&view=auto
==============================================================================
--- incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/TikaMIMETypeDetectorTest.java (added)
+++ incubator/any23/trunk/mime/src/test/java/org/apache/any23/mime/TikaMIMETypeDetectorTest.java Mon Sep  3 23:11:15 2012
@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.mime;
+
+import junit.framework.Assert;
+import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.openrdf.rio.RDFFormat;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Test case for {@link TikaMIMETypeDetector} class.
+ *
+ * @author juergen
+ * @author Michele Mostarda (michele.mostarda@gmail.com)
+ */
+public class TikaMIMETypeDetectorTest {
+
+    private static final String PLAIN  = "text/plain";
+    private static final String HTML   = "text/html";
+    private static final String XML    = "application/xml";
+    private static final String TRIX   = RDFFormat.TRIX.getDefaultMIMEType();
+    private static final String XHTML  = "application/xhtml+xml";
+    private static final String RDFXML = RDFFormat.RDFXML.getDefaultMIMEType();
+    private static final String TURTLE = RDFFormat.TURTLE.getDefaultMIMEType();
+    private static final String N3     = RDFFormat.N3.getDefaultMIMEType();
+    private static final String NQUADS = RDFFormat.NQUADS.getDefaultMIMEType();
+    private static final String CSV    = "text/csv";
+    private static final String RSS    = "application/rss+xml";
+    private static final String ATOM   = "application/atom+xml";
+
+    private TikaMIMETypeDetector detector;
+
+    @Before
+    public void setUp() throws Exception {
+        detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        detector = null;
+    }
+
+    @Test
+    public void testN3Detection() throws IOException {
+        assertN3Detection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .");
+        assertN3Detection("_:bnode1 <http://foo.com> _:bnode2 .");
+        assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\" .");
+        assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"@it .");
+        assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^<http://xxx.net> .");
+        assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^xsd:integer .");
+
+        // Wrong N3 line '.'
+        assertN3DetectionFail("" +
+                "<http://wrong.example.org/path> <http://wrong.foo.com> . <http://wrong.org/Document/foo#>"
+        );
+        // NQuads is not mislead with N3.
+        assertN3DetectionFail(
+            "<http://example.org/path> <http://foo.com> <http://dom.org/Document/foo#> <http://path/to/graph> ."
+        );
+    }
+
+    @Test
+    public void testNQuadsDetection() throws IOException {
+        assertNQuadsDetection(
+                "<http://www.ex.eu> <http://foo.com> <http://example.org/Document/foo#> <http://path.to.graph> ."
+        );
+        assertNQuadsDetection(
+                "_:bnode1 <http://foo.com> _:bnode2 <http://path.to.graph> ."
+        );
+        assertNQuadsDetection(
+                "<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\" <http://path.to.graph> ."
+        );
+        assertNQuadsDetection(
+                "<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\"@it <http://path.to.graph> ."
+        );
+        assertNQuadsDetection(
+                "<http://www.ex.eu> <http://dd.cc.org/1.1/p> \"xxx\"^^<http://www.sp.net/a#tt> <http://path.to.graph> ."
+        );
+        assertNQuadsDetection(
+                "<http://www.ex.eu> <http://purlo.org/1.1/title> \"yyy\"^^xsd:datetime <http://path.to.graph> ."
+        );
+
+        // Wrong NQuads line.
+        assertNQuadsDetectionFail(
+                "<http://www.wrong.com> <http://wrong.com/1.1/tt> \"x\"^^<http://xxx.net/int> . <http://path.to.graph>"
+        );
+        // N3 is not mislead with NQuads.
+        assertNQuadsDetectionFail(
+                "<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> ."
+        );
+    }
+
+    /* BEGIN: by content. */
+
+    @Test
+    public void testDetectRSS1ByContent() throws Exception {
+        detectMIMEtypeByContent(RDFXML, manifestRss1());
+    }
+
+    private List<String> manifestRss1() {
+        return Arrays.asList("/application/rss1/test1");
+    }
+
+    @Test
+    public void testDetectRSS2ByContent() throws Exception {
+        detectMIMEtypeByContent(RSS, manifestRss2());
+    }
+
+    private List<String> manifestRss2() {
+        return Arrays.asList("/application/rss2/index.html", "/application/rss2/rss2sample.xml", "/application/rss2/test1");
+    }
+
+    @Test
+    public void testDetectRDFN3ByContent() throws Exception {
+        detectMIMEtypeByContent(N3, manifestN3());
+    }
+
+    private List<String> manifestN3() {
+        return Arrays.asList("/application/rdfn3/test1", "/application/rdfn3/test2", "/application/rdfn3/test3");
+    }
+
+    @Test
+    public void testDetectRDFNQuadsByContent() throws Exception {
+        detectMIMEtypeByContent(NQUADS, manifestNQuads());
+    }
+
+    private List<String> manifestNQuads() {
+        return Arrays.asList("/application/nquads/test1.nq", "/application/nquads/test2.nq");
+    }
+
+    @Test
+    public void testDetectRDFXMLByContent() throws Exception {
+        detectMIMEtypeByContent(RDFXML, manifestRdfXml());
+    }
+
+    private List<String> manifestRdfXml() {
+        return Arrays.asList("/application/rdfxml/error.rdf", "/application/rdfxml/foaf", "/application/rdfxml/physics.owl", "/application/rdfxml/test1", "/application/rdfxml/test2", "/application/rdfxml/test3");
+    }
+
+    @Test
+    public void testDetectTriXByContent() throws Exception {
+        detectMIMEtypeByContent(TRIX, manifestTrix());
+    }
+
+    private List<String> manifestTrix() {
+        return Arrays.asList("/application/trix/test1.trx");
+    }
+
+    @Test
+    public void testDetectAtomByContent() throws Exception {
+        detectMIMEtypeByContent(ATOM, manifestAtom());
+    }
+
+    private List<String> manifestAtom() {
+        return Arrays.asList("/application/atom/atom.xml");
+    }
+
+    @Test
+    public void testDetectHTMLByContent() throws Exception {
+        detectMIMEtypeByContent(HTML, manifestHtml());
+    }
+
+    private List<String> manifestHtml() {
+        return Arrays.asList("/text/html/test1");
+    }
+
+    @Test
+    public void testDetectRDFaByContent() throws Exception {
+        detectMIMEtypeByContent(XHTML, manifestRdfa());
+    }
+
+    private List<String> manifestRdfa() {
+        return Arrays.asList("/application/rdfa/false.test", "/application/rdfa/london-gazette.html", "/application/rdfa/mic.xhtml", "/application/rdfa/test1.html");
+    }
+
+    @Test
+    public void testDetectXHTMLByContent() throws Exception {
+        detectMIMEtypeByContent(XHTML, manifestXHtml());
+    }
+
+    private List<String> manifestXHtml() {
+        return Arrays.asList("/application/xhtml/blank-file-header.xhtml", "/application/xhtml/index.html", "/application/xhtml/test1");
+    }
+
+    @Test
+    public void testDetectWSDLByContent() throws Exception {
+        detectMIMEtypeByContent("application/x-wsdl", manifestWsdl());
+    }
+
+    private List<String> manifestWsdl() {
+        return Arrays.asList("/application/wsdl/error.wsdl", "/application/wsdl/test1");
+    }
+
+    @Test
+    public void testDetectZIPByContent() throws Exception {
+        detectMIMEtypeByContent("application/zip", manifestZip());
+    }
+
+    private List<String> manifestZip() {
+        return Arrays.asList("/application/zip/4_entries.zip", "/application/zip/test1.zip", "/application/zip/test2");
+    }
+
+    @Test
+    public void testDetectCSVByContent() throws Exception {
+        detectMIMEtypeByContent(CSV, manifestCsv());
+    }
+
+    private List<String> manifestCsv() {
+        return Arrays.asList("/org/apache/any23/extractor/csv/test-comma.csv", "/org/apache/any23/extractor/csv/test-semicolon.csv", "/org/apache/any23/extractor/csv/test-tab.csv", "/org/apache/any23/extractor/csv/test-type.csv");
+    }
+
+    /* END: by content. */
+
+    /* BEGIN: by content metadata. */
+
+    @Test
+    public void testDetectContentPlainByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
+    }
+
+    @Test
+    public void testDetectTextRDFByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(RDFXML, "text/rdf");
+    }
+
+    @Test
+    public void testDetectTextN3ByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(N3, "text/rdf+n3");
+    }
+
+    @Test
+    public void testDetectTextNQuadsByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(NQUADS, "text/x-nquads");
+    }
+
+    @Test
+    public void testDetectTextTurtleByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(TURTLE, "text/turtle");
+    }
+
+    @Test
+    public void testDetectRDFXMLByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(RDFXML, "application/rdf+xml");
+    }
+
+    @Test
+    public void testDetectXMLByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(XML, "application/xml");
+    }
+
+    @Test
+    public void testDetectTriXByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(TRIX, "application/trix");
+    }
+
+    @Test
+    public void testDetectExtensionN3ByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
+    }
+
+    @Test
+    public void testDetectXHTMLByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(XHTML, "application/xhtml+xml");
+    }
+
+    @Test
+    public void testDetectTextHTMLByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(HTML, "text/html");
+    }
+
+    @Test
+    public void testDetectTextPlainByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
+    }
+
+    @Test
+    public void testDetectApplicationXMLByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(XML, "application/xml");
+    }
+
+    @Test
+    public void testDetectApplicationCSVByMeta() throws IOException {
+        detectMIMETypeByMimeTypeHint(CSV, "text/csv");
+    }
+
+    /* END: by content metadata. */
+
+    /* BEGIN: by content and name. */
+
+    @Test
+    public void testRDFXMLByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(RDFXML, manifestRdfXml());
+    }
+
+    @Test
+    public void testTriXByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(TRIX, manifestTrix());
+    }
+
+    @Test
+    public void testRSS1ByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(RDFXML, manifestRss1());
+    }
+
+    @Test
+    public void testRSS2ByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(RSS, manifestRss2());
+    }
+
+    @Test
+    public void testDetectRDFN3ByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(N3, manifestN3());
+    }
+
+    @Test
+    public void testDetectRDFNQuadsByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(NQUADS, manifestNQuads());
+    }
+
+    @Test
+    public void testAtomByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(ATOM, manifestAtom());
+    }
+
+    @Test
+    public void testHTMLByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(HTML, manifestHtml());
+    }
+
+    @Test
+    public void testXHTMLByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(XHTML, manifestXHtml());
+    }
+
+     @Test
+    public void testWSDLByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName("application/x-wsdl", manifestWsdl());
+    }
+
+    @Test
+    public void testZipByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName("application/zip", manifestZip());
+    }
+
+    @Test
+    public void testRDFaByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(XHTML, manifestRdfa());
+    }
+
+    @Test
+    public void testCSVByContentAndName() throws Exception {
+        detectMIMETypeByContentAndName(CSV, manifestCsv());
+    }
+
+    /* END: by content and name. */
+
+    private void assertN3Detection(String n3Exp) throws IOException {
+        ByteArrayInputStream bais = new ByteArrayInputStream( n3Exp.getBytes() );
+        Assert.assertTrue( TikaMIMETypeDetector.checkN3Format(bais) );
+    }
+
+    private void assertN3DetectionFail(String n3Exp) throws IOException {
+        ByteArrayInputStream bais = new ByteArrayInputStream( n3Exp.getBytes() );
+        Assert.assertFalse( TikaMIMETypeDetector.checkN3Format(bais) );
+    }
+
+    private void assertNQuadsDetection(String n4Exp) throws IOException {
+        ByteArrayInputStream bais = new ByteArrayInputStream( n4Exp.getBytes() );
+        Assert.assertTrue( TikaMIMETypeDetector.checkNQuadsFormat(bais) );
+    }
+
+    private void assertNQuadsDetectionFail(String n4Exp) throws IOException {
+        ByteArrayInputStream bais = new ByteArrayInputStream( n4Exp.getBytes() );
+        Assert.assertFalse( TikaMIMETypeDetector.checkNQuadsFormat(bais) );
+    }
+
+    /**
+     * Checks the detection of a specific MIME based on content analysis.
+     *
+     * @param expectedMimeType the expected mime type.
+     * @param testDir the target file.
+     * @throws IOException
+     */
+    private void detectMIMEtypeByContent(String expectedMimeType, Collection<String> manifest)
+    throws IOException {
+        String detectedMimeType;
+        for (String test : manifest) {
+            InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
+            detectedMimeType = detector.guessMIMEType(
+                    null,
+                    is,
+                    null
+            ).toString();
+            if (test.contains("error"))
+                Assert.assertNotSame(expectedMimeType, detectedMimeType);
+            else {
+                Assert.assertEquals(
+                        String.format("Error in mimetype detection for file %s", test),
+                        expectedMimeType,
+                        detectedMimeType
+                );
+            }
+            is.close();
+        }
+    }
+
+    /**
+     * Verifies the detection of a specific MIME based on content, filename and metadata MIME type.
+     *
+     * @param expectedMimeType
+     * @param contentTypeHeader
+     * @throws IOException
+     */
+    private void detectMIMETypeByMimeTypeHint(String expectedMimeType, String contentTypeHeader)
+    throws IOException {
+        String detectedMimeType = detector.guessMIMEType(
+                null,
+                null,
+                MIMEType.parse(contentTypeHeader)
+        ).toString();
+        Assert.assertEquals(expectedMimeType, detectedMimeType);
+    }
+
+    /**
+     * Verifies the detection of a specific MIME based on content and filename.
+     *
+     * @param expectedMimeType
+     * @param testDir
+     * @throws IOException
+     */
+    private void detectMIMETypeByContentAndName(String expectedMimeType, Collection<String> manifest) throws IOException {
+        String detectedMimeType;
+        for (String test : manifest) {
+            InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
+            detectedMimeType = detector.guessMIMEType(test, is, null).toString();
+            if (test.contains("error"))
+                Assert.assertNotSame(expectedMimeType, detectedMimeType);
+            else {
+                Assert.assertEquals(
+                        String.format("Error while detecting mimetype in file %s", test),
+                        expectedMimeType,
+                        detectedMimeType
+                );
+            }
+            is.close();
+        }
+    }
+
+}



Mime
View raw message