incubator-any23-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ans...@apache.org
Subject svn commit: r1380400 - in /incubator/any23/trunk: ./ core/ core/src/main/java/org/apache/any23/encoding/ core/src/test/java/org/apache/any23/encoding/ encoding/ encoding/src/ encoding/src/main/ encoding/src/main/java/ encoding/src/main/java/org/ encodi...
Date Mon, 03 Sep 2012 23:21:45 GMT
Author: ansell
Date: Mon Sep  3 23:21:44 2012
New Revision: 1380400

URL: http://svn.apache.org/viewvc?rev=1380400&view=rev
Log:
ANY23-118 : Split encoding detection out into its own module

Added:
    incubator/any23/trunk/encoding/
    incubator/any23/trunk/encoding/pom.xml
    incubator/any23/trunk/encoding/src/
    incubator/any23/trunk/encoding/src/main/
    incubator/any23/trunk/encoding/src/main/java/
    incubator/any23/trunk/encoding/src/main/java/org/
    incubator/any23/trunk/encoding/src/main/java/org/apache/
    incubator/any23/trunk/encoding/src/main/java/org/apache/any23/
    incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/
    incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
    incubator/any23/trunk/encoding/src/test/
    incubator/any23/trunk/encoding/src/test/java/
    incubator/any23/trunk/encoding/src/test/java/org/
    incubator/any23/trunk/encoding/src/test/java/org/apache/
    incubator/any23/trunk/encoding/src/test/java/org/apache/any23/
    incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/
    incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
Removed:
    incubator/any23/trunk/core/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
    incubator/any23/trunk/core/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
Modified:
    incubator/any23/trunk/core/pom.xml
    incubator/any23/trunk/pom.xml

Modified: incubator/any23/trunk/core/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/pom.xml?rev=1380400&r1=1380399&r2=1380400&view=diff
==============================================================================
--- incubator/any23/trunk/core/pom.xml (original)
+++ incubator/any23/trunk/core/pom.xml Mon Sep  3 23:21:44 2012
@@ -47,6 +47,11 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-encoding</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
       <artifactId>apache-any23-nquads</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>

Added: incubator/any23/trunk/encoding/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/pom.xml?rev=1380400&view=auto
==============================================================================
--- incubator/any23/trunk/encoding/pom.xml (added)
+++ incubator/any23/trunk/encoding/pom.xml Mon Sep  3 23:21:44 2012
@@ -0,0 +1,40 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <artifactId>apache-any23</artifactId>
+    <groupId>org.apache.any23</groupId>
+    <version>0.7.1-incubating-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+  <artifactId>apache-any23-encoding</artifactId>
+  <name>Apache Any23 :: Encoding Detection</name>
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-api</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-test-resources</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>

Added: incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java?rev=1380400&view=auto
==============================================================================
--- incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
(added)
+++ incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
Mon Sep  3 23:21:44 2012
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.encoding;
+
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * An implementation of {@link EncodingDetector} based on
+ * <a href="http://tika.apache.org/">Apache Tika</a>.
+ *
+ * @author Michele Mostarda ( michele.mostarda@gmail.com )
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @version $Id$
+ */
+public class TikaEncodingDetector implements EncodingDetector {
+
+    public String guessEncoding(InputStream is) throws IOException {
+        CharsetDetector charsetDetector = new CharsetDetector();
+        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is)
);
+        charsetDetector.enableInputFilter(true);
+        CharsetMatch cm = charsetDetector.detect();
+        return cm.getName();
+    }
+
+}

Added: incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java?rev=1380400&view=auto
==============================================================================
--- incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
(added)
+++ incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java
Mon Sep  3 23:21:44 2012
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.encoding;
+
+import junit.framework.Assert;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Test case for {@link TikaEncodingDetector}.
+ *
+ * @author Michele Mostarda ( michele.mostarda@gmail.com )
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @version $Id$
+ */
+public class TikaEncodingDetectorTest {
+
+    private TikaEncodingDetector detector;
+
+    @Before
+    public void setUp() {
+        detector = new TikaEncodingDetector();
+    }
+
+    @After
+    public void tearDown() {
+        detector = null;
+    }
+
+    @Test
+    public void testISO8859HTML() throws IOException {
+         assertEncoding( "ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.html" );
+    }
+
+    @Test
+    public void testISO8859XHTML() throws IOException {
+         assertEncoding( "ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.xhtml" );
+    }
+
+    @Test
+    public void testUTF8AfterTitle() throws IOException {
+         assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8-after-title.html" );
+    }
+
+    @Test
+    public void testUTF8HTML() throws IOException {
+         assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8.html" );
+    }
+
+    @Test
+    public void testUTF8XHTML() throws IOException {
+         assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8.xhtml" );
+    }
+
+    @Test
+    public void testEncodingHTML() throws IOException {
+         assertEncoding( "UTF-8", "/html/encoding-test.html" );
+    }
+
+    private void assertEncoding(final String expected, final String resource) throws IOException
{
+        InputStream fis = this.getClass().getResourceAsStream(resource);
+        try {
+            String encoding = detector.guessEncoding(fis);
+            Assert.assertEquals( "Unexpected encoding", expected, encoding );
+        } finally {
+            fis.close();
+        }
+    }
+
+}

Modified: incubator/any23/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/pom.xml?rev=1380400&r1=1380399&r2=1380400&view=diff
==============================================================================
--- incubator/any23/trunk/pom.xml (original)
+++ incubator/any23/trunk/pom.xml Mon Sep  3 23:21:44 2012
@@ -193,6 +193,7 @@
     <module>nquads</module>
     <module>csvutils</module>
     <module>mime</module>
+    <module>encoding</module>
     <module>core</module>
     <module>plugins/basic-crawler</module>
     <module>plugins/html-scraper</module>



Mime
View raw message