Return-Path: X-Original-To: apmail-incubator-any23-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-any23-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 0C18DDE58 for ; Mon, 3 Sep 2012 23:22:33 +0000 (UTC) Received: (qmail 4734 invoked by uid 500); 3 Sep 2012 23:22:33 -0000 Delivered-To: apmail-incubator-any23-commits-archive@incubator.apache.org Received: (qmail 4704 invoked by uid 500); 3 Sep 2012 23:22:33 -0000 Mailing-List: contact any23-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: any23-dev@incubator.apache.org Delivered-To: mailing list any23-commits@incubator.apache.org Received: (qmail 4693 invoked by uid 99); 3 Sep 2012 23:22:33 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 03 Sep 2012 23:22:33 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 03 Sep 2012 23:22:30 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id EC87923888E3; Mon, 3 Sep 2012 23:21:45 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1380400 - in /incubator/any23/trunk: ./ core/ core/src/main/java/org/apache/any23/encoding/ core/src/test/java/org/apache/any23/encoding/ encoding/ encoding/src/ encoding/src/main/ encoding/src/main/java/ encoding/src/main/java/org/ encodi... Date: Mon, 03 Sep 2012 23:21:45 -0000 To: any23-commits@incubator.apache.org From: ansell@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120903232145.EC87923888E3@eris.apache.org> Author: ansell Date: Mon Sep 3 23:21:44 2012 New Revision: 1380400 URL: http://svn.apache.org/viewvc?rev=1380400&view=rev Log: ANY23-118 : Split encoding detection out into its own module Added: incubator/any23/trunk/encoding/ incubator/any23/trunk/encoding/pom.xml incubator/any23/trunk/encoding/src/ incubator/any23/trunk/encoding/src/main/ incubator/any23/trunk/encoding/src/main/java/ incubator/any23/trunk/encoding/src/main/java/org/ incubator/any23/trunk/encoding/src/main/java/org/apache/ incubator/any23/trunk/encoding/src/main/java/org/apache/any23/ incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/ incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java incubator/any23/trunk/encoding/src/test/ incubator/any23/trunk/encoding/src/test/java/ incubator/any23/trunk/encoding/src/test/java/org/ incubator/any23/trunk/encoding/src/test/java/org/apache/ incubator/any23/trunk/encoding/src/test/java/org/apache/any23/ incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/ incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java Removed: incubator/any23/trunk/core/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java incubator/any23/trunk/core/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java Modified: incubator/any23/trunk/core/pom.xml incubator/any23/trunk/pom.xml Modified: incubator/any23/trunk/core/pom.xml URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/pom.xml?rev=1380400&r1=1380399&r2=1380400&view=diff ============================================================================== --- incubator/any23/trunk/core/pom.xml (original) +++ incubator/any23/trunk/core/pom.xml Mon Sep 3 23:21:44 2012 @@ -47,6 +47,11 @@ ${project.groupId} + apache-any23-encoding + ${project.version} + + + ${project.groupId} apache-any23-nquads ${project.version} test Added: incubator/any23/trunk/encoding/pom.xml URL: http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/pom.xml?rev=1380400&view=auto ============================================================================== --- incubator/any23/trunk/encoding/pom.xml (added) +++ incubator/any23/trunk/encoding/pom.xml Mon Sep 3 23:21:44 2012 @@ -0,0 +1,40 @@ + + 4.0.0 + + apache-any23 + org.apache.any23 + 0.7.1-incubating-SNAPSHOT + .. + + apache-any23-encoding + Apache Any23 :: Encoding Detection + + + ${project.groupId} + apache-any23-api + ${project.version} + + + ${project.groupId} + apache-any23-test-resources + ${project.version} + test + test-jar + + + org.apache.tika + tika-parsers + + + junit + junit + test + + + org.slf4j + slf4j-log4j12 + test + + + Added: incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java URL: http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java?rev=1380400&view=auto ============================================================================== --- incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java (added) +++ incubator/any23/trunk/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java Mon Sep 3 23:21:44 2012 @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.encoding; + +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * An implementation of {@link EncodingDetector} based on + * Apache Tika. + * + * @author Michele Mostarda ( michele.mostarda@gmail.com ) + * @author Davide Palmisano ( dpalmisano@gmail.com ) + * @version $Id$ + */ +public class TikaEncodingDetector implements EncodingDetector { + + public String guessEncoding(InputStream is) throws IOException { + CharsetDetector charsetDetector = new CharsetDetector(); + charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) ); + charsetDetector.enableInputFilter(true); + CharsetMatch cm = charsetDetector.detect(); + return cm.getName(); + } + +} Added: incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java URL: http://svn.apache.org/viewvc/incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java?rev=1380400&view=auto ============================================================================== --- incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java (added) +++ incubator/any23/trunk/encoding/src/test/java/org/apache/any23/encoding/TikaEncodingDetectorTest.java Mon Sep 3 23:21:44 2012 @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.encoding; + +import junit.framework.Assert; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Test case for {@link TikaEncodingDetector}. + * + * @author Michele Mostarda ( michele.mostarda@gmail.com ) + * @author Davide Palmisano ( dpalmisano@gmail.com ) + * @version $Id$ + */ +public class TikaEncodingDetectorTest { + + private TikaEncodingDetector detector; + + @Before + public void setUp() { + detector = new TikaEncodingDetector(); + } + + @After + public void tearDown() { + detector = null; + } + + @Test + public void testISO8859HTML() throws IOException { + assertEncoding( "ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.html" ); + } + + @Test + public void testISO8859XHTML() throws IOException { + assertEncoding( "ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.xhtml" ); + } + + @Test + public void testUTF8AfterTitle() throws IOException { + assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8-after-title.html" ); + } + + @Test + public void testUTF8HTML() throws IOException { + assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8.html" ); + } + + @Test + public void testUTF8XHTML() throws IOException { + assertEncoding( "UTF-8", "/microformats/xfn/encoding-utf-8.xhtml" ); + } + + @Test + public void testEncodingHTML() throws IOException { + assertEncoding( "UTF-8", "/html/encoding-test.html" ); + } + + private void assertEncoding(final String expected, final String resource) throws IOException { + InputStream fis = this.getClass().getResourceAsStream(resource); + try { + String encoding = detector.guessEncoding(fis); + Assert.assertEquals( "Unexpected encoding", expected, encoding ); + } finally { + fis.close(); + } + } + +} Modified: incubator/any23/trunk/pom.xml URL: http://svn.apache.org/viewvc/incubator/any23/trunk/pom.xml?rev=1380400&r1=1380399&r2=1380400&view=diff ============================================================================== --- incubator/any23/trunk/pom.xml (original) +++ incubator/any23/trunk/pom.xml Mon Sep 3 23:21:44 2012 @@ -193,6 +193,7 @@ nquads csvutils mime + encoding core plugins/basic-crawler plugins/html-scraper