Return-Path: X-Original-To: apmail-jackrabbit-oak-commits-archive@minotaur.apache.org Delivered-To: apmail-jackrabbit-oak-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id A9B7911F61 for ; Fri, 2 May 2014 08:42:04 +0000 (UTC) Received: (qmail 89379 invoked by uid 500); 2 May 2014 08:42:04 -0000 Delivered-To: apmail-jackrabbit-oak-commits-archive@jackrabbit.apache.org Received: (qmail 89319 invoked by uid 500); 2 May 2014 08:42:00 -0000 Mailing-List: contact oak-commits-help@jackrabbit.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: oak-dev@jackrabbit.apache.org Delivered-To: mailing list oak-commits@jackrabbit.apache.org Received: (qmail 89277 invoked by uid 99); 2 May 2014 08:41:57 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 02 May 2014 08:41:57 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 02 May 2014 08:41:53 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id B787F238890D; Fri, 2 May 2014 08:41:29 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1591826 - in /jackrabbit/oak/trunk/oak-run: pom.xml src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Date: Fri, 02 May 2014 08:41:29 -0000 To: oak-commits@jackrabbit.apache.org From: mreutegg@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140502084129.B787F238890D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: mreutegg Date: Fri May 2 08:41:29 2014 New Revision: 1591826 URL: http://svn.apache.org/r1591826 Log: OAK-1790: Import of compressed wikipedia dump Modified: jackrabbit/oak/trunk/oak-run/pom.xml jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Modified: jackrabbit/oak/trunk/oak-run/pom.xml URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/pom.xml?rev=1591826&r1=1591825&r2=1591826&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/pom.xml (original) +++ jackrabbit/oak/trunk/oak-run/pom.xml Fri May 2 08:41:29 2014 @@ -183,6 +183,11 @@ 2.0 + org.apache.commons + commons-compress + 1.8 + + org.eclipse.jetty jetty-servlet ${jetty.version} Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java?rev=1591826&r1=1591825&r2=1591826&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java (original) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Fri May 2 08:41:29 2014 @@ -19,7 +19,9 @@ package org.apache.jackrabbit.oak.benchm import static com.google.common.base.Preconditions.checkState; import static java.lang.Math.min; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import javax.jcr.Node; import javax.jcr.NodeIterator; @@ -32,6 +34,7 @@ import javax.xml.stream.XMLStreamConstan import javax.xml.stream.XMLStreamReader; import javax.xml.transform.stream.StreamSource; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.jackrabbit.commons.JcrUtils; import org.apache.jackrabbit.oak.benchmark.Benchmark; import org.apache.jackrabbit.oak.fixture.RepositoryFixture; @@ -101,7 +104,7 @@ public class WikipediaImport extends Ben } String type = "nt:unstructured"; - if (flat) { + if (session.getWorkspace().getNodeTypeManager().hasNodeType("oak:Unstructured")) { type = "oak:Unstructured"; } Node wikipedia = session.getRootNode().addNode("wikipedia", type); @@ -118,8 +121,15 @@ public class WikipediaImport extends Ben String title = null; String text = null; XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLStreamReader reader = - factory.createXMLStreamReader(new StreamSource(dump)); + StreamSource source; + if (dump.getName().endsWith(".xml")) { + source = new StreamSource(dump); + } else { + CompressorStreamFactory csf = new CompressorStreamFactory(); + source = new StreamSource(csf.createCompressorInputStream( + new BufferedInputStream(new FileInputStream(dump)))); + } + XMLStreamReader reader = factory.createXMLStreamReader(source); while (reader.hasNext()) { switch (reader.next()) { case XMLStreamConstants.START_ELEMENT: