commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bode...@apache.org
Subject [commons-compress] 02/06: COMPRESS-124 : add testcases for extracting sparse
Date Wed, 01 Jan 2020 17:11:34 GMT
This is an automated email from the ASF dual-hosted git repository.

bodewig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-compress.git

commit 50569e5bfb1526c54acf0abe5a6e4d5463c5a4bd
Author: Lee <peteralfredlee@gmail.com>
AuthorDate: Fri Nov 22 17:24:10 2019 +0800

    COMPRESS-124 : add testcases for extracting sparse
---
 .../compress/archivers/tar/TarArchiveEntry.java    |  24 +--
 .../archivers/tar/TarArchiveInputStream.java       | 237 ++++++++++++++++++---
 .../archivers/tar/TarArchiveSparseEntry.java       |   2 +
 .../archivers/tar/TarArchiveSparseInputStream.java | 222 -------------------
 .../archivers/tar/TarArchiveStructSparse.java      |  24 +++
 .../compress/archivers/tar/SparseFilesTest.java    | 205 +++++++++++++++++-
 .../compress/archivers/tar/TarUtilsTest.java       |  12 ++
 src/test/resources/oldgnu_extended_sparse.tar      | Bin 0 -> 10240 bytes
 8 files changed, 452 insertions(+), 274 deletions(-)

diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
index 9012a3c..2aac73e 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveEntry.java
@@ -852,10 +852,14 @@ public class TarArchiveEntry implements ArchiveEntry, TarConstants {
 
     /**
      * Get this entry's real file size in case of a sparse file.
+     * If the file is not a sparse file, return size instead of realSize.
      *
-     * @return This entry's real file size.
+     * @return This entry's real file size, if the file is not a sparse file, return size
instead of realSize.
      */
     public long getRealSize() {
+        if (!isSparse()) {
+            return size;
+        }
         return realSize;
     }
 
@@ -1077,16 +1081,13 @@ public class TarArchiveEntry implements ArchiveEntry, TarConstants
{
     /**
      * Update the entry using a map of pax headers.
      * @param headers
-     * @param sparseHeaders for 0.0 PAX Format, the sparse headers may appear more than 1
time in headers map,
-*    *                      this means it can not be read from a map, therefore the sparse
headers have already
-*    *                      been parsed to a list and was passed through parameter sparseHeaders
      * @since 1.15
      */
-    void updateEntryFromPaxHeaders(Map<String, String> headers, final List<TarArchiveStructSparse>
sparseHeaders) {
+    void updateEntryFromPaxHeaders(Map<String, String> headers) {
         for (final Map.Entry<String, String> ent : headers.entrySet()) {
             final String key = ent.getKey();
             final String val = ent.getValue();
-            processPaxHeader(key, val, headers, sparseHeaders);
+            processPaxHeader(key, val, headers);
         }
     }
 
@@ -1101,10 +1102,6 @@ public class TarArchiveEntry implements ArchiveEntry, TarConstants
{
         processPaxHeader(key,val,extraPaxHeaders);
     }
 
-    private void processPaxHeader(String key, String val, Map<String, String> headers)
{
-        processPaxHeader(key, val, headers, null);
-    }
-
     /**
      * Process one pax header, using the supplied map as source for extra headers to be used
when handling
      * entries for sparse files
@@ -1112,13 +1109,9 @@ public class TarArchiveEntry implements ArchiveEntry, TarConstants
{
      * @param key  the header name.
      * @param val  the header value.
      * @param headers  map of headers used for dealing with sparse file.
-     * @param sparseHeaders  for 0.0 PAX Format, the sparse headers may appear more than
1 time in headers map,
-     *                       this means it can not be read from a map, therefore the sparse
headers have already
-     *                       been parsed to a list and was passed through parameter sparseHeaders
      * @since 1.15
      */
-    private void processPaxHeader(String key, String val, Map<String, String> headers,
-                                  final List<TarArchiveStructSparse> sparseHeaders)
{
+    private void processPaxHeader(String key, String val, Map<String, String> headers)
{
     /*
      * The following headers are defined for Pax.
      * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields
@@ -1172,7 +1165,6 @@ public class TarArchiveEntry implements ArchiveEntry, TarConstants {
                 break;
             case "GNU.sparse.size":
                 fillGNUSparse0xData(headers);
-                this.sparseHeaders = sparseHeaders;
                 break;
             case "GNU.sparse.realsize":
                 fillGNUSparse1xData(headers);
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
index 9194db8..72b6653 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
@@ -26,16 +26,14 @@ package org.apache.commons.compress.archivers.tar;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 import org.apache.commons.compress.archivers.ArchiveEntry;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipEncoding;
 import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
 import org.apache.commons.compress.utils.ArchiveUtils;
+import org.apache.commons.compress.utils.BoundedInputStream;
 import org.apache.commons.compress.utils.CharsetNames;
 import org.apache.commons.compress.utils.IOUtils;
 
@@ -70,7 +68,12 @@ public class TarArchiveInputStream extends ArchiveInputStream {
     /** An input stream to read from */
     private final InputStream is;
 
-    /** An input stream to read sparse file */
+    /** Input streams for reading sparse entries **/
+    private List<InputStream> sparseInputStreams;
+
+    /** the index of current input stream being read when reading sparse entries */
+    private int currentSparseInputStreamIndex;
+
     private InputStream sparseInputStream;
 
     /** The meta-data about the current entry */
@@ -86,7 +89,7 @@ public class TarArchiveInputStream extends ArchiveInputStream {
     private Map<String, String> globalPaxHeaders = new HashMap<>();
 
     // the global sparse headers, this is only used in PAX Format 0.X
-    private List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
+    private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
 
     private final boolean lenient;
 
@@ -193,6 +196,13 @@ public class TarArchiveInputStream extends ArchiveInputStream {
      */
     @Override
     public void close() throws IOException {
+        // Close all the input streams in sparseInputStreams
+        if(sparseInputStreams != null) {
+            for (InputStream inputStream : sparseInputStreams) {
+                inputStream.close();
+            }
+        }
+
         is.close();
     }
 
@@ -223,16 +233,10 @@ public class TarArchiveInputStream extends ArchiveInputStream {
             return 0;
         }
 
-        // for sparse entries, there are actually currEntry.getRealSize() bytes to read
-        long entryActualSize = entrySize;
-        if (currEntry.isSparse()) {
-            entryActualSize = currEntry.getRealSize();
-        }
-
-        if (entryActualSize - entryOffset > Integer.MAX_VALUE) {
+        if (currEntry.getRealSize() - entryOffset > Integer.MAX_VALUE) {
             return Integer.MAX_VALUE;
         }
-        return (int) (entryActualSize - entryOffset);
+        return (int) (currEntry.getRealSize() - entryOffset);
     }
 
 
@@ -258,15 +262,12 @@ public class TarArchiveInputStream extends ArchiveInputStream {
             return 0;
         }
 
-        long available;
+        long available = currEntry.getRealSize() - entryOffset;
         long skipped;
         if(!currEntry.isSparse()) {
-            available = entrySize - entryOffset;
             skipped = IOUtils.skip(is, Math.min(n, available));
         } else {
-            // for sparse entries, there are actually currEntry.getRealSize() bytes to read
-            available = currEntry.getRealSize() - entryOffset;
-            skipped = IOUtils.skip(sparseInputStream, Math.min(n, available));
+            skipped = skipSparse(n);
         }
         count(skipped);
         entryOffset += skipped;
@@ -274,6 +275,35 @@ public class TarArchiveInputStream extends ArchiveInputStream {
     }
 
     /**
+     * Skip n bytes from current input stream, if the current input stream doesn't have enough
data to skip,
+     * jump to the next input stream and skip the rest bytes, keep doing this until total
n bytes are skipped
+     * or the input streams are all skipped
+     *
+     * @param n bytes of data to skip
+     * @return actual bytes of data skipped
+     * @throws IOException
+     */
+    private long skipSparse(final long n) throws IOException {
+        if (sparseInputStreams.size() == 0) {
+            return is.skip(n);
+        }
+
+        long bytesSkipped = 0;
+        InputStream currentInputStream;
+
+        while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size())
{
+            currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
+            bytesSkipped += currentInputStream.skip(n - bytesSkipped);
+
+            if (bytesSkipped < n) {
+                currentSparseInputStreamIndex++;
+            }
+        }
+
+        return bytesSkipped;
+    }
+
+    /**
      * Since we do not support marking just yet, we return false.
      *
      * @return False.
@@ -536,8 +566,8 @@ public class TarArchiveInputStream extends ArchiveInputStream {
         }
 
         // sparse headers are all done reading, we need to build
-        // a new input stream using these sparse headers
-        sparseInputStream = new TarArchiveSparseInputStream(sparseHeaders, is);
+        // sparse input streams using these sparse headers
+        buildSparseInputStreams();
     }
 
     /**
@@ -717,8 +747,8 @@ public class TarArchiveInputStream extends ArchiveInputStream {
     }
 
     private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final
List<TarArchiveStructSparse> sparseHeaders) {
-        currEntry.updateEntryFromPaxHeaders(headers, sparseHeaders);
-
+        currEntry.updateEntryFromPaxHeaders(headers);
+        currEntry.setSparseHeaders(sparseHeaders);
     }
 
     /**
@@ -742,8 +772,8 @@ public class TarArchiveInputStream extends ArchiveInputStream {
         }
 
         // sparse headers are all done reading, we need to build
-        // a new input stream using these sparse headers
-        sparseInputStream = new TarArchiveSparseInputStream(currEntry.getSparseHeaders(),
is);
+        // sparse input streams using these sparse headers
+        buildSparseInputStreams();
     }
 
     private boolean isDirectory() {
@@ -816,19 +846,22 @@ public class TarArchiveInputStream extends ArchiveInputStream {
             throw new IllegalStateException("No current tar entry");
         }
 
-        if(!currEntry.isSparse() && entryOffset >= entrySize) {
-            return -1;
-        }
-
-        // for sparse entries, there are actually currEntry.getRealSize() bytes to read
-        if(currEntry.isSparse() && entryOffset >= currEntry.getRealSize()) {
-            return -1;
+        if (!currEntry.isSparse()) {
+            if (entryOffset >= entrySize) {
+                return -1;
+            }
+        } else {
+            // for sparse entries, there are actually currEntry.getRealSize() bytes to read
+            if (entryOffset >= currEntry.getRealSize()) {
+                return -1;
+            }
         }
 
         numToRead = Math.min(numToRead, available());
 
-        if(currEntry.isSparse()) {
-            totalRead = sparseInputStream.read(buf, offset, numToRead);
+        if (currEntry.isSparse()) {
+            // for sparse entries, we need to read them in another way
+            totalRead = readSparse(buf, offset, numToRead);
         } else {
             totalRead = is.read(buf, offset, numToRead);
         }
@@ -847,6 +880,61 @@ public class TarArchiveInputStream extends ArchiveInputStream {
     }
 
     /**
+     * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only
the non-zero data is
+     * stored in tar files, and they are stored separately. The structure of non-zero data
is introduced by the
+     * sparse headers using the offset, where a block of non-zero data starts, and numbytes,
the length of the
+     * non-zero data block.
+     * When reading sparse entries, the actual data is read out with "holes" and non-zero
data combined together
+     * according to the sparse headers.
+     *
+     * @param buf The buffer into which to place bytes read.
+     * @param offset The offset at which to place bytes read.
+     * @param numToRead The number of bytes to read.
+     * @return The number of bytes read, or -1 at EOF.
+     * @throws IOException on error
+     */
+    private int readSparse(final byte[] buf, final int offset, int numToRead) throws IOException
{
+        // if there are no actual input streams, just read from the original input stream
+        if (sparseInputStreams.size() == 0) {
+            return is.read(buf, offset, numToRead);
+        }
+
+        if(currentSparseInputStreamIndex >= sparseInputStreams.size()) {
+            return -1;
+        }
+
+        InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
+        int readLen = currentInputStream.read(buf, offset, numToRead);
+
+        // if the current input stream is the last input stream,
+        // just return the number of bytes read from current input stream
+        if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
+            return readLen;
+        }
+
+        // if EOF of current input stream is meet, open a new input stream and recursively
call read
+        if (readLen == -1) {
+            currentSparseInputStreamIndex++;
+            return readSparse(buf, offset, numToRead);
+        }
+
+        // if the rest data of current input stream is not long enough, open a new input
stream
+        // and recursively call read
+        if (readLen < numToRead) {
+            currentSparseInputStreamIndex++;
+            int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
+            if (readLenOfNext == -1) {
+                return readLen;
+            }
+
+            return readLen + readLenOfNext;
+        }
+
+        // if the rest data of current input stream is enough(which means readLen == len),
just return readLen
+        return readLen;
+    }
+
+    /**
      * Whether this class is able to read the given entry.
      *
      * <p>May return false if the current entry is a sparse file.</p>
@@ -937,4 +1025,85 @@ public class TarArchiveInputStream extends ArchiveInputStream {
                         signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
     }
 
+    /**
+     * Build the input streams consisting of all-zero input streams and non-zero input streams.
+     * When reading from the non-zero input streams, the data is actually read from the original
input stream.
+     * The size of each input stream is introduced by the sparse headers.
+     *
+     * NOTE : Some all-zero input streams and non-zero input streams have the size of 0.
We DO NOT store the
+     *        0 size input streams because they are meaningless.
+     */
+    private void buildSparseInputStreams() throws IOException {
+        currentSparseInputStreamIndex = -1;
+        sparseInputStreams = new ArrayList<>();
+        InputStream zeroInputStream = new TarArchiveSparseZeroInputStream();
+
+        long offset = 0;
+        List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders();
+        // sort the sparse headers in case they are written in wrong order
+        if (sparseHeaders != null && sparseHeaders.size() > 1) {
+            final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>()
{
+                @Override
+                public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse
q) {
+                    Long pOffset = p.getOffset();
+                    Long qOffset = q.getOffset();
+                    return pOffset.compareTo(qOffset);
+                }
+            };
+            Collections.sort(sparseHeaders, sparseHeaderComparator);
+        }
+
+        for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
+            if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0)
{
+                break;
+            }
+
+            if ((sparseHeader.getOffset() - offset) < 0) {
+                throw new IOException("Corrupted struct sparse detected");
+            }
+
+            // only store the input streams with non-zero size
+            if ((sparseHeader.getOffset() - offset) > 0) {
+                sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset()
- offset));
+            }
+
+            // only store the input streams with non-zero size
+            if (sparseHeader.getNumbytes() > 0) {
+                sparseInputStreams.add(new BoundedInputStream(is, sparseHeader.getNumbytes()));
+            }
+
+            offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
+        }
+
+        if (sparseInputStreams.size() > 0) {
+            currentSparseInputStreamIndex = 0;
+        }
+    }
+
+    /**
+     * This is an inputstream that always return 0,
+     * this is used when reading the "holes" of a sparse file
+     */
+    public class TarArchiveSparseZeroInputStream extends InputStream {
+        /**
+         * Just return 0
+         * @return
+         * @throws IOException
+         */
+        @Override
+        public int read() throws IOException {
+            return 0;
+        }
+
+        /**
+         * these's nothing need to do when skipping
+         *
+         * @param n bytes to skip
+         * @return bytes actually skipped
+         */
+        @Override
+        public long skip(final long n) {
+            return n;
+        }
+    }
 }
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseEntry.java
b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseEntry.java
index 7a8e2ee..5599e06 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseEntry.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseEntry.java
@@ -19,6 +19,7 @@
 package org.apache.commons.compress.archivers.tar;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 
 /**
@@ -56,6 +57,7 @@ public class TarArchiveSparseEntry implements TarConstants {
      */
     public TarArchiveSparseEntry(final byte[] headerBuf) throws IOException {
         int offset = 0;
+        sparseHeaders = new ArrayList<>();
         for(int i = 0; i < SPARSE_HEADERS_IN_EXTENSION_HEADER;i++) {
             TarArchiveStructSparse sparseHeader = TarUtils.parseSparse(headerBuf,
                     offset + i * (SPARSE_OFFSET_LEN + SPARSE_NUMBYTES_LEN));
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseInputStream.java
b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseInputStream.java
deleted file mode 100644
index 7dd609b..0000000
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveSparseInputStream.java
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.commons.compress.archivers.tar;
-
-import org.apache.commons.compress.utils.BoundedInputStream;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only
the non-zero data is
- * stored in tar files, and they are stored separately. The structure of non-zero data is
introduced by the
- * sparse headers using the offset, where a block of non-zero data starts, and numbytes,
the length of the
- * non-zero data block.
- * This class is used to construct an input stream that combines the "holes" and the non-zero
data together using
- * the sparse headers. When reading from this input stream, the actual data is read out with
"holes" and non-zero
- * data combined together according to the sparse headers.
- */
-public class TarArchiveSparseInputStream extends InputStream {
-    /** the sparse headers describing the sparse information */
-    private List<TarArchiveStructSparse> sparseHeaders;
-
-    /** the input stream of the tar file */
-    private InputStream inputStream;
-
-    /** the input streams consisting of all-zero input streams and non-zero streams */
-    private List<InputStream> inputStreams;
-
-    /** the index of current input stream being read */
-    private int currentInputStreamIndex = -1;
-
-    public TarArchiveSparseInputStream(List<TarArchiveStructSparse> sparseHeaders,
InputStream inputStream) {
-        this.sparseHeaders = sparseHeaders;
-        this.inputStream = inputStream;
-        buildInputStreams();
-
-        if (inputStreams.size() > 0) {
-            currentInputStreamIndex = 0;
-        }
-    }
-
-    @Override
-    public int read() throws IOException {
-        // if there are no actual input streams, just read from the original input stream
-        if (inputStreams.size() == 0) {
-            return inputStream.read();
-        }
-
-        int value = inputStreams.get(currentInputStreamIndex).read();
-        if (value != -1) {
-            return value;
-        }
-
-        if (currentInputStreamIndex == inputStreams.size() - 1) {
-            return -1;
-        }
-
-        currentInputStreamIndex++;
-        return inputStreams.get(currentInputStreamIndex).read();
-    }
-
-    @Override
-    public int read(byte[] buf) throws IOException {
-        return read(buf, 0, buf.length);
-    }
-
-    @Override
-    public int read(byte[] buf, int offset, int len) throws IOException {
-        // if there are no actual input streams, just read from the original input stream
-        if (inputStreams.size() == 0) {
-            return inputStream.read(buf, offset, len);
-        }
-
-        InputStream currentInputStream = inputStreams.get(currentInputStreamIndex);
-        int readLen = currentInputStream.read(buf, offset, len);
-
-        // if the current input stream is the last input stream,
-        // just return the number of bytes read from current input stream
-        if (currentInputStreamIndex == inputStreams.size() - 1) {
-            return readLen;
-        }
-
-        // if EOF of current input stream is meet, open a new input stream and recursively
call read
-        if (readLen == -1) {
-            currentInputStreamIndex++;
-            return read(buf, offset, len);
-        }
-
-        // if the rest data of current input stream is not long enough, open a new input
stream
-        // and recursively call read
-        if (readLen < len) {
-            currentInputStreamIndex++;
-            int readLenOfNext = read(buf, offset + readLen, len - readLen);
-            if (readLenOfNext == -1) {
-                return readLen;
-            }
-
-            return readLen + readLenOfNext;
-        }
-
-        // if the rest data of current input stream is enough(which means readLen == len),
just return readLen
-        return readLen;
-    }
-
-    /**
-     * Skip n bytes from current input stream, if the current input stream doesn't have enough
data to skip,
-     * jump to the next input stream and skip the rest bytes, keep doing this until total
n bytes are skipped
-     * or the input streams are all skipped
-     *
-     * @param n bytes of data to skip
-     * @return actual bytes of data skipped
-     * @throws IOException
-     */
-    @Override
-    public long skip(final long n) throws IOException {
-        if (inputStreams.size() == 0) {
-            return inputStream.skip(n);
-        }
-
-        long bytesSkipped = 0;
-        InputStream currentInputStream;
-
-        while (bytesSkipped < n && currentInputStreamIndex < inputStreams.size())
{
-            currentInputStream = inputStreams.get(currentInputStreamIndex);
-            bytesSkipped += currentInputStream.skip(n - bytesSkipped);
-
-            if (bytesSkipped < n) {
-                currentInputStreamIndex++;
-            }
-        }
-
-        return bytesSkipped;
-    }
-
-    /**
-     * Close all the input streams in inputStreams
-     * @throws IOException
-     */
-    @Override
-    public void close() throws IOException {
-        for (InputStream inputStream : inputStreams) {
-            inputStream.close();
-        }
-    }
-
-    /**
-     * Build the input streams consisting of all-zero input streams and non-zero input streams.
-     * When reading from the non-zero input streams, the data is actually read from the original
input stream.
-     * The size of each input stream is introduced by the sparse headers.
-     *
-     * NOTE : Some all-zero input streams and non-zero input streams have the size of 0.
We DO NOT store the
-     *        0 size input streams because they are meaningless.
-     */
-    private void buildInputStreams() {
-        inputStreams = new ArrayList<>();
-        InputStream zeroInputStream = new TarArchiveSparseZeroInputStream();
-
-        long offset = 0;
-        for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
-            if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0)
{
-                break;
-            }
-
-            // only store the input streams with non-zero size
-            if ((sparseHeader.getOffset() - offset) > 0) {
-                inputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset()
- offset));
-            }
-
-            // only store the input streams with non-zero size
-            if (sparseHeader.getNumbytes() > 0) {
-                inputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes()));
-            }
-
-            offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
-        }
-    }
-
-    /**
-     * This is an inputstream that always return 0,
-     * this is used when writing the holes of a sparse file
-     */
-    public class TarArchiveSparseZeroInputStream extends InputStream {
-        /**
-         * Just return 0
-         * @return
-         * @throws IOException
-         */
-        @Override
-        public int read() throws IOException {
-            return 0;
-        }
-
-        /**
-         * these's nothing need to do when skipping
-         *
-         * @param n bytes to skip
-         * @return bytes actually skipped
-         */
-        @Override
-        public long skip(final long n) {
-            return n;
-        }
-    }
-}
diff --git a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveStructSparse.java
b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveStructSparse.java
index ac08e68..8221a25 100644
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveStructSparse.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveStructSparse.java
@@ -18,6 +18,8 @@
  */
 package org.apache.commons.compress.archivers.tar;
 
+import java.util.Objects;
+
 /**
  * This class represents struct sparse in a Tar archive.
  * <p>
@@ -39,6 +41,28 @@ public class TarArchiveStructSparse {
         this.numbytes = numbytes;
     }
 
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        TarArchiveStructSparse that = (TarArchiveStructSparse) o;
+        return offset == that.offset &&
+                numbytes == that.numbytes;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(offset, numbytes);
+    }
+
+    @Override
+    public String toString() {
+        return "TarArchiveStructSparse{" +
+                "offset=" + offset +
+                ", numbytes=" + numbytes +
+                '}';
+    }
+
     public long getOffset() {
         return offset;
     }
diff --git a/src/test/java/org/apache/commons/compress/archivers/tar/SparseFilesTest.java
b/src/test/java/org/apache/commons/compress/archivers/tar/SparseFilesTest.java
index 56ac227..33b0c91 100644
--- a/src/test/java/org/apache/commons/compress/archivers/tar/SparseFilesTest.java
+++ b/src/test/java/org/apache/commons/compress/archivers/tar/SparseFilesTest.java
@@ -18,15 +18,24 @@
 
 package org.apache.commons.compress.archivers.tar;
 
-import static org.apache.commons.compress.AbstractTestCase.getFile;
 import static org.junit.Assert.*;
+
+import org.apache.commons.compress.AbstractTestCase;
+import org.junit.Assert;
 import org.junit.Test;
+import shaded.org.apache.commons.io.IOUtils;
 
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
 
+public class SparseFilesTest extends AbstractTestCase {
 
-public class SparseFilesTest {
+    private final boolean isOnWindows = System.getProperty("os.name").toLowerCase(Locale.ENGLISH).contains("windows");
 
     @Test
     public void testOldGNU() throws Throwable {
@@ -40,6 +49,18 @@ public class SparseFilesTest {
             assertTrue(ae.isGNUSparse());
             assertFalse(ae.isPaxGNUSparse());
             assertFalse(tin.canReadEntryData(ae));
+
+            List<TarArchiveStructSparse> sparseHeaders = ae.getSparseHeaders();
+            assertEquals(3, sparseHeaders.size());
+
+            assertEquals(0, sparseHeaders.get(0).getOffset());
+            assertEquals(2048, sparseHeaders.get(0).getNumbytes());
+
+            assertEquals(1050624L, sparseHeaders.get(1).getOffset());
+            assertEquals(2560, sparseHeaders.get(1).getNumbytes());
+
+            assertEquals(3101184L, sparseHeaders.get(2).getOffset());
+            assertEquals(0, sparseHeaders.get(2).getNumbytes());
         } finally {
             if (tin != null) {
                 tin.close();
@@ -63,6 +84,159 @@ public class SparseFilesTest {
         }
     }
 
+    @Test
+    public void testExtractSparseTarsOnWindows() throws IOException {
+        if (!isOnWindows) {
+            return;
+        }
+
+        final File oldGNUSparseTar = getFile("oldgnu_sparse.tar");
+        final File paxGNUSparseTar = getFile("pax_gnu_sparse.tar");
+        TarArchiveInputStream oldGNUSparseInputStream = null;
+        TarArchiveInputStream paxGNUSparseInputStream = null;
+        try {
+            // compare between old GNU and PAX 0.0
+            oldGNUSparseInputStream = new TarArchiveInputStream(new FileInputStream(oldGNUSparseTar));
+            oldGNUSparseInputStream.getNextTarEntry();
+            paxGNUSparseInputStream = new TarArchiveInputStream(new FileInputStream(paxGNUSparseTar));
+            paxGNUSparseInputStream.getNextTarEntry();
+            Assert.assertTrue(IOUtils.contentEquals(oldGNUSparseInputStream, paxGNUSparseInputStream));
+
+            // compare between old GNU and PAX 0.1
+            oldGNUSparseInputStream.close();
+            oldGNUSparseInputStream = new TarArchiveInputStream(new FileInputStream(oldGNUSparseTar));
+            oldGNUSparseInputStream.getNextTarEntry();
+            paxGNUSparseInputStream.getNextTarEntry();
+            Assert.assertTrue(IOUtils.contentEquals(oldGNUSparseInputStream, paxGNUSparseInputStream));
+
+            // compare between old GNU and PAX 1.0
+            oldGNUSparseInputStream.close();
+            oldGNUSparseInputStream = new TarArchiveInputStream(new FileInputStream(oldGNUSparseTar));
+            oldGNUSparseInputStream.getNextTarEntry();
+            paxGNUSparseInputStream.getNextTarEntry();
+            Assert.assertTrue(IOUtils.contentEquals(oldGNUSparseInputStream, paxGNUSparseInputStream));
+        } finally {
+            if (oldGNUSparseInputStream != null) {
+                oldGNUSparseInputStream.close();
+            }
+
+            if (paxGNUSparseInputStream != null) {
+                paxGNUSparseInputStream.close();
+            }
+        }
+    }
+
+    @Test
+    public void testExtractOldGNU() throws IOException, InterruptedException {
+        if (isOnWindows) {
+            return;
+        }
+
+        final File file = getFile("oldgnu_sparse.tar");
+        InputStream sparseFileInputStream = null;
+        TarArchiveInputStream tin = null;
+        try {
+            sparseFileInputStream = extractTarAndGetInputStream(file, "sparsefile");
+            tin = new TarArchiveInputStream(new FileInputStream(file));
+            tin.getNextTarEntry();
+
+            Assert.assertTrue(IOUtils.contentEquals(tin, sparseFileInputStream));
+        } finally {
+            if (sparseFileInputStream != null) {
+                sparseFileInputStream.close();
+            }
+
+            if (tin != null) {
+                tin.close();
+            }
+        }
+    }
+
+    @Test
+    public void testExtractExtendedOldGNU() throws IOException, InterruptedException {
+        if (isOnWindows) {
+            return;
+        }
+
+        final File file = getFile("oldgnu_extended_sparse.tar");
+        InputStream sparseFileInputStream = null;
+        TarArchiveInputStream tin = null;
+        try {
+            sparseFileInputStream = extractTarAndGetInputStream(file, "sparse6");
+            tin = new TarArchiveInputStream(new FileInputStream(file));
+            final TarArchiveEntry ae = tin.getNextTarEntry();
+
+            Assert.assertTrue(IOUtils.contentEquals(tin, sparseFileInputStream));
+
+            List<TarArchiveStructSparse> sparseHeaders = ae.getSparseHeaders();
+            assertEquals(7, sparseHeaders.size());
+
+            assertEquals(0, sparseHeaders.get(0).getOffset());
+            assertEquals(1024, sparseHeaders.get(0).getNumbytes());
+
+            assertEquals(10240, sparseHeaders.get(1).getOffset());
+            assertEquals(1024, sparseHeaders.get(1).getNumbytes());
+
+            assertEquals(16384, sparseHeaders.get(2).getOffset());
+            assertEquals(1024, sparseHeaders.get(2).getNumbytes());
+
+            assertEquals(24576, sparseHeaders.get(3).getOffset());
+            assertEquals(1024, sparseHeaders.get(3).getNumbytes());
+
+            assertEquals(29696, sparseHeaders.get(4).getOffset());
+            assertEquals(1024, sparseHeaders.get(4).getNumbytes());
+
+            assertEquals(36864, sparseHeaders.get(5).getOffset());
+            assertEquals(1024, sparseHeaders.get(5).getNumbytes());
+
+            assertEquals(51200, sparseHeaders.get(6).getOffset());
+            assertEquals(0, sparseHeaders.get(6).getNumbytes());
+        } finally {
+            if (sparseFileInputStream != null) {
+                sparseFileInputStream.close();
+            }
+
+            if (tin != null) {
+                tin.close();
+            }
+        }
+    }
+
+    @Test
+    public void testExtractPaxGNU() throws IOException, InterruptedException {
+        if (isOnWindows) {
+            return;
+        }
+
+        final File file = getFile("pax_gnu_sparse.tar");
+        InputStream sparseFileInputStream = null;
+        TarArchiveInputStream tin = null;
+        try {
+            sparseFileInputStream = extractTarAndGetInputStream(file, "sparsefile-0.0");
+            tin = new TarArchiveInputStream(new FileInputStream(file));
+            tin.getNextTarEntry();
+            Assert.assertTrue(IOUtils.contentEquals(tin, sparseFileInputStream));
+
+            // TODO : it's wired that I can only get a 0 size sparsefile-0.1 on my Ubuntu
16.04
+            //        using "tar -xf pax_gnu_sparse.tar"
+            sparseFileInputStream = extractTarAndGetInputStream(file, "sparsefile-0.0");
+            tin.getNextTarEntry();
+            Assert.assertTrue(IOUtils.contentEquals(tin, sparseFileInputStream));
+
+            sparseFileInputStream = extractTarAndGetInputStream(file, "sparsefile-1.0");
+            tin.getNextTarEntry();
+            Assert.assertTrue(IOUtils.contentEquals(tin, sparseFileInputStream));
+        } finally {
+            if (sparseFileInputStream != null) {
+                sparseFileInputStream.close();
+            }
+
+            if (tin != null) {
+                tin.close();
+            }
+        }
+    }
+
     private void assertPaxGNUEntry(final TarArchiveInputStream tin, final String suffix)
throws Throwable {
         final TarArchiveEntry ae = tin.getNextTarEntry();
         assertEquals("sparsefile-" + suffix, ae.getName());
@@ -70,6 +244,33 @@ public class SparseFilesTest {
         assertTrue(ae.isPaxGNUSparse());
         assertFalse(ae.isOldGNUSparse());
         assertFalse(tin.canReadEntryData(ae));
+
+        List<TarArchiveStructSparse> sparseHeaders = ae.getSparseHeaders();
+        assertEquals(3, sparseHeaders.size());
+
+        assertEquals(0, sparseHeaders.get(0).getOffset());
+        assertEquals(2048, sparseHeaders.get(0).getNumbytes());
+
+        assertEquals(1050624L, sparseHeaders.get(1).getOffset());
+        assertEquals(2560, sparseHeaders.get(1).getNumbytes());
+
+        assertEquals(3101184L, sparseHeaders.get(2).getOffset());
+        assertEquals(0, sparseHeaders.get(2).getNumbytes());
+    }
+
+    private InputStream extractTarAndGetInputStream(File tarFile, String sparseFileName)
throws IOException, InterruptedException {
+        Runtime runtime = Runtime.getRuntime();
+        Process process = runtime.exec("tar -xf " + tarFile.getPath() + " -C " + resultDir.getPath());
+        // wait until the extract finishes
+        process.waitFor();
+
+        for (File file : resultDir.listFiles()) {
+            if(file.getName().equals(sparseFileName)) {
+                return new FileInputStream(file);
+            }
+        }
+
+        return null;
     }
 }
 
diff --git a/src/test/java/org/apache/commons/compress/archivers/tar/TarUtilsTest.java b/src/test/java/org/apache/commons/compress/archivers/tar/TarUtilsTest.java
index 393c0aa..91b0ef7 100644
--- a/src/test/java/org/apache/commons/compress/archivers/tar/TarUtilsTest.java
+++ b/src/test/java/org/apache/commons/compress/archivers/tar/TarUtilsTest.java
@@ -381,4 +381,16 @@ public class TarUtilsTest {
         }
     }
 
+    @Test
+    public void testParseSparse() {
+        final long expectedOffset = 0100000;
+        final long expectedNumbytes = 0111000;
+        final byte [] buffer = new byte[] {
+                ' ', ' ', ' ', ' ', ' ', '0', '1', '0', '0', '0', '0', '0', // sparseOffset
+                ' ', ' ', ' ', ' ', ' ', '0', '1', '1', '1', '0', '0', '0'};
+        TarArchiveStructSparse sparse = TarUtils.parseSparse(buffer, 0);
+        assertEquals(sparse.getOffset(), expectedOffset);
+        assertEquals(sparse.getNumbytes(), expectedNumbytes);
+    }
+
 }
diff --git a/src/test/resources/oldgnu_extended_sparse.tar b/src/test/resources/oldgnu_extended_sparse.tar
new file mode 100644
index 0000000..cba3ebc
Binary files /dev/null and b/src/test/resources/oldgnu_extended_sparse.tar differ


Mime
View raw message