poi-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Dale Monti <dmo...@linoma.com>
Subject Re: Out of heap space in XSSFReader#getSharedStringsTable()
Date Wed, 13 May 2009 17:04:47 GMT
Hi Chris,
Thanks for the sample code. I am gearing up to start a similar project 
soon. Did you find the Microsoft documents Nick mentioned? I've been 
looking for an Excel XML reference manual but I can't find one.

This article might be helpful. There is a 'Read It' section about 2/3 of 
the article.

http://msdn.microsoft.com/en-us/library/bb332058.aspx


Thanks

Chris Lott wrote:
> Nick, what Microsoft documents do you mean?  I just don't know where
> to start.
>
> Please see below for a first implementation of a 
> ReadonlySharedStringsTable.  Initially I wanted to extend 
> POIXMLDocumentPart, but that supports read/write access.  I copied the 
> relevant method signatures from POIXMLDocumentPart and provided 
> implementations.  The program depends on finding the counts in the 
> root element to allocate space, which may be just too fragile, but 
> time will tell.  It certainly runs extremely quickly.
> HTH
>
> chris...
>
> Nick Burch wrote:
>> On Thu, 7 May 2009, Chris Lott wrote:
>>> Sounds reasonable.  Can you offer any insights into the shared 
>>> string table?
>>
>> Have a look at the microsoft docs, they're not bad.
>>
>>> It looks like the shared string ID in a sheet(n).xml file is simply 
>>> the index of the string in the table.  That suggests to me a simple 
>>> Java vector might be a suitable implementation with not only low 
>>> memory use but also constant lookup time; don't even need a map.
>>
>> I think you ought to be just fine with a vector or array / arraylist, 
>> yes.
>>
>> Nick
>
> -- 
>
> package com.telcordia.arroyo.beans.poi;
>
> import java.io.File;
> import java.io.IOException;
> import java.io.InputStream;
> import java.util.ArrayList;
>
> import javax.xml.parsers.ParserConfigurationException;
> import javax.xml.parsers.SAXParser;
> import javax.xml.parsers.SAXParserFactory;
>
> import org.apache.log4j.BasicConfigurator;
> import org.apache.log4j.Level;
> import org.apache.log4j.Logger;
> import org.apache.poi.openxml4j.opc.Package;
> import org.apache.poi.openxml4j.opc.PackageAccess;
> import org.apache.poi.openxml4j.opc.PackagePart;
> import org.apache.poi.openxml4j.opc.PackageRelationship;
> import org.apache.poi.xssf.usermodel.XSSFRelation;
> import org.xml.sax.Attributes;
> import org.xml.sax.InputSource;
> import org.xml.sax.SAXException;
> import org.xml.sax.XMLReader;
> import org.xml.sax.helpers.DefaultHandler;
>
> /**
> * Many methods copied from org.apache.poi.POIXMLDocumentPart because
> * I was not sure whether this class should extend that class.
> *
> */
> public class ReadonlySharedStringsTable extends DefaultHandler {
>
>     /**
>      * An integer representing the total count of strings in the 
> workbook. This count does not
>      * include any numbers, it counts only the total of text strings 
> in the workbook.
>      */
>     private int count;
>
>     /**
>      * An integer representing the total count of unique strings in 
> the Shared String Table.
>      * A string is unique even if it is a copy of another string, but 
> has different formatting applied
>      * at the character level.
>      */
>     private int uniqueCount;
>
>     /**
>      * The shared strings table.
>      */
>     private String [] strings;
>
>     /**
>      *      * @param pkg
>      * @throws IOException
>      * @throws SAXException
>      * @throws ParserConfigurationException
>      */
>     public ReadonlySharedStringsTable(Package pkg)
>     throws IOException, SAXException, ParserConfigurationException {
>         ArrayList<PackagePart> parts =             
> pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
>         PackagePart sstPart = parts.get(0);
>         readFrom(sstPart.getInputStream());
>     }
>
>     /**
>      * Like POIXMLDocumentPart constructor
>      *      * @param part
>      * @param rel_ignored
>      * @throws IOException
>      */
>     public ReadonlySharedStringsTable(PackagePart part, 
> PackageRelationship rel_ignored)     throws IOException, SAXException, 
> ParserConfigurationException {
>         readFrom(part.getInputStream());
>     }
>
>     /**
>      * Read this shared strings table from an XML file.
>      *      * @param is The input stream containing the XML document.
>      * @throws IOException if an error occurs while reading.
>      * @throws SAXException      * @throws 
> ParserConfigurationException      */
>     public void readFrom(InputStream is) throws IOException, 
> SAXException, ParserConfigurationException {
>         InputSource sheetSource = new InputSource(is);
>         SAXParserFactory saxFactory = SAXParserFactory.newInstance();
>         SAXParser saxParser = saxFactory.newSAXParser();
>         XMLReader sheetParser = saxParser.getXMLReader();
>         sheetParser.setContentHandler(this);
>         sheetParser.parse(sheetSource);
>     }
>
>     /**
>      * Return an integer representing the total count of strings in 
> the workbook. This count does not
>      * include any numbers, it counts only the total of text strings 
> in the workbook.
>      *
>      * @return the total count of strings in the workbook
>      */
>     public int getCount(){
>         return this.count;
>     }
>
>     /**
>      * Returns an integer representing the total count of unique 
> strings in the Shared String Table.
>      * A string is unique even if it is a copy of another string, but 
> has different formatting applied
>      * at the character level.
>      *
>      * @return the total count of unique strings in the workbook
>      */
>     public int getUniqueCount(){
>         return this.uniqueCount;
>     }
>
>     /**
>      * Return a string item by index
>      *
>      * @param idx index of item to return.
>      * @return the item at the specified position in this Shared 
> String table.
>      */
>     public String getEntryAt(int idx) {
>         return strings[idx];
>     }
>
>     //// ContentHandler methods ////
>
>     private StringBuffer characters;
>     private boolean tIsOpen;
>     private int index;
>
>     /*
>      * (non-Javadoc)
>      * @see 
> org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, 
> java.lang.String, java.lang.String, org.xml.sax.Attributes)
>      */
>     public void startElement(String uri, String localName, String name,
>             Attributes attributes) throws SAXException {
>         if ("sst".equals(name)) {
>             String count = attributes.getValue("count");
>             String uniqueCount = attributes.getValue("uniqueCount");
>             this.count = Integer.parseInt(count);
>             this.uniqueCount = Integer.parseInt(uniqueCount);
>             this.strings = new String[this.uniqueCount];
>             index = 0;
>             characters = new StringBuffer();
>         }
>         else if ("t".equals(name)) {
>             characters.setLength(0);
>             tIsOpen = true;
>         }
>     }
>
>     /*
>      * (non-Javadoc)
>      * @see 
> org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, 
> java.lang.String, java.lang.String)
>      */
>     public void endElement(String uri, String localName, String name)
>     throws SAXException {
>         if ("t".equals(name)) {
>             strings[index] = characters.toString();
>             ++index;
>         }
>     }
>
>     /**
>      * Captures characters only if a t(ext?) element is open.
>      */
>     public void characters(char[] ch, int start, int length)
>     throws SAXException {
>         if (tIsOpen)
>             characters.append(ch, start, length);
>     }
>
>     ////////////// For testing //////////////
>
>     /**
>      * Dumps the shared string table from an XLSX package.
>      *      * @param args
>      * @throws Exception
>      */
>     public static void main(String [] args) throws Exception {
>
>         BasicConfigurator.configure();
>         Logger.getRootLogger().setLevel(Level.INFO);
>
>         if (args.length != 1) {
>             System.err.println("Usage: ReadonlySharedStringsTable 
> <file.xlsx>");
>             return;
>         }
>
>         File inputFile = new File(args[0]);
>         if (! inputFile.exists()) {
>             System.err.println("Failed to find file: " + 
> inputFile.getPath());
>             return;
>         }
>
>         Package pkg = Package.open(inputFile.getPath(), 
> PackageAccess.READ);
>         ReadonlySharedStringsTable sst = new 
> ReadonlySharedStringsTable(pkg);
>         int unique = sst.getUniqueCount();
>         for (int s = 0; s < unique; ++s)
>             System.out.println("Index " + s + ": " + 
> sst.getEntryAt(s));       
>         // Close without saving any changes.
>         pkg.revert();
>
>     }
> }
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: user-unsubscribe@poi.apache.org
> For additional commands, e-mail: user-help@poi.apache.org
>
>
> __________ Information from ESET NOD32 Antivirus, version of virus 
> signature database 4061 (20090507) __________
>
> The message was checked by ESET NOD32 Antivirus.
>
> http://www.eset.com
>
>
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscribe@poi.apache.org
For additional commands, e-mail: user-help@poi.apache.org


Mime
View raw message