poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ser...@apache.org
Subject svn commit: r1155336 [1/2] - in /poi/trunk/src: documentation/content/xdocs/ java/org/apache/poi/ java/org/apache/poi/hssf/extractor/ java/org/apache/poi/poifs/filesystem/ ooxml/java/org/apache/poi/extractor/ scratchpad/src/org/apache/poi/hwpf/ scratch...
Date Tue, 09 Aug 2011 12:38:53 GMT
Author: sergey
Date: Tue Aug  9 12:38:52 2011
New Revision: 1155336

URL: http://svn.apache.org/viewvc?rev=1155336&view=rev
Log:
Add Word-to-Text converter and use it as replacement for WordExtractor

Added:
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/java/org/apache/poi/POIOLE2TextExtractor.java
    poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
    poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
    poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
    poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Tue Aug  9 12:38:52 2011
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.8-beta4" date="2011-??-??">
+           <action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
            <action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
            <action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
            <action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>

Modified: poi/trunk/src/java/org/apache/poi/POIOLE2TextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/POIOLE2TextExtractor.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/POIOLE2TextExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/POIOLE2TextExtractor.java Tue Aug  9 12:38:52 2011
@@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -39,7 +40,7 @@ public abstract class POIOLE2TextExtract
 	public POIOLE2TextExtractor(POIDocument document) {
 		super(document);
 	}
-	
+
 	/**
 	 * Returns the document information metadata for the document
 	 */
@@ -52,20 +53,28 @@ public abstract class POIOLE2TextExtract
 	public SummaryInformation getSummaryInformation() {
 		return document.getSummaryInformation();
 	}
-	
+
 	/**
-	 * Returns an HPSF powered text extractor for the 
+	 * Returns an HPSF powered text extractor for the
 	 *  document properties metadata, such as title and author.
 	 */
 	public POITextExtractor getMetadataTextExtractor() {
 		return new HPSFPropertiesExtractor(this);
 	}
 
-	/**
-	 * Return the underlying POIFS FileSystem of
-	 *  this document.
-	 */
-	public POIFSFileSystem getFileSystem() {
-		return document.directory.getFileSystem();
-	}
+    public DirectoryEntry getRoot()
+    {
+        return document.directory;
+    }
+
+    /**
+     * Return the underlying POIFS FileSystem of this document.
+     *
+     * @deprecated Use {@link #getRoot()} instead
+     */
+    @Deprecated
+    public POIFSFileSystem getFileSystem()
+    {
+        return document.directory.getFileSystem();
+    }
 }

Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java Tue Aug  9 12:38:52 2011
@@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.P
  */
 public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
    private DirectoryNode _dir;
-	private POIFSFileSystem _fs;
 	boolean _includeSheetNames = true;
 	boolean _formulasNotResults = false;
 
-	public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
-		super(null);
-		_dir = dir;
-		_fs = fs;
-	}
+    /**
+     * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
+    {
+        this( dir );
+    }
+
+    public EventBasedExcelExtractor( DirectoryNode dir )
+    {
+        super( null );
+        _dir = dir;
+    }
+
    public EventBasedExcelExtractor(POIFSFileSystem fs) {
-      this(fs.getRoot(), fs);
+      this(fs.getRoot());
    }
 
    /**
@@ -79,9 +89,9 @@ public class EventBasedExcelExtractor ex
     *  this document.
     */
    public POIFSFileSystem getFileSystem() {
-      return _fs;
+      return _dir.getFileSystem();
    }
-   
+
 	/**
 	 * Would return the document information metadata for the document,
 	 *  if we supported it
@@ -200,7 +210,7 @@ public class EventBasedExcelExtractor ex
 						outputNextStringValue = true;
 						nextRow = frec.getRow();
 					} else {
-						thisText = _ft.formatNumberDateCell(frec); 
+						thisText = _ft.formatNumberDateCell(frec);
 					}
 				}
 				break;
@@ -234,7 +244,7 @@ public class EventBasedExcelExtractor ex
 			case NumberRecord.sid:
 				NumberRecord numrec = (NumberRecord) record;
 				thisRow = numrec.getRow();
-				thisText = _ft.formatNumberDateCell(numrec); 
+				thisText = _ft.formatNumberDateCell(numrec);
 				break;
 			default:
 				break;

Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java Tue Aug  9 12:38:52 2011
@@ -24,7 +24,6 @@ import java.io.InputStream;
 import java.io.PrintStream;
 
 import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFCellStyle;
 import org.apache.poi.hssf.usermodel.HSSFComment;
@@ -35,12 +34,13 @@ import org.apache.poi.hssf.usermodel.HSS
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.ss.usermodel.HeaderFooter;
 
 /**
  * A text extractor for Excel files.
  * <p>
- * Returns the textual content of the file, suitable for 
+ * Returns the textual content of the file, suitable for
  *  indexing by something like Lucene, but not really
  *  intended for display to the user.
  * </p>
@@ -59,19 +59,27 @@ public class ExcelExtractor extends POIO
 	private boolean _includeCellComments = false;
 	private boolean _includeBlankCells = false;
 	private boolean _includeHeadersFooters = true;
-	
+
 	public ExcelExtractor(HSSFWorkbook wb) {
 		super(wb);
 		_wb = wb;
 		_formatter = new HSSFDataFormatter();
 	}
 	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-		this(fs.getRoot(), fs);
+		this(fs.getRoot());
 	}
-	public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-		this(new HSSFWorkbook(dir, fs, true));
+	/**
+     * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+        this( dir );
+    }
+    public ExcelExtractor(DirectoryNode dir) throws IOException {
+		this(new HSSFWorkbook(dir, true));
 	}
-	
+
 	private static final class CommandParseException extends Exception {
 		public CommandParseException(String msg) {
 			super(msg);
@@ -183,7 +191,7 @@ public class ExcelExtractor extends POIO
 			return _headersFooters;
 		}
 	}
-	
+
 	private static void printUsageMessage(PrintStream ps) {
 		ps.println("Use:");
 		ps.println("    " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
@@ -201,7 +209,7 @@ public class ExcelExtractor extends POIO
 	 * Command line extractor.
 	 */
 	public static void main(String[] args) {
-		
+
 		CommandArgs cmdArgs;
 		try {
 			cmdArgs = new CommandArgs(args);
@@ -211,12 +219,12 @@ public class ExcelExtractor extends POIO
 			System.exit(1);
 			return; // suppress compiler error
 		}
-		
+
 		if (cmdArgs.isRequestHelp()) {
 			printUsageMessage(System.out);
 			return;
 		}
-		
+
 		try {
 			InputStream is;
 			if(cmdArgs.getInputFile() == null) {
@@ -270,9 +278,9 @@ public class ExcelExtractor extends POIO
 	 * Default is to include them.
 	 */
 	public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
-		_includeHeadersFooters = includeHeadersFooters; 
+		_includeHeadersFooters = includeHeadersFooters;
 	}
-	
+
 	/**
 	 * Retrieves the text contents of the file
 	 */
@@ -282,12 +290,12 @@ public class ExcelExtractor extends POIO
 		// We don't care about the difference between
 		//  null (missing) and blank cells
 		_wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL);
-		
+
 		// Process each sheet in turn
 		for(int i=0;i<_wb.getNumberOfSheets();i++) {
 			HSSFSheet sheet = _wb.getSheetAt(i);
 			if(sheet == null) { continue; }
-			
+
 			if(_includeSheetNames) {
 				String name = _wb.getSheetName(i);
 				if(name != null) {
@@ -295,12 +303,12 @@ public class ExcelExtractor extends POIO
 					text.append("\n");
 				}
 			}
-			
+
 			// Header text, if there is any
 			if(_includeHeadersFooters) {
 				text.append(_extractHeaderFooter(sheet.getHeader()));
 			}
-			
+
 			int firstRow = sheet.getFirstRowNum();
 			int lastRow = sheet.getLastRowNum();
 			for(int j=firstRow;j<=lastRow;j++) {
@@ -313,7 +321,7 @@ public class ExcelExtractor extends POIO
 				if(_includeBlankCells) {
 					firstCell = 0;
 				}
-				
+
 				for(int k=firstCell;k<lastCell;k++) {
 					HSSFCell cell = row.getCell(k);
 					boolean outputContents = true;
@@ -368,14 +376,14 @@ public class ExcelExtractor extends POIO
 										case HSSFCell.CELL_TYPE_ERROR:
 											text.append(ErrorEval.getText(cell.getErrorCellValue()));
 											break;
-											
+
 									}
 								}
 								break;
 							default:
 								throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
 						}
-						
+
 						// Output the comment, if requested and exists
 						HSSFComment comment = cell.getCellComment();
 						if(_includeCellComments && comment != null) {
@@ -385,29 +393,29 @@ public class ExcelExtractor extends POIO
 							text.append(" Comment by "+comment.getAuthor()+": "+commentText);
 						}
 					}
-					
+
 					// Output a tab if we're not on the last cell
 					if(outputContents && k < (lastCell-1)) {
 						text.append("\t");
 					}
 				}
-				
+
 				// Finish off the row
 				text.append("\n");
 			}
-			
+
 			// Finally Footer text, if there is any
 			if(_includeHeadersFooters) {
 				text.append(_extractHeaderFooter(sheet.getFooter()));
 			}
 		}
-		
+
 		return text.toString();
 	}
-	
+
 	public static String _extractHeaderFooter(HeaderFooter hf) {
 		StringBuffer text = new StringBuffer();
-		
+
 		if(hf.getLeft() != null) {
 			text.append(hf.getLeft());
 		}
@@ -423,7 +431,7 @@ public class ExcelExtractor extends POIO
 		}
 		if(text.length() > 0)
 			text.append("\n");
-		
+
 		return text.toString();
 	}
 }

Modified: poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java (original)
+++ poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java Tue Aug  9 12:38:52 2011
@@ -15,13 +15,14 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */
-        
 
-package org.apache.poi.poifs.filesystem;
 
-import java.io.*;
+package org.apache.poi.poifs.filesystem;
 
-import java.util.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
 
 import org.apache.poi.hpsf.ClassID;
 
@@ -68,6 +69,12 @@ public interface DirectoryEntry
     public int getEntryCount();
 
     /**
+     * Checks if entry with specified name present
+     */
+
+    public boolean hasEntry( final String name );
+
+    /**
      * get a specified Entry by name
      *
      * @param name the name of the Entry to obtain.

Modified: poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java (original)
+++ poi/trunk/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java Tue Aug  9 12:38:52 2011
@@ -15,7 +15,7 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */
-        
+
 
 package org.apache.poi.poifs.filesystem;
 
@@ -53,7 +53,7 @@ public class DirectoryNode
     // the POIFSFileSystem we belong to
     private POIFSFileSystem   _ofilesystem;
     // the NPOIFSFileSytem we belong to
-    private NPOIFSFileSystem  _nfilesystem; 
+    private NPOIFSFileSystem  _nfilesystem;
 
     // the path described by this document
     private POIFSDocumentPath _path;
@@ -72,7 +72,7 @@ public class DirectoryNode
     {
        this(property, parent, filesystem, (NPOIFSFileSystem)null);
     }
-    
+
     /**
      * create a DirectoryNode. This method is not public by design; it
      * is intended strictly for the internal use of this package
@@ -87,7 +87,7 @@ public class DirectoryNode
     {
        this(property, parent, (POIFSFileSystem)null, nfilesystem);
     }
-    
+
     private DirectoryNode(final DirectoryProperty property,
                           final DirectoryNode parent,
                           final POIFSFileSystem ofilesystem,
@@ -96,7 +96,7 @@ public class DirectoryNode
         super(property, parent);
         this._ofilesystem = ofilesystem;
         this._nfilesystem = nfilesystem;
-        
+
         if (parent == null)
         {
             _path = new POIFSDocumentPath();
@@ -143,23 +143,23 @@ public class DirectoryNode
     {
         return _path;
     }
-    
+
     /**
      * @return the filesystem that this belongs to
      */
     public POIFSFileSystem getFileSystem()
     {
-        return _ofilesystem; 
+        return _ofilesystem;
     }
-    
+
     /**
      * @return the filesystem that this belongs to
      */
     public NPOIFSFileSystem getNFileSystem()
     {
-        return _nfilesystem; 
+        return _nfilesystem;
     }
-    
+
     /**
      * open a document in the directory's entry's list of entries
      *
@@ -195,7 +195,7 @@ public class DirectoryNode
             throw new IOException("Entry '" + document.getName()
                                   + "' is not a DocumentEntry");
         }
-        
+
         DocumentEntry entry = (DocumentEntry)document;
         return new DocumentInputStream(entry);
     }
@@ -217,7 +217,7 @@ public class DirectoryNode
 
         (( DirectoryProperty ) getProperty()).addChild(property);
         _ofilesystem.addDocument(document);
-        
+
         _entries.add(rval);
         _byname.put(property.getName(), rval);
         return rval;
@@ -240,7 +240,7 @@ public class DirectoryNode
 
         (( DirectoryProperty ) getProperty()).addChild(property);
         _nfilesystem.addDocument(document);
-        
+
         _entries.add(rval);
         _byname.put(property.getName(), rval);
         return rval;
@@ -290,7 +290,7 @@ public class DirectoryNode
         {
             _entries.remove(entry);
         	   _byname.remove(entry.getName());
-        	   
+
         	   if(_ofilesystem != null) {
                _ofilesystem.remove(entry);
         	   } else {
@@ -342,6 +342,11 @@ public class DirectoryNode
         return _entries.size();
     }
 
+    public boolean hasEntry( String name )
+    {
+        return name != null && _byname.containsKey( name );
+    }
+
     /**
      * get a specified Entry by name
      *
@@ -430,7 +435,7 @@ public class DirectoryNode
     {
         DirectoryNode rval;
         DirectoryProperty property = new DirectoryProperty(name);
-        
+
         if(_ofilesystem != null) {
            rval = new DirectoryNode(property, _ofilesystem, this);
            _ofilesystem.addDirectory(property);
@@ -562,7 +567,7 @@ public class DirectoryNode
      * Returns an Iterator over all the entries
      */
     public Iterator<Entry> iterator() {
-        return getEntries(); 
+        return getEntries();
     }
 
     /* **********  END  begin implementation of POIFSViewable ********** */

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Tue Aug  9 12:38:52 2011
@@ -66,48 +66,48 @@ import org.apache.xmlbeans.XmlException;
 public class ExtractorFactory {
 	public static final String CORE_DOCUMENT_REL =
 		"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
-	
-	
+
+
 	/** Should this thread prefer event based over usermodel based extractors? */
 	private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
       protected Boolean initialValue() { return Boolean.FALSE; }
 	};
 	/** Should all threads prefer event based over usermodel based extractors? */
 	private static Boolean allPreferEventExtractors;
-	
-   /** 
+
+   /**
     * Should this thread prefer event based over usermodel based extractors?
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is false. 
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is false.
     */
 	public static boolean getThreadPrefersEventExtractors() {
 	   return threadPreferEventExtractors.get();
 	}
-   /** 
-    * Should all threads prefer event based over usermodel based extractors? 
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is to use the thread level setting, which defaults to false. 
+   /**
+    * Should all threads prefer event based over usermodel based extractors?
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is to use the thread level setting, which defaults to false.
     */
 	public static Boolean getAllThreadsPreferEventExtractors() {
 	   return allPreferEventExtractors;
 	}
-	
-   /** 
+
+   /**
     * Should this thread prefer event based over usermodel based extractors?
-    * Will only be used if the All Threads setting is null. 
+    * Will only be used if the All Threads setting is null.
     */
    public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
       threadPreferEventExtractors.set(preferEventExtractors);
    }
-   /** 
+   /**
     * Should all threads prefer event based over usermodel based extractors?
-    * If set, will take preference over the Thread level setting. 
+    * If set, will take preference over the Thread level setting.
     */
    public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
       allPreferEventExtractors = preferEventExtractors;
    }
-	
-   
+
+
    /**
     * Should this thread use event based extractors is available?
     * Checks the all-threads one first, then thread specific.
@@ -118,8 +118,8 @@ public class ExtractorFactory {
       }
       return threadPreferEventExtractors.get();
    }
-   
-	
+
+
 	public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 		InputStream inp = null;
         try {
@@ -137,14 +137,14 @@ public class ExtractorFactory {
             if(inp != null) inp.close();
         }
     }
-	
+
 	public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 		// Figure out the kind of stream
 		// If clearly doesn't do mark/reset, wrap up
 		if(! inp.markSupported()) {
 			inp = new PushbackInputStream(inp, 8);
 		}
-		
+
 		if(POIFSFileSystem.hasPOIFSHeader(inp)) {
 			return createExtractor(new POIFSFileSystem(inp));
 		}
@@ -153,16 +153,16 @@ public class ExtractorFactory {
 		}
 		throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
 	}
-	
+
 	public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
-       PackageRelationshipCollection core = 
+       PackageRelationshipCollection core =
             pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
        if(core.size() != 1) {
           throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
        }
 
        PackagePart corePart = pkg.getPart(core.getRelationship(0));
-        
+
        // Is it XSSF?
        for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
           if(corePart.getContentType().equals(rel.getContentType())) {
@@ -173,84 +173,98 @@ public class ExtractorFactory {
              }
           }
        }
-        
+
        // Is it XWPF?
        for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
           if(corePart.getContentType().equals(rel.getContentType())) {
              return new XWPFWordExtractor(pkg);
           }
        }
-       
+
        // Is it XSLF?
        for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
           if(corePart.getContentType().equals(rel.getContentType())) {
              return new XSLFPowerPointExtractor(pkg);
           }
        }
-       
+
        throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
 	}
-	
+
 	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 	   // Only ever an OLE2 one from the root of the FS
-		return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
+		return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
 	}
-	public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
-		// Look for certain entries in the stream, to figure it
-		//  out from
-		for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
-			Entry entry = entries.next();
-			
-			if(entry.getName().equals("Workbook")) {
-			   if(getPreferEventExtractor()) {
-               return new EventBasedExcelExtractor(poifsDir, fs);
-			   } else {
-			      return new ExcelExtractor(poifsDir, fs);
-			   }
-			}
-			if(entry.getName().equals("WordDocument")) {
-			    // Old or new style word document?
-			    try {
-			        return new WordExtractor(poifsDir, fs);
-			    } catch(OldWordFileFormatException e) {
-			        return new Word6Extractor(poifsDir, fs);
-			    }
-			}
-			if(entry.getName().equals("PowerPoint Document")) {
-				return new PowerPointExtractor(poifsDir, fs);
-			}
-			if(entry.getName().equals("VisioDocument")) {
-				return new VisioTextExtractor(poifsDir, fs);
-			}
-         if(entry.getName().equals("Quill")) {
-            return new PublisherTextExtractor(poifsDir, fs);
-         }
-			if(
-                entry.getName().equals("__substg1.0_1000001E") ||
-                entry.getName().equals("__substg1.0_1000001F") ||
-                entry.getName().equals("__substg1.0_0047001E") ||
-                entry.getName().equals("__substg1.0_0047001F") ||
-                entry.getName().equals("__substg1.0_0037001E") ||
-                entry.getName().equals("__substg1.0_0037001F")
-			) {
-			   return new OutlookTextExtactor(poifsDir, fs);
-			}
-			if(entry.getName().equals("Package")) {
-			   OPCPackage pkg = OPCPackage.open(
-			         poifsDir.createDocumentInputStream(entry.getName())
-			   );
-			   return createExtractor(pkg);
-			}
-		}
-		throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
-	}
-	
-	
+
+    /**
+     * @deprecated Use {@link #createExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings("unused")
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
+            throws IOException, InvalidFormatException, OpenXML4JException, XmlException
+    {
+        return createExtractor(poifsDir);
+    }
+
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
+            InvalidFormatException, OpenXML4JException, XmlException
+    {
+        // Look for certain entries in the stream, to figure it
+        // out from
+        if (poifsDir.hasEntry("Workbook")) {
+            if (getPreferEventExtractor()) {
+                return new EventBasedExcelExtractor(poifsDir);
+            }
+            return new ExcelExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("WordDocument")) {
+            // Old or new style word document?
+            try {
+                return new WordExtractor(poifsDir);
+            } catch (OldWordFileFormatException e) {
+                return new Word6Extractor(poifsDir);
+            }
+        }
+
+        if (poifsDir.hasEntry("PowerPoint Document")) {
+            return new PowerPointExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("VisioDocument")) {
+            return new VisioTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("Quill")) {
+            return new PublisherTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
+                || poifsDir.hasEntry("__substg1.0_0047001E")
+                || poifsDir.hasEntry("__substg1.0_0047001F")
+                || poifsDir.hasEntry("__substg1.0_0037001E")
+                || poifsDir.hasEntry("__substg1.0_0037001F"))
+        {
+            return new OutlookTextExtactor(poifsDir);
+        }
+
+        for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
+            Entry entry = entries.next();
+
+            if (entry.getName().equals("Package")) {
+                OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
+                return createExtractor(pkg);
+            }
+        }
+        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+    }
+
 	/**
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
 	 * If there are no embeded documents, you'll get back an
-	 *  empty array. Otherwise, you'll get one open 
+	 *  empty array. Otherwise, you'll get one open
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@@ -258,16 +272,16 @@ public class ExtractorFactory {
 		ArrayList<Entry> dirs = new ArrayList<Entry>();
 		// For anything else not directly held in as a POIFS directory
 		ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-		
+
       // Find all the embeded directories
-		POIFSFileSystem fs = ext.getFileSystem();
-		if(fs == null) {
+		DirectoryEntry root = ext.getRoot();
+		if(root == null) {
 			throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
 		}
-		
+
 		if(ext instanceof ExcelExtractor) {
 			// These are in MBD... under the root
-			Iterator<Entry> it = fs.getRoot().getEntries();
+			Iterator<Entry> it = root.getEntries();
 			while(it.hasNext()) {
 				Entry entry = it.next();
 				if(entry.getName().startsWith("MBD")) {
@@ -278,7 +292,7 @@ public class ExtractorFactory {
 			// These are in ObjectPool -> _... under the root
 			try {
 				DirectoryEntry op = (DirectoryEntry)
-					fs.getRoot().getEntry("ObjectPool");
+				        root.getEntry("ObjectPool");
 				Iterator<Entry> it = op.getEntries();
 				while(it.hasNext()) {
 					Entry entry = it.next();
@@ -302,7 +316,7 @@ public class ExtractorFactory {
 		      }
 		   }
 		}
-		
+
 		// Create the extractors
 		if(
 		      (dirs == null || dirs.size() == 0) &&
@@ -310,11 +324,11 @@ public class ExtractorFactory {
 		){
 			return new POITextExtractor[0];
 		}
-		
+
 		ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
 		for(int i=0; i<dirs.size(); i++) {
 			e.add( createExtractor(
-					(DirectoryNode)dirs.get(i), ext.getFileSystem()
+					(DirectoryNode)dirs.get(i)
 			) );
 		}
 		for(int i=0; i<nonPOIFS.size(); i++) {
@@ -336,7 +350,7 @@ public class ExtractorFactory {
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
 	 * If there are no embeded documents, you'll get back an
-	 *  empty array. Otherwise, you'll get one open 
+	 *  empty array. Otherwise, you'll get one open
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java Tue Aug  9 12:38:52 2011
@@ -23,6 +23,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.hwpf.model.BookmarksTables;
 import org.apache.poi.hwpf.model.CHPBinTable;
 import org.apache.poi.hwpf.model.CPSplitCalculator;
@@ -190,7 +192,9 @@ public final class HWPFDocument extends 
    * @param pfilesystem The POIFSFileSystem that contains the Word document.
    * @throws IOException If there is an unexpected IOException from the passed
    *         in POIFSFileSystem.
+   * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
    */
+  @Deprecated
   public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
   {
      this(directory);

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java Tue Aug  9 12:38:52 2011
@@ -17,10 +17,17 @@
 
 package org.apache.poi.hwpf;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
 
+import org.apache.poi.hwpf.usermodel.ObjectsPool;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.EncryptedDocumentException;
 import org.apache.poi.POIDocument;
 import org.apache.poi.hwpf.model.CHPBinTable;
@@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
  */
 public abstract class HWPFDocumentCore extends POIDocument
 {
+  /** Holds OLE2 objects */
+  protected ObjectPoolImpl _objectPool;
+
   /** The FIB */
   protected FileInformationBlock _fib;
 
@@ -148,7 +158,21 @@ public abstract class HWPFDocumentCore e
     if(_fib.isFEncrypted()) {
     	throw new EncryptedDocumentException("Cannot process encrypted word files!");
     }
-  }
+
+        {
+            DirectoryEntry objectPoolEntry;
+            try
+            {
+                objectPoolEntry = (DirectoryEntry) directory
+                        .getEntry( "ObjectPool" );
+            }
+            catch ( FileNotFoundException exc )
+            {
+                objectPoolEntry = directory.createDirectory( "ObjectPool" );
+            }
+            _objectPool = new ObjectPoolImpl( objectPoolEntry );
+        }
+    }
 
     /**
      * Returns the range which covers the whole of the document, but excludes
@@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore e
     return _fib;
   }
 
+    public ObjectsPool getObjectsPool()
+    {
+        return _objectPool;
+    }
+
     public abstract TextPieceTable getTextTable();
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java Tue Aug  9 12:38:52 2011
@@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWP
         this(fs.getRoot());
     }
 
+    @Deprecated
     public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
             throws IOException {
        this(directory);

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java Tue Aug  9 12:38:52 2011
@@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Sec
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
@@ -56,6 +57,32 @@ import org.w3c.dom.Element;
 @Beta
 public abstract class AbstractWordConverter
 {
+    private static final class Structure implements Comparable<Structure>
+    {
+        final int end;
+        final int start;
+        final Object structure;
+
+        Structure( Bookmark bookmark )
+        {
+            this.start = bookmark.getStart();
+            this.end = bookmark.getEnd();
+            this.structure = bookmark;
+        }
+
+        Structure( Field field )
+        {
+            this.start = field.getFieldStartOffset();
+            this.end = field.getFieldEndOffset();
+            this.structure = field;
+        }
+
+        public int compareTo( Structure o )
+        {
+            return start < o.start ? -1 : start == o.start ? 0 : 1;
+        }
+    }
+
     private static final byte BEL_MARK = 7;
 
     private static final byte FIELD_BEGIN_MARK = 19;
@@ -396,6 +423,13 @@ public abstract class AbstractWordConver
                     processDrawnObject( doc, characterRun, block );
                     continue;
                 }
+                if ( characterRun.isOle2()
+                        && ( wordDocument instanceof HWPFDocument ) )
+                {
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
+                    processOle2( doc, characterRun, block );
+                    continue;
+                }
             }
 
             if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
@@ -613,10 +647,11 @@ public abstract class AbstractWordConver
             CharacterRun characterRun, OfficeDrawing officeDrawing,
             String path, Element block );
 
-    protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range endnoteTextRange );
+    protected abstract void processEndnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range endnoteTextRange );
 
-    protected void processField( HWPFDocument hwpfDocument, Range parentRange,
+    protected void processField( HWPFDocument wordDocument, Range parentRange,
             int currentTableLevel, Field field, Element currentBlock )
     {
         switch ( field.getType() )
@@ -633,7 +668,7 @@ public abstract class AbstractWordConver
                 if ( matcher.find() )
                 {
                     String pageref = matcher.group( 1 );
-                    processPageref( hwpfDocument, currentBlock,
+                    processPageref( wordDocument, currentBlock,
                             field.secondSubrange( parentRange ),
                             currentTableLevel, pageref );
                     return;
@@ -641,6 +676,36 @@ public abstract class AbstractWordConver
             }
             break;
         }
+        case 58: // Embedded Object
+        {
+            if ( !field.hasSeparator() )
+            {
+                logger.log( POILogger.WARN, parentRange + " contains " + field
+                        + " with 'Embedded Object' but without separator mark" );
+                return;
+            }
+
+            CharacterRun separator = field
+                    .getMarkSeparatorCharacterRun( parentRange );
+
+            if ( separator.isOle2() )
+            {
+                // the only supported so far
+                boolean processed = processOle2( wordDocument, separator,
+                        currentBlock );
+
+                // if we didn't output OLE - output field value
+                if ( !processed )
+                {
+                    processCharacters( wordDocument, currentTableLevel,
+                            field.secondSubrange( parentRange ), currentBlock );
+                }
+
+                return;
+            }
+
+            break;
+        }
         case 88: // hyperlink
         {
             final Range firstSubrange = field.firstSubrange( parentRange );
@@ -653,7 +718,7 @@ public abstract class AbstractWordConver
                 if ( matcher.find() )
                 {
                     String hyperlink = matcher.group( 1 );
-                    processHyperlink( hwpfDocument, currentBlock,
+                    processHyperlink( wordDocument, currentBlock,
                             field.secondSubrange( parentRange ),
                             currentTableLevel, hyperlink );
                     return;
@@ -665,12 +730,13 @@ public abstract class AbstractWordConver
 
         logger.log( POILogger.WARN, parentRange + " contains " + field
                 + " with unsupported type or format" );
-        processCharacters( hwpfDocument, currentTableLevel,
+        processCharacters( wordDocument, currentTableLevel,
                 field.secondSubrange( parentRange ), currentBlock );
     }
 
-    protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range footnoteTextRange );
+    protected abstract void processFootnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range footnoteTextRange );
 
     protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
             Element currentBlock, Range textRange, int currentTableLevel,
@@ -732,6 +798,40 @@ public abstract class AbstractWordConver
         }
     }
 
+    private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
+            Element block )
+    {
+        Entry entry = doc.getObjectsPool().getObjectById(
+                "_" + characterRun.getPicOffset() );
+        if ( entry == null )
+        {
+            logger.log( POILogger.WARN, "Referenced OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ),
+                    "' not found in ObjectPool" );
+            return false;
+        }
+
+        try
+        {
+            return processOle2( doc, block, entry );
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.WARN,
+                    "Unable to convert internal OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
+                    exc );
+            return false;
+        }
+    }
+
+    @SuppressWarnings( "unused" )
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        return false;
+    }
+
     protected abstract void processPageref( HWPFDocumentCore wordDocument,
             Element currentBlock, Range textRange, int currentTableLevel,
             String pageref );
@@ -896,30 +996,4 @@ public abstract class AbstractWordConver
         return endMark;
     }
 
-    private static final class Structure implements Comparable<Structure>
-    {
-        final int end;
-        final int start;
-        final Object structure;
-
-        Structure( Bookmark bookmark )
-        {
-            this.start = bookmark.getStart();
-            this.end = bookmark.getEnd();
-            this.structure = bookmark;
-        }
-
-        Structure( Field field )
-        {
-            this.start = field.getFieldStartOffset();
-            this.end = field.getFieldEndOffset();
-            this.structure = field;
-        }
-
-        public int compareTo( Structure o )
-        {
-            return start < o.start ? -1 : start == o.start ? 0 : 1;
-        }
-    }
-
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java Tue Aug  9 12:38:52 2011
@@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Par
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.IOUtils;
@@ -422,6 +423,19 @@ public class AbstractWordUtils
         return !isEmpty( str );
     }
 
+    public static HWPFDocumentCore loadDoc( final DirectoryNode root )
+            throws IOException
+    {
+        try
+        {
+            return new HWPFDocument( root );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( root );
+        }
+    }
+
     public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
     {
         final FileInputStream istream = new FileInputStream( docFile );
@@ -438,16 +452,13 @@ public class AbstractWordUtils
     public static HWPFDocumentCore loadDoc( InputStream inputStream )
             throws IOException
     {
-        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
-                .verifyAndBuildPOIFS( inputStream );
-        try
-        {
-            return new HWPFDocument( poifsFileSystem );
-        }
-        catch ( OldWordFileFormatException exc )
-        {
-            return new HWPFOldDocument( poifsFileSystem );
-        }
+        return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
+    }
+
+    public static HWPFDocumentCore loadDoc(
+            final POIFSFileSystem poifsFileSystem ) throws IOException
+    {
+        return loadDoc( poifsFileSystem.getRoot() );
     }
 
     static String substringBeforeLast( String str, String separator )

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java Tue Aug  9 12:38:52 2011
@@ -276,8 +276,8 @@ public class WordToFoConverter extends A
     }
 
     @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
     {
         final String textIndex = String.valueOf( internalLinkCounter
                 .incrementAndGet() );
@@ -297,7 +297,8 @@ public class WordToFoConverter extends A
         setId( backwardLink, forwardLinkName );
         endnote.appendChild( backwardLink );
 
-        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
+        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
+                endnote );
 
         WordToFoUtils.compactInlines( endnote );
         this.endnotes.add( endnote );

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java Tue Aug  9 12:38:52 2011
@@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.conver
 @Beta
 public class WordToHtmlConverter extends AbstractWordConverter
 {
-
     /**
      * Holds properties values, applied to current <tt>p</tt> element. Those
      * properties shall not be doubled in children <tt>span</tt> elements.
@@ -282,10 +281,11 @@ public class WordToHtmlConverter extends
     }
 
     @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
     {
-        processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
+        processNoteAutonumbered( wordDocument, "end", noteIndex, block,
+                endnoteTextRange );
     }
 
     @Override

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java Tue Aug  9 12:38:52 2011
@@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;
 
 import java.io.File;
 import java.io.FileWriter;
+import java.io.StringWriter;
+import java.lang.reflect.Method;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerFactory;
@@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Sec
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -33,6 +39,29 @@ import org.w3c.dom.Element;
 public class WordToTextConverter extends AbstractWordConverter
 {
 
+    public static String getText( DirectoryNode root ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
+        return getText( wordDocument );
+    }
+
+    public static String getText( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils
+                .loadDoc( docFile );
+        return getText( wordDocument );
+    }
+
+    public static String getText( final HWPFDocumentCore wordDocument )
+            throws Exception
+    {
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+        return wordToTextConverter.getText();
+    }
+
     /**
      * Java main() interface to interact with {@link WordToTextConverter}
      * 
@@ -91,12 +120,28 @@ public class WordToTextConverter extends
 
     private Element notes = null;
 
+    private boolean outputSummaryInformation = false;
+
     private final TextDocumentFacade textDocumentFacade;
 
     /**
      * Creates new instance of {@link WordToTextConverter}. Can be used for
      * output several {@link HWPFDocument}s into single text document.
      * 
+     * @throws ParserConfigurationException
+     *             if an internal {@link DocumentBuilder} cannot be created
+     */
+    public WordToTextConverter() throws ParserConfigurationException
+    {
+        this.textDocumentFacade = new TextDocumentFacade(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+    }
+
+    /**
+     * Creates new instance of {@link WordToTextConverter}. Can be used for
+     * output several {@link HWPFDocument}s into single text document.
+     * 
      * @param document
      *            XML DOM Document used as storage for text pieces
      */
@@ -110,6 +155,28 @@ public class WordToTextConverter extends
         return textDocumentFacade.getDocument();
     }
 
+    public String getText() throws Exception
+    {
+        StringWriter stringWriter = new StringWriter();
+        DOMSource domSource = new DOMSource( getDocument() );
+        StreamResult streamResult = new StreamResult( stringWriter );
+
+        TransformerFactory tf = TransformerFactory.newInstance();
+        Transformer serializer = tf.newTransformer();
+        // TODO set encoding from a command argument
+        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+        serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+        serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+        serializer.transform( domSource, streamResult );
+
+        return stringWriter.toString();
+    }
+
+    public boolean isOutputSummaryInformation()
+    {
+        return outputSummaryInformation;
+    }
+
     @Override
     protected void outputCharacters( Element block, CharacterRun characterRun,
             String text )
@@ -138,18 +205,24 @@ public class WordToTextConverter extends
     protected void processDocumentInformation(
             SummaryInformation summaryInformation )
     {
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
-            textDocumentFacade.setTitle( summaryInformation.getTitle() );
-
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
-            textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
+        if ( isOutputSummaryInformation() )
+        {
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
+                textDocumentFacade.setTitle( summaryInformation.getTitle() );
 
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
-            textDocumentFacade
-                    .addDescription( summaryInformation.getComments() );
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
+                textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
 
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
-            textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getComments() ) )
+                textDocumentFacade.addDescription( summaryInformation
+                        .getComments() );
+
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getKeywords() ) )
+                textDocumentFacade.addKeywords( summaryInformation
+                        .getKeywords() );
+        }
     }
 
     @Override
@@ -223,6 +296,48 @@ public class WordToTextConverter extends
     }
 
     @Override
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        if ( !( entry instanceof DirectoryNode ) )
+            return false;
+        DirectoryNode directoryNode = (DirectoryNode) entry;
+
+        // even if no ExtractorFactory in classpath
+        if ( directoryNode.hasEntry( "WordDocument" ) )
+        {
+            String text = WordToTextConverter.getText( (DirectoryNode) entry );
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+
+        try
+        {
+            Class<?> cls = Class
+                    .forName( "org.apache.poi.extractor.ExtractorFactory" );
+            Method createExtractor = cls.getMethod( "createExtractor",
+                    DirectoryNode.class );
+            Object extractor = createExtractor.invoke( null, directoryNode );
+
+            Method getText = extractor.getClass().getMethod( "getText" );
+            String text = (String) getText.invoke( extractor );
+
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+        catch ( ClassNotFoundException exc )
+        {
+            // no extractor in classpath
+        }
+
+        return false;
+    }
+
+    @Override
     protected void processPageref( HWPFDocumentCore wordDocument,
             Element currentBlock, Range textRange, int currentTableLevel,
             String pageref )
@@ -254,7 +369,7 @@ public class WordToTextConverter extends
         textDocumentFacade.body.appendChild( sectionElement );
     }
 
-    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
             Table table )
     {
         final int tableRows = table.numRows();
@@ -275,8 +390,8 @@ public class WordToTextConverter extends
                     tableCellElement.appendChild( textDocumentFacade
                             .createText( "\t" ) );
 
-                processParagraphes( hwpfDocument, tableCellElement, tableCell,
-                        table.getTableLevel() );
+                processCharacters( wordDocument, table.getTableLevel(),
+                        tableCell, tableCellElement );
                 tableRowElement.appendChild( tableCellElement );
             }
 
@@ -285,4 +400,9 @@ public class WordToTextConverter extends
         }
     }
 
+    public void setOutputSummaryInformation( boolean outputDocumentInformation )
+    {
+        this.outputSummaryInformation = outputDocumentInformation;
+    }
+
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java Tue Aug  9 12:38:52 2011
@@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
 
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFOldDocument;
@@ -47,16 +51,32 @@ public final class Word6Extractor extend
 		this( new POIFSFileSystem(is) );
 	}
 
-	/**
-	 * Create a new Word Extractor
-	 * @param fs POIFSFileSystem containing the word file
-	 */
-	public Word6Extractor(POIFSFileSystem fs) throws IOException {
-		this(fs.getRoot(), fs);
-	}
-	public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-	    this(new HWPFOldDocument(dir,fs));
-	}
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public Word6Extractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( fs.getRoot() );
+    }
+
+    /**
+     * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public Word6Extractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFOldDocument( dir ) );
+    }
 
 	/**
 	 * Create a new Word Extractor
@@ -71,6 +91,7 @@ public final class Word6Extractor extend
      * Get the text from the word file, as an array with one String
      *  per paragraph
      */
+	@Deprecated
 	public String[] getParagraphText() {
 	    String[] ret;
 
@@ -95,13 +116,25 @@ public final class Word6Extractor extend
 	    return ret;
 	}
 
-    public String getText() {
-        StringBuffer text = new StringBuffer();
-        
-        for(String t : getParagraphText()) {
-            text.append(t);
+    public String getText()
+    {
+        try
+        {
+            WordToTextConverter wordToTextConverter = new WordToTextConverter();
+            wordToTextConverter.processDocument( doc );
+            return wordToTextConverter.getText();
         }
+        catch ( Exception exc )
+        {
+            // fall-back
+            StringBuffer text = new StringBuffer();
+
+            for ( String t : getParagraphText() )
+            {
+                text.append( t );
+            }
 
-        return text.toString();
+            return text.toString();
+        }
     }
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java Tue Aug  9 12:38:52 2011
@@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
@@ -33,231 +36,300 @@ import org.apache.poi.poifs.filesystem.P
 
 /**
  * Class to extract the text from a Word Document.
- *
- * You should use either getParagraphText() or getText() unless
- *  you have a strong reason otherwise.
- *
+ * 
+ * You should use either getParagraphText() or getText() unless you have a
+ * strong reason otherwise.
+ * 
  * @author Nick Burch
  */
-public final class WordExtractor extends POIOLE2TextExtractor {
-	private POIFSFileSystem fs;
-	private HWPFDocument doc;
-
-	/**
-	 * Create a new Word Extractor
-	 * @param is InputStream containing the word file
-	 */
-	public WordExtractor(InputStream is) throws IOException {
-		this( HWPFDocument.verifyAndBuildPOIFS(is) );
-	}
-
-	/**
-	 * Create a new Word Extractor
-	 * @param fs POIFSFileSystem containing the word file
-	 */
-	public WordExtractor(POIFSFileSystem fs) throws IOException {
-		this(new HWPFDocument(fs));
-		this.fs = fs;
-	}
-	public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-		this(new HWPFDocument(dir, fs));
-		this.fs = fs;
-	}
-
-	/**
-	 * Create a new Word Extractor
-	 * @param doc The HWPFDocument to extract from
-	 */
-	public WordExtractor(HWPFDocument doc) {
-		super(doc);
-		this.doc = doc;
-	}
-
-	/**
-	 * Command line extractor, so people will stop moaning that
-	 *  they can't just run this.
-	 */
-	public static void main(String[] args) throws IOException {
-		if(args.length == 0) {
-			System.err.println("Use:");
-			System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
-			System.exit(1);
-		}
-
-		// Process the first argument as a file
-		FileInputStream fin = new FileInputStream(args[0]);
-		WordExtractor extractor = new WordExtractor(fin);
-		System.out.println(extractor.getText());
-	}
-
-	/**
-	 * Get the text from the word file, as an array with one String
-	 *  per paragraph
-	 */
-        public String[] getParagraphText() {
-                String[] ret;
-
-                // Extract using the model code
-                try {
-                        Range r = doc.getRange();
-
-                        ret = getParagraphText(r);
-                } catch (Exception e) {
-                        // Something's up with turning the text pieces into paragraphs
-                        // Fall back to ripping out the text pieces
-                        ret = new String[1];
-                        ret[0] = getTextFromPieces();
-                }
-
-                return ret;
+public final class WordExtractor extends POIOLE2TextExtractor
+{
+    private HWPFDocument doc;
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param is
+     *            InputStream containing the word file
+     */
+    public WordExtractor( InputStream is ) throws IOException
+    {
+        this( HWPFDocument.verifyAndBuildPOIFS( is ) );
+    }
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public WordExtractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( new HWPFDocument( fs ) );
+    }
+
+    /**
+     * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public WordExtractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFDocument( dir ) );
+    }
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param doc
+     *            The HWPFDocument to extract from
+     */
+    public WordExtractor( HWPFDocument doc )
+    {
+        super( doc );
+        this.doc = doc;
+    }
+
+    /**
+     * Command line extractor, so people will stop moaning that they can't just
+     * run this.
+     */
+    public static void main( String[] args ) throws IOException
+    {
+        if ( args.length == 0 )
+        {
+            System.err.println( "Use:" );
+            System.err
+                    .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
+            System.exit( 1 );
         }
 
-        public String[] getFootnoteText() {
-                Range r = doc.getFootnoteRange();
+        // Process the first argument as a file
+        FileInputStream fin = new FileInputStream( args[0] );
+        WordExtractor extractor = new WordExtractor( fin );
+        System.out.println( extractor.getText() );
+    }
+
+    /**
+     * Get the text from the word file, as an array with one String per
+     * paragraph
+     */
+    public String[] getParagraphText()
+    {
+        String[] ret;
+
+        // Extract using the model code
+        try
+        {
+            Range r = doc.getRange();
 
-                return getParagraphText(r);
+            ret = getParagraphText( r );
+        }
+        catch ( Exception e )
+        {
+            // Something's up with turning the text pieces into paragraphs
+            // Fall back to ripping out the text pieces
+            ret = new String[1];
+            ret[0] = getTextFromPieces();
         }
 
-        public String[] getMainTextboxText() {
-                Range r = doc.getMainTextboxRange();
+        return ret;
+    }
 
-                return getParagraphText(r);
+    public String[] getFootnoteText()
+    {
+        Range r = doc.getFootnoteRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getMainTextboxText()
+    {
+        Range r = doc.getMainTextboxRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getEndnoteText()
+    {
+        Range r = doc.getEndnoteRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getCommentsText()
+    {
+        Range r = doc.getCommentsRange();
+
+        return getParagraphText( r );
+    }
+
+    protected static String[] getParagraphText( Range r )
+    {
+        String[] ret;
+        ret = new String[r.numParagraphs()];
+        for ( int i = 0; i < ret.length; i++ )
+        {
+            Paragraph p = r.getParagraph( i );
+            ret[i] = p.text();
+
+            // Fix the line ending
+            if ( ret[i].endsWith( "\r" ) )
+            {
+                ret[i] = ret[i] + "\n";
+            }
+        }
+        return ret;
+    }
+
+    /**
+     * Add the header/footer text, if it's not empty
+     */
+    private void appendHeaderFooter( String text, StringBuffer out )
+    {
+        if ( text == null || text.length() == 0 )
+            return;
+
+        text = text.replace( '\r', '\n' );
+        if ( !text.endsWith( "\n" ) )
+        {
+            out.append( text );
+            out.append( '\n' );
+            return;
+        }
+        if ( text.endsWith( "\n\n" ) )
+        {
+            out.append( text.substring( 0, text.length() - 1 ) );
+            return;
+        }
+        out.append( text );
+        return;
+    }
+
+    /**
+     * Grab the text from the headers
+     */
+    @Deprecated
+    public String getHeaderText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstHeader() != null )
+        {
+            appendHeaderFooter( hs.getFirstHeader(), ret );
+        }
+        if ( hs.getEvenHeader() != null )
+        {
+            appendHeaderFooter( hs.getEvenHeader(), ret );
+        }
+        if ( hs.getOddHeader() != null )
+        {
+            appendHeaderFooter( hs.getOddHeader(), ret );
         }
 
-        public String[] getEndnoteText() {
-                Range r = doc.getEndnoteRange();
+        return ret.toString();
+    }
 
-                return getParagraphText(r);
+    /**
+     * Grab the text from the footers
+     */
+    @Deprecated
+    public String getFooterText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstFooter() != null )
+        {
+            appendHeaderFooter( hs.getFirstFooter(), ret );
+        }
+        if ( hs.getEvenFooter() != null )
+        {
+            appendHeaderFooter( hs.getEvenFooter(), ret );
+        }
+        if ( hs.getOddFooter() != null )
+        {
+            appendHeaderFooter( hs.getOddFooter(), ret );
         }
 
-        public String[] getCommentsText() {
-                Range r = doc.getCommentsRange();
+        return ret.toString();
+    }
 
-                return getParagraphText(r);
+    /**
+     * Grab the text out of the text pieces. Might also include various bits of
+     * crud, but will work in cases where the text piece -> paragraph mapping is
+     * broken. Fast too.
+     */
+    public String getTextFromPieces()
+    {
+        String text = doc.getDocumentText();
+
+        // Fix line endings (Note - won't get all of them
+        text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
+        text = text.replaceAll( "\r\r", "\r\n\r\n" );
+
+        if ( text.endsWith( "\r" ) )
+        {
+            text += "\n";
         }
 
-        protected static String[] getParagraphText(Range r) {
-                String[] ret;
-                ret = new String[r.numParagraphs()];
-                for (int i = 0; i < ret.length; i++) {
-                        Paragraph p = r.getParagraph(i);
-                        ret[i] = p.text();
+        return text;
+    }
+
+    /**
+     * Grab the text, based on the WordToTextConverter. Shouldn't include any
+     * crud, but slower than getTextFromPieces().
+     */
+    public String getText()
+    {
+        try
+        {
+            final StringWriter stringWriter = new StringWriter();
+            @SuppressWarnings( "unused" )
+            WordToTextConverter wordToTextConverter = new WordToTextConverter()
+            {
+                {
+                    HeaderStories hs = new HeaderStories( doc );
+
+                    if ( hs.getFirstHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstHeaderSubrange() );
+                    if ( hs.getEvenHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenHeaderSubrange() );
+                    if ( hs.getOddHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getOddHeaderSubrange() );
+
+                    processDocument( doc );
+                    processDocumentPart( doc, doc.getMainTextboxRange() );
+
+                    if ( hs.getFirstFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstFooterSubrange() );
+                    if ( hs.getEvenFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenFooterSubrange() );
+                    if ( hs.getOddFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getOddFooterSubrange() );
 
-                        // Fix the line ending
-                        if (ret[i].endsWith("\r")) {
-                                ret[i] = ret[i] + "\n";
-                        }
+                    stringWriter.append( getText() );
                 }
-                return ret;
+            };
+            return stringWriter.toString();
+        }
+        catch ( Exception exc )
+        {
+            throw new RuntimeException( exc );
         }
+    }
 
-        /**
-	 * Add the header/footer text, if it's not empty
-	 */
-	private void appendHeaderFooter(String text, StringBuffer out) {
-		if(text == null || text.length() == 0)
-			return;
-
-		text = text.replace('\r', '\n');
-		if(! text.endsWith("\n")) {
-			out.append(text);
-			out.append('\n');
-			return;
-		}
-		if(text.endsWith("\n\n")) {
-			out.append(text.substring(0, text.length()-1));
-			return;
-		}
-		out.append(text);
-		return;
-	}
-	/**
-	 * Grab the text from the headers
-	 */
-	public String getHeaderText() {
-		HeaderStories hs = new HeaderStories(doc);
-
-		StringBuffer ret = new StringBuffer();
-		if(hs.getFirstHeader() != null) {
-			appendHeaderFooter(hs.getFirstHeader(), ret);
-		}
-		if(hs.getEvenHeader() != null) {
-			appendHeaderFooter(hs.getEvenHeader(), ret);
-		}
-		if(hs.getOddHeader() != null) {
-			appendHeaderFooter(hs.getOddHeader(), ret);
-		}
-
-		return ret.toString();
-	}
-	/**
-	 * Grab the text from the footers
-	 */
-	public String getFooterText() {
-		HeaderStories hs = new HeaderStories(doc);
-
-		StringBuffer ret = new StringBuffer();
-		if(hs.getFirstFooter() != null) {
-			appendHeaderFooter(hs.getFirstFooter(), ret);
-		}
-		if(hs.getEvenFooter() != null) {
-			appendHeaderFooter(hs.getEvenFooter(), ret);
-		}
-		if(hs.getOddFooter() != null) {
-			appendHeaderFooter(hs.getOddFooter(), ret);
-		}
-
-		return ret.toString();
-	}
-
-	/**
-	 * Grab the text out of the text pieces. Might also include various
-	 *  bits of crud, but will work in cases where the text piece -> paragraph
-	 *  mapping is broken. Fast too.
-	 */
-	public String getTextFromPieces() {
-    	String text = doc.getDocumentText();
-
-    	// Fix line endings (Note - won't get all of them
-    	text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
-    	text = text.replaceAll("\r\r", "\r\n\r\n");
-
-    	if(text.endsWith("\r")) {
-    		text += "\n";
-    	}
-
-    	return text;
-	}
-
-	/**
-	 * Grab the text, based on the paragraphs. Shouldn't include any crud,
-	 *  but slightly slower than getTextFromPieces().
-	 */
-	public String getText() {
-	   StringBuffer ret = new StringBuffer();
-
-	   ret.append(getHeaderText());
-
-	   ArrayList<String> text = new ArrayList<String>();
-	   text.addAll(Arrays.asList(getParagraphText()));
-	   text.addAll(Arrays.asList(getMainTextboxText()));
-	   text.addAll(Arrays.asList(getFootnoteText()));
-	   text.addAll(Arrays.asList(getEndnoteText()));
-
-	   for(String p : text) {
-	      ret.append(p);
-	   }
-
-	   ret.append(getFooterText());
-
-	   return ret.toString();
-	}
-
-	/**
-	 * Removes any fields (eg macros, page markers etc)
-	 *  from the string.
-	 */
-	public static String stripFields(String text) {
-		return Range.stripFields(text);
-	}
+    /**
+     * Removes any fields (eg macros, page markers etc) from the string.
+     */
+    public static String stripFields( String text )
+    {
+        return Range.stripFields( text );
+    }
 }

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java Tue Aug  9 12:38:52 2011
@@ -17,17 +17,23 @@ public interface Field
      */
     int getFieldStartOffset();
 
+    CharacterRun getMarkEndCharacterRun( Range parent );
+
     /**
      * @return character position of end field mark
      */
     int getMarkEndOffset();
 
+    CharacterRun getMarkSeparatorCharacterRun( Range parent );
+
     /**
      * @return character position of separator field mark (if present,
      *         {@link NullPointerException} otherwise)
      */
     int getMarkSeparatorOffset();
 
+    CharacterRun getMarkStartCharacterRun( Range parent );
+
     /**
      * @return character position of start field mark
      */

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java Tue Aug  9 12:38:52 2011
@@ -112,6 +112,12 @@ class FieldImpl implements Field
         return startPlex.getFcStart();
     }
 
+    public CharacterRun getMarkEndCharacterRun( Range parent )
+    {
+        return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
+                .getCharacterRun( 0 );
+    }
+
     /**
      * @return character position of end field mark
      */
@@ -120,6 +126,15 @@ class FieldImpl implements Field
         return endPlex.getFcStart();
     }
 
+    public CharacterRun getMarkSeparatorCharacterRun( Range parent )
+    {
+        if ( !hasSeparator() )
+            return null;
+
+        return new Range( getMarkSeparatorOffset(),
+                getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
+    }
+
     /**
      * @return character position of separator field mark (if present,
      *         {@link NullPointerException} otherwise)
@@ -129,6 +144,12 @@ class FieldImpl implements Field
         return separatorPlex.getFcStart();
     }
 
+    public CharacterRun getMarkStartCharacterRun( Range parent )
+    {
+        return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
+                parent ).getCharacterRun( 0 );
+    }
+
     /**
      * @return character position of start field mark
      */

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java Tue Aug  9 12:38:52 2011
@@ -82,35 +82,96 @@ public final class HeaderStories {
                 fib.getPlcfHddSize(), 0 );
     }
 
-	public String getFootnoteSeparator() {
-		return getAt(0);
-	}
-	public String getFootnoteContSeparator() {
-		return getAt(1);
-	}
-	public String getFootnoteContNote() {
-		return getAt(2);
-	}
-	public String getEndnoteSeparator() {
-		return getAt(3);
-	}
-	public String getEndnoteContSeparator() {
-		return getAt(4);
-	}
-	public String getEndnoteContNote() {
-		return getAt(5);
-	}
+    @Deprecated
+    public String getFootnoteSeparator()
+    {
+        return getAt( 0 );
+    }
 
+    @Deprecated
+    public String getFootnoteContSeparator()
+    {
+        return getAt( 1 );
+    }
+
+    @Deprecated
+    public String getFootnoteContNote()
+    {
+        return getAt( 2 );
+    }
 
+    @Deprecated
+    public String getEndnoteSeparator()
+    {
+        return getAt( 3 );
+    }
+
+    @Deprecated
+    public String getEndnoteContSeparator()
+    {
+        return getAt( 4 );
+    }
+
+    @Deprecated
+    public String getEndnoteContNote()
+    {
+        return getAt( 5 );
+    }
+
+    public Range getFootnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 0 );
+    }
+
+    public Range getFootnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 1 );
+    }
+
+    public Range getFootnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 2 );
+    }
+
+    public Range getEndnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 3 );
+    }
+
+    public Range getEndnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 4 );
+    }
+
+    public Range getEndnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 5 );
+    }
+
+	@Deprecated
 	public String getEvenHeader() {
 		return getAt(6+0);
 	}
+    @Deprecated
 	public String getOddHeader() {
 		return getAt(6+1);
 	}
+    @Deprecated
 	public String getFirstHeader() {
 		return getAt(6+4);
 	}
+	
+
+    public Range getEvenHeaderSubrange() {
+        return getSubrangeAt(6+0);
+    }
+    public Range getOddHeaderSubrange() {
+        return getSubrangeAt(6+1);
+    }
+    public Range getFirstHeaderSubrange() {
+        return getSubrangeAt(6+4);
+    }
+    
 	/**
 	 * Returns the correct, defined header for the given
 	 *  one based page
@@ -135,16 +196,39 @@ public final class HeaderStories {
 		return getOddHeader();
 	}
 
+	@Deprecated
+    public String getEvenFooter()
+    {
+        return getAt( 6 + 2 );
+    }
+
+    @Deprecated
+    public String getOddFooter()
+    {
+        return getAt( 6 + 3 );
+    }
+
+    @Deprecated
+    public String getFirstFooter()
+    {
+        return getAt( 6 + 5 );
+    }
+
+    public Range getEvenFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 2 );
+    }
+
+    public Range getOddFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 3 );
+    }
+
+    public Range getFirstFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 5 );
+    }
 
-	public String getEvenFooter() {
-		return getAt(6+2);
-	}
-	public String getOddFooter() {
-		return getAt(6+3);
-	}
-	public String getFirstFooter() {
-		return getAt(6+5);
-	}
 	/**
 	 * Returns the correct, defined footer for the given
 	 *  one based page
@@ -174,6 +258,7 @@ public final class HeaderStories {
 	 * Get the string that's pointed to by the
 	 *  given plcfHdd index
 	 */
+    @Deprecated
 	private String getAt(int plcfHddIndex) {
 		if(plcfHdd == null) return null;
 
@@ -209,6 +294,32 @@ public final class HeaderStories {
 		return text;
 	}
 
+    private Range getSubrangeAt( int plcfHddIndex )
+    {
+        if ( plcfHdd == null )
+            return null;
+
+        GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
+        if ( prop.getStart() == prop.getEnd() )
+        {
+            // Empty story
+            return null;
+        }
+        if ( prop.getEnd() < prop.getStart() )
+        {
+            // Broken properties?
+            return null;
+        }
+
+        final int headersLength = headerStories.getEndOffset()
+                - headerStories.getStartOffset();
+        int start = Math.min( prop.getStart(), headersLength );
+        int end = Math.min( prop.getEnd(), headersLength );
+
+        return new Range( headerStories.getStartOffset() + start,
+                headerStories.getStartOffset() + end, headerStories );
+    }
+
 	public Range getRange() {
 		return headerStories;
 	}

Added: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java?rev=1155336&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java (added)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java Tue Aug  9 12:38:52 2011
@@ -0,0 +1,34 @@
+package org.apache.poi.hwpf.usermodel;
+
+import java.io.FileNotFoundException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.util.Internal;
+
+@Internal
+public class ObjectPoolImpl implements ObjectsPool
+{
+    private DirectoryEntry _objectPool;
+
+    public ObjectPoolImpl( DirectoryEntry _objectPool )
+    {
+        super();
+        this._objectPool = _objectPool;
+    }
+
+    public Entry getObjectById( String objId )
+    {
+        if ( _objectPool == null )
+            return null;
+
+        try
+        {
+            return _objectPool.getEntry( objId );
+        }
+        catch ( FileNotFoundException exc )
+        {
+            return null;
+        }
+    }
+}

Added: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java?rev=1155336&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java (added)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java Tue Aug  9 12:38:52 2011
@@ -0,0 +1,8 @@
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.poifs.filesystem.Entry;
+
+public interface ObjectsPool
+{
+    public Entry getObjectById( String objId );
+}

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java?rev=1155336&r1=1155335&r2=1155336&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java Tue Aug  9 12:38:52 2011
@@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.Characte
 import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 
 /**
  * This class is the central class of the HWPF object model. All properties that
@@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
  */
 public class Range { // TODO -instantiable superclass
 
+    private POILogger logger = POILogFactory.getLogger( Range.class );
+    
 	public static final int TYPE_PARAGRAPH = 0;
 	public static final int TYPE_CHARACTER = 1;
 	public static final int TYPE_SECTION = 2;
@@ -888,9 +892,12 @@ public class Range { // TODO -instantiab
         initAll();
         if ( tableEndInclusive >= this._parEnd )
         {
-            throw new ArrayIndexOutOfBoundsException(
-                    "The table's bounds fall outside of this Range" );
+            logger.log( POILogger.WARN, "The table's bounds ", "["
+                    + this._parStart + "; " + tableEndInclusive + ")",
+                    " fall outside of this Range paragraphs numbers ", "["
+                            + this._parStart + "; " + this._parEnd + ")" );
         }
+
         if ( tableEndInclusive < 0 )
         {
             throw new ArrayIndexOutOfBoundsException(



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message