poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ser...@apache.org
Subject svn commit: r1142765 [1/2] - in /poi/trunk/src/scratchpad: src/org/apache/poi/hwpf/extractor/ testcases/org/apache/poi/hwpf/extractor/
Date Mon, 04 Jul 2011 19:08:07 GMT
Author: sergey
Date: Mon Jul  4 19:08:06 2011
New Revision: 1142765

URL: http://svn.apache.org/viewvc?rev=1142765&view=rev
Log:
add Word-to-HTML extractor

Added:
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java
      - copied, changed from r1139204, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java
      - copied, changed from r1139204, poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java
Removed:
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java
Modified:
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java

Added: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java?rev=1142765&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java (added)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java Mon Jul  4 19:08:06 2011
@@ -0,0 +1,365 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.model.ListFormatOverride;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public abstract class AbstractWordExtractor
+{
+    private static final byte BEL_MARK = 7;
+
+    private static final byte FIELD_BEGIN_MARK = 19;
+
+    private static final byte FIELD_END_MARK = 21;
+
+    private static final byte FIELD_SEPARATOR_MARK = 20;
+
+    private static final POILogger logger = POILogFactory
+            .getLogger( AbstractWordExtractor.class );
+
+    public abstract Document getDocument();
+
+    protected abstract void outputCharacters( Element block,
+            CharacterRun characterRun, String text );
+
+    protected boolean processCharacters( HWPFDocumentCore hwpfDocument,
+            int currentTableLevel, Paragraph paragraph, final Element block,
+            List<CharacterRun> characterRuns, final int start, final int end )
+    {
+        boolean haveAnyText = false;
+
+        for ( int c = start; c < end; c++ )
+        {
+            CharacterRun characterRun = characterRuns.get( c );
+
+            if ( characterRun == null )
+                throw new AssertionError();
+
+            if ( hwpfDocument instanceof HWPFDocument
+                    && ( (HWPFDocument) hwpfDocument ).getPicturesTable()
+                            .hasPicture( characterRun ) )
+            {
+                HWPFDocument newFormat = (HWPFDocument) hwpfDocument;
+                Picture picture = newFormat.getPicturesTable().extractPicture(
+                        characterRun, true );
+
+                processImage( block, characterRun.text().charAt( 0 ) == 0x01,
+                        picture );
+                continue;
+            }
+
+            String text = characterRun.text();
+            if ( text.getBytes().length == 0 )
+                continue;
+
+            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
+            {
+                int skipTo = tryField( hwpfDocument, paragraph,
+                        currentTableLevel, characterRuns, c, block );
+
+                if ( skipTo != c )
+                {
+                    c = skipTo;
+                    continue;
+                }
+
+                continue;
+            }
+            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+            {
+                // shall not appear without FIELD_BEGIN_MARK
+                continue;
+            }
+            if ( text.getBytes()[0] == FIELD_END_MARK )
+            {
+                // shall not appear without FIELD_BEGIN_MARK
+                continue;
+            }
+
+            if ( characterRun.isSpecialCharacter() || characterRun.isObj()
+                    || characterRun.isOle2() )
+            {
+                continue;
+            }
+
+            if ( text.endsWith( "\r" )
+                    || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) )
+                text = text.substring( 0, text.length() - 1 );
+
+            outputCharacters( block, characterRun, text );
+
+            haveAnyText |= text.trim().length() != 0;
+        }
+
+        return haveAnyText;
+    }
+
+    public void processDocument( HWPFDocumentCore wordDocument )
+    {
+        final Range range = wordDocument.getRange();
+        for ( int s = 0; s < range.numSections(); s++ )
+        {
+            processSection( wordDocument, range.getSection( s ), s );
+        }
+    }
+
+    protected void processField( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph, int currentTableLevel,
+            List<CharacterRun> characterRuns, int beginMark, int separatorMark,
+            int endMark )
+    {
+
+        Pattern hyperlinkPattern = Pattern
+                .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
+        Pattern pagerefPattern = Pattern
+                .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
+
+        if ( separatorMark - beginMark > 1 )
+        {
+            int index = beginMark + 1;
+            CharacterRun firstAfterBegin = null;
+            while ( index < separatorMark )
+            {
+                firstAfterBegin = paragraph.getCharacterRun( index );
+                if ( firstAfterBegin == null )
+                {
+                    logger.log( POILogger.WARN,
+                            "Paragraph " + paragraph.getStartOffset() + "--"
+                                    + paragraph.getEndOffset()
+                                    + " contains null CharacterRun #" + index );
+                    index++;
+                    continue;
+                }
+                break;
+            }
+
+            if ( firstAfterBegin != null )
+            {
+                final Matcher hyperlinkMatcher = hyperlinkPattern
+                        .matcher( firstAfterBegin.text() );
+                if ( hyperlinkMatcher.matches() )
+                {
+                    String hyperlink = hyperlinkMatcher.group( 1 );
+                    processHyperlink( wordDocument, currentBlock, paragraph,
+                            characterRuns, currentTableLevel, hyperlink,
+                            separatorMark + 1, endMark );
+                    return;
+                }
+
+                final Matcher pagerefMatcher = pagerefPattern
+                        .matcher( firstAfterBegin.text() );
+                if ( pagerefMatcher.matches() )
+                {
+                    String pageref = pagerefMatcher.group( 1 );
+                    processPageref( wordDocument, currentBlock, paragraph,
+                            characterRuns, currentTableLevel, pageref,
+                            separatorMark + 1, endMark );
+                    return;
+                }
+            }
+        }
+
+        StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
+        for ( int i = beginMark; i <= endMark; i++ )
+        {
+            debug.append( "\t" );
+            debug.append( paragraph.getCharacterRun( i ) );
+            debug.append( "\n" );
+        }
+        logger.log( POILogger.WARN, debug );
+
+        // just output field value
+        if ( separatorMark + 1 < endMark )
+            processCharacters( wordDocument, currentTableLevel, paragraph,
+                    currentBlock, characterRuns, separatorMark + 1, endMark );
+
+        return;
+    }
+
+    protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
+            String hyperlink, int i, int endMark );
+
+    protected abstract void processImage( Element currentBlock,
+            boolean inlined, Picture picture );
+
+    protected abstract void processPageref( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
+            String pageref, int beginTextInclusive, int endTextExclusive );
+
+    protected abstract void processParagraph( HWPFDocumentCore wordDocument,
+            Element parentFopElement, int currentTableLevel,
+            Paragraph paragraph, String bulletText );
+
+    protected abstract void processSection( HWPFDocumentCore wordDocument,
+            Section section, int s );
+
+    protected void processSectionParagraphes( HWPFDocumentCore wordDocument,
+            Element flow, Range range, int currentTableLevel )
+    {
+        final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
+        for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator(
+                range, currentTableLevel + 1 ); tableIterator.hasNext(); )
+        {
+            Table next = tableIterator.next();
+            allTables.put( Integer.valueOf( next.getStartOffset() ), next );
+        }
+
+        final ListTables listTables = wordDocument.getListTables();
+        int currentListInfo = 0;
+
+        final int paragraphs = range.numParagraphs();
+        for ( int p = 0; p < paragraphs; p++ )
+        {
+            Paragraph paragraph = range.getParagraph( p );
+
+            if ( allTables.containsKey( Integer.valueOf( paragraph
+                    .getStartOffset() ) ) )
+            {
+                Table table = allTables.get( Integer.valueOf( paragraph
+                        .getStartOffset() ) );
+                processTable( wordDocument, flow, table, currentTableLevel + 1 );
+                continue;
+            }
+
+            if ( paragraph.isInTable()
+                    && paragraph.getTableLevel() != currentTableLevel )
+            {
+                continue;
+            }
+
+            if ( paragraph.getIlfo() != currentListInfo )
+            {
+                currentListInfo = paragraph.getIlfo();
+            }
+
+            if ( currentListInfo != 0 )
+            {
+                if ( listTables != null )
+                {
+                    final ListFormatOverride listFormatOverride = listTables
+                            .getOverride( paragraph.getIlfo() );
+
+                    String label = AbstractWordUtils.getBulletText( listTables,
+                            paragraph, listFormatOverride.getLsid() );
+
+                    processParagraph( wordDocument, flow, currentTableLevel,
+                            paragraph, label );
+                }
+                else
+                {
+                    logger.log( POILogger.WARN,
+                            "Paragraph #" + paragraph.getStartOffset() + "-"
+                                    + paragraph.getEndOffset()
+                                    + " has reference to list structure #"
+                                    + currentListInfo
+                                    + ", but listTables not defined in file" );
+
+                    processParagraph( wordDocument, flow, currentTableLevel,
+                            paragraph, AbstractWordUtils.EMPTY );
+                }
+            }
+            else
+            {
+                processParagraph( wordDocument, flow, currentTableLevel,
+                        paragraph, AbstractWordUtils.EMPTY );
+            }
+        }
+
+    }
+
+    protected void processSingleSection( HWPFDocumentCore wordDocument,
+            Section section )
+    {
+        processSection( wordDocument, section, 0 );
+    }
+
+    protected abstract void processTable( HWPFDocumentCore wordDocument,
+            Element flow, Table table, int newTableLevel );
+
+    protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph,
+            int currentTableLevel, List<CharacterRun> characterRuns,
+            int beginMark, Element currentBlock )
+    {
+        int separatorMark = -1;
+        int endMark = -1;
+        for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
+        {
+            CharacterRun characterRun = paragraph.getCharacterRun( c );
+
+            String text = characterRun.text();
+            if ( text.getBytes().length == 0 )
+                continue;
+
+            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+            {
+                if ( separatorMark != -1 )
+                {
+                    // double;
+                    return beginMark;
+                }
+
+                separatorMark = c;
+                continue;
+            }
+
+            if ( text.getBytes()[0] == FIELD_END_MARK )
+            {
+                if ( endMark != -1 )
+                {
+                    // double;
+                    return beginMark;
+                }
+
+                endMark = c;
+                break;
+            }
+
+        }
+
+        if ( separatorMark == -1 || endMark == -1 )
+            return beginMark;
+
+        processField( wordDocument, currentBlock, paragraph, currentTableLevel,
+                characterRuns, beginMark, separatorMark, endMark );
+
+        return endMark;
+    }
+
+}

Added: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java?rev=1142765&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java (added)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java Mon Jul  4 19:08:06 2011
@@ -0,0 +1,404 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+public class AbstractWordUtils
+{
+    static final String EMPTY = "";
+
+    private static final POILogger logger = POILogFactory
+            .getLogger( AbstractWordUtils.class );
+
+    public static final float TWIPS_PER_INCH = 1440.0f;
+    public static final int TWIPS_PER_PT = 20;
+
+    static void closeQuietly( final Closeable closeable )
+    {
+        try
+        {
+            closeable.close();
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.ERROR, "Unable to close resource: " + exc,
+                    exc );
+        }
+    }
+
+    static boolean equals( String str1, String str2 )
+    {
+        return str1 == null ? str2 == null : str1.equals( str2 );
+    }
+
+    // XXX incorporate into Range
+    static List<CharacterRun> findCharacterRuns( Range range )
+    {
+        final int min = range.getStartOffset();
+        final int max = range.getEndOffset();
+
+        List<CharacterRun> result = new ArrayList<CharacterRun>();
+        List<CHPX> chpxs = getCharacters( range );
+        for ( int i = 0; i < chpxs.size(); i++ )
+        {
+            CHPX chpx = chpxs.get( i );
+            if ( chpx == null )
+                continue;
+
+            if ( Math.max( min, chpx.getStart() ) <= Math.min( max,
+                    chpx.getEnd() ) )
+            {
+                final CharacterRun characterRun = getCharacterRun( range, chpx );
+
+                if ( characterRun == null )
+                    continue;
+
+                result.add( characterRun );
+            }
+        }
+
+        return result;
+    }
+
+    public static String getBorderType( BorderCode borderCode )
+    {
+        if ( borderCode == null )
+            throw new IllegalArgumentException( "borderCode is null" );
+
+        switch ( borderCode.getBorderType() )
+        {
+        case 1:
+        case 2:
+            return "solid";
+        case 3:
+            return "double";
+        case 5:
+            return "solid";
+        case 6:
+            return "dotted";
+        case 7:
+        case 8:
+            return "dashed";
+        case 9:
+            return "dotted";
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+        case 16:
+        case 17:
+        case 18:
+        case 19:
+            return "double";
+        case 20:
+            return "solid";
+        case 21:
+            return "double";
+        case 22:
+            return "dashed";
+        case 23:
+            return "dashed";
+        case 24:
+            return "ridge";
+        case 25:
+            return "grooved";
+        default:
+            return "solid";
+        }
+    }
+
+    public static String getBorderWidth( BorderCode borderCode )
+    {
+        int lineWidth = borderCode.getLineWidth();
+        int pt = lineWidth / 8;
+        int pte = lineWidth - pt * 8;
+
+        StringBuilder stringBuilder = new StringBuilder();
+        stringBuilder.append( pt );
+        stringBuilder.append( "." );
+        stringBuilder.append( 1000 / 8 * pte );
+        stringBuilder.append( "pt" );
+        return stringBuilder.toString();
+    }
+
+    public static String getBulletText( ListTables listTables,
+            Paragraph paragraph, int listId )
+    {
+        final ListLevel listLevel = listTables.getLevel( listId,
+                paragraph.getIlvl() );
+
+        if ( listLevel.getNumberText() == null )
+            return EMPTY;
+
+        StringBuffer bulletBuffer = new StringBuffer();
+        char[] xst = listLevel.getNumberText().toCharArray();
+        for ( char element : xst )
+        {
+            if ( element < 9 )
+            {
+                ListLevel numLevel = listTables.getLevel( listId, element );
+
+                int num = numLevel.getStartAt();
+                bulletBuffer.append( NumberFormatter.getNumber( num,
+                        listLevel.getNumberFormat() ) );
+
+                if ( numLevel == listLevel )
+                {
+                    numLevel.setStartAt( numLevel.getStartAt() + 1 );
+                }
+
+            }
+            else
+            {
+                bulletBuffer.append( element );
+            }
+        }
+
+        byte follow = getIxchFollow( listLevel );
+        switch ( follow )
+        {
+        case 0:
+            bulletBuffer.append( "\t" );
+            break;
+        case 1:
+            bulletBuffer.append( " " );
+            break;
+        default:
+            break;
+        }
+
+        return bulletBuffer.toString();
+    }
+
+    private static CharacterRun getCharacterRun( Range range, CHPX chpx )
+    {
+        try
+        {
+            Method method = Range.class.getDeclaredMethod( "getCharacterRun",
+                    CHPX.class );
+            method.setAccessible( true );
+            return (CharacterRun) method.invoke( range, chpx );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    private static List<CHPX> getCharacters( Range range )
+    {
+        try
+        {
+            Field field = Range.class.getDeclaredField( "_characters" );
+            field.setAccessible( true );
+            return (List<CHPX>) field.get( range );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    public static String getColor( int ico )
+    {
+        switch ( ico )
+        {
+        case 1:
+            return "black";
+        case 2:
+            return "blue";
+        case 3:
+            return "cyan";
+        case 4:
+            return "green";
+        case 5:
+            return "magenta";
+        case 6:
+            return "red";
+        case 7:
+            return "yellow";
+        case 8:
+            return "white";
+        case 9:
+            return "darkblue";
+        case 10:
+            return "darkcyan";
+        case 11:
+            return "darkgreen";
+        case 12:
+            return "darkmagenta";
+        case 13:
+            return "darkred";
+        case 14:
+            return "darkyellow";
+        case 15:
+            return "darkgray";
+        case 16:
+            return "lightgray";
+        default:
+            return "black";
+        }
+    }
+
+    public static byte getIxchFollow( ListLevel listLevel )
+    {
+        try
+        {
+            Field field = ListLevel.class.getDeclaredField( "_ixchFollow" );
+            field.setAccessible( true );
+            return ( (Byte) field.get( listLevel ) ).byteValue();
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    public static String getJustification( int js )
+    {
+        switch ( js )
+        {
+        case 0:
+            return "start";
+        case 1:
+            return "center";
+        case 2:
+            return "end";
+        case 3:
+        case 4:
+            return "justify";
+        case 5:
+            return "center";
+        case 6:
+            return "left";
+        case 7:
+            return "start";
+        case 8:
+            return "end";
+        case 9:
+            return "justify";
+        }
+        return "";
+    }
+
+    public static String getListItemNumberLabel( int number, int format )
+    {
+
+        if ( format != 0 )
+            System.err.println( "NYI: toListItemNumberLabel(): " + format );
+
+        return String.valueOf( number );
+    }
+
+    public static SectionProperties getSectionProperties( Section section )
+    {
+        try
+        {
+            Field field = Section.class.getDeclaredField( "_props" );
+            field.setAccessible( true );
+            return (SectionProperties) field.get( section );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    static boolean isEmpty( String str )
+    {
+        return str == null || str.length() == 0;
+    }
+
+    static boolean isNotEmpty( String str )
+    {
+        return !isEmpty( str );
+    }
+
+    public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
+    {
+        final FileInputStream istream = new FileInputStream( docFile );
+        try
+        {
+            return loadDoc( istream );
+        }
+        finally
+        {
+            closeQuietly( istream );
+        }
+    }
+
+    public static HWPFDocumentCore loadDoc( InputStream inputStream )
+            throws IOException
+    {
+        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
+                .verifyAndBuildPOIFS( inputStream );
+        try
+        {
+            return new HWPFDocument( poifsFileSystem );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( poifsFileSystem );
+        }
+    }
+
+    public static TableIterator newTableIterator( Range range, int level )
+    {
+        try
+        {
+            Constructor<TableIterator> constructor = TableIterator.class
+                    .getDeclaredConstructor( Range.class, int.class );
+            constructor.setAccessible( true );
+            return constructor.newInstance( range, Integer.valueOf( level ) );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+}

Copied: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java (from r1139204, poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java)
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java?p2=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java&p1=poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java&r1=1139204&r2=1142765&rev=1142765&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java Mon Jul  4 19:08:06 2011
@@ -1,37 +1,34 @@
-/*
- *  ====================================================================
- *    Licensed to the Apache Software Foundation (ASF) under one or more
- *    contributor license agreements.  See the NOTICE file distributed with
- *    this work for additional information regarding copyright ownership.
- *    The ASF licenses this file to You under the Apache License, Version 2.0
- *    (the "License"); you may not use this file except in compliance with
- *    the License.  You may obtain a copy of the License at
- *
- *        http://www.apache.org/licenses/LICENSE-2.0
- *
- *    Unless required by applicable law or agreed to in writing, software
- *    distributed under the License is distributed on an "AS IS" BASIS,
- *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *    See the License for the specific language governing permissions and
- *    limitations under the License.
- * ====================================================================
- */
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
 package org.apache.poi.hwpf.extractor;
 
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Text;
 
-public abstract class AbstractToFoExtractor
+public class FoDocumentFacade
 {
-
     private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
 
     protected final Document document;
     protected final Element layoutMasterSet;
     protected final Element root;
 
-    public AbstractToFoExtractor( Document document )
+    public FoDocumentFacade( Document document )
     {
         this.document = document;
 
@@ -43,7 +40,7 @@ public abstract class AbstractToFoExtrac
         root.appendChild( layoutMasterSet );
     }
 
-    protected Element addFlowToPageSequence( final Element pageSequence,
+    public Element addFlowToPageSequence( final Element pageSequence,
             String flowName )
     {
         final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
@@ -53,28 +50,28 @@ public abstract class AbstractToFoExtrac
         return flow;
     }
 
-    protected Element addListItem( Element listBlock )
+    public Element addListItem( Element listBlock )
     {
         Element result = createListItem();
         listBlock.appendChild( result );
         return result;
     }
 
-    protected Element addListItemBody( Element listItem )
+    public Element addListItemBody( Element listItem )
     {
         Element result = createListItemBody();
         listItem.appendChild( result );
         return result;
     }
 
-    protected Element addListItemLabel( Element listItem, String text )
+    public Element addListItemLabel( Element listItem, String text )
     {
         Element result = createListItemLabel( text );
         listItem.appendChild( result );
         return result;
     }
 
-    protected Element addPageSequence( String pageMaster )
+    public Element addPageSequence( String pageMaster )
     {
         final Element pageSequence = document.createElementNS( NS_XSLFO,
                 "fo:page-sequence" );
@@ -83,7 +80,7 @@ public abstract class AbstractToFoExtrac
         return pageSequence;
     }
 
-    protected Element addRegionBody( Element pageMaster )
+    public Element addRegionBody( Element pageMaster )
     {
         final Element regionBody = document.createElementNS( NS_XSLFO,
                 "fo:region-body" );
@@ -92,7 +89,7 @@ public abstract class AbstractToFoExtrac
         return regionBody;
     }
 
-    protected Element addSimplePageMaster( String masterName )
+    public Element addSimplePageMaster( String masterName )
     {
         final Element simplePageMaster = document.createElementNS( NS_XSLFO,
                 "fo:simple-page-master" );
@@ -110,7 +107,7 @@ public abstract class AbstractToFoExtrac
         return basicLink;
     }
 
-    protected Element createBasicLinkInternal( String internalDestination )
+    public Element createBasicLinkInternal( String internalDestination )
     {
         final Element basicLink = document.createElementNS( NS_XSLFO,
                 "fo:basic-link" );
@@ -118,12 +115,12 @@ public abstract class AbstractToFoExtrac
         return basicLink;
     }
 
-    protected Element createBlock()
+    public Element createBlock()
     {
         return document.createElementNS( NS_XSLFO, "fo:block" );
     }
 
-    protected Element createExternalGraphic( String source )
+    public Element createExternalGraphic( String source )
     {
         Element result = document.createElementNS( NS_XSLFO,
                 "fo:external-graphic" );
@@ -131,32 +128,32 @@ public abstract class AbstractToFoExtrac
         return result;
     }
 
-    protected Element createInline()
+    public Element createInline()
     {
         return document.createElementNS( NS_XSLFO, "fo:inline" );
     }
 
-    protected Element createLeader()
+    public Element createLeader()
     {
         return document.createElementNS( NS_XSLFO, "fo:leader" );
     }
 
-    protected Element createListBlock()
+    public Element createListBlock()
     {
         return document.createElementNS( NS_XSLFO, "fo:list-block" );
     }
 
-    protected Element createListItem()
+    public Element createListItem()
     {
         return document.createElementNS( NS_XSLFO, "fo:list-item" );
     }
 
-    protected Element createListItemBody()
+    public Element createListItemBody()
     {
         return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
     }
 
-    protected Element createListItemLabel( String text )
+    public Element createListItemLabel( String text )
     {
         Element result = document.createElementNS( NS_XSLFO,
                 "fo:list-item-label" );

Added: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java?rev=1142765&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java (added)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java Mon Jul  4 19:08:06 2011
@@ -0,0 +1,107 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+public class HtmlDocumentFacade
+{
+
+    protected final Element body;
+    protected final Document document;
+    protected final Element head;
+    protected final Element html;
+
+    public HtmlDocumentFacade( Document document )
+    {
+        this.document = document;
+
+        html = document.createElement( "html" );
+        document.appendChild( html );
+
+        body = document.createElement( "body" );
+        head = document.createElement( "head" );
+
+        html.appendChild( head );
+        html.appendChild( body );
+    }
+
+    public Element createHyperlink( String internalDestination )
+    {
+        final Element basicLink = document.createElement( "a" );
+        basicLink.setAttribute( "href", internalDestination );
+        return basicLink;
+    }
+
+    public Element createListItem()
+    {
+        return document.createElement( "li" );
+    }
+
+    public Element createParagraph()
+    {
+        return document.createElement( "p" );
+    }
+
+    public Element createTable()
+    {
+        return document.createElement( "table" );
+    }
+
+    public Element createTableBody()
+    {
+        return document.createElement( "tbody" );
+    }
+
+    public Element createTableCell()
+    {
+        return document.createElement( "td" );
+    }
+
+    public Element createTableHeader()
+    {
+        return document.createElement( "thead" );
+    }
+
+    public Element createTableHeaderCell()
+    {
+        return document.createElement( "th" );
+    }
+
+    public Element createTableRow()
+    {
+        return document.createElement( "tr" );
+    }
+
+    public Text createText( String data )
+    {
+        return document.createTextNode( data );
+    }
+
+    public Element createUnorderedList()
+    {
+        return document.createElement( "ul" );
+    }
+
+    public Document getDocument()
+    {
+        return document;
+    }
+
+}

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java?rev=1142765&r1=1142764&r2=1142765&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java Mon Jul  4 19:08:06 2011
@@ -1,32 +1,27 @@
-/*
- *  ====================================================================
- *    Licensed to the Apache Software Foundation (ASF) under one or more
- *    contributor license agreements.  See the NOTICE file distributed with
- *    this work for additional information regarding copyright ownership.
- *    The ASF licenses this file to You under the Apache License, Version 2.0
- *    (the "License"); you may not use this file except in compliance with
- *    the License.  You may obtain a copy of the License at
- *
- *        http://www.apache.org/licenses/LICENSE-2.0
- *
- *    Unless required by applicable law or agreed to in writing, software
- *    distributed under the License is distributed on an "AS IS" BASIS,
- *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *    See the License for the specific language governing permissions and
- *    limitations under the License.
- * ====================================================================
- */
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
 package org.apache.poi.hwpf.extractor;
 
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileWriter;
-import java.io.IOException;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Stack;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.transform.OutputKeys;
@@ -36,8 +31,10 @@ import javax.xml.transform.dom.DOMSource
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
 import org.apache.poi.hwpf.model.ListFormatOverride;
 import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
@@ -54,12 +51,10 @@ import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Text;
 
-import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
-
 /**
  * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
  */
-public class WordToFoExtractor extends AbstractToFoExtractor
+public class WordToFoExtractor extends AbstractWordExtractor
 {
 
     /**
@@ -84,35 +79,55 @@ public class WordToFoExtractor extends A
         }
     }
 
-    private static final byte BEL_MARK = 7;
-
-    private static final byte FIELD_BEGIN_MARK = 19;
-
-    private static final byte FIELD_END_MARK = 21;
-
-    private static final byte FIELD_SEPARATOR_MARK = 20;
-
     private static final POILogger logger = POILogFactory
             .getLogger( WordToFoExtractor.class );
 
-    private static HWPFDocument loadDoc( File docFile ) throws IOException
+    public static String getBorderType( BorderCode borderCode )
     {
-        final FileInputStream istream = new FileInputStream( docFile );
-        try
+        if ( borderCode == null )
+            throw new IllegalArgumentException( "borderCode is null" );
+
+        switch ( borderCode.getBorderType() )
         {
-            return new HWPFDocument( istream );
-        }
-        finally
-        {
-            try
-            {
-                istream.close();
-            }
-            catch ( Exception exc )
-            {
-                logger.log( POILogger.ERROR,
-                        "Unable to close FileInputStream: " + exc, exc );
-            }
+        case 1:
+        case 2:
+            return "solid";
+        case 3:
+            return "double";
+        case 5:
+            return "solid";
+        case 6:
+            return "dotted";
+        case 7:
+        case 8:
+            return "dashed";
+        case 9:
+            return "dotted";
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+        case 16:
+        case 17:
+        case 18:
+        case 19:
+            return "double";
+        case 20:
+            return "solid";
+        case 21:
+            return "double";
+        case 22:
+            return "dashed";
+        case 23:
+            return "dashed";
+        case 24:
+            return "ridge";
+        case 25:
+            return "grooved";
+        default:
+            return "solid";
         }
     }
 
@@ -160,7 +175,7 @@ public class WordToFoExtractor extends A
 
     static Document process( File docFile ) throws Exception
     {
-        final HWPFDocument hwpfDocument = loadDoc( docFile );
+        final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile );
         WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
                 DocumentBuilderFactory.newInstance().newDocumentBuilder()
                         .newDocument() );
@@ -170,6 +185,8 @@ public class WordToFoExtractor extends A
 
     private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
 
+    protected final FoDocumentFacade foDocumentFacade;
+
     /**
      * Creates new instance of {@link WordToFoExtractor}. Can be used for output
      * several {@link HWPFDocument}s into single FO document.
@@ -180,27 +197,28 @@ public class WordToFoExtractor extends A
      */
     public WordToFoExtractor( Document document )
     {
-        super( document );
+        this.foDocumentFacade = new FoDocumentFacade( document );
     }
 
     protected String createPageMaster( SectionProperties sep, String type,
             int section )
     {
-        float height = sep.getYaPage() / TWIPS_PER_INCH;
-        float width = sep.getXaPage() / TWIPS_PER_INCH;
-        float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
-        float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
-        float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
-        float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
+        float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH;
+        float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH;
+        float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH;
+        float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH;
+        float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH;
+        float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH;
 
         // add these to the header
         String pageMasterName = type + "-page" + section;
 
-        Element pageMaster = addSimplePageMaster( pageMasterName );
+        Element pageMaster = foDocumentFacade
+                .addSimplePageMaster( pageMasterName );
         pageMaster.setAttribute( "page-height", height + "in" );
         pageMaster.setAttribute( "page-width", width + "in" );
 
-        Element regionBody = addRegionBody( pageMaster );
+        Element regionBody = foDocumentFacade.addRegionBody( pageMaster );
         regionBody.setAttribute( "margin", topMargin + "in " + rightMargin
                 + "in " + bottomMargin + "in " + leftMargin + "in" );
 
@@ -216,12 +234,13 @@ public class WordToFoExtractor extends A
 
         if ( sep.getCcolM1() > 0 )
         {
-            regionBody
-                    .setAttribute( "column-count", "" + (sep.getCcolM1() + 1) );
+            regionBody.setAttribute( "column-count", ""
+                    + ( sep.getCcolM1() + 1 ) );
             if ( sep.getFEvenlySpaced() )
             {
                 regionBody.setAttribute( "column-gap",
-                        (sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
+                        ( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH )
+                                + "in" );
             }
             else
             {
@@ -232,171 +251,55 @@ public class WordToFoExtractor extends A
         return pageMasterName;
     }
 
-    protected boolean processCharacters( HWPFDocument hwpfDocument,
-            int currentTableLevel, Paragraph paragraph, final Element block,
-            final int start, final int end )
+    public Document getDocument()
     {
-        boolean haveAnyText = false;
-
-        for ( int c = start; c < end; c++ )
-        {
-            CharacterRun characterRun = paragraph.getCharacterRun( c );
-
-            if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) )
-            {
-                Picture picture = hwpfDocument.getPicturesTable()
-                        .extractPicture( characterRun, true );
-
-                processImage( block, characterRun.text().charAt( 0 ) == 0x01,
-                        picture );
-                continue;
-            }
-
-            String text = characterRun.text();
-            if ( text.getBytes().length == 0 )
-                continue;
-
-            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
-            {
-                int skipTo = tryField( hwpfDocument, paragraph,
-                        currentTableLevel, c, block );
-
-                if ( skipTo != c )
-                {
-                    c = skipTo;
-                    continue;
-                }
-
-                continue;
-            }
-            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
-            {
-                // shall not appear without FIELD_BEGIN_MARK
-                continue;
-            }
-            if ( text.getBytes()[0] == FIELD_END_MARK )
-            {
-                // shall not appear without FIELD_BEGIN_MARK
-                continue;
-            }
-
-            if ( characterRun.isSpecialCharacter() || characterRun.isObj()
-                    || characterRun.isOle2() )
-            {
-                continue;
-            }
-
-            BlockProperies blockProperies = this.blocksProperies.peek();
-            Element inline = createInline();
-            if ( characterRun.isBold() != blockProperies.pBold )
-            {
-                WordToFoUtils.setBold( inline, characterRun.isBold() );
-            }
-            if ( characterRun.isItalic() != blockProperies.pItalic )
-            {
-                WordToFoUtils.setItalic( inline, characterRun.isItalic() );
-            }
-            if ( !WordToFoUtils.equals( characterRun.getFontName(),
-                    blockProperies.pFontName ) )
-            {
-                WordToFoUtils
-                        .setFontFamily( inline, characterRun.getFontName() );
-            }
-            if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
-            {
-                WordToFoUtils.setFontSize( inline,
-                        characterRun.getFontSize() / 2 );
-            }
-            WordToFoUtils.setCharactersProperties( characterRun, inline );
-            block.appendChild( inline );
-
-            if ( text.endsWith( "\r" )
-                    || (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) )
-                text = text.substring( 0, text.length() - 1 );
-
-            Text textNode = createText( text );
-            inline.appendChild( textNode );
-
-            haveAnyText |= text.trim().length() != 0;
-        }
-
-        return haveAnyText;
+        return foDocumentFacade.getDocument();
     }
 
-    public void processDocument( HWPFDocument hwpfDocument )
+    @Override
+    protected void outputCharacters( Element block, CharacterRun characterRun,
+            String text )
     {
-        final Range range = hwpfDocument.getRange();
-
-        for ( int s = 0; s < range.numSections(); s++ )
+        BlockProperies blockProperies = this.blocksProperies.peek();
+        Element inline = foDocumentFacade.createInline();
+        if ( characterRun.isBold() != blockProperies.pBold )
         {
-            processSection( hwpfDocument, range.getSection( s ), s );
+            WordToFoUtils.setBold( inline, characterRun.isBold() );
         }
-    }
-
-    protected void processField( HWPFDocument hwpfDocument,
-            Element currentBlock, Paragraph paragraph, int currentTableLevel,
-            int beginMark, int separatorMark, int endMark )
-    {
-
-        Pattern hyperlinkPattern = Pattern
-                .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
-        Pattern pagerefPattern = Pattern
-                .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
-
-        if ( separatorMark - beginMark > 1 )
+        if ( characterRun.isItalic() != blockProperies.pItalic )
         {
-            CharacterRun firstAfterBegin = paragraph
-                    .getCharacterRun( beginMark + 1 );
-
-            final Matcher hyperlinkMatcher = hyperlinkPattern
-                    .matcher( firstAfterBegin.text() );
-            if ( hyperlinkMatcher.matches() )
-            {
-                String hyperlink = hyperlinkMatcher.group( 1 );
-                processHyperlink( hwpfDocument, currentBlock, paragraph,
-                        currentTableLevel, hyperlink, separatorMark + 1,
-                        endMark );
-                return;
-            }
-
-            final Matcher pagerefMatcher = pagerefPattern
-                    .matcher( firstAfterBegin.text() );
-            if ( pagerefMatcher.matches() )
-            {
-                String pageref = pagerefMatcher.group( 1 );
-                processPageref( hwpfDocument, currentBlock, paragraph,
-                        currentTableLevel, pageref, separatorMark + 1, endMark );
-                return;
-            }
+            WordToFoUtils.setItalic( inline, characterRun.isItalic() );
         }
-
-        StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
-        for ( int i = beginMark; i <= endMark; i++ )
+        if ( characterRun.getFontName() != null
+                && !AbstractWordUtils.equals( characterRun.getFontName(),
+                        blockProperies.pFontName ) )
         {
-            debug.append( "\t" );
-            debug.append( paragraph.getCharacterRun( i ) );
-            debug.append( "\n" );
+            WordToFoUtils.setFontFamily( inline, characterRun.getFontName() );
         }
-        logger.log( POILogger.WARN, debug );
-
-        // just output field value
-        if ( separatorMark + 1 < endMark )
-            processCharacters( hwpfDocument, currentTableLevel, paragraph,
-                    currentBlock, separatorMark + 1, endMark );
+        if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
+        {
+            WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 );
+        }
+        WordToFoUtils.setCharactersProperties( characterRun, inline );
+        block.appendChild( inline );
 
-        return;
+        Text textNode = foDocumentFacade.createText( text );
+        inline.appendChild( textNode );
     }
 
-    protected void processHyperlink( HWPFDocument hwpfDocument,
-            Element currentBlock, Paragraph paragraph, int currentTableLevel,
+    protected void processHyperlink( HWPFDocumentCore hwpfDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
             String hyperlink, int beginTextInclusive, int endTextExclusive )
     {
-        Element basicLink = createBasicLinkExternal( hyperlink );
+        Element basicLink = foDocumentFacade
+                .createBasicLinkExternal( hyperlink );
         currentBlock.appendChild( basicLink );
 
         if ( beginTextInclusive < endTextExclusive )
             processCharacters( hwpfDocument, currentTableLevel, paragraph,
-                    basicLink, beginTextInclusive, endTextExclusive );
+                    basicLink, characterRuns, beginTextInclusive,
+                    endTextExclusive );
     }
 
     /**
@@ -422,27 +325,30 @@ public class WordToFoExtractor extends A
             Picture picture )
     {
         // no default implementation -- skip
-        currentBlock.appendChild( document.createComment( "Image link to '"
-                + picture.suggestFullFileName() + "' can be here" ) );
+        currentBlock.appendChild( foDocumentFacade.getDocument().createComment(
+                "Image link to '" + picture.suggestFullFileName()
+                        + "' can be here" ) );
     }
 
-    protected void processPageref( HWPFDocument hwpfDocument,
-            Element currentBlock, Paragraph paragraph, int currentTableLevel,
+    protected void processPageref( HWPFDocumentCore hwpfDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
             String pageref, int beginTextInclusive, int endTextExclusive )
     {
-        Element basicLink = createBasicLinkInternal( pageref );
+        Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref );
         currentBlock.appendChild( basicLink );
 
         if ( beginTextInclusive < endTextExclusive )
             processCharacters( hwpfDocument, currentTableLevel, paragraph,
-                    basicLink, beginTextInclusive, endTextExclusive );
+                    basicLink, characterRuns, beginTextInclusive,
+                    endTextExclusive );
     }
 
-    protected void processParagraph( HWPFDocument hwpfDocument,
+    protected void processParagraph( HWPFDocumentCore hwpfDocument,
             Element parentFopElement, int currentTableLevel,
             Paragraph paragraph, String bulletText )
     {
-        final Element block = createBlock();
+        final Element block = foDocumentFacade.createBlock();
         parentFopElement.appendChild( block );
 
         WordToFoUtils.setParagraphProperties( paragraph, block );
@@ -480,21 +386,23 @@ public class WordToFoExtractor extends A
 
             if ( WordToFoUtils.isNotEmpty( bulletText ) )
             {
-                Element inline = createInline();
+                Element inline = foDocumentFacade.createInline();
                 block.appendChild( inline );
 
-                Text textNode = createText( bulletText );
+                Text textNode = foDocumentFacade.createText( bulletText );
                 inline.appendChild( textNode );
 
                 haveAnyText |= bulletText.trim().length() != 0;
             }
 
+            List<CharacterRun> characterRuns = WordToFoUtils
+                    .findCharacterRuns( paragraph );
             haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
-                    paragraph, block, 0, charRuns );
+                    paragraph, block, characterRuns, 0, characterRuns.size() );
 
             if ( !haveAnyText )
             {
-                Element leader = createLeader();
+                Element leader = foDocumentFacade.createLeader();
                 block.appendChild( leader );
             }
         }
@@ -506,20 +414,21 @@ public class WordToFoExtractor extends A
         return;
     }
 
-    protected void processSection( HWPFDocument hwpfDocument, Section section,
-            int sectionCounter )
+    protected void processSection( HWPFDocumentCore wordDocument,
+            Section section, int sectionCounter )
     {
         String regularPage = createPageMaster(
                 WordToFoUtils.getSectionProperties( section ), "page",
                 sectionCounter );
 
-        Element pageSequence = addPageSequence( regularPage );
-        Element flow = addFlowToPageSequence( pageSequence, "xsl-region-body" );
+        Element pageSequence = foDocumentFacade.addPageSequence( regularPage );
+        Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence,
+                "xsl-region-body" );
 
-        processSectionParagraphes( hwpfDocument, flow, section, 0 );
+        processSectionParagraphes( wordDocument, flow, section, 0 );
     }
 
-    protected void processSectionParagraphes( HWPFDocument hwpfDocument,
+    protected void processSectionParagraphes( HWPFDocument wordDocument,
             Element flow, Range range, int currentTableLevel )
     {
         final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
@@ -530,7 +439,7 @@ public class WordToFoExtractor extends A
             allTables.put( Integer.valueOf( next.getStartOffset() ), next );
         }
 
-        final ListTables listTables = hwpfDocument.getListTables();
+        final ListTables listTables = wordDocument.getListTables();
         int currentListInfo = 0;
 
         final int paragraphs = range.numParagraphs();
@@ -543,7 +452,7 @@ public class WordToFoExtractor extends A
             {
                 Table table = allTables.get( Integer.valueOf( paragraph
                         .getStartOffset() ) );
-                processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
+                processTable( wordDocument, flow, table, currentTableLevel + 1 );
                 continue;
             }
 
@@ -568,7 +477,7 @@ public class WordToFoExtractor extends A
                     String label = WordToFoUtils.getBulletText( listTables,
                             paragraph, listFormatOverride.getLsid() );
 
-                    processParagraph( hwpfDocument, flow, currentTableLevel,
+                    processParagraph( wordDocument, flow, currentTableLevel,
                             paragraph, label );
                 }
                 else
@@ -580,24 +489,24 @@ public class WordToFoExtractor extends A
                                     + currentListInfo
                                     + ", but listTables not defined in file" );
 
-                    processParagraph( hwpfDocument, flow, currentTableLevel,
+                    processParagraph( wordDocument, flow, currentTableLevel,
                             paragraph, WordToFoUtils.EMPTY );
                 }
             }
             else
             {
-                processParagraph( hwpfDocument, flow, currentTableLevel,
+                processParagraph( wordDocument, flow, currentTableLevel,
                         paragraph, WordToFoUtils.EMPTY );
             }
         }
 
     }
 
-    protected void processTable( HWPFDocument hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
             Table table, int thisTableLevel )
     {
-        Element tableHeader = createTableHeader();
-        Element tableBody = createTableBody();
+        Element tableHeader = foDocumentFacade.createTableHeader();
+        Element tableBody = foDocumentFacade.createTableBody();
 
         final int tableRows = table.numRows();
 
@@ -611,7 +520,7 @@ public class WordToFoExtractor extends A
         {
             TableRow tableRow = table.getRow( r );
 
-            Element tableRowElement = createTableRow();
+            Element tableRowElement = foDocumentFacade.createTableRow();
             WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
 
             final int rowCells = tableRow.numCells();
@@ -626,7 +535,7 @@ public class WordToFoExtractor extends A
                         && !tableCell.isFirstVerticallyMerged() )
                     continue;
 
-                Element tableCellElement = createTableCell();
+                Element tableCellElement = foDocumentFacade.createTableCell();
                 WordToFoUtils.setTableCellProperties( tableRow, tableCell,
                         tableCellElement, r == 0, r == tableRows - 1, c == 0,
                         c == rowCells - 1 );
@@ -649,9 +558,9 @@ public class WordToFoExtractor extends A
                 {
                     if ( c == rowCells - 1 && c != maxColumns - 1 )
                     {
-                        tableCellElement
-                                .setAttribute( "number-columns-spanned", ""
-                                        + (maxColumns - c) );
+                        tableCellElement.setAttribute(
+                                "number-columns-spanned", ""
+                                        + ( maxColumns - c ) );
                     }
                 }
 
@@ -673,12 +582,13 @@ public class WordToFoExtractor extends A
                             + count );
                 }
 
-                processSectionParagraphes( hwpfDocument, tableCellElement,
+                processSectionParagraphes( wordDocument, tableCellElement,
                         tableCell, thisTableLevel );
 
                 if ( !tableCellElement.hasChildNodes() )
                 {
-                    tableCellElement.appendChild( createBlock() );
+                    tableCellElement.appendChild( foDocumentFacade
+                            .createBlock() );
                 }
 
                 tableRowElement.appendChild( tableCellElement );
@@ -694,7 +604,7 @@ public class WordToFoExtractor extends A
             }
         }
 
-        final Element tableElement = createTable();
+        final Element tableElement = foDocumentFacade.createTable();
         if ( tableHeader.hasChildNodes() )
         {
             tableElement.appendChild( tableHeader );
@@ -714,51 +624,4 @@ public class WordToFoExtractor extends A
         }
     }
 
-    protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph,
-            int currentTableLevel, int beginMark, Element currentBlock )
-    {
-        int separatorMark = -1;
-        int endMark = -1;
-        for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
-        {
-            CharacterRun characterRun = paragraph.getCharacterRun( c );
-
-            String text = characterRun.text();
-            if ( text.getBytes().length == 0 )
-                continue;
-
-            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
-            {
-                if ( separatorMark != -1 )
-                {
-                    // double;
-                    return beginMark;
-                }
-
-                separatorMark = c;
-                continue;
-            }
-
-            if ( text.getBytes()[0] == FIELD_END_MARK )
-            {
-                if ( endMark != -1 )
-                {
-                    // double;
-                    return beginMark;
-                }
-
-                endMark = c;
-                break;
-            }
-
-        }
-
-        if ( separatorMark == -1 || endMark == -1 )
-            return beginMark;
-
-        processField( hwpfDocument, currentBlock, paragraph, currentTableLevel,
-                beginMark, separatorMark, endMark );
-
-        return endMark;
-    }
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message