Mailing-List: contact lucene-user-help@jakarta.apache.org; run by ezmlm
Precedence: bulk
Reply-To: "Lucene Users List" <lucene-user@jakarta.apache.org>
Message-ID: <024601c45ac9$021a7070$dadacda3@tdryan>
Reply-To: "Ryan Ackley" <sackley@apache.org>
From: "Ryan Ackley" <sackley@cfl.rr.com>
To: "Lucene Users List" <lucene-user@jakarta.apache.org>
References: <40DC1DA4.2020001@ifit.uni-klu.ac.at>
Subject: Re: Index MSOffice Documents
Date: Fri, 25 Jun 2004 11:28:04 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 8bit

Thanks Sergiu,

You should also post to the Lucene Users list.

-Ryan

----- Original Message ----- 
From: "Sergiu Gordea" <gsergiu@ifit.uni-klu.ac.at>
To: "Lucene Users List" <lucene-user@jakarta.apache.org>;
<lucene-dev@jakarta.apache.org>
Cc: "POI Users List" <poi-user@jakarta.apache.org>
Sent: Friday, June 25, 2004 8:42 AM
Subject: Index MSOffice Documents


> Hi all,
>
>  I'm working on a project in which we are building a knowledge
> management platform. We are using Turbine/Velocity
> as framework and we are using lucene for search.
>
>  We want to make the search to be able to index MSOffice Documents,
> therefore I was searching for some possibilities to extract the text
> from this
> documents. I found some examples based on POI library
> (http://jakarta.apache.org/poi) and I addapted them to our needs.
> The extraction of the text elements from XLS file I think is trustable
> (the from POI development comunity did a great job with the package that
> work with XSL files). The examples that extract the text from DOC and
> PPT files are not very general, I think they have problems with the
> documents
> written with special charsets but they are working just well on the
> documents I use. I hope someone that has more experience that I have
> will improve this
> and will a better source code.
>
>  Congratulations to all people involved in development of the Jakarta
> project and it's subprojects,
>
>  Sergiu Gordea
>
> Ps: ExeConverteImpl uses an external stand alone application (like
> antiwort or pdf2txt) to extract the text.
>


----------------------------------------------------------------------------
----


> /* @(#) CWK 1.4 07.06.2004
>  *
>  * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
>  * Universit�tsstr. 94/7 9020 Klagenfurt Austria
>  * www.configworks.com
>  * All rights reserved.
>  */
>
> package com.configworks.cwk.be.search.converters;
>
> import java.io.BufferedWriter;
> import java.io.File;
> import java.io.FileNotFoundException;
> import java.io.IOException;
> import java.io.InputStream;
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
> import org.apache.poi.hssf.usermodel.HSSFCell;
> import org.apache.poi.hssf.usermodel.HSSFRow;
> import org.apache.poi.hssf.usermodel.HSSFSheet;
> import org.apache.poi.hssf.usermodel.HSSFWorkbook;
>
> /**
>  * Class description
>  *
>  * @author sergiu
>  * @version 1.0
>  * @since CWK 1.5
>  */
> public class XLSConverterImpl extends JavaDocumentConverter {
>
>     private Log logger = null;
>     File dest = null;
>
>
>
>     public boolean extractText(InputStream reader, BufferedWriter writer)
throws FileNotFoundException,
>         IOException {
>
>         HSSFWorkbook workbook = new HSSFWorkbook(reader);
>
>         for (int k = 0; k < workbook.getNumberOfSheets(); k++) {
>             HSSFSheet sheet = workbook.getSheetAt(k);
>
>             if (sheet != null) {
>                 int rows = sheet.getLastRowNum();
>                 //I don't know why the last row = sheet.getRow(rows) and
first row = sheet.getRow(0)
>                 for (int r = 0; r <= rows; r++) {
>                 HSSFRow row = sheet.getRow(r);
>                 if (row != null) {
>                     int cells = row.getLastCellNum();
>                     for (int c = 0; c <= cells; c++) {
>                     HSSFCell cell = row.getCell((short) c);
>                     String value = null;
>                     if (cell != null) {
>                         switch (cell.getCellType()) {
>                             case HSSFCell.CELL_TYPE_FORMULA:
>                                 value = cell.getCellFormula();
>                                 break;
>                             case HSSFCell.CELL_TYPE_STRING:
>                                 value = cell.getStringCellValue();
>                                 break;
>                             case HSSFCell.CELL_TYPE_NUMERIC:
>                                 value = "" + cell.getNumericCellValue();
>                                 break;
>                             default:
>                                 value = cell.getStringCellValue();
>                         }
>                     }
>                     if (value != null) {
>                         writer.write(value + " ");
>                     }
>                 }//cels
>                 }
>             }//rows
>             }
>         }//sheets
>
>         //if no Exception was thrown consider that the conversion was
successful
>         return true;
>     }
>
>     /**
>      * @return Returns the logger.
>      */
>     public Log getLogger() {
>         if (logger == null)
>             logger = LogFactory.getLog(XLSConverterImpl.class);
>         return logger;
>     }
>
> }
>
>
>


----------------------------------------------------------------------------
----


> package com.configworks.cwk.be.search.converters;
>
> import com.configworks.cwk.share.Utils;
> import java.io.BufferedInputStream;
> import java.io.File;
> import java.io.FileReader;
> import java.io.IOException;
> import java.io.Reader;
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
>
>
> /**
>  * Created by IntelliJ IDEA.
>  * User: Kostya
>  * Date: 12.09.2003
>  * Time: 11:39:25
>  * To change this template use Options | File Templates.
>  */
>
> public class ExeConverterImpl extends Converter {
>
>     private Log logger =
LogFactory.getLog(ExeConverterImpl.class.getName());
>
>     public Reader convertSource(File source) {
>         try {
>             // the type is not registered the file content will not be
added to the index
>             if (_config.getExecutionPath() == null) {
>                 return null;
>             }
>             // else convert file into a temp dir and return contents of
the converted file
>             else {
>                 // if no converter is specified the file will be added
withot conversion
>                 if (_config.getExecutionPath().length() == 0)
>                     return new FileReader(source);
>
>                 String execPath = _config.getExecutionPath();
>
>                 String sourcePath = source.getAbsolutePath();
>                 // create tempdir if it doesn't exists
>                 new File(_config.getTempDirectory()).mkdirs();
>
>                 String targetPath = _config.getTempDirectory() +
File.separator + source.getName()
>                     + ".txt";
>
>                 String params = "";
>                 if(_config.getPathParam()!= null){
>                 //add HOME parameter
>                 params += _config.getPathParam();
>                 }
>
>                 Process process = Utils.executeOSCommand(execPath,
sourcePath, targetPath, params);
>                 process.waitFor();
>                 if (logger.isTraceEnabled()) {
>                     BufferedInputStream stream=null;
>                     try {
>                     stream = new
BufferedInputStream(process.getErrorStream());
>                     int read = 0;
>                     String outErrorString = "";
>                     while ((read = stream.read()) > 0)
>                         outErrorString += ((char) read);
>                     stream.close();
>                     if (outErrorString.length() > 0)
>                         logger.error(outErrorString);
>                     } finally {
>                         if (stream!=null) {
>                             stream.close();
>                         }
>                     }
>                 }
>                 File convertedSource = new File(targetPath);
>                 convertedSource.deleteOnExit();
>                 return new FileReader(convertedSource);
>             }
>         } catch (IOException ex) {
>             if (logger.isErrorEnabled())
>                 logger.error("IOException: " + ex.getMessage());
>         } catch (InterruptedException ex) {
>             if (logger.isErrorEnabled())
>                 logger.error("InterruptedException: " + ex.getMessage());
>         }
>
>         return null;
>     }
> }
>


----------------------------------------------------------------------------
----


> /* @(#) CWK 1.4 25.06.2004
>  *
>  * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
>  * Universit�tsstr. 94/7 9020 Klagenfurt Austria
>  * www.configworks.com
>  * All rights reserved.
>  */
>
> package com.configworks.cwk.be.search.converters;
>
> import java.io.BufferedReader;
> import java.io.BufferedWriter;
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.Reader;
>
> /**
>  * Class description
>  *
>  * @author sergiu
>  * @version 1.0
>  *
>  * @since CWK 1.5
>  */
> public abstract class JavaDocumentConverter extends Converter {
>
> File dest = null;
>
> /* (non-Javadoc)
> * @see
com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.Fil
e)
> */
> public Reader convertSource(File source) {
> if (source == null)
> return null;
> Reader reader = null;
> InputStream inputStream = null;
> BufferedWriter writer = null;
> try {
> String filename = source.getName();
> filename = filename.replace('.', '_');
> filename += ".txt";
> File tmpDir = new File(_config.getTempDirectory());
> tmpDir.mkdirs();
> dest = new File(tmpDir.getPath(), filename);
> boolean created = dest.createNewFile();
>
> //create the input and output streams
> writer = new BufferedWriter(
> new FileWriter(dest));
> inputStream = new FileInputStream(source);
>
> extractText(inputStream, writer);
>
> if (!dest.exists())
> return null;
> dest.deleteOnExit();
> reader = new BufferedReader(new FileReader(dest));
>
> } catch (Exception e) {
> getLogger().error("JavaDocumentConverter cannot convert the source file: "
> + source.getAbsolutePath(), e);
> reader = null;
> }finally{
> try {
> if(writer != null)
> writer.close();
> if(inputStream != null)
> inputStream.close();
> } catch (IOException ex) {
> if(getLogger().isDebugEnabled())
> getLogger().error("Cannot close the stream: " + ex);
> }
> }
> return reader;
>
> }
>
> /**
> * @param inputStream
> * @param writer
> * @since CWK 1.4.1
> * @see
> */
> public abstract boolean extractText(InputStream inputStream,
BufferedWriter writer) throws IOException;
>
> }
>


----------------------------------------------------------------------------
----


> /* @(#) CWK 1.5 23.06.2004
>  *
>  * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
>  * Universit�tsstr. 94/7 9020 Klagenfurt Austria
>  * www.configworks.com
>  * All rights reserved.
>  */
>
> package com.configworks.cwk.be.search.converters;
>
> import java.io.BufferedWriter;
> import java.io.IOException;
> import java.io.InputStream;
> import org.apache.poi.hpsf.PropertySet;
> import org.apache.poi.hpsf.PropertySetFactory;
> import org.apache.poi.poifs.eventfilesystem.POIFSReader;
> import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
> import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
> import org.apache.poi.util.LittleEndian;
>
> /**
>  * Class description
>  *
>  * @author sergiu
>  * @version 1.0
>  * @since CWK 1.5
>  */
> public class PPTConverterImpl extends JavaDocumentConverter {
>
>     static final String lineSeparator =
System.getProperty("line.separator");
>
>     /**
>      * Extract the text from a number of presentations.
>      */
>     public boolean extractText(InputStream  reader, BufferedWriter writer)
throws IOException{
>     POIFSReader r = new POIFSReader();
>
>     /* Register a listener for *all* documents. */
>     MyPOIFSReaderListener listener = new MyPOIFSReaderListener(writer);
>     r.registerListener(listener);
>     r.read(reader);
>     //if no exception was trown, consider that the conversion was
successful
>     return true;
>     }
>
>     class MyPOIFSReaderListener implements POIFSReaderListener{
>     private BufferedWriter writer = null;
>
>     public MyPOIFSReaderListener(BufferedWriter writer){
>     this.writer = writer;
>     }
>
>     public void processPOIFSReaderEvent(POIFSReaderEvent event) {
>     PropertySet ps = null;
>
>     try{
>
>     org.apache.poi.poifs.filesystem.DocumentInputStream dis=null;
>     if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
>     return;
>
>     dis=event.getStream();
>
>     byte btoWrite[]= new byte[12];
>     dis.read(btoWrite);
>
>     btoWrite = new byte[dis.available()];
>     dis.read(btoWrite, 0, dis.available());
>
>     //StringBuffer buff = new StringBuffer("");
>
>     for(int i=0; i<btoWrite.length-20; i++){
>
>     long type=LittleEndian.getUShort(btoWrite,i+2);
>     long size=LittleEndian.getUInt(btoWrite,i+4);
>     if (type==4008){
>
>     int offset = i+4+1;
>     int length = (int)size+3;
>     int end = offset + length;
>
>     byte[] textBytes = new byte[length];
>
>     for (int j = offset; j < end; j++) {
>     byte b = btoWrite[j];
>     writer.write((char) b);
>     }
>
>     if(i < (end -1))
>     i = end -1;
>     }
>
>     }
>
>     PropertySetFactory.create(event.getStream());
>     }catch (Exception e){
>     String msg = "Cannot index ppt file: \n";
>         if(getLogger().isErrorEnabled())
>         getLogger().error(msg + e);
>     }
>     }
>     }
> }
>
>
>
>


----------------------------------------------------------------------------
----


> /* @(#) CWK 1.4 24.06.2004
>  *
>  * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
>  * Universit�tsstr. 94/7 9020 Klagenfurt Austria
>  * www.configworks.com
>  * All rights reserved.
>  */
>
> package com.configworks.cwk.be.search.converters;
>
> import java.io.BufferedWriter;
> import java.io.IOException;
> import java.io.InputStream;
> import java.util.ArrayList;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.poi.util.LittleEndian;
>
> /**
>  * Class description
>  *
>  * @author sergiu
>  * @version 1.0
>  * @since CWK 1.5
>  */
> public class WordConverterImpl extends JavaDocumentConverter {
>
>     public boolean extractText(InputStream in, BufferedWriter writer)
throws IOException{
>     ArrayList text = new ArrayList();
>     POIFSFileSystem fsys = new POIFSFileSystem(in);
>
>     DocumentEntry headerProps =
>     (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
>     DocumentInputStream din =
fsys.createDocumentInputStream("WordDocument");
>     byte[] header = new byte[headerProps.getSize()];
>
>     din.read(header);
>     din.close();
>     // Prende le informazioni dall'header del documento
>     int info = LittleEndian.getShort(header, 0xa);
>
>     boolean useTable1 = (info & 0x200) != 0;
>
>     // Prende informazioni dalla piece table
>     int complexOffset = LittleEndian.getInt(header, 0x1a2);
>
>
>     String tableName = null;
>     if (useTable1)
>     tableName = "1Table";
>     else
>     tableName = "0Table";
>
>     DocumentEntry table =
(DocumentEntry)fsys.getRoot().getEntry(tableName);
>     byte[] tableStream = new byte[table.getSize()];
>
>     din = fsys.createDocumentInputStream(tableName);
>
>     din.read(tableStream);
>     din.close();
>
>     din = null;
>     fsys = null;
>     table = null;
>     headerProps = null;
>
>     int multiple = findText(tableStream, complexOffset, text);
>
>     StringBuffer sb = new StringBuffer();
>     int size = text.size();
>     tableStream = null;
>
>     for (int x = 0; x < size; x++){
>     WordTextPiece nextPiece = (WordTextPiece)text.get(x);
>     int start = nextPiece.getStart();
>     int length = nextPiece.getLength();
>
>     boolean unicode = nextPiece.usesUnicode();
>     String toStr = null;
>     if (unicode)
>     toStr = new String(header, start, length * multiple, "UTF-16LE");
>     else
>     toStr = new String(header, start, length , "ISO-8859-1");
>
>     //sb.append(toStr).append(" ");
>     toStr += " ";
>     writer.write(toStr);
>     }
>     //if no exeption occured we say that the conversion was successfully
realized
>     return true;
>     }
>
>     private static int findText(byte[] tableStream, int complexOffset,
> ArrayList text) throws IOException{
>     //actual text
>     int pos = complexOffset;
>     int multiple = 2;
>     //skips through the prms before we reach the piece table. These
contain data
>     //for actual fast saved files
>     while(tableStream[pos] == 1){
>     pos++;
>     int skip = LittleEndian.getShort(tableStream, pos);
>     pos += 2 + skip;
>     }
>
>     if(tableStream[pos] != 2){
>     throw new IOException("corrupted Word file");
>     }else{
>     //parse out the text pieces
>     int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
>     pos += 4;
>     int pieces = (pieceTableSize - 4) / 12;
>     for (int x = 0; x < pieces; x++){
>     int filePos = LittleEndian.getInt(tableStream, pos + ((pieces
>     + 1) * 4) +
>     (x * 8) + 2);
>     boolean unicode = false;
>     if ((filePos & 0x40000000) == 0){
>     unicode = true;
>     }else{
>     unicode = false;
>     multiple = 1;
>     filePos &= ~(0x40000000);//gives me FC in doc stream
>     filePos /= 2;
>     }
>
>     int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
>     - LittleEndian.getInt(tableStream, pos + (x * 4));
>
>     WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
>
>     text.add(piece);
>     }
>     }
>     return multiple;
>     }
>
>
>
> }
>
>
>


----------------------------------------------------------------------------
----


> /* @(#) CWK 1.4 07.06.2004
>  *
>  * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
>  * Universit�tsstr. 94/7 9020 Klagenfurt Austria
>  * www.configworks.com
>  * All rights reserved.
>  */
>
> package com.configworks.cwk.be.search.converters;
>
> /**
>  * Class description
>  *
>  * @author sergiu
>  * @version 1.0
>  *
>  * @since CWK 1.4
>  */
> class WordTextPiece{
> private int _fcStart;
> private boolean _usesUnicode;
> private int _length;
>
> public WordTextPiece(int start, int length, boolean unicode){
> _usesUnicode = unicode;
> _length = length;
> _fcStart = start;
> }
> public boolean usesUnicode(){
> return _usesUnicode;
> }
>
> public int getStart(){
> return _fcStart;
> }
> public int getLength(){
> return _length;
> }
>
> }
>
>
>


----------------------------------------------------------------------------
----


> package com.configworks.cwk.be.search.converters;
>
> import java.io.File;
> import java.io.Reader;
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
>
> /**
>  * Created by IntelliJ IDEA.
>  * User: Kostya
>  * Date: 11.09.2003
>  * Time: 19:24:56
>  * To change this template use Options | File Templates.
>  */
>
> public abstract class Converter {
>     protected ConverterConfig _config;
>     private static Log logger = null;
>
>     public abstract Reader convertSource(File source);
>
>     protected void Initialize(ConverterConfig config) {
>         _config = config;
>     };
>
>     /**
>      * @return Returns the logger.
>      */
>     public Log getLogger() {
>     if (logger == null)
>     logger = LogFactory.getLog(XLSConverterImpl.class);
>     return logger;
>     }
> }
>
>


----------------------------------------------------------------------------
----


> ---------------------------------------------------------------------
> To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: lucene-user-help@jakarta.apache.org
>


---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-user-help@jakarta.apache.org