lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Thierry Ferrero" <ferr...@itldev.info>
Subject Re: Indexing MS Files
Date Wed, 10 Nov 2004 17:23:47 GMT
I used OpenOffice API to convert all Word and Excel version.
For me it's "the solution" for complex Word and Excel document.
http://api.openoffice.org/
Good luck !

// UNO API
import com.sun.star.bridge.XUnoUrlResolver;
import com.sun.star.uno.XComponentContext;
import com.sun.star.uno.UnoRuntime;
import com.sun.star.frame.XComponentLoader;
import com.sun.star.frame.XStorable;
import com.sun.star.beans.PropertyValue;
import com.sun.star.beans.XPropertySet;
import com.sun.star.lang.XComponent;
import com.sun.star.lang.XMultiComponentFactory;
import com.sun.star.connection.NoConnectException;
import com.sun.star.io.IOException;


/** This class implements a http servlet in order to convert an incoming
document
 * with help of a running OpenOffice.org and to push the converted file back
 * to the client.
 */
public class DocConverter {

 private String stringHost;
 private String stringPort;
 private Xcontext xcontext;
 private Xbase xbase;

 public DocConverter(Xbase xbase,Xcontext xcontext,ServletContext sc) {

  this.xbase=xbase;
  this.xcontext=xcontext;
    stringHost=ApplicationUtil.getParameter(sc,"openoffice.oohost");
    stringPort=ApplicationUtil.getParameter(sc,"openoffice.ooport");
   }

 public synchronized String convertToTxt(String namedoc, String pathdoc,
String stringConvertType, String stringExtension) {

        String stringConvertedFile = this.convertDocument(namedoc, pathdoc,
stringConvertType, stringExtension);
  return stringConvertedFile;
 }


 /** This method converts a document to a given type by using a running
 * OpenOffice.org and saves the converted document to the specified
 * working directory.
 * @param stringDocumentName The full path name of the file on the server to
be converted.
 * @param stringConvertType Type to convert to.
 * @param stringExtension This string will be appended to the file name of
the converted file.
 * @return The full path name of the converted file will be returned.
 * @see stringWorkingDirectory
 */
 private String convertDocument(String namedoc, String pathdoc, String
stringConvertType, String stringExtension ) {

 String tagerr="";
    String stringUrl="";
    String stringConvertedFile = "";
    // Converting the document to the favoured type
    try {
      tagerr="0";
      // Composing the URL - suppression de l'extension
      stringUrl = pathdoc+"/"+namedoc;
     stringUrl=stringUrl.replace( '\\', '/' );
      /* Bootstraps a component context with the jurt base components
         registered. Component context to be granted to a component for
running.
         Arbitrary values can be retrieved from the context. */
      XComponentContext xcomponentcontext =
      com.sun.star.comp.helper.Bootstrap.createInitialComponentContext(
null );

      /* Gets the service manager instance to be used (or null). This method
has
         been added for convenience, because the service manager is a often
used
         object. */
      XMultiComponentFactory xmulticomponentfactory =
      xcomponentcontext.getServiceManager();
   tagerr="2";
      /* Creates an instance of the component UnoUrlResolver which
         supports the services specified by the factory. */
      Object objectUrlResolver =
      xmulticomponentfactory.createInstanceWithContext(
      "com.sun.star.bridge.UnoUrlResolver", xcomponentcontext );
       // Create a new url resolver
      XUnoUrlResolver xurlresolver = ( XUnoUrlResolver )
      UnoRuntime.queryInterface( XUnoUrlResolver.class,
      objectUrlResolver );
        // Resolves an object that is specified as follow:
      // uno:<connection description>;<protocol description>;<initial object
name>
      Object objectInitial = xurlresolver.resolve(
      "uno:socket,host=" + stringHost + ",port=" + stringPort +
";urp;StarOffice.ServiceManager" );

      // Create a service manager from the initial object
      xmulticomponentfactory = ( XMultiComponentFactory )
      UnoRuntime.queryInterface( XMultiComponentFactory.class,
objectInitial );
      // Query for the XPropertySet interface.
      XPropertySet xpropertysetMultiComponentFactory = ( XPropertySet )
      UnoRuntime.queryInterface( XPropertySet.class,
xmulticomponentfactory );
       // Get the default context from the office server.
      Object objectDefaultContext =
      xpropertysetMultiComponentFactory.getPropertyValue(
"DefaultContext" );

      // Query for the interface XComponentContext.
      xcomponentcontext = ( XComponentContext ) UnoRuntime.queryInterface(
      XComponentContext.class, objectDefaultContext );

      /* A desktop environment contains tasks with one or more
         frames in which components can be loaded. Desktop is the
         environment for components which can instanciate within
         frames. */
      XComponentLoader xcomponentloader = ( XComponentLoader )
      UnoRuntime.queryInterface( XComponentLoader.class,
      xmulticomponentfactory.createInstanceWithContext(
      "com.sun.star.frame.Desktop", xcomponentcontext ) );

      // Preparing properties for loading the document
      PropertyValue propertyvalue[] = new PropertyValue[ 1 ];
      // Setting the flag for hidding the open document
      propertyvalue[ 0 ] = new PropertyValue();
      propertyvalue[ 0 ].Name = "Hidden";
      propertyvalue[ 0 ].Value = new Boolean(true);


      // Loading the wanted document
      Object objectDocumentToStore =
      xcomponentloader.loadComponentFromURL(
      stringUrl, "_blank", 0, propertyvalue );

      // Getting an object that will offer a simple way to store a document
to a URL.
      XStorable xstorable =
      ( XStorable ) UnoRuntime.queryInterface( XStorable.class,
      objectDocumentToStore );

      // Preparing properties for converting the document
      propertyvalue = new PropertyValue[2];
      // Setting the flag for overwriting
      propertyvalue[0] = new PropertyValue();
      propertyvalue[0].Name = "Overwrite";
      propertyvalue[0].Value = new Boolean(true);

      // Setting the filter name
      propertyvalue[1] = new PropertyValue();
      propertyvalue[1].Name = "FilterName";
      propertyvalue[1].Value = stringConvertType;

        // Appending the favoured extension to the origin document name
        //if(stringUrl.lastIndexOf(".")!=0){
   //stringUrl=stringUrl.substring(0,stringUrl.lastIndexOf("."));
  //}

        if(namedoc.lastIndexOf(".")!=-1){
   namedoc=namedoc.substring(0,namedoc.lastIndexOf("."));
  }

  //stringConvertedFile = stringUrl + "." + stringExtension;



stringConvertedFile=xbase.getAlias("local")+"/oo_tmp/"+namedoc+"."+stringExt
ension;

    stringConvertedFile=stringConvertedFile.replace( '\\', '/' );

      // Storing and converting the document
    xstorable.storeToURL( stringConvertedFile, propertyvalue );

      // Getting the method dispose() for closing the document
      XComponent xcomponent =
      ( XComponent ) UnoRuntime.queryInterface( XComponent.class,
      xstorable );

      // Closing the converted document
      xcomponent.dispose();
    }

     catch(NoConnectException ex ) {
      return( "" );
    }
 catch( IOException ex ) {
     return( "" );
 }
    catch( Exception ex ) {
        return( "" );
    }


    // Returning the name of the converted file
    return( stringConvertedFile );
  }


----- Original Message ----- 
From: "Luke Shannon" <lshannon@hypermedia.com>
To: "Lucene Users List" <lucene-user@jakarta.apache.org>
Sent: Wednesday, November 10, 2004 5:59 PM
Subject: Re: Indexing MS Files


> Thanks Otis. I am looking forward to this book. Any idea when it may be
> released?
>
> ----- Original Message ----- 
> From: "Otis Gospodnetic" <otis_gospodnetic@yahoo.com>
> To: "Lucene Users List" <lucene-user@jakarta.apache.org>
> Sent: Wednesday, November 10, 2004 11:54 AM
> Subject: Re: Indexing MS Files
>
>
> > That's one place to start.  The other one would be textmining.org, at
> > least for Word files.
> > I used both POI and Textmining API in Lucene in Action, and the latter
> > was much simpler to use.  You can also find some comments about both
> > libs in lucene-user archives.  People tend to like Textmining API
> > better.
> >
> > Otis
> >
> > --- Luke Shannon <lshannon@hypermedia.com> wrote:
> >
> > > I need to index Word, Excel and Power Point files.
> > >
> > > Is this the place to start?
> > >
> > > http://jakarta.apache.org/poi/
> > >
> > > Is there something better?
> > >
> > > Thanks,
> > >
> > > Luke
> >
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
> > For additional commands, e-mail: lucene-user-help@jakarta.apache.org
> >
> >
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: lucene-user-help@jakarta.apache.org
>
>



---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-user-help@jakarta.apache.org


Mime
View raw message