jackrabbit-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Martin Perez <mper...@gmail.com>
Subject Re: Another issue with text filtering
Date Fri, 28 Oct 2005 10:07:14 GMT
Are these changes on the SVN repository?

I can't see them...

Regards,

Martin

On 10/28/05, Marcel Reutegger <marcel.reutegger@gmx.net> wrote:
>
> Hi Martin,
>
> I quickly put together a patch for the pdf text filter. completely
> untested because I'm a bit short of time at the moment.
>
> Any feedback if it works is appreciated.
>
> regards
> marcel
>
> Martin Perez wrote:
> > If you want to add a PDF document to a repository using a PdfTextFilter,
> and
> > you do the following steps:
> >
> > session.save()
> > node.checkin();
> >
> > The method PdfTextFilter.doFilter() gets called 4 times!!!
> >
> > session's save method calls doFilter one time. This is normal
> >
> > But checkin method calls doFilter three times. Is this normal? I do not
> see
> > the sense.
> >
> > Thanks.
> >
> > Martin
> >
>
>
> Index: java/org/apache/jackrabbit/core/query/LazyReader.java
> ===================================================================
> --- java/org/apache/jackrabbit/core/query/LazyReader.java (revision 0)
> +++ java/org/apache/jackrabbit/core/query/LazyReader.java (revision 0)
> @@ -0,0 +1,66 @@
> +/*
> + * Copyright 2004-2005 The Apache Software Foundation or its licensors,
> + * as applicable.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.jackrabbit.core.query;
> +
> +import java.io.Reader;
> +import java.io.IOException;
> +
> +/**
> + * <code>LazyReader</code> implement an utility that allows an
> implementing
> + * class to lazy initialize an actual reader.
> + */
> +public abstract class LazyReader extends Reader {
> +
> + /**
> + * The actual reader, set by concrete sub class.
> + */
> + protected Reader delegate;
> +
> + /**
> + * Implementation must set the actual reader {@link #delegate} when
> + * this method is called.
> + *
> + * @throws IOException if an error occurs.
> + */
> + protected abstract void initializeReader() throws IOException;
> +
> + /**
> + * Closes the underlying reader.
> + *
> + * @throws IOException if an exception occurs while closing the
> underlying
> + * reader.
> + */
> + public void close() throws IOException {
> + if (delegate != null) {
> + delegate.close();
> + }
> + }
> +
> + /**
> + * @inheritDoc
> + */
> + public int read(char cbuf[], int off, int len) throws IOException {
> + if (delegate == null) {
> + initializeReader();
> + }
> + // be suspicious
> + if (delegate == null) {
> + throw new IOException("reader not initialized");
> + }
> + return delegate.read(cbuf, off, len);
> + }
> +}
>
> Property changes on: java/org/apache/jackrabbit/core/query/LazyReader.java
> ___________________________________________________________________
> Name: svn:eol-style
> + native
>
> Index: java/org/apache/jackrabbit/core/query/PdfTextFilter.java
> ===================================================================
> --- java/org/apache/jackrabbit/core/query/PdfTextFilter.java (revision
> 329171)
> +++ java/org/apache/jackrabbit/core/query/PdfTextFilter.java (working
> copy)
> @@ -57,31 +57,37 @@
> public Map doFilter(PropertyState data, String encoding) throws
> RepositoryException {
> InternalValue[] values = data.getValues();
> if (values.length > 0) {
> - BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
> -
> - try {
> - PDFParser parser = new PDFParser(blob.getStream());
> - parser.parse();
> -
> - PDDocument document = parser.getPDDocument();
> -
> - CharArrayWriter writer = new CharArrayWriter();
> -
> - PDFTextStripper stripper = new PDFTextStripper();
> - stripper.setLineSeparator("\n");
> - stripper.writeText(document, writer);
> -
> - document.close();
> - writer.close();
> -
> - Map result = new HashMap();
> - result.put(FieldNames.FULLTEXT, new CharArrayReader(writer.toCharArray
> ()));
> - return result;
> - }
> - catch (IOException ex) {
> - throw new RepositoryException(ex);
> - }
> - }
> + final BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
> + LazyReader reader = new LazyReader() {
> + protected void initializeReader() throws IOException {
> + PDFParser parser;
> + try {
> + parser = new PDFParser(blob.getStream());
> + } catch (RepositoryException e) {
> + throw new IOException(e.getMessage());
> + }
> + parser.parse();
> +
> + PDDocument document = parser.getPDDocument();
> +
> + CharArrayWriter writer = new CharArrayWriter();
> +
> + PDFTextStripper stripper = new PDFTextStripper();
> + stripper.setLineSeparator("\n");
> + stripper.writeText(document, writer);
> +
> + document.close();
> + writer.close();
> +
> + delegate = new CharArrayReader(writer.toCharArray());
> + }
> + };
> +
> +
> + Map result = new HashMap();
> + result.put(FieldNames.FULLTEXT, reader);
> + return result;
> + }
> else {
> // multi value not supported
> throw new RepositoryException("Multi-valued binary properties not
> supported.");
>
>
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message