mahout-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ted Dunning <ted.dunn...@gmail.com>
Subject Re: svn commit: r1095864 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/utils/vectors/lucene/
Date Fri, 22 Apr 2011 06:13:50 GMT
Fixed.

Thanks for the catch.

On Thu, Apr 21, 2011 at 11:10 PM, Ted Dunning <ted.dunning@gmail.com> wrote:

> Yes.  Forgotten add.
>
>
> On Thu, Apr 21, 2011 at 10:21 PM, Dmitriy Lyubimov <dlieu.7@gmail.com>wrote:
>
>> Hm. I am getting this after this commit. Forgotten class?
>>
>> [INFO] Compilation failure
>>
>>
>> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce
>> neIterator.java:[33,30] cannot find symbol
>> symbol  : class Bump125
>> location: package org.apache.mahout.utils
>>
>> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce
>> neIterator.java:[55,10] cannot find symbol
>> symbol  : class Bump125
>> location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator
>>
>> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce
>> neIterator.java:[55,29] cannot find symbol
>> symbol  : class Bump125
>> location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator
>>
>> [INFO]
>> ------------------------------------------------------------------------
>> [INFO] For more information, run Maven with the -e switch
>> [INFO]
>> ------------------------------------------------------------------------
>>
>> On Thu, Apr 21, 2011 at 9:58 PM,  <tdunning@apache.org> wrote:
>> > Author: tdunning
>> > Date: Fri Apr 22 04:58:14 2011
>> > New Revision: 1095864
>> >
>> > URL: http://svn.apache.org/viewvc?rev=1095864&view=rev
>> > Log:
>> > MAHOUT-675 - Add better handling of empty term vectors in lucene
>> conversion to vectors.
>> >
>> > Modified:
>> >
>>  mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
>> >
>>  mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
>> >
>>  mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
>> >
>> > Modified:
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
>> > URL:
>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff
>> >
>> ==============================================================================
>> > ---
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
>> (original)
>> > +++
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
>> Fri Apr 22 04:58:14 2011
>> > @@ -35,11 +35,16 @@ public final class LuceneIterable implem
>> >   private final String idField;
>> >   private final VectorMapper mapper;
>> >   private final double normPower;
>> > +  private final double maxPercentErrorDocs;
>> >
>> >   public LuceneIterable(IndexReader reader, String idField, String
>> field, VectorMapper mapper) {
>> >     this(reader, idField, field, mapper, NO_NORMALIZING);
>> >   }
>> >
>> > +  public LuceneIterable(IndexReader indexReader, String idField, String
>> field, VectorMapper mapper, double normPower) {
>> > +    this(indexReader, idField, field, mapper, normPower, 0);
>> > +  }
>> > +
>> >   /**
>> >    * Produce a LuceneIterable that can create the Vector plus normalize
>> it.
>> >    *
>> > @@ -49,18 +54,19 @@ public final class LuceneIterable implem
>> >    * @param mapper {@link VectorMapper} for creating {@link Vector}s
>> from Lucene's TermVectors.
>> >    * @param normPower the normalization value. Must be nonnegative, or
>> {@link #NO_NORMALIZING}
>> >    */
>> > -  public LuceneIterable(IndexReader indexReader, String idField, String
>> field, VectorMapper mapper, double normPower) {
>> > +  public LuceneIterable(IndexReader indexReader, String idField, String
>> field, VectorMapper mapper, double normPower, double maxPercentErrorDocs) {
>> >     this.indexReader = indexReader;
>> >     this.idField = idField;
>> >     this.field = field;
>> >     this.mapper = mapper;
>> >     this.normPower = normPower;
>> > +    this.maxPercentErrorDocs = maxPercentErrorDocs;
>> >   }
>> >
>> >   @Override
>> >   public Iterator<Vector> iterator() {
>> >     try {
>> > -      return new LuceneIterator(indexReader, idField, field, mapper,
>> normPower);
>> > +      return new LuceneIterator(indexReader, idField, field, mapper,
>> normPower, maxPercentErrorDocs);
>> >     } catch (IOException e) {
>> >       throw new IllegalStateException(e);
>> >     }
>> >
>> > Modified:
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
>> > URL:
>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff
>> >
>> ==============================================================================
>> > ---
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
>> (original)
>> > +++
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
>> Fri Apr 22 04:58:14 2011
>> > @@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs;
>> >  import org.apache.lucene.index.TermFreqVector;
>> >  import org.apache.mahout.math.NamedVector;
>> >  import org.apache.mahout.math.Vector;
>> > +import org.apache.mahout.utils.Bump125;
>> >  import org.slf4j.Logger;
>> >  import org.slf4j.LoggerFactory;
>> >
>> > @@ -48,8 +49,12 @@ public final class LuceneIterator extend
>> >   private final VectorMapper mapper;
>> >   private final double normPower;
>> >   private final TermDocs termDocs;
>> > -  private int numErrorDocs;
>> > -  private int maxErrorDocs;
>> > +
>> > +  private int numErrorDocs = 0;
>> > +  private int maxErrorDocs = 0;
>> > +  private Bump125 bump = new Bump125();
>> > +  private long nextLogRecord = bump.increment();
>> > +  private int skippedErrorMessages = 0;
>> >
>> >   /**
>> >    * Produce a LuceneIterable that can create the Vector plus normalize
>> it.
>> > @@ -65,7 +70,7 @@ public final class LuceneIterator extend
>> >                         String field,
>> >                         VectorMapper mapper,
>> >                         double normPower) throws IOException {
>> > -    this(indexReader, idField, field, mapper, normPower, 1.0);
>> > +    this(indexReader, idField, field, mapper, normPower, 0.0);
>> >   }
>> >
>> >   /**
>> > @@ -91,7 +96,6 @@ public final class LuceneIterator extend
>> >     // term docs(null) is a better way of iterating all the docs in
>> Lucene
>> >     this.termDocs = indexReader.termDocs(null);
>> >     this.maxErrorDocs = (int) (maxPercentErrorDocs *
>> indexReader.numDocs());
>> > -    this.numErrorDocs = 0;
>> >   }
>> >
>> >   @Override
>> > @@ -104,11 +108,22 @@ public final class LuceneIterator extend
>> >       int doc = termDocs.doc();
>> >       TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc,
>> field);
>> >       if (termFreqVector == null) {
>> > -        if (++numErrorDocs >= maxErrorDocs) {
>> > +        numErrorDocs++;
>> > +        if (numErrorDocs >= maxErrorDocs) {
>> >           log.error("There are too many documents that do not have a
>> term vector for {}", field);
>> >           throw new IllegalStateException("There are too many documents
>> that do not have a term vector for " + field);
>> >         }
>> > -        log.warn("{} does not have a term vector for {}",
>> indexReader.document(doc).get(idField), field);
>> > +        if (numErrorDocs >= nextLogRecord) {
>> > +          if (skippedErrorMessages == 0) {
>> > +            log.warn("{} does not have a term vector for {}",
>> indexReader.document(doc).get(idField), field);
>> > +          } else {
>> > +            log.warn("{} documents do not have a term vector for {}",
>> numErrorDocs, field);
>> > +          }
>> > +          nextLogRecord = bump.increment();
>> > +          skippedErrorMessages = 0;
>> > +        } else {
>> > +          skippedErrorMessages++;
>> > +        }
>> >         computeNext();
>> >       }
>> >
>> >
>> > Modified:
>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
>> > URL:
>> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff
>> >
>> ==============================================================================
>> > ---
>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
>> (original)
>> > +++
>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
>> Fri Apr 22 04:58:14 2011
>> > @@ -97,16 +97,81 @@ public final class LuceneIterableTest ex
>> >     iterator.next();
>> >   }
>> >
>> > +  @Test
>> > +  public void testIterable_someNoiseTermVectors() throws IOException {
>> > +    //get noise vectors
>> > +    RAMDirectory directory = createTestIndex(Field.TermVector.YES, new
>> RAMDirectory(), true, 0);
>> > +    //get real vectors
>> > +    createTestIndex(Field.TermVector.NO, directory, false, 5);
>> > +
>> > +    IndexReader reader = IndexReader.open(directory, true);
>> > +    Weight weight = new TFIDF();
>> > +    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
>> > +    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
>> > +
>> > +    boolean exceptionThrown;
>> > +    //0 percent tolerance
>> > +    LuceneIterable iterable = new LuceneIterable(reader, "id",
>> "content", mapper);
>> > +    try {
>> > +        Iterator<Vector> iterator = iterable.iterator();
>> > +        while (iterator.hasNext()) {
>> > +            iterator.next();
>> > +        }
>> > +        exceptionThrown = false;
>> > +    }
>> > +    catch(IllegalStateException ise) {
>> > +        exceptionThrown = true;
>> > +    }
>> > +    assertTrue(exceptionThrown);
>> > +
>> > +    //100 percent tolerance
>> > +    iterable = new LuceneIterable(reader, "id", "content", mapper, -1,
>> 1.0);
>> > +    try {
>> > +        Iterator<Vector> iterator = iterable.iterator();
>> > +        while (iterator.hasNext()) {
>> > +            iterator.next();
>> > +        }
>> > +        exceptionThrown = false;
>> > +    }
>> > +    catch(IllegalStateException ise) {
>> > +        exceptionThrown = true;
>> > +    }
>> > +    assertFalse(exceptionThrown);
>> > +
>> > +    //50 percent tolerance
>> > +    iterable = new LuceneIterable(reader, "id", "content", mapper, -1,
>> 0.5);
>> > +    Iterator<Vector> iterator = iterable.iterator();
>> > +    iterator.next();
>> > +    iterator.next();
>> > +    iterator.next();
>> > +    iterator.next();
>> > +    iterator.next();
>> > +
>> > +    try {
>> > +        while (iterator.hasNext()) {
>> > +            iterator.next();
>> > +        }
>> > +        exceptionThrown = false;
>> > +    }
>> > +    catch(IllegalStateException ise) {
>> > +        exceptionThrown = true;
>> > +    }
>> > +    assertTrue(exceptionThrown);
>> > +  }
>> > +
>> >   private static RAMDirectory createTestIndex(Field.TermVector
>> termVector) throws IOException {
>> > -    RAMDirectory directory = new RAMDirectory();
>> > +      return createTestIndex(termVector, new RAMDirectory(), true, 0);
>> > +  }
>> > +
>> > +  private static RAMDirectory createTestIndex(Field.TermVector
>> termVector, RAMDirectory directory, boolean createNew, int startingId)
>> throws IOException {
>> >     IndexWriter writer = new IndexWriter(
>> >         directory,
>> >         new StandardAnalyzer(Version.LUCENE_30),
>> > -        true,
>> > +        createNew,
>> >         IndexWriter.MaxFieldLength.UNLIMITED);
>> >     for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) {
>> >       Document doc = new Document();
>> > -      Fieldable id = new Field("id", "doc_" + i, Field.Store.YES,
>> Field.Index.NOT_ANALYZED_NO_NORMS);
>> > +      Fieldable id = new Field("id", "doc_" + (i + startingId),
>> Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
>> >       doc.add(id);
>> >       //Store both position and offset information
>> >       Fieldable text = new Field("content", DOCS[i], Field.Store.NO,
>> Field.Index.ANALYZED, termVector);
>> >
>> >
>> >
>>
>
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message