lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Federica Falini Data Management S.p.A" <ffal...@datamanagement.it>
Subject Probelm sort on TermEnum
Date Tue, 07 Apr 2009 11:36:03 GMT
Good morning,
In Lucene 2.2 i have made modification to Term.java, TermBuffer.java 
(*see below*)  in order to have  Term enumerations sorted 
case-insensitive (when a field is not-tokenized):
TermEnum terms = reader.terms(new Term("myFieldNotTokenized", ""));
      while ("myFieldNotTokenized".equals(terms.term().field())) {
       
        System.out.println( "     " + terms.term());
        if (!terms.next()) break;
  }

For example, instead to obtain this sort on TermEnum:

Annales
Cafè
Zucche
cafe

i need to obtain this :

Annales
cafe
Cafè
Zucche

Now in Lucene 2.4 i find it difficult because the package "index" is 
changed a lot; can i have some indications to keep my sort?
Thanks in advance
Federica

*Term.java:*
package org.apache.lucene.index;

import java.text.CollationKey;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
  A Term represents a word from text.  This is the unit of search.  It is
  composed of two elements, the text of the word, as a string, and the 
name of
  the field that the text occured in, an interned string.

  Note that terms may represent more than words from text fields, but also
  things like dates, email addresses, urls, etc.  */

public final class Term implements Comparable, java.io.Serializable {
  String field;
  String text;
  transient CollationKey ckText;

  /** Constructs a Term with the given field and text.
   * <p>Note that a null field or null text value results in undefined
   * behavior for most Lucene APIs that accept a Term parameter. */
  public Term(String fld, String txt) {
     
    this(fld, txt, true);
  }
  Term(String fld, String txt, boolean intern) {
    field = intern ? fld.intern() : fld;      // field names are interned
    text = txt;                      // unless already known to be
    ckText = OpacCollator.getInstancePool().getCollationKey(text);
  
  }

  /** Returns the field of this term, an interned string.   The field 
indicates
    the part of a document which this term came from. */
  public final String field() { return field; }

  /** Returns the text of this term.  In the case of words, this is 
simply the
    text of the word.  In the case of dates and other types, this is an
    encoding of the object as a string.  */
  public final String text() { return text; }
 
  /**
   * Optimized construction of new Terms by reusing same field as this Term
   * - avoids field.intern() overhead
   * @param text The text of the new term (field is implicitly same as 
this Term instance)
   * @return A new Term
   */
  public Term createTerm(String text)
  {
      return new Term(field,text,false);
  }

  /** Compares two terms, returning true iff they have the same
      field and text. */
  public final boolean equals(Object o) {
    if (o == this)
      return true;
    if (o == null)
      return false;
    if (!(o instanceof Term))
      return false;
    Term other = (Term)o;
    //return field == other.field && text.equals(other.text);
    return field == other.field && text.equalsIgnoreCase(other.text);
  }

  /** Combines the hashCode() of the field and the text. */
  public final int hashCode() {
    return field.hashCode() + text.hashCode();
  }

  public int compareTo(Object other) {
    return compareTo((Term)other);
  }

  /** Compares two terms, returning a negative integer if this
    term belongs before the argument, zero if this term is equal to the
    argument, and a positive integer if this term belongs after the 
argument.

    The ordering of terms is first by field, then by text.*/
//  public final int compareTo(Term other) {
//    if (field == other.field)              // fields are interned
//      return text.compareTo(other.text);
//    else
//      return field.compareTo(other.field);
//  }
  public final int compareTo(Term other) {
    
      if (field == other.field) { // fields are interned
          return  ckText.compareTo(other.ckText);
      } else {
          // per il field basta il compareToIgnoreCase delle stringhe
          return field.compareToIgnoreCase(other.field);
      }
  }

  /** Resets the field and text of a Term. */
  final void set(String fld, String txt) {
    field = fld;
    text = txt;
    ckText = OpacCollator.getInstancePool().getCollationKey(text);
  }

  public final String toString() { return field + ":" + text; }

  private void readObject(java.io.ObjectInputStream in)
    throws java.io.IOException, ClassNotFoundException
  {
      in.defaultReadObject();
      field = field.intern();
      ckText = OpacCollator.getInstancePool().getCollationKey(text);
  }
 
 
}


*TermBuffer.java:*
public final int compareTo(TermBuffer other) {
    if (field == other.field)    {          // fields are interned
      return compareChars(text, textLength, other.text, other.textLength);
    }
    else {
      return field.compareTo(other.field);

    }
  }

private static final int compareChars(char[] v1, int len1,
                                         char[] v2, int len2) {
    String v1s = new String(v1,0,len1);
    String v2s = new String(v2,0,len2);
    OpacCollator oc = OpacCollator.getInstancePool();
    CollationKey v1k  = oc.getCollationKey(v1s);
    CollationKey v2k  = oc.getCollationKey(v2s);
    return v1k.compareTo(v2k);
  }

*OpacCollator.java*
package org.apache.lucene.index;

import java.text.CollationKey;
import java.text.Collator;
import java.text.ParseException;
import java.text.RuleBasedCollator;
import java.util.BitSet;
import java.util.Locale;

import org.apache.log4j.Logger;

/**
 * Collator che considera lo spazio '\u0020' come primo carattere 
dell'ordinamento.L'underscore
 * sembra essere il primo di tutti
 */
public class OpacCollator extends RuleBasedCollator {
  /**
   * logger
   */
  private static Logger log = 
Logger.getLogger(OpacCollator.class.getName());
  private static String spacesRules = null;
  /** progressivo identificativo oggetto */
  // private static int seqIdPool = 0;
  /** identificativo dell'oggetto */
  private int id = 0;

  private final static int POOL = 
Integer.parseInt(System.getProperty("sebina.opac.collatorpool", "256"));
  private final static OpacCollator[] collatorPool = new OpacCollator[POOL];
  /** la posizione 'true' indica disponibile, la posizione 'false' 
indica in uso */
  private static BitSet bs = new BitSet(POOL);
  // private static int cntPool = 0;
  static {
    RuleBasedCollator it_ITcollator = (RuleBasedCollator) 
Collator.getInstance(new Locale("it", "IT"));
    // CASPH: per scaffafle virtuale Rimuove tutte le occorrenze del 
carattere di tabulazione nella stringa del collator
    spacesRules = it_ITcollator.getRules().replaceAll("='\t'=", "=");
    spacesRules = spacesRules.replaceAll(";'\t' ;", ";");
    // Aggiunto il carattere di tabulazione come prioritario allo spazio
    spacesRules = spacesRules.replaceAll("<'_'", "<'\t'<'\u0020'<'_'");
   
    for (int i = 0; i < POOL; i++) {
      try {
        collatorPool[i] = new OpacCollator(i);
        bs.set(i);
      } catch (ParseException e) {
        log.fatal("Rules:" + it_ITcollator.getRules(), e);
        break;
      }
    }
    log.info("dimensione pool: " + POOL);
  }

  public static synchronized OpacCollator getInstancePool() {
    int pos = -1;
    while (pos < 0) {
      pos = bs.nextSetBit(0);
    }
    bs.clear(pos);
    // log.debug("getting pool#" + pos);
    return collatorPool[pos];
  }

  /**
   * Constructor for OpacCollator.
   *
   * @throws ParseException
   */
  private OpacCollator() throws ParseException {
    super(spacesRules);
    this.setStrength(Collator.PRIMARY);
  }

  /**
   * Constructor for OpacCollator.
   *
   * @param id
   * @throws ParseException
   */
  public OpacCollator(int ident) throws ParseException {
    this();
    this.id = ident;
  }

  /**
   * @see java.text.Collator#getCollationKey(String)
   */
  public CollationKey getCollationKey(String arg0) {
    CollationKey ck = super.getCollationKey(arg0);
    bs.set(this.id);
    return ck;
  }

}


-- 
 
 
*Federica FALINI
*
Divisione Beni Culturali
 
Data Management S.p.A. 
<http://identity.datamanagement.it/web/img/email/dm_logo_email.gif>
 

tel: +39.0544.503.*886*
fax: +39.0544.461697
e-mail: ffalini@datamanagement.it <mailto:ffalini@datamanagement.it>
web: http://www.datamanagement.it <http://www.datamanagement.it/>
 
48100 - Ravenna (RA)
Via S.Cavina, n 7 
Italy
------------------------------------------------------------------------
Questo messaggio di posta elettronica contiene informazioni di carattere 
confidenziale rivolte esclusivamente al destinatario sopra indicato. E' 
vietato l'uso, la diffusione, distribuzione o riproduzione da parte di 
ogni altra persona. Nel caso aveste ricevuto questo messaggio di posta 
elettronica per errore, siete pregati di segnalarlo immediatamente al 
mittente e distruggere quanto ricevuto (compresi i file allegati) senza 
farne copia. /This e-mail transmission may contain legally privileged 
and/or confidential information. Please do not read it if you are not 
the intended recipient(S). Any use, distribution, reproduction or 
disclosure by any other person is strictly prohibited. If you have 
received this e-mail in error, please notify the sender and destroy the 
original transmission and its attachments without reading or saving it 
in any manner./


Mime
View raw message