I think the new contrib/collation package may address this use case?
It converts each term to its CollationKey, outside of Lucene.
Mike
On Tue, Apr 7, 2009 at 7:36 AM, Federica Falini Data Management S.p.A
<ffalini@datamanagement.it> wrote:
> Good morning,
> In Lucene 2.2 i have made modification to Term.java, TermBuffer.java (see
> below) in order to have Term enumerations sorted case-insensitive (when a
> field is not-tokenized):
> TermEnum terms = reader.terms(new Term("myFieldNotTokenized", ""));
> while ("myFieldNotTokenized".equals(terms.term().field())) {
>
> System.out.println( " " + terms.term());
> if (!terms.next()) break;
> }
>
> For example, instead to obtain this sort on TermEnum:
>
> Annales
> Cafè
> Zucche
> cafe
>
> i need to obtain this :
>
> Annales
> cafe
> Cafè
> Zucche
>
> Now in Lucene 2.4 i find it difficult because the package "index" is changed
> a lot; can i have some indications to keep my sort?
> Thanks in advance
> Federica
>
> Term.java:
> package org.apache.lucene.index;
>
> import java.text.CollationKey;
>
> /**
> * Licensed to the Apache Software Foundation (ASF) under one or more
> * contributor license agreements. See the NOTICE file distributed with
> * this work for additional information regarding copyright ownership.
> * The ASF licenses this file to You under the Apache License, Version 2.0
> * (the "License"); you may not use this file except in compliance with
> * the License. You may obtain a copy of the License at
> *
> * http://www.apache.org/licenses/LICENSE-2.0
> *
> * Unless required by applicable law or agreed to in writing, software
> * distributed under the License is distributed on an "AS IS" BASIS,
> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> * See the License for the specific language governing permissions and
> * limitations under the License.
> */
>
> /**
> A Term represents a word from text. This is the unit of search. It is
> composed of two elements, the text of the word, as a string, and the name
> of
> the field that the text occured in, an interned string.
>
> Note that terms may represent more than words from text fields, but also
> things like dates, email addresses, urls, etc. */
>
> public final class Term implements Comparable, java.io.Serializable {
> String field;
> String text;
> transient CollationKey ckText;
>
> /** Constructs a Term with the given field and text.
> * <p>Note that a null field or null text value results in undefined
> * behavior for most Lucene APIs that accept a Term parameter. */
> public Term(String fld, String txt) {
>
> this(fld, txt, true);
> }
> Term(String fld, String txt, boolean intern) {
> field = intern ? fld.intern() : fld; // field names are interned
> text = txt; // unless already known to be
> ckText = OpacCollator.getInstancePool().getCollationKey(text);
>
> }
>
> /** Returns the field of this term, an interned string. The field
> indicates
> the part of a document which this term came from. */
> public final String field() { return field; }
>
> /** Returns the text of this term. In the case of words, this is simply
> the
> text of the word. In the case of dates and other types, this is an
> encoding of the object as a string. */
> public final String text() { return text; }
>
> /**
> * Optimized construction of new Terms by reusing same field as this Term
> * - avoids field.intern() overhead
> * @param text The text of the new term (field is implicitly same as this
> Term instance)
> * @return A new Term
> */
> public Term createTerm(String text)
> {
> return new Term(field,text,false);
> }
>
> /** Compares two terms, returning true iff they have the same
> field and text. */
> public final boolean equals(Object o) {
> if (o == this)
> return true;
> if (o == null)
> return false;
> if (!(o instanceof Term))
> return false;
> Term other = (Term)o;
> //return field == other.field && text.equals(other.text);
> return field == other.field && text.equalsIgnoreCase(other.text);
> }
>
> /** Combines the hashCode() of the field and the text. */
> public final int hashCode() {
> return field.hashCode() + text.hashCode();
> }
>
> public int compareTo(Object other) {
> return compareTo((Term)other);
> }
>
> /** Compares two terms, returning a negative integer if this
> term belongs before the argument, zero if this term is equal to the
> argument, and a positive integer if this term belongs after the
> argument.
>
> The ordering of terms is first by field, then by text.*/
> // public final int compareTo(Term other) {
> // if (field == other.field) // fields are interned
> // return text.compareTo(other.text);
> // else
> // return field.compareTo(other.field);
> // }
> public final int compareTo(Term other) {
>
> if (field == other.field) { // fields are interned
> return ckText.compareTo(other.ckText);
> } else {
> // per il field basta il compareToIgnoreCase delle stringhe
> return field.compareToIgnoreCase(other.field);
> }
> }
>
> /** Resets the field and text of a Term. */
> final void set(String fld, String txt) {
> field = fld;
> text = txt;
> ckText = OpacCollator.getInstancePool().getCollationKey(text);
> }
>
> public final String toString() { return field + ":" + text; }
>
> private void readObject(java.io.ObjectInputStream in)
> throws java.io.IOException, ClassNotFoundException
> {
> in.defaultReadObject();
> field = field.intern();
> ckText = OpacCollator.getInstancePool().getCollationKey(text);
> }
>
>
> }
>
>
> TermBuffer.java:
> public final int compareTo(TermBuffer other) {
> if (field == other.field) { // fields are interned
> return compareChars(text, textLength, other.text, other.textLength);
> }
> else {
> return field.compareTo(other.field);
>
> }
> }
>
> private static final int compareChars(char[] v1, int len1,
> char[]
v2, int len2) {
> String v1s = new String(v1,0,len1);
> String v2s = new String(v2,0,len2);
> OpacCollator oc = OpacCollator.getInstancePool();
> CollationKey v1k = oc.getCollationKey(v1s);
> CollationKey v2k = oc.getCollationKey(v2s);
> return v1k.compareTo(v2k);
> }
>
> OpacCollator.java
> package org.apache.lucene.index;
>
> import java.text.CollationKey;
> import java.text.Collator;
> import java.text.ParseException;
> import java.text.RuleBasedCollator;
> import java.util.BitSet;
> import java.util.Locale;
>
> import org.apache.log4j.Logger;
>
> /**
> * Collator che considera lo spazio '\u0020' come primo carattere
> dell'ordinamento.L'underscore
> * sembra essere il primo di tutti
> */
> public class OpacCollator extends RuleBasedCollator {
> /**
> * logger
> */
> private static Logger log =
> Logger.getLogger(OpacCollator.class.getName());
> private static String spacesRules = null;
> /** progressivo identificativo oggetto */
> // private static int seqIdPool = 0;
> /** identificativo dell'oggetto */
> private int id = 0;
>
> private final static int POOL =
> Integer.parseInt(System.getProperty("sebina.opac.collatorpool", "256"));
> private final static OpacCollator[] collatorPool = new OpacCollator[POOL];
> /** la posizione 'true' indica disponibile, la posizione 'false' indica in
> uso */
> private static BitSet bs = new BitSet(POOL);
> // private static int cntPool = 0;
> static {
> RuleBasedCollator it_ITcollator = (RuleBasedCollator)
> Collator.getInstance(new Locale("it", "IT"));
> // CASPH: per scaffafle virtuale Rimuove tutte le occorrenze del
> carattere di tabulazione nella stringa del collator
> spacesRules = it_ITcollator.getRules().replaceAll("='\t'=", "=");
> spacesRules = spacesRules.replaceAll(";'\t' ;", ";");
> // Aggiunto il carattere di tabulazione come prioritario allo spazio
> spacesRules = spacesRules.replaceAll("<'_'", "<'\t'<'\u0020'<'_'");
>
> for (int i = 0; i < POOL; i++) {
> try {
> collatorPool[i] = new OpacCollator(i);
> bs.set(i);
> } catch (ParseException e) {
> log.fatal("Rules:" + it_ITcollator.getRules(), e);
> break;
> }
> }
> log.info("dimensione pool: " + POOL);
> }
>
> public static synchronized OpacCollator getInstancePool() {
> int pos = -1;
> while (pos < 0) {
> pos = bs.nextSetBit(0);
> }
> bs.clear(pos);
> // log.debug("getting pool#" + pos);
> return collatorPool[pos];
> }
>
> /**
> * Constructor for OpacCollator.
> *
> * @throws ParseException
> */
> private OpacCollator() throws ParseException {
> super(spacesRules);
> this.setStrength(Collator.PRIMARY);
> }
>
> /**
> * Constructor for OpacCollator.
> *
> * @param id
> * @throws ParseException
> */
> public OpacCollator(int ident) throws ParseException {
> this();
> this.id = ident;
> }
>
> /**
> * @see java.text.Collator#getCollationKey(String)
> */
> public CollationKey getCollationKey(String arg0) {
> CollationKey ck = super.getCollationKey(arg0);
> bs.set(this.id);
> return ck;
> }
>
> }
>
>
> --
>
>
> Federica FALINI
> Divisione Beni Culturali
>
>
>
> tel: +39.0544.503.886
> fax: +39.0544.461697
> e-mail: ffalini@datamanagement.it
> web: http://www.datamanagement.it
>
> 48100 - Ravenna (RA)
> Via S.Cavina, n 7
> Italy
> ________________________________
> Questo messaggio di posta elettronica contiene informazioni di carattere
> confidenziale rivolte esclusivamente al destinatario sopra indicato. E'
> vietato l'uso, la diffusione, distribuzione o riproduzione da parte di ogni
> altra persona. Nel caso aveste ricevuto questo messaggio di posta
> elettronica per errore, siete pregati di segnalarlo immediatamente al
> mittente e distruggere quanto ricevuto (compresi i file allegati) senza
> farne copia. This e-mail transmission may contain legally privileged and/or
> confidential information. Please do not read it if you are not the intended
> recipient(S). Any use, distribution, reproduction or disclosure by any other
> person is strictly prohibited. If you have received this e-mail in error,
> please notify the sender and destroy the original transmission and its
> attachments without reading or saving it in any manner.
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org
|