lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r607161 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/analysis/standard/ src/test/org/apache/lucene/analysis/
Date Fri, 28 Dec 2007 02:46:12 GMT
Author: gsingers
Date: Thu Dec 27 18:46:11 2007
New Revision: 607161

URL: http://svn.apache.org/viewvc?rev=607161&view=rev
Log:
LUCENE-1068: updated StandardTokenizer, Analyzer to allow for the replaceInvalidAcronym

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=607161&r1=607160&r2=607161&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Dec 27 18:46:11 2007
@@ -196,6 +196,12 @@
     Hits.length(), an application attempts to retrieve more hits 
     than actually exist , a ConcurrentMidificationException 
     is thrown.  (Doron Cohen)
+
+27. LUCENE-1068: Changed StandardTokenizer to fix an issue with it marking
+  the type of some tokens incorrectly.  This is done by adding a new flag named
+  replaceInvalidAcronym which defaults to false, the current, incorrect behavior.  Setting
+  this flag to true fixes the problem.  This flag is a temporary fix and is already
+  marked as being deprecated.  3.x will implement the correct approach.  (Shai Erera via
Grant Ingersoll)
     
     
 New features

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=607161&r1=607160&r2=607161&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Thu
Dec 27 18:46:11 2007
@@ -33,6 +33,16 @@
 public class StandardAnalyzer extends Analyzer {
   private Set stopSet;
 
+  /**
+   * Specifies whether deprecated acronyms should be replaced with HOST type.
+   * This is false by default to support backward compatibility.
+   * 
+   * @deprecated this should be removed in the next release (3.0).
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   */
+  private boolean replaceInvalidAcronym = false;
+  
   /** An array containing some common English words that are usually not
   useful for searching. */
   public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
@@ -66,10 +76,75 @@
     stopSet = WordlistLoader.getWordSet(stopwords);
   }
 
+  /**
+   *
+   * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized
acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   *
+   * @deprecated Remove in 3.X and make true the only valid value
+   */
+  public StandardAnalyzer(boolean replaceInvalidAcronym) {
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+  }
+
+  /**
+   *  @param stopwords The stopwords to use
+   * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized
acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   *
+   * @deprecated Remove in 3.X and make true the only valid value
+   */
+  public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{
+    this(stopwords);
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+  }
+
+  /**
+   * @param stopwords The stopwords to use
+   * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized
acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   *
+   * @deprecated Remove in 3.X and make true the only valid value
+   */
+  public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{
+    this(stopwords);
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+  }
+
+  /**
+   *
+   * @param stopwords The stopwords to use
+   * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized
acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   *
+   * @deprecated Remove in 3.X and make true the only valid value
+   */
+  public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
+    this(stopwords);
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+  }
+
+  /**
+   * @param stopwords The stopwords to use
+   * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized
acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   *
+   * @deprecated Remove in 3.X and make true the only valid value
+   */
+  public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{
+    this(stopwords);
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+  }
+
   /** Constructs a {@link StandardTokenizer} filtered by a {@link
   StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
   public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(reader);
+    TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     result = new StopFilter(result, stopSet);
@@ -90,9 +165,32 @@
       streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
       streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
       streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
-    } else
+    } else {
       streams.tokenStream.reset(reader);
+    }
     
+    streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
+
     return streams.filteredTokenStream;
+  }
+
+  /**
+   *
+   * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   */
+  public boolean isReplaceInvalidAcronym() {
+    return replaceInvalidAcronym;
+  }
+
+  /**
+   *
+   * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized
acronyms in the StandardTokenizer
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   */
+  public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
   }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=607161&r1=607160&r2=607161&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
Thu Dec 27 18:46:11 2007
@@ -43,6 +43,17 @@
 public class StandardTokenizer extends Tokenizer {
     /** A private instance of the JFlex-constructed scanner */
     private final StandardTokenizerImpl scanner;
+    
+  /**
+   * Specifies whether deprecated acronyms should be replaced with HOST type.
+   * This is false by default to support backward compatibility.
+   *<p/>
+   * See http://issues.apache.org/jira/browse/LUCENE-1068
+   * 
+   * @deprecated this should be removed in the next release (3.0).
+   */
+  private boolean replaceInvalidAcronym = false;
+    
   void setInput(Reader reader) {
     this.input = reader;
   }
@@ -52,11 +63,24 @@
      * <code>input</code> to a newly created JFlex scanner.
      */
     public StandardTokenizer(Reader input) {
-	this.input = input;
-	this.scanner = new StandardTokenizerImpl(input);
+	    this.input = input;
+	    this.scanner = new StandardTokenizerImpl(input);
     }
 
-    /*
+  /**
+   * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}.
 Attaches
+   * the <code>input</code> to the newly created JFlex scanner.
+   *
+   * @param input The input reader
+   * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms with HOST.
+   *
+   * See http://issues.apache.org/jira/browse/LUCENE-1068
+   */
+  public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+    this.input = input;
+    this.scanner = new StandardTokenizerImpl(input);
+  }/*
      * (non-Javadoc)
      *
      * @see org.apache.lucene.analysis.TokenStream#next()
@@ -72,7 +96,19 @@
         final int start = scanner.yychar();
         result.setStartOffset(start);
         result.setEndOffset(start+result.termLength());
-        result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+        // This 'if' should be removed in the next release. For now, it converts
+        // invalid acronyms to HOST. When removed, only the 'else' part should
+        // remain.
+        if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
+          if (replaceInvalidAcronym) {
+            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+            result.setTermLength(result.termLength() - 1); // remove extra '.'
+          } else {
+            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+          }
+        } else {
+          result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+        }
         return result;
     }
 
@@ -90,4 +126,26 @@
         input = reader;
         reset();
     }
+
+  /**
+   * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized
as acronyms tokens like www.abc.com
+   * when they should have been labeled as hosts instead.
+   * @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
+   *
+   * @deprecated Remove in 3.X and make true the only valid value
+   */
+  public boolean isReplaceInvalidAcronym() {
+    return replaceInvalidAcronym;
+  }
+
+  /**
+   *
+   * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
+   * @deprecated Remove in 3.X and make true the only valid value
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   */
+  public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?rev=607161&r1=607160&r2=607161&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
Thu Dec 27 18:46:11 2007
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */
+/* The following code was generated by JFlex 1.4.1 on 12/18/07 9:22 PM */
 
 package org.apache.lucene.analysis.standard;
 
@@ -25,8 +25,8 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
- * on 8/9/07 10:15 AM from the specification file
- * <tt>/tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
+ * on 12/18/07 9:22 PM from the specification file
+ * <tt>/Volumes/User/grantingersoll/projects/lucene/java/lucene-clean/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
  */
 class StandardTokenizerImpl {
 
@@ -63,12 +63,13 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\1\0\1\1\4\2\1\3\1\1\14\0\1\4\4\5"+
-    "\2\6\2\0\1\7\1\0\1\7\3\5\6\7\3\5"+
-    "\1\10\4\0\1\10\2\0\2\10\2\5\1\11";
+    "\1\0\1\1\4\2\1\3\1\1\6\0\2\2\6\0"+
+    "\1\4\4\5\2\6\2\0\1\7\1\0\1\7\3\5"+
+    "\6\7\3\5\1\10\1\0\1\11\2\0\1\10\1\11"+
+    "\1\0\2\11\2\10\2\5\1\12";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[57];
+    int [] result = new int[61];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -96,14 +97,14 @@
     "\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"+
     "\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+
     "\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"+
-    "\0\u0159\0\207\0\u0168\0\u0177\0\u0186\0\u0195\0\u01a4\0\u01b3"+
+    "\0\u0159\0\u0168\0\u0177\0\207\0\u0186\0\u0195\0\u01a4\0\u01b3"+
     "\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"+
     "\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"+
-    "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\170\0\377\0\u02ee\0\u02fd"+
-    "\0\u030c";
+    "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\u02ee\0\u02fd\0\u012c\0\341"+
+    "\0\170\0\u011d\0\u030c\0\u031b\0\u032a";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[57];
+    int [] result = new int[61];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -127,49 +128,50 @@
 
   private static final String ZZ_TRANS_PACKED_0 =
     "\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"+
-    "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\3"+
-    "\1\4\1\5\1\6\5\0\1\17\1\0\1\20\2\21"+
-    "\1\22\3\4\1\6\4\0\1\11\1\23\1\13\1\14"+
-    "\2\21\1\22\1\5\1\4\1\5\1\6\5\0\1\24"+
-    "\1\0\1\20\2\15\1\16\4\6\21\0\1\2\10\0"+
-    "\1\25\1\0\1\25\14\0\1\26\1\27\1\30\1\31"+
-    "\13\0\1\32\1\0\1\32\14\0\1\33\1\34\1\33"+
-    "\1\34\13\0\1\35\2\36\1\37\13\0\1\16\2\40"+
-    "\14\0\1\41\2\42\1\43\13\0\4\34\13\0\1\44"+
-    "\2\45\1\46\13\0\1\47\2\50\1\51\13\0\1\52"+
-    "\1\42\1\53\1\43\13\0\1\54\2\27\1\31\4\0"+
-    "\1\11\6\0\1\25\1\0\1\25\6\0\1\55\1\0"+
-    "\1\20\2\56\1\0\1\26\1\27\1\30\1\31\5\0"+
-    "\1\57\1\0\1\20\2\60\1\61\3\27\1\31\5\0"+
-    "\1\62\1\0\1\20\2\60\1\61\1\30\1\27\1\30"+
-    "\1\31\5\0\1\63\1\0\1\20\2\56\1\0\4\31"+
-    "\5\0\1\64\2\0\1\64\2\0\1\33\1\34\1\33"+
-    "\1\34\5\0\1\64\2\0\1\64\2\0\4\34\5\0"+
-    "\1\56\1\0\1\20\2\56\1\0\1\35\2\36\1\37"+
-    "\5\0\1\60\1\0\1\20\2\60\1\61\3\36\1\37"+
-    "\5\0\1\56\1\0\1\20\2\56\1\0\4\37\5\0"+
-    "\1\61\2\0\3\61\3\40\6\0\1\24\1\0\1\20"+
-    "\2\15\1\16\1\41\2\42\1\43\5\0\1\17\1\0"+
-    "\1\20\2\21\1\22\3\42\1\43\5\0\1\24\1\0"+
-    "\1\20\2\15\1\16\4\43\5\0\1\15\1\0\1\20"+
-    "\2\15\1\16\1\44\2\45\1\46\5\0\1\21\1\0"+
-    "\1\20\2\21\1\22\3\45\1\46\5\0\1\15\1\0"+
-    "\1\20\2\15\1\16\4\46\5\0\1\16\2\0\3\16"+
-    "\1\47\2\50\1\51\5\0\1\22\2\0\3\22\3\50"+
-    "\1\51\5\0\1\16\2\0\3\16\4\51\5\0\1\65"+
-    "\1\0\1\20\2\15\1\16\1\52\1\42\1\53\1\43"+
-    "\5\0\1\66\1\0\1\20\2\21\1\22\1\53\1\42"+
-    "\1\53\1\43\5\0\1\63\1\0\1\20\2\56\1\0"+
-    "\1\54\2\27\1\31\13\0\1\67\1\31\1\67\1\31"+
-    "\13\0\4\37\13\0\4\43\13\0\4\46\13\0\4\51"+
-    "\13\0\1\70\1\43\1\70\1\43\13\0\4\31\13\0"+
-    "\4\71\5\0\1\55\1\0\1\20\2\56\1\0\1\67"+
-    "\1\31\1\67\1\31\5\0\1\65\1\0\1\20\2\15"+
-    "\1\16\1\70\1\43\1\70\1\43\5\0\1\64\2\0"+
-    "\1\64\2\0\4\71\3\0";
+    "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\17"+
+    "\1\4\1\20\1\6\5\0\1\21\1\0\1\22\2\23"+
+    "\1\24\3\4\1\6\4\0\1\11\1\25\1\13\1\14"+
+    "\2\23\1\24\1\20\1\4\1\20\1\6\5\0\1\26"+
+    "\1\0\1\22\2\15\1\16\4\6\21\0\1\2\10\0"+
+    "\1\27\1\0\1\27\14\0\1\30\1\31\1\32\1\33"+
+    "\13\0\1\34\1\0\1\34\14\0\1\35\1\36\1\35"+
+    "\1\36\13\0\1\37\2\40\1\41\13\0\1\16\2\42"+
+    "\5\0\1\11\1\26\1\13\1\14\2\15\1\16\1\17"+
+    "\1\4\1\20\1\6\4\0\1\11\1\21\1\13\1\14"+
+    "\2\23\1\24\1\20\1\4\1\20\1\6\13\0\1\43"+
+    "\2\44\1\45\13\0\4\36\13\0\1\46\2\47\1\50"+
+    "\13\0\1\51\2\52\1\53\13\0\1\54\1\44\1\55"+
+    "\1\45\13\0\1\56\2\31\1\33\4\0\1\11\6\0"+
+    "\1\27\1\0\1\27\6\0\1\57\1\0\1\22\2\60"+
+    "\1\0\1\56\2\31\1\33\5\0\1\61\1\0\1\22"+
+    "\2\62\1\63\3\31\1\33\5\0\1\64\1\0\1\22"+
+    "\2\62\1\63\3\31\1\33\5\0\1\65\1\0\1\22"+
+    "\2\60\1\0\4\33\5\0\1\66\2\0\1\66\2\0"+
+    "\1\35\1\36\1\35\1\36\5\0\1\66\2\0\1\66"+
+    "\2\0\4\36\5\0\1\60\1\0\1\22\2\60\1\0"+
+    "\1\37\2\40\1\41\5\0\1\62\1\0\1\22\2\62"+
+    "\1\63\3\40\1\41\5\0\1\60\1\0\1\22\2\60"+
+    "\1\0\4\41\5\0\1\63\2\0\3\63\3\42\6\0"+
+    "\1\67\1\0\1\22\2\15\1\16\1\43\2\44\1\45"+
+    "\5\0\1\70\1\0\1\22\2\23\1\24\3\44\1\45"+
+    "\5\0\1\67\1\0\1\22\2\15\1\16\4\45\5\0"+
+    "\1\15\1\0\1\22\2\15\1\16\1\46\2\47\1\50"+
+    "\5\0\1\23\1\0\1\22\2\23\1\24\3\47\1\50"+
+    "\5\0\1\15\1\0\1\22\2\15\1\16\4\50\5\0"+
+    "\1\16\2\0\3\16\1\51\2\52\1\53\5\0\1\24"+
+    "\2\0\3\24\3\52\1\53\5\0\1\16\2\0\3\16"+
+    "\4\53\5\0\1\71\1\0\1\22\2\15\1\16\1\43"+
+    "\2\44\1\45\5\0\1\72\1\0\1\22\2\23\1\24"+
+    "\3\44\1\45\5\0\1\65\1\0\1\22\2\60\1\0"+
+    "\1\56\2\31\1\33\13\0\1\73\1\33\1\73\1\33"+
+    "\13\0\4\41\13\0\4\45\13\0\4\50\13\0\4\53"+
+    "\13\0\1\74\1\45\1\74\1\45\13\0\4\33\13\0"+
+    "\4\75\5\0\1\57\1\0\1\22\2\60\1\0\4\33"+
+    "\5\0\1\71\1\0\1\22\2\15\1\16\4\45\5\0"+
+    "\1\66\2\0\1\66\2\0\4\75\3\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[795];
+    int [] result = new int[825];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -207,11 +209,12 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\1\0\1\11\4\1\1\11\1\1\14\0\7\1\2\0"+
-    "\1\1\1\0\16\1\4\0\1\1\2\0\5\1";
+    "\1\0\1\11\4\1\1\11\1\1\6\0\2\1\6\0"+
+    "\7\1\2\0\1\1\1\0\16\1\1\0\1\1\2\0"+
+    "\2\1\1\0\7\1";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[57];
+    int [] result = new int[61];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -288,6 +291,12 @@
 public static final int HOST              = 5;
 public static final int NUM               = 6;
 public static final int CJ                = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ *             as ACRONYMs. It is deprecated and will be removed in the next
+ *             release.
+ */
+public static final int ACRONYM_DEP       = 8;
 
 public static final String [] TOKEN_TYPES = new String [] {
     "<ALPHANUM>",
@@ -297,7 +306,8 @@
     "<EMAIL>",
     "<HOST>",
     "<NUM>",
-    "<CJ>"
+    "<CJ>",
+    "<ACRONYM_DEP>"
 };
 
 public final int yychar()
@@ -605,39 +615,43 @@
         case 5: 
           { return HOST;
           }
-        case 10: break;
+        case 11: break;
+        case 9: 
+          { return ACRONYM_DEP;
+          }
+        case 12: break;
         case 8: 
           { return ACRONYM;
           }
-        case 11: break;
+        case 13: break;
         case 1: 
           { /* ignore */
           }
-        case 12: break;
+        case 14: break;
         case 7: 
           { return NUM;
           }
-        case 13: break;
+        case 15: break;
         case 3: 
           { return CJ;
           }
-        case 14: break;
+        case 16: break;
         case 2: 
           { return ALPHANUM;
           }
-        case 15: break;
+        case 17: break;
         case 6: 
           { return COMPANY;
           }
-        case 16: break;
+        case 18: break;
         case 4: 
           { return APOSTROPHE;
           }
-        case 17: break;
-        case 9: 
+        case 19: break;
+        case 10: 
           { return EMAIL;
           }
-        case 18: break;
+        case 20: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?rev=607161&r1=607160&r2=607161&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
Thu Dec 27 18:46:11 2007
@@ -38,6 +38,12 @@
 public static final int HOST              = 5;
 public static final int NUM               = 6;
 public static final int CJ                = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ *             as ACRONYMs. It is deprecated and will be removed in the next
+ *             release.
+ */
+public static final int ACRONYM_DEP       = 8;
 
 public static final String [] TOKEN_TYPES = new String [] {
     "<ALPHANUM>",
@@ -47,7 +53,8 @@
     "<EMAIL>",
     "<HOST>",
     "<NUM>",
-    "<CJ>"
+    "<CJ>",
+    "<ACRONYM_DEP>"
 };
 
 public final int yychar()
@@ -72,7 +79,9 @@
 
 // acronyms: U.S.A., I.B.M., etc.
 // use a post-filter to remove dots
-ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+
+ACRONYM    =  {LETTER} "." ({LETTER} ".")+
+
+ACRONYM_DEP	= {ALPHANUM} "." ({ALPHANUM} ".")+
 
 // company names like AT&T and Excite@Home.
 COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
@@ -125,6 +134,7 @@
 {HOST}                                                         { return HOST; }
 {NUM}                                                          { return NUM; }
 {CJ}                                                           { return CJ; }
+{ACRONYM_DEP}                                                  { return ACRONYM_DEP; }
 
 /** Ignore the rest */
 . | {WHITESPACE}                                               { /* ignore */ }

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=607161&r1=607160&r2=607161&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Thu Dec
27 18:46:11 2007
@@ -92,6 +92,12 @@
   public void testDomainNames() throws Exception {
     // domain names
     assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
+    //Notice the trailing .  See https://issues.apache.org/jira/browse/LUCENE-1068.
+     //TODO: Remove in 3.x
+     assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] {
"<ACRONYM>" });
+     // the following should be recognized as HOST. The code that sets replaceDepAcronym
should be removed in the next release.
+     ((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
+ 	  assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] {
"<HOST>" });
   }
 
   public void testEMailAddresses() throws Exception {
@@ -195,4 +201,11 @@
                     "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
                     "<ALPHANUM>", "<HOST>"});
   }
+
+  /** @deprecated this should be removed in the 3.0. */
+   public void testDeprecatedAcronyms() throws Exception {
+ 	// test backward compatibility for applications that require the old behavior.
+ 	// this should be removed once replaceDepAcronym is removed.
+ 	  assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[]
{ "<ACRONYM>" });
+   }
 }



Mime
View raw message