lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r618001 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
Date Sun, 03 Feb 2008 15:33:17 GMT
Author: mikemccand
Date: Sun Feb  3 07:33:16 2008
New Revision: 618001

URL: http://svn.apache.org/viewvc?rev=618001&view=rev
Log:
LUCENE-1151: don't mis-identify HOST as ACRONYM, but, provide static method/property to revert
to backwards-compatible but buggy behavior

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=618001&r1=618000&r2=618001&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Feb  3 07:33:16 2008
@@ -5,6 +5,19 @@
 
 Changes in runtime behavior
 
+ 1. LUCENE-1151: Fix StandardAnalyzer to not mis-identify host names
+    (eg lucene.apache.org) as an ACRONYM.  To get back to the pre-2.4
+    backwards compatible, but buggy, behavior, you can either call
+    StandardAnalyzer.setDefaultReplaceInvalidAcronym(false) (static
+    method), or, set system property
+    org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
+    to "false" on JVM startup.  All StandardAnalyzer instances created
+    after that will then show the pre-2.4 behavior.  Alternatively,
+    you can call setReplaceInvalidAcronym(false) to change the
+    behavior per instance of StandardAnalyzer.  This backwards
+    compatibility will be removed in 3.0 (hardwiring the value to
+    true).  (Mike McCandless)
+
 API Changes
 
  1. LUCENE-1084: Changed all IndexWriter constructors to take an

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=618001&r1=618000&r2=618001&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Sun
Feb  3 07:33:16 2008
@@ -41,8 +41,49 @@
    *
    * See https://issues.apache.org/jira/browse/LUCENE-1068
    */
-  private boolean replaceInvalidAcronym = false;
-  
+  private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;
+
+  private static boolean defaultReplaceInvalidAcronym;
+
+  // Default to false (fixed the bug), unless the system prop is set
+  static {
+    final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
+    if (v == null || v.equals("true"))
+      defaultReplaceInvalidAcronym = true;
+    else
+      defaultReplaceInvalidAcronym = false;
+  }
+
+  /**
+   *
+   * @return true if new instances of StandardTokenizer will
+   * replace mischaracterized acronyms
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   * @deprecated This will be removed (hardwired to true) in 3.0
+   */
+  public static boolean getDefaultReplaceInvalidAcronym() {
+    return defaultReplaceInvalidAcronym;
+  }
+
+  /**
+   *
+   * @param replaceInvalidAcronym Set to true to have new
+   * instances of StandardTokenizer replace mischaracterized
+   * acronyms by default.  Set to false to preseve the
+   * previous (before 2.4) buggy behavior.  Alternatively,
+   * set the system property
+   * org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
+   * to false.
+   *
+   * See https://issues.apache.org/jira/browse/LUCENE-1068
+   * @deprecated This will be removed (hardwired to true) in 3.0
+   */
+  public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
+    defaultReplaceInvalidAcronym = replaceInvalidAcronym;
+  }
+
+
   /** An array containing some common English words that are usually not
   useful for searching. */
   public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
@@ -204,6 +245,7 @@
    * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
    *
    * See https://issues.apache.org/jira/browse/LUCENE-1068
+   * @deprecated This will be removed (hardwired to true) in 3.0
    */
   public boolean isReplaceInvalidAcronym() {
     return replaceInvalidAcronym;
@@ -214,6 +256,7 @@
    * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized
acronyms in the StandardTokenizer
    *
    * See https://issues.apache.org/jira/browse/LUCENE-1068
+   * @deprecated This will be removed (hardwired to true) in 3.0
    */
   public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
     this.replaceInvalidAcronym = replaceInvalidAcronym;

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=618001&r1=618000&r2=618001&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Sun Feb
 3 07:33:16 2008
@@ -134,11 +134,11 @@
     // domain names
     assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
     //Notice the trailing .  See https://issues.apache.org/jira/browse/LUCENE-1068.
-     //TODO: Remove in 3.x
-     assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] {
"<ACRONYM>" });
-     // the following should be recognized as HOST. The code that sets replaceDepAcronym
should be removed in the next release.
-     ((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
- 	  assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] {
"<HOST>" });
+    // the following should be recognized as HOST:
+    assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] {
"<HOST>" });
+    ((StandardAnalyzer) a).setReplaceInvalidAcronym(false);
+    assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>"
});
+    ((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
   }
 
   public void testEMailAddresses() throws Exception {
@@ -247,6 +247,6 @@
    public void testDeprecatedAcronyms() throws Exception {
  	// test backward compatibility for applications that require the old behavior.
  	// this should be removed once replaceDepAcronym is removed.
- 	  assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[]
{ "<ACRONYM>" });
+ 	  assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "lucene.apache.org" }, new String[]
{ "<HOST>" });
    }
 }



Mime
View raw message