nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From si...@apache.org
Subject svn commit: r160462 - in incubator/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/analysis/NutchAnalysis.java src/java/org/apache/nutch/analysis/NutchAnalysis.jj src/test/org/apache/nutch/analysis/ src/test/org/apache/nutch/analysis/TestQueryParser.java
Date Thu, 07 Apr 2005 20:33:14 GMT
Author: siren
Date: Thu Apr  7 13:33:14 2005
New Revision: 160462

URL: http://svn.apache.org/viewcvs?view=rev&rev=160462
Log:
Fix for bug #4 - Unbalanced quote in query eats all resources.


Added:
    incubator/nutch/trunk/src/test/org/apache/nutch/analysis/
    incubator/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java
Modified:
    incubator/nutch/trunk/CHANGES.txt
    incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
    incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj

Modified: incubator/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/CHANGES.txt?view=diff&r1=160461&r2=160462
==============================================================================
--- incubator/nutch/trunk/CHANGES.txt (original)
+++ incubator/nutch/trunk/CHANGES.txt Thu Apr  7 13:33:14 2005
@@ -56,6 +56,9 @@
     servers are not queried until tey come back online, watchdog keeps
     an eye for your searchservers and writes simple statistics.
     (Sami Siren, 20050407)
+    
+11. Fix for bug #4 - Unbalanced quote in query eats all resources.
+	(Piotr Kosiorowski, Sami Siren, 20050407)
 
 Release 0.6
 

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java?view=diff&r1=160461&r2=160462
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java Thu Apr  7
13:33:14 2005
@@ -147,7 +147,6 @@
     label_2:
     while (true) {
       switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case 0:
       case PLUS:
       case MINUS:
       case COLON:
@@ -181,7 +180,6 @@
       label_4:
       while (true) {
         switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-        case 0:
         case PLUS:
         case MINUS:
         case COLON:
@@ -200,7 +198,18 @@
       }
     }
     end = token.endColumn;
-    jj_consume_token(QUOTE);
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case QUOTE:
+      jj_consume_token(QUOTE);
+      break;
+    case 0:
+      jj_consume_token(0);
+      break;
+    default:
+      jj_la1[7] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
     if (QueryFilters.isRawField(field)) {
       result.clear();
       result.add(queryString.substring(start, end));
@@ -240,7 +249,7 @@
           ;
           break;
         default:
-          jj_la1[7] = jj_gen;
+          jj_la1[8] = jj_gen;
           break label_6;
         }
       }
@@ -269,7 +278,7 @@
       token = jj_consume_token(SIGRAM);
       break;
     default:
-      jj_la1[8] = jj_gen;
+      jj_la1[9] = jj_gen;
       jj_consume_token(-1);
       throw new ParseException();
     }
@@ -292,11 +301,30 @@
     case APOSTROPHE:
       infix();
       break;
+    default:
+      jj_la1[10] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+  }
+
+  final public void nonTermOrEOF() throws ParseException {
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case PLUS:
+    case MINUS:
+    case COLON:
+    case SLASH:
+    case DOT:
+    case ATSIGN:
+    case APOSTROPHE:
+    case WHITE:
+      nonTerm();
+      break;
     case 0:
       jj_consume_token(0);
       break;
     default:
-      jj_la1[9] = jj_gen;
+      jj_la1[11] = jj_gen;
       jj_consume_token(-1);
       throw new ParseException();
     }
@@ -332,14 +360,14 @@
           jj_consume_token(MINUS);
           break;
         default:
-          jj_la1[10] = jj_gen;
+          jj_la1[12] = jj_gen;
           jj_consume_token(-1);
           throw new ParseException();
         }
-        nonTerm();
+        nonTermOrEOF();
         break;
       default:
-        jj_la1[11] = jj_gen;
+        jj_la1[13] = jj_gen;
         jj_consume_token(-1);
         throw new ParseException();
       }
@@ -363,7 +391,7 @@
       nonOpInfix();
       break;
     default:
-      jj_la1[12] = jj_gen;
+      jj_la1[14] = jj_gen;
       jj_consume_token(-1);
       throw new ParseException();
     }
@@ -388,7 +416,7 @@
       jj_consume_token(APOSTROPHE);
       break;
     default:
-      jj_la1[13] = jj_gen;
+      jj_la1[15] = jj_gen;
       jj_consume_token(-1);
       throw new ParseException();
     }
@@ -434,7 +462,7 @@
     return false;
   }
 
-  final private boolean jj_3R_25() {
+  final private boolean jj_3R_26() {
     if (jj_3R_16()) return true;
     return false;
   }
@@ -444,17 +472,6 @@
     return false;
   }
 
-  final private boolean jj_3R_21() {
-    Token xsp;
-    if (jj_3R_25()) return true;
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_25()) { jj_scanpos = xsp; break; }
-    }
-    if (jj_3R_11()) return true;
-    return false;
-  }
-
   final private boolean jj_3R_16() {
     Token xsp;
     xsp = jj_scanpos;
@@ -468,18 +485,24 @@
     return false;
   }
 
-  final private boolean jj_3R_15() {
-    if (jj_3R_11()) return true;
+  final private boolean jj_3R_21() {
     Token xsp;
+    if (jj_3R_26()) return true;
     while (true) {
       xsp = jj_scanpos;
-      if (jj_3R_21()) { jj_scanpos = xsp; break; }
+      if (jj_3R_26()) { jj_scanpos = xsp; break; }
     }
+    if (jj_3R_11()) return true;
     return false;
   }
 
-  final private boolean jj_3R_23() {
-    if (jj_3R_16()) return true;
+  final private boolean jj_3R_15() {
+    if (jj_3R_11()) return true;
+    Token xsp;
+    while (true) {
+      xsp = jj_scanpos;
+      if (jj_3R_21()) { jj_scanpos = xsp; break; }
+    }
     return false;
   }
 
@@ -496,6 +519,11 @@
     return false;
   }
 
+  final private boolean jj_3R_27() {
+    if (jj_3R_16()) return true;
+    return false;
+  }
+
   final private boolean jj_3_1() {
     if (jj_scan_token(WORD)) return true;
     if (jj_scan_token(COLON)) return true;
@@ -508,6 +536,21 @@
     return false;
   }
 
+  final private boolean jj_3R_23() {
+    if (jj_3R_24()) return true;
+    return false;
+  }
+
+  final private boolean jj_3R_18() {
+    Token xsp;
+    xsp = jj_scanpos;
+    if (jj_3R_23()) {
+    jj_scanpos = xsp;
+    if (jj_scan_token(0)) return true;
+    }
+    return false;
+  }
+
   final private boolean jj_3R_13() {
     Token xsp;
     xsp = jj_scanpos;
@@ -519,21 +562,28 @@
     return false;
   }
 
-  final private boolean jj_3R_18() {
+  final private boolean jj_3R_24() {
     Token xsp;
     xsp = jj_scanpos;
     if (jj_scan_token(15)) {
     jj_scanpos = xsp;
-    if (jj_3R_23()) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(0)) return true;
-    }
+    if (jj_3R_27()) return true;
     }
     return false;
   }
 
-  final private boolean jj_3R_24() {
-    if (jj_3R_18()) return true;
+  final private boolean jj_3R_22() {
+    if (jj_3R_17()) return true;
+    return false;
+  }
+
+  final private boolean jj_3R_25() {
+    if (jj_3R_24()) return true;
+    return false;
+  }
+
+  final private boolean jj_3R_12() {
+    if (jj_3R_17()) return true;
     return false;
   }
 
@@ -555,7 +605,7 @@
     Token xsp;
     while (true) {
       xsp = jj_scanpos;
-      if (jj_3R_24()) { jj_scanpos = xsp; break; }
+      if (jj_3R_25()) { jj_scanpos = xsp; break; }
     }
     return false;
   }
@@ -565,18 +615,8 @@
     return false;
   }
 
-  final private boolean jj_3R_22() {
-    if (jj_3R_17()) return true;
-    return false;
-  }
-
   final private boolean jj_3R_19() {
-    if (jj_3R_18()) return true;
-    return false;
-  }
-
-  final private boolean jj_3R_12() {
-    if (jj_3R_17()) return true;
+    if (jj_3R_24()) return true;
     return false;
   }
 
@@ -607,7 +647,11 @@
       xsp = jj_scanpos;
       if (jj_3R_20()) { jj_scanpos = xsp; break; }
     }
-    if (jj_scan_token(QUOTE)) return true;
+    xsp = jj_scanpos;
+    if (jj_scan_token(9)) {
+    jj_scanpos = xsp;
+    if (jj_scan_token(0)) return true;
+    }
     return false;
   }
 
@@ -619,13 +663,13 @@
   public boolean lookingAhead = false;
   private boolean jj_semLA;
   private int jj_gen;
-  final private int[] jj_la1 = new int[14];
+  final private int[] jj_la1 = new int[16];
   static private int[] jj_la1_0;
   static {
       jj_la1_0();
    }
    private static void jj_la1_0() {
-      jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd81,0xe,0xfd81,0x7d80,0xe,0xfd81,0x180,0xfd80,0x7d80,0x7c00,};
+      jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,};
    }
   final private JJCalls[] jj_2_rtns = new JJCalls[3];
   private boolean jj_rescan = false;
@@ -636,7 +680,7 @@
     token = new Token();
     jj_ntk = -1;
     jj_gen = 0;
-    for (int i = 0; i < 14; i++) jj_la1[i] = -1;
+    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
@@ -645,7 +689,7 @@
     token = new Token();
     jj_ntk = -1;
     jj_gen = 0;
-    for (int i = 0; i < 14; i++) jj_la1[i] = -1;
+    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
@@ -654,7 +698,7 @@
     token = new Token();
     jj_ntk = -1;
     jj_gen = 0;
-    for (int i = 0; i < 14; i++) jj_la1[i] = -1;
+    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
@@ -663,7 +707,7 @@
     token = new Token();
     jj_ntk = -1;
     jj_gen = 0;
-    for (int i = 0; i < 14; i++) jj_la1[i] = -1;
+    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
     for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
   }
 
@@ -782,7 +826,7 @@
       la1tokens[jj_kind] = true;
       jj_kind = -1;
     }
-    for (int i = 0; i < 14; i++) {
+    for (int i = 0; i < 16; i++) {
       if (jj_la1[i] == jj_gen) {
         for (int j = 0; j < 32; j++) {
           if ((jj_la1_0[i] & (1<<j)) != 0) {

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj?view=diff&r1=160461&r2=160462
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj Thu Apr  7 13:33:14
2005
@@ -239,7 +239,7 @@
 
   { end = token.endColumn; }
 
-  <QUOTE>
+  (<QUOTE>|<EOF>)
     
   {
     if (QueryFilters.isRawField(field)) {
@@ -294,15 +294,20 @@
 void nonTerm() :
 {}
 {
-  <WHITE> | infix() | <EOF>
+  <WHITE> | infix()
 }
 
+void nonTermOrEOF() :
+{}
+{
+  nonTerm() | <EOF>
+}
 
 /** Parse anything but a term or an operator (plur or minus or quote). */
 void nonOpOrTerm() :
 {}
 {
-  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTerm())))*
+  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
 }
 
 /** Characters which can be used to form compound terms. */

Added: incubator/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java?view=auto&rev=160462
==============================================================================
--- incubator/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java (added)
+++ incubator/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java Thu Apr
 7 13:33:14 2005
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.analysis;
+
+import org.apache.nutch.searcher.Query;
+
+import junit.framework.TestCase;
+
+/**
+ * JUnit tests for query parser
+ *  
+ */
+public class TestQueryParser extends TestCase {
+
+  public void assertQueryEquals(String query, String result) throws Exception {
+    try {
+      Query q = NutchAnalysis.parseQuery(query);
+      String s = q.toString();
+      if (!s.equals(result)) {
+        fail("Query /" + query + "/ yielded /" + s + "/, expecting /" + result
+            + "/");
+      }
+    } catch (Exception e) {
+      throw new Exception("error: While parsing query:" + query, e);
+    }
+  }
+
+  /**
+   * Test query parser
+   * 
+   * @throws Exception
+   */
+  public void testParseQuery() throws Exception {
+    //simple tests
+    assertQueryEquals("x", "x");
+    assertQueryEquals("X", "x");
+    assertQueryEquals("+x", "x");
+    assertQueryEquals("-x", "-x");
+    assertQueryEquals("x y", "x y");
+    assertQueryEquals(" x  y ", "x y");
+    assertQueryEquals("test +", "test");
+
+    // missing fourth double quote
+    assertQueryEquals("\" abc def \" \" def ghi ", "\"abc def\" \"def ghi\"");
+
+    //empty query
+    assertQueryEquals("\"", "");
+
+    //fields
+    assertQueryEquals("field:x -another:y", "field:x -another:y");
+    assertQueryEquals("the:x", "the:x");
+
+    //ACRONYM
+    assertQueryEquals("w.s.o.p.", "wsop");
+
+    //STOPWORD
+    assertQueryEquals("the", "");
+    assertQueryEquals("field:the -y", "-y");
+    assertQueryEquals("+the -y", "the -y");
+
+    //PHRASE
+    assertQueryEquals("\"hello world\"", "\"hello world\"");
+    assertQueryEquals("\"phrase a.b.c. phrase\"", "\"phrase abc phrase\"");
+    assertQueryEquals("\"the end\"", "\"the end\"");
+    assertQueryEquals("term\"the end\"", "term \"the end\"");
+    //unbalanced
+    assertQueryEquals("term\"the end", "term \"the end\"");
+
+    //SIGRAM
+    assertQueryEquals("\u3040\u3041\u3042", "\u3040 \u3041 \u3042");
+
+    //COMPOUND
+    assertQueryEquals("term some.email@adress.here",
+        "term \"some email adress here\"");
+  }
+}



Mime
View raw message