jakarta-regexp-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vgritse...@apache.org
Subject svn commit: r232192 - in /jakarta/regexp/trunk: src/java/org/apache/regexp/RE.java src/java/org/apache/regexp/REProgram.java src/java/org/apache/regexp/RETest.java xdocs/changes.xml
Date Fri, 12 Aug 2005 03:04:14 GMT
Author: vgritsenko
Date: Thu Aug 11 20:04:07 2005
New Revision: 232192

URL: http://svn.apache.org/viewcvs?rev=232192&view=rev
Log:
Applied patches for Bug #27795:
Add optimization for regexps which start with ^ (BOL)


Modified:
    jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java
    jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java
    jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java
    jakarta/regexp/trunk/xdocs/changes.xml

Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java
URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java (original)
+++ jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java Thu Aug 11 20:04:07 2005
@@ -1414,6 +1414,43 @@
         // Save string to search
         this.search = search;
 
+        // Can we optimize the search by looking for new lines?
+        if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL)
+        {
+            // Non multi-line matching with BOL: Must match at '0' index
+            if ((matchFlags & MATCH_MULTILINE) == 0)
+            {
+                return i == 0 && matchAt(i);
+            }
+
+            // Multi-line matching with BOL: Seek to next line
+            for ( ;! search.isEnd(i); i++)
+            {
+                // Skip if we are at the beginning of the line
+                if (isNewline(i))
+                {
+                    continue;
+                }
+
+                // Match at the beginning of the line
+                if (matchAt(i))
+                {
+                    return true;
+                }
+
+                // Skip to the end of line
+                for ( ;! search.isEnd(i); i++)
+                {
+                    if (isNewline(i))
+                    {
+                        break;
+                    }
+                }
+            }
+
+            return false;
+        }
+
         // Can we optimize the search by looking for a prefix string?
         if (program.prefix == null)
         {

Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java
URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java (original)
+++ jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java Thu Aug 11 20:04:07 2005
@@ -33,6 +33,7 @@
 public class REProgram implements Serializable
 {
     static final int OPT_HASBACKREFS = 1;
+    static final int OPT_HASBOL      = 2;
 
     char[] instruction;         // The compiled regular expression 'program'
     int lenInstruction;         // The amount of the instruction buffer in use
@@ -81,7 +82,7 @@
         // Ensure program has been compiled!
         if (lenInstruction != 0)
         {
-            // Return copy of program 
+            // Return copy of program
             char[] ret = new char[lenInstruction];
             System.arraycopy(instruction, 0, ret, 0, lenInstruction);
             return ret;
@@ -116,16 +117,23 @@
             if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode]
== RE.OP_BRANCH)
             {
                 // to the end node
-                int next = instruction[0 + RE.offsetNext];
-                if (instruction[next + RE.offsetOpcode] == RE.OP_END)
+                char next = instruction[0 + RE.offsetNext];
+                if (instruction[next + RE.offsetOpcode] == RE.OP_END && lenInstruction
>= (RE.nodeSize * 2))
                 {
-                    // and the branch starts with an atom
-                    if (lenInstruction >= (RE.nodeSize * 2) && instruction[RE.nodeSize
+ RE.offsetOpcode] == RE.OP_ATOM)
+                    final char nextOp = instruction[RE.nodeSize + RE.offsetOpcode];
+                    // the branch starts with an atom
+                    if (nextOp == RE.OP_ATOM)
                     {
                         // then get that atom as an prefix because there's no other choice
                         int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata];
                         prefix = new char[lenAtom];
                         System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom);
+                    }
+                    // the branch starts with a BOL
+                    else if (nextOp == RE.OP_BOL)
+                    {
+                        // then set the flag indicating that BOL is present
+                        flags |= OPT_HASBOL;
                     }
                 }
             }

Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java
URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java (original)
+++ jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java Thu Aug 11 20:04:07 2005
@@ -378,6 +378,12 @@
             showParens(r);
         }
 
+        // Test for eol/bol symbols.
+        r = new RE("^abc$");
+        if (r.match("\nabc")) {
+            fail("\"\\nabc\" matches \"^abc$\"");
+        }
+
         // Test MATCH_MULTILINE. Test for eol/bol symbols.
         r = new RE("^abc$", RE.MATCH_MULTILINE);
         if (!r.match("\nabc")) {

Modified: jakarta/regexp/trunk/xdocs/changes.xml
URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/xdocs/changes.xml?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/xdocs/changes.xml (original)
+++ jakarta/regexp/trunk/xdocs/changes.xml Thu Aug 11 20:04:07 2005
@@ -34,53 +34,56 @@
 
 <h3>Version 1.4-dev</h3>
 <ul>
-<li>Fixed Bug 
+<li>Applied patches for Bug
+    <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=27795">27795</a>:
+    Add optimization for regexps which start with ^ (BOL) (VG)</li>
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=25985">25985</a>:
     In MATCH_MULTILINE mode $ does not match end of line (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=2121">2121</a>:
     '.' or '-' in bracket expression gives unexpected results (VG)</li>
 <li>Regexp is relicensed to <a href="http://www.apache.org/licenses/LICENSE-2.0">
     Apache License, Version 2.0</a> (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=2525">2525</a>:
     Leading zero-length string splitted by RE (VG)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=4137">4137</a>:
     Regexp match gets different results on different platforms (VG)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3303">3303</a>:
     Unicode 3.0 character \\uFFFD (VG)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3773">3773</a>:
     Problem with parsing greedy match modifiers (VG)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3273">3273</a>:
     CharacterArrayCharacterIterator docs and implementation mismatch (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=22928">22928</a>:
     subst() with REPLACE_BACKREFERENCES cuts first 2 characters (VG)</li>
 </ul>
 
 <h3>Version 1.3</h3>
 <ul>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=22804">22804</a>:
     ArrayIndexOutOfBoundsException on negated classes (VG)</li>
 <li>New Feature: subst() can now process backreferences when flag
     REPLACE_BACKREFERENCES is set. See API docs for details.
     Patch provided by Tobias Schaefer. (VG)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=16592">16592</a>:
     Syntax error: Too many bracketed closures (limit is 10) (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=5212">5212</a>,
aka
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=14954">14954</a>:
     A bug caused by '-' in character class definition ('[...]') (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=4057">4057</a>:
     \w does not match underscore (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=1030">1030</a>,
aka
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=10893">10893</a>:
     {n.m} notation work incorrect if n=0 (VG)</li>
@@ -89,22 +92,22 @@
     Expressions using {0,n} match 0 to n+1 times instead of 0 to n times.
     Now, expression "[a-z]{0,3}" matches "123abcdefg123" resulting in ""
     (empty string). (VG)</li>
-<li>Fixed Bug 
+<li>Fixed Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=306">306</a>:
     Why is the RE class not Serializable? (VG)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3879">3879</a>:
     Expressions using {0,n} match 0 to n+1 times instead of 0 to n times. (JSS)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=7288">7288</a>:
     Bug in negative character ranges. (JSS)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=986">986</a>:
     Leading "\b" word boundary is ignored. (JSS)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3877">3877</a>:
     {n} and {n,m} not thread safe. (JSS)</li>
-<li>Applied patches for Bug 
+<li>Applied patches for Bug
     <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=8467">8467</a>:
     Number of paren pairs limited to 16 (JSS)</li>
 <li>Fixed RE.grep() documentation to reflect a String[] is returned
@@ -117,7 +120,7 @@
 <h3>Version 1.2</h3>
 <ul>
 <li>Updated to Ant 1.2 (JSS)</li>
-<li>Documentation now built with <a 
+<li>Documentation now built with <a
 href="http://jakarta.apache.org/site/jakarta-site2.html">Anakia</a> (JSS)</li>
 <li><a href="http://jakarta.apache.org/cvsweb/index.cgi/jakarta-regexp/src/java/org/apache/regexp/RE.java?rev=1.3&amp;content-type=text/vnd.viewcvs-markup">Fixed
bug</a></li>
 <li><a href="http://jakarta.apache.org/cvsweb/index.cgi/jakarta-regexp/src/java/org/apache/regexp/RE.java?rev=1.4&amp;content-type=text/vnd.viewcvs-markup">



---------------------------------------------------------------------
To unsubscribe, e-mail: regexp-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: regexp-dev-help@jakarta.apache.org


Mime
View raw message