httpd-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Justin Erenkrantz <jerenkra...@ebuilt.com>
Subject [PATCH] Take 3 of mod_include patch...
Date Wed, 05 Sep 2001 23:44:53 GMT
Okay, I've cleaned this up and I think it is ready for commit.
However, I'd really like some eyes on this.  =-)  

In Ian and Brian's testing, this does seem to make mod_include
faster.  I can't guarantee that there aren't any bugs here,
but I've tested it with what I have and looked at the code as
best as I can.  I think this gets us moving in the right
direction.

Changes since last post:
- don't call strlen on the static string (thanks Cliff!)
- move the compilation of the starting_sequence to post_config
- comment (well, tried to...)
- fixed some of the parameters to fit our type conventions.  

Comments?  Can someone who has knowledge of the bndm algorithm
verify this?  I'll try glancing at the paper tonight or tomorrow.
-- justin

Index: modules/filters/mod_include.c
===================================================================
RCS file: /home/cvs/httpd-2.0/modules/filters/mod_include.c,v
retrieving revision 1.143
diff -u -r1.143 mod_include.c
--- modules/filters/mod_include.c	2001/09/04 02:13:58	1.143
+++ modules/filters/mod_include.c	2001/09/05 23:39:18
@@ -206,6 +206,113 @@
 
 /* --------------------------- Parser functions --------------------------- */
 
+/* This is an implementation of the BNDM search algorithm.
+ *
+ * Fast and Flexible String Matching by Combining Bit-parallelism and 
+ * Suffix Automata (2001) 
+ * Gonzalo Navarro, Mathieu Raffinot
+ *
+ * http://www-igm.univ-mlv.fr/~raffinot/ftp/jea2001.ps.gz
+ *
+ * Initial code submitted by Sascha Schumann.
+ */
+   
+typedef struct {
+    unsigned int T[256];
+    unsigned int x;
+} bndm_t;
+
+/* This is the pattern matcher that holds the STARTING_SEQUENCE bndm_t
+ * structure.
+ */
+static bndm_t start_seq_pat;
+
+/* Precompile the bndm_t data structure. */
+static void bndm_compile(bndm_t *t, const char *n, apr_size_t nl)
+{
+    unsigned int x;
+    const char *ne = n + nl;
+
+    memset(t->T, 0, sizeof(unsigned int) * 256);
+    
+    for (x = 1; n < ne; x <<= 1)
+        t->T[(unsigned char) *n++] |= x;
+
+    t->x = x - 1;
+}
+
+/* Implements the BNDM search algorithm (as described above).
+ *
+ * n  - the pattern to search for
+ * nl - length of the pattern to search for
+ * h  - the string to look in
+ * hl - length of the string to look for
+ * t  - precompiled bndm structure against the pattern 
+ *
+ * Returns the count of character that is the first match or hl if no
+ * match is found.
+ */
+static int bndm(const char *n, apr_size_t nl, const char *h, apr_size_t hl, 
+                bndm_t *t)
+{
+    apr_size_t skip;
+    const char *he, *p, *pi;
+    unsigned int *T, x, d;
+
+    he = h + hl;
+
+    T = t->T;
+    x = t->x << 1;
+
+    pi = h - 1; /* pi: p initial */
+    p = pi + nl; /* compare window right to left. point to the first char */
+
+    do {
+        skip = nl;
+        d = x;
+        do {
+            d = (d >> 1) & T[(unsigned char) *p--];
+            if ((d & 1)) {
+                if (p != pi)
+                    skip = p - pi;
+                else
+                    return p - h + 1;
+            }
+        } while (d > 1);
+        p = (pi += skip) + nl; 
+    } while (p < he);
+
+    return hl;
+}
+
+/* We've now found a start sequence tag... */
+static apr_bucket* found_start_sequence(apr_bucket *dptr,
+                                          include_ctx_t *ctx, 
+                                          int tagStart)
+{
+    /* We want to split the bucket at the '<'. */
+    ctx->state = PARSE_DIRECTIVE;
+    ctx->tag_length = 0;
+    ctx->parse_pos = 0;
+    ctx->tag_start_bucket = dptr;
+    ctx->tag_start_index = tagStart;
+    if (ctx->head_start_index > 0) {
+        apr_bucket *tmp_bkt;
+
+        /* Split the bucket with the start of the tag in it */
+        apr_bucket_split(ctx->head_start_bucket, ctx->head_start_index);
+        tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket);
+        /* If it was a one bucket match */
+        if (dptr == ctx->head_start_bucket) {
+            ctx->tag_start_bucket = tmp_bkt;
+            ctx->tag_start_index = tagStart - ctx->head_start_index;
+        }
+        ctx->head_start_bucket = tmp_bkt;
+        ctx->head_start_index = 0;
+    }
+    return ctx->head_start_bucket;
+}
+
 /* This function returns either a pointer to the split bucket containing the
  * first byte of the BEGINNING_SEQUENCE (after finding a complete match) or it
  * returns NULL if no match found.
@@ -217,8 +324,8 @@
     const char *c;
     const char *buf;
     const char *str = STARTING_SEQUENCE;
-    apr_bucket *tmp_bkt;
-    apr_size_t  start_index;
+    int slen = sizeof(STARTING_SEQUENCE) - 1;
+    int pos;
 
     *do_cleanup = 0;
 
@@ -269,8 +376,53 @@
         if (len == 0) { /* end of pipe? */
             break;
         }
+
+        /* Set our buffer to use. */
         c = buf;
-        while (c < buf + len) {
+
+        /* The last bucket had a left over partial match that we need to
+         * complete. 
+         */
+        if (ctx->state == PARSE_HEAD)
+        {
+            apr_size_t tmpLen;
+            tmpLen = (len > (slen - 1)) ? len : (slen - 1);
+
+            while (c < buf + tmpLen && *c == str[ctx->parse_pos])
+            {
+                c++; 
+                ctx->parse_pos++;
+            }
+
+            if (str[ctx->parse_pos] == '\0')
+            {
+                ctx->bytes_parsed += c - buf;
+                return found_start_sequence(dptr, ctx, c - buf);
+            }
+
+            /* False alarm... */
+            ctx->state = PRE_HEAD;
+        }
+
+        if (len)
+        {
+            pos = bndm(str, slen, c, len, &start_seq_pat);
+            if (pos != len)
+            {
+                ctx->head_start_bucket = dptr;
+                ctx->head_start_index = pos;
+                ctx->bytes_parsed += pos + slen;
+                return found_start_sequence(dptr, ctx, pos + slen);
+            }
+        }
+        
+        /* Consider the case where we have <!-- at the end of the bucket. */
+        ctx->bytes_parsed += len - slen;
+        ctx->parse_pos = 0;
+
+        c = buf + len - slen + 1;
+        while (c < buf + len)
+        {
             if (*c == str[ctx->parse_pos]) {
                 if (ctx->state == PRE_HEAD) {
                     ctx->state = PARSE_HEAD;
@@ -279,48 +431,24 @@
                 }
                 ctx->parse_pos++;
             }
-            else {
-                if (str[ctx->parse_pos] == '\0') {
-                    /* We want to split the bucket at the '<'. */
-                    ctx->bytes_parsed++;
-                    ctx->state = PARSE_DIRECTIVE;
-                    ctx->tag_length = 0;
-                    ctx->parse_pos = 0;
-                    ctx->tag_start_bucket = dptr;
-                    ctx->tag_start_index = c - buf;
-                    if (ctx->head_start_index > 0) {
-                        start_index = (c - buf) - ctx->head_start_index;
-                        apr_bucket_split(ctx->head_start_bucket, 
-                                         ctx->head_start_index);
-                        tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket);
-                        if (dptr == ctx->head_start_bucket) {
-                            ctx->tag_start_bucket = tmp_bkt;
-                            ctx->tag_start_index = start_index;
-                        }
-                        ctx->head_start_bucket = tmp_bkt;
-                        ctx->head_start_index = 0;
-                    }
-                    return ctx->head_start_bucket;
+            else if (ctx->parse_pos != 0) 
+            {
+                /* The reason for this, is that we need to make sure 
+                 * that we catch cases like <<!--#.  This makes the 
+                 * second check after the original check fails.
+                 * If parse_pos was already 0 then we already checked this.
+                 */
+                /* FIXME: Why? */
+                *do_cleanup = 1;
+                if (*c == str[0]) {
+                    ctx->parse_pos = 1;
+                    ctx->head_start_index = c - buf;
                 }
-                else if (ctx->parse_pos != 0) {
-                    /* The reason for this, is that we need to make sure 
-                     * that we catch cases like <<!--#.  This makes the 
-                     * second check after the original check fails.
-                     * If parse_pos was already 0 then we already checked this.
-                     */
-                    *do_cleanup = 1;
-                    if (*c == str[0]) {
-                        ctx->parse_pos = 1;
-                        ctx->state = PARSE_HEAD;
-                        ctx->head_start_bucket = dptr;
-                        ctx->head_start_index = c - buf;
-                    }
-                    else {
-                        ctx->parse_pos = 0;
-                        ctx->state = PRE_HEAD;
-                        ctx->head_start_bucket = NULL;
-                        ctx->head_start_index = 0;
-                    }
+                else {
+                    ctx->parse_pos = 0;
+                    ctx->state = PRE_HEAD;
+                    ctx->head_start_bucket = NULL;
+                    ctx->head_start_index = 0;
                 }
             }
             c++;
@@ -2970,6 +3098,10 @@
                                 apr_pool_t *ptemp, server_rec *s)
 {
     include_hash = apr_hash_make(p);
+
+    /* compile the pattern used by find_start_sequence */
+    bndm_compile(&start_seq_pat, STARTING_SEQUENCE, 
+                 sizeof(STARTING_SEQUENCE)-1); 
 
     ssi_pfn_register = APR_RETRIEVE_OPTIONAL_FN(ap_register_include_handler);
 


Mime
View raw message