jerenkrantz 01/09/06 16:58:29
Modified: . CHANGES
modules/filters mod_include.c
Log:
Make find_start_sequence use the BNDM search algorithm. We handle
edge cases via the old slow mechanism.
Previously, find_start_sequence would be responsible for ~25% of the
usr CPU time in tests (as performed by Ian). No more.
Revision Changes Path
1.358 +3 -0 httpd-2.0/CHANGES
Index: CHANGES
===================================================================
RCS file: /home/cvs/httpd-2.0/CHANGES,v
retrieving revision 1.357
retrieving revision 1.358
diff -u -r1.357 -r1.358
--- CHANGES 2001/09/06 17:58:27 1.357
+++ CHANGES 2001/09/06 23:58:29 1.358
@@ -1,5 +1,8 @@
Changes with Apache 2.0.26-dev
+ *) Rewrite find_start_sequence to use a better search algorithm
+ to find the start tag. [Justin Erenkrantz]
+
*) Fix a seg fault in mod_include. When we are generating an
internal redirect, we must set r->uri to "", not a bogus
string, and not NULL. [Ryan Bloom]
1.144 +176 -44 httpd-2.0/modules/filters/mod_include.c
Index: mod_include.c
===================================================================
RCS file: /home/cvs/httpd-2.0/modules/filters/mod_include.c,v
retrieving revision 1.143
retrieving revision 1.144
diff -u -r1.143 -r1.144
--- mod_include.c 2001/09/04 02:13:58 1.143
+++ mod_include.c 2001/09/06 23:58:29 1.144
@@ -206,6 +206,113 @@
/* --------------------------- Parser functions --------------------------- */
+/* This is an implementation of the BNDM search algorithm.
+ *
+ * Fast and Flexible String Matching by Combining Bit-parallelism and
+ * Suffix Automata (2001)
+ * Gonzalo Navarro, Mathieu Raffinot
+ *
+ * http://www-igm.univ-mlv.fr/~raffinot/ftp/jea2001.ps.gz
+ *
+ * Initial code submitted by Sascha Schumann.
+ */
+
+typedef struct {
+ unsigned int T[256];
+ unsigned int x;
+} bndm_t;
+
+/* This is the pattern matcher that holds the STARTING_SEQUENCE bndm_t
+ * structure.
+ */
+static bndm_t start_seq_pat;
+
+/* Precompile the bndm_t data structure. */
+static void bndm_compile(bndm_t *t, const char *n, apr_size_t nl)
+{
+ unsigned int x;
+ const char *ne = n + nl;
+
+ memset(t->T, 0, sizeof(unsigned int) * 256);
+
+ for (x = 1; n < ne; x <<= 1)
+ t->T[(unsigned char) *n++] |= x;
+
+ t->x = x - 1;
+}
+
+/* Implements the BNDM search algorithm (as described above).
+ *
+ * n - the pattern to search for
+ * nl - length of the pattern to search for
+ * h - the string to look in
+ * hl - length of the string to look for
+ * t - precompiled bndm structure against the pattern
+ *
+ * Returns the count of character that is the first match or hl if no
+ * match is found.
+ */
+static int bndm(const char *n, apr_size_t nl, const char *h, apr_size_t hl,
+ bndm_t *t)
+{
+ apr_size_t skip;
+ const char *he, *p, *pi;
+ unsigned int *T, x, d;
+
+ he = h + hl;
+
+ T = t->T;
+ x = t->x << 1;
+
+ pi = h - 1; /* pi: p initial */
+ p = pi + nl; /* compare window right to left. point to the first char */
+
+ while (p < he) {
+ skip = nl;
+ d = x;
+ do {
+ d = (d >> 1) & T[(unsigned char) *p--];
+ if ((d & 1)) {
+ if (p != pi)
+ skip = p - pi;
+ else
+ return p - h + 1;
+ }
+ } while (d > 1);
+ p = (pi += skip) + nl;
+ }
+
+ return hl;
+}
+
+/* We've now found a start sequence tag... */
+static apr_bucket* found_start_sequence(apr_bucket *dptr,
+ include_ctx_t *ctx,
+ int tagStart)
+{
+ /* We want to split the bucket at the '<'. */
+ ctx->state = PARSE_DIRECTIVE;
+ ctx->tag_length = 0;
+ ctx->parse_pos = 0;
+ ctx->tag_start_bucket = dptr;
+ ctx->tag_start_index = tagStart;
+ if (ctx->head_start_index > 0) {
+ apr_bucket *tmp_bkt;
+
+ /* Split the bucket with the start of the tag in it */
+ apr_bucket_split(ctx->head_start_bucket, ctx->head_start_index);
+ tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket);
+ /* If it was a one bucket match */
+ if (dptr == ctx->head_start_bucket) {
+ ctx->tag_start_bucket = tmp_bkt;
+ ctx->tag_start_index = tagStart - ctx->head_start_index;
+ }
+ ctx->head_start_bucket = tmp_bkt;
+ ctx->head_start_index = 0;
+ }
+ return ctx->head_start_bucket;
+}
+
/* This function returns either a pointer to the split bucket containing the
* first byte of the BEGINNING_SEQUENCE (after finding a complete match) or it
* returns NULL if no match found.
@@ -217,8 +324,8 @@
const char *c;
const char *buf;
const char *str = STARTING_SEQUENCE;
- apr_bucket *tmp_bkt;
- apr_size_t start_index;
+ int slen = sizeof(STARTING_SEQUENCE) - 1;
+ int pos;
*do_cleanup = 0;
@@ -269,8 +376,53 @@
if (len == 0) { /* end of pipe? */
break;
}
+
+ /* Set our buffer to use. */
c = buf;
- while (c < buf + len) {
+
+ /* The last bucket had a left over partial match that we need to
+ * complete.
+ */
+ if (ctx->state == PARSE_HEAD)
+ {
+ apr_size_t tmpLen;
+ tmpLen = (len > (slen - 1)) ? len : (slen - 1);
+
+ while (c < buf + tmpLen && *c == str[ctx->parse_pos])
+ {
+ c++;
+ ctx->parse_pos++;
+ }
+
+ if (str[ctx->parse_pos] == '\0')
+ {
+ ctx->bytes_parsed += c - buf;
+ return found_start_sequence(dptr, ctx, c - buf);
+ }
+
+ /* False alarm... */
+ ctx->state = PRE_HEAD;
+ }
+
+ if (len)
+ {
+ pos = bndm(str, slen, c, len, &start_seq_pat);
+ if (pos != len)
+ {
+ ctx->head_start_bucket = dptr;
+ ctx->head_start_index = pos;
+ ctx->bytes_parsed += pos + slen;
+ return found_start_sequence(dptr, ctx, pos + slen);
+ }
+ }
+
+ /* Consider the case where we have <!-- at the end of the bucket. */
+ ctx->bytes_parsed += len - slen;
+ ctx->parse_pos = 0;
+
+ c = buf + len - slen + 1;
+ while (c < buf + len)
+ {
if (*c == str[ctx->parse_pos]) {
if (ctx->state == PRE_HEAD) {
ctx->state = PARSE_HEAD;
@@ -279,48 +431,24 @@
}
ctx->parse_pos++;
}
- else {
- if (str[ctx->parse_pos] == '\0') {
- /* We want to split the bucket at the '<'. */
- ctx->bytes_parsed++;
- ctx->state = PARSE_DIRECTIVE;
- ctx->tag_length = 0;
- ctx->parse_pos = 0;
- ctx->tag_start_bucket = dptr;
- ctx->tag_start_index = c - buf;
- if (ctx->head_start_index > 0) {
- start_index = (c - buf) - ctx->head_start_index;
- apr_bucket_split(ctx->head_start_bucket,
- ctx->head_start_index);
- tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket);
- if (dptr == ctx->head_start_bucket) {
- ctx->tag_start_bucket = tmp_bkt;
- ctx->tag_start_index = start_index;
- }
- ctx->head_start_bucket = tmp_bkt;
- ctx->head_start_index = 0;
- }
- return ctx->head_start_bucket;
+ else if (ctx->parse_pos != 0)
+ {
+ /* The reason for this, is that we need to make sure
+ * that we catch cases like <<!--#. This makes the
+ * second check after the original check fails.
+ * If parse_pos was already 0 then we already checked this.
+ */
+ /* FIXME: Why? */
+ *do_cleanup = 1;
+ if (*c == str[0]) {
+ ctx->parse_pos = 1;
+ ctx->head_start_index = c - buf;
}
- else if (ctx->parse_pos != 0) {
- /* The reason for this, is that we need to make sure
- * that we catch cases like <<!--#. This makes the
- * second check after the original check fails.
- * If parse_pos was already 0 then we already checked this.
- */
- *do_cleanup = 1;
- if (*c == str[0]) {
- ctx->parse_pos = 1;
- ctx->state = PARSE_HEAD;
- ctx->head_start_bucket = dptr;
- ctx->head_start_index = c - buf;
- }
- else {
- ctx->parse_pos = 0;
- ctx->state = PRE_HEAD;
- ctx->head_start_bucket = NULL;
- ctx->head_start_index = 0;
- }
+ else {
+ ctx->parse_pos = 0;
+ ctx->state = PRE_HEAD;
+ ctx->head_start_bucket = NULL;
+ ctx->head_start_index = 0;
}
}
c++;
@@ -2970,6 +3098,10 @@
apr_pool_t *ptemp, server_rec *s)
{
include_hash = apr_hash_make(p);
+
+ /* compile the pattern used by find_start_sequence */
+ bndm_compile(&start_seq_pat, STARTING_SEQUENCE,
+ sizeof(STARTING_SEQUENCE)-1);
ssi_pfn_register = APR_RETRIEVE_OPTIONAL_FN(ap_register_include_handler);
|