[augeas-devel] augeas: master - libfa: handle regexps with embedded NUL characters

David Lutterkort lutter at fedoraproject.org
Tue Sep 1 18:09:08 UTC 2009


Gitweb:        http://git.fedorahosted.org/git/augeas.git?p=augeas.git;a=commitdiff;h=40b84befb36130bcb91e3597556384439c34f9ab
Commit:        40b84befb36130bcb91e3597556384439c34f9ab
Parent:        82152586f01e19d8ed4bc144f83855328d1196dc
Author:        David Lutterkort <lutter at redhat.com>
AuthorDate:    Fri Aug 14 18:54:41 2009 -0700
Committer:     David Lutterkort <lutter at redhat.com>
CommitterDate: Mon Aug 31 14:36:27 2009 -0700

libfa: handle regexps with embedded NUL characters

So far, the API promised that we could do that, but that wasn't implemented.
---
 src/fa.c       |  246 +++++++++++++++++++++++++++++++++++---------------------
 tests/fatest.c |   24 +++++-
 2 files changed, 176 insertions(+), 94 deletions(-)

diff --git a/src/fa.c b/src/fa.c
index 491ec63..d3edc3c 100644
--- a/src/fa.c
+++ b/src/fa.c
@@ -133,7 +133,7 @@ static void bitset_free(bitset *bs) {
 
 /*
  * Representation of a parsed regular expression. The regular expression is
- * parsed according to the following grammar by RE_PARSE:
+ * parsed according to the following grammar by PARSE_REGEXP:
  *
  * regexp:        concat_exp ('|' regexp)?
  * concat_exp:    repeated_exp concat_exp?
@@ -152,8 +152,6 @@ static void bitset_free(bitset *bs) {
  *           | CHAR
  */
 
-static const char * const special_chars = ".()[]{}*+?";
-
 enum re_type {
     UNION,
     CONCAT,
@@ -195,6 +193,12 @@ struct re_parse {
     int         error;       /* error code */
 };
 
+/* String with explicit length, used when converting re to string */
+struct re_str {
+    char  *rx;
+    size_t len;
+};
+
 static struct re *parse_regexp(struct re_parse *parse);
 
 /* A map from a set of states to a state. */
@@ -2627,7 +2631,7 @@ static bool match(struct re_parse *parse, char m) {
 }
 
 static bool peek(struct re_parse *parse, const char *chars) {
-    return strchr(chars, *parse->rx) != NULL;
+    return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
 }
 
 static bool next(struct re_parse *parse, char *c) {
@@ -2730,15 +2734,15 @@ static struct re *parse_simple_exp(struct re_parse *parse) {
             goto error;
         }
         add_re_char(re, '\n', '\n');
-    } else {
-        if (more(parse)) {
-            char c;
-            if (!parse_char(parse, 1, &c)) {
-                parse->error = REG_EESCAPE;
-                goto error;
-            }
-            re = make_re_char(c);
+    } else if (more(parse)) {
+        char c;
+        if (!parse_char(parse, 1, &c)) {
+            parse->error = REG_EESCAPE;
+            goto error;
         }
+        re = make_re_char(c);
+    } else {
+        re = make_re(EPSILON);
     }
     return re;
  error:
@@ -2861,7 +2865,18 @@ static struct re *parse_regexp(struct re_parse *parse) {
  * we try to be clever and avoid unneeded parens and concatenation with
  * epsilon etc.
  */
-static char *re_as_string(const struct re *re);
+static int re_as_string(const struct re *re, struct re_str *str);
+
+static void release_re_str(struct re_str *str) {
+    if (str == NULL)
+        return;
+    FREE(str->rx);
+    str->len = 0;
+}
+
+static int re_str_alloc(struct re_str *str) {
+    return ALLOC_N(str->rx, str->len + 1);
+}
 
 static int re_binop_count(enum re_type type, const struct re *re) {
     assert(type == CONCAT || type == UNION);
@@ -2885,13 +2900,13 @@ static int re_binop_store(enum re_type type, const struct re *re,
     return pos;
 }
 
-static char *re_union_as_string(const struct re *re) {
+static int re_union_as_string(const struct re *re, struct re_str *str) {
     assert(re->type == UNION);
 
+    int result = -1;
     const struct re **res = NULL;
-    char **strings = NULL;
+    struct re_str *strings = NULL;
     int nre = 0, r;
-    char *result = NULL;
 
     nre = re_binop_count(re->type, re);
     r = ALLOC_N(res, nre);
@@ -2902,34 +2917,41 @@ static char *re_union_as_string(const struct re *re) {
 
     r = ALLOC_N(strings, nre);
     if (r < 0)
-        goto done;
+        goto error;
 
-    int len = 0;
+    str->len = 0;
     for (int i=0; i < nre; i++) {
-        strings[i] = re_as_string(res[i]);
-        len += strlen(strings[i]);
+        if (re_as_string(res[i], strings + i) < 0)
+            goto error;
+        str->len += strings[i].len;
     }
-    len += (nre-1) + 1;
+    str->len += nre-1;
 
-    r = ALLOC_N(result, len);
+    r = re_str_alloc(str);
     if (r < 0)
-        goto done;
+        goto error;
 
-    char *p = result;
+    char *p = str->rx;
     for (int i=0; i < nre; i++) {
         if (i>0)
             *p++ = '|';
-        p = stpcpy(p, strings[i]);
+        memcpy(p, strings[i].rx, strings[i].len);
+        p += strings[i].len;
     }
 
+    result = 0;
  done:
     free(res);
     if (strings != NULL) {
         for (int i=0; i < nre; i++)
-            free(strings[i]);
+            release_re_str(strings + i);
     }
     free(strings);
     return result;
+ error:
+    release_re_str(str);
+    result = -1;
+    goto done;
 }
 
 ATTRIBUTE_PURE
@@ -2940,61 +2962,67 @@ static int re_needs_parens_in_concat(const struct re *re) {
     return (re->type != CHAR && re->type != CSET);
 }
 
-static char *re_concat_as_string(const struct re *re) {
+static int re_concat_as_string(const struct re *re, struct re_str *str) {
     assert(re->type == CONCAT);
 
     const struct re **res = NULL;
-    char **strings = NULL;
+    struct re_str *strings = NULL;
     int nre = 0, r;
-    char *result = NULL;
+    int result = -1;
 
     nre = re_binop_count(re->type, re);
     r = ALLOC_N(res, nre);
     if (r < 0)
-        goto done;
+        goto error;
     re_binop_store(re->type, re, res);
 
     r = ALLOC_N(strings, nre);
     if (r < 0)
-        goto done;
+        goto error;
 
-    int len = 0;
+    str->len = 0;
     for (int i=0; i < nre; i++) {
         if (res[i]->type == EPSILON)
             continue;
-        strings[i] = re_as_string(res[i]);
-        len += strlen(strings[i]);
+        if (re_as_string(res[i], strings + i) < 0)
+            goto error;
+        str->len += strings[i].len;
         if (re_needs_parens_in_concat(res[i]))
-            len += 2;
+            str->len += 2;
     }
-    len += 1;
 
-    r = ALLOC_N(result, len);
+    r = re_str_alloc(str);
     if (r < 0)
-        goto done;
+        goto error;
 
-    char *p = result;
+    char *p = str->rx;
     for (int i=0; i < nre; i++) {
         if (res[i]->type == EPSILON)
             continue;
         if (re_needs_parens_in_concat(res[i]))
             *p++ = '(';
-        p = stpcpy(p, strings[i]);
+        p = memcpy(p, strings[i].rx, strings[i].len);
+        p += strings[i].len;
         if (re_needs_parens_in_concat(res[i]))
             *p++ = ')';
     }
 
+    result = 0;
  done:
     free(res);
     if (strings != NULL) {
         for (int i=0; i < nre; i++)
-            free(strings[i]);
+            release_re_str(strings + i);
     }
     free(strings);
     return result;
+ error:
+    release_re_str(str);
+    result = -1;
+    goto done;
 }
 
-static char *re_cset_as_string(const struct re *re) {
+static int re_cset_as_string(const struct re *re, struct re_str *str) {
     const uchar rbrack = ']';
     const uchar dash = '-';
     const uchar nul = '\0';
@@ -3003,13 +3031,13 @@ static char *re_cset_as_string(const struct re *re) {
     static const char *const total_set = "(.|\n)";
     static const char *const not_newline = ".";
 
-    char *result = NULL, *s;
+    char *s;
     int from, to, negate;
-    size_t set_len, len;
+    size_t len;
     int incl_rbrack, incl_dash;
     int r;
 
-    set_len = strlen(empty_set);
+    str->len = strlen(empty_set);
 
     /* We can not include NUL explicitly in a CSET since we use ordinary
        NUL delimited strings to represent them. That means that we need to
@@ -3022,7 +3050,8 @@ static char *re_cset_as_string(const struct re *re) {
              from += 1);
         if (from > UCHAR_MAX) {
             /* Special case: the set matches every character */
-            return strdup(total_set);
+            str->rx = strdup(total_set);
+            goto done;
         }
         if (from == '\n') {
             for (from += 1;
@@ -3030,7 +3059,8 @@ static char *re_cset_as_string(const struct re *re) {
                  from += 1);
             if (from > UCHAR_MAX) {
                 /* Special case: the set matches everything but '\n' */
-                return strdup(not_newline);
+                str->rx = strdup(not_newline);
+                goto done;
             }
         }
     }
@@ -3064,17 +3094,17 @@ static char *re_cset_as_string(const struct re *re) {
             incl_rbrack = 0;
         if (from < dash && dash < to)
             incl_dash = 0;
-        set_len += len;
+        str->len += len;
     }
-    set_len += incl_rbrack + incl_dash;
+    str->len += incl_rbrack + incl_dash;
     if (negate)
-        set_len += 1;        /* For the ^ */
+        str->len += 1;        /* For the ^ */
 
-    r = ALLOC_N(result, set_len + 1);
+    r = re_str_alloc(str);
     if (r < 0)
-        return NULL;
+        goto error;
 
-    s = result;
+    s = str->rx;
     *s++ = '[';
     if (negate)
         *s++ = '^';
@@ -3112,18 +3142,23 @@ static char *re_cset_as_string(const struct re *re) {
         *s++ = dash;
 
     *s = ']';
-
-    return result;
+ done:
+    if (str->rx == NULL)
+        goto error;
+    str->len = strlen(str->rx);
+    return 0;
+ error:
+    release_re_str(str);
+    return -1;
 }
 
-static char *re_iter_as_string(const struct re *re) {
+static int re_iter_as_string(const struct re *re, struct re_str *str) {
     const char *quant = NULL;
-    char *result, *exp;
-    int r;
+    char *iter = NULL;
+    int r, result = -1;
 
-    exp = re_as_string(re->exp);
-    if (exp == NULL)
-        return NULL;
+    if (re_as_string(re->exp, str) < 0)
+        return -1;
 
     if (re->min == 0 && re->max == -1) {
         quant = "*";
@@ -3131,57 +3166,76 @@ static char *re_iter_as_string(const struct re *re) {
         quant = "+";
     } else if (re->min == 0 && re->max == 1) {
         quant = "?";
+    } else {
+        r = asprintf(&iter, "{%d,%d}", re->min, re->max);
+        if (r < 0)
+            return -1;
+        quant = iter;
     }
 
     if (re->exp->type == CHAR || re->exp->type == CSET) {
-        if (quant == NULL) {
-            r = asprintf(&result, "%s{%d,%d}", exp, re->min, re->max);
-        } else {
-            r = asprintf(&result, "%s%s", exp, quant);
-        }
+        if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
+            goto error;
+        strcpy(str->rx + str->len, quant);
+        str->len += strlen(quant);
     } else {
-        if (quant == NULL) {
-            r = asprintf(&result, "(%s){%d,%d}", exp, re->min, re->max);
-        } else {
-            r = asprintf(&result, "(%s)%s", exp, quant);
-        }
+        /* Format '(' + str->rx ')' + quant */
+        if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
+            goto error;
+        memmove(str->rx + 1, str->rx, str->len);
+        str->rx[0] = '(';
+        str->rx[str->len + 1] = ')';
+        str->len += 2;
+        strcpy(str->rx + str->len, quant);
+        str->len += strlen(quant);
     }
-    FREE(exp);
 
-    return (r < 0) ? NULL : result;
+    result = 0;
+ done:
+    FREE(iter);
+    return result;
+ error:
+    release_re_str(str);
+    goto done;
 }
 
-static char *re_as_string(const struct re *re) {
-    char *result = NULL;
+static int re_as_string(const struct re *re, struct re_str *str) {
+    /* Characters that must be escaped */
+    static const char * const special_chars = ".()[]{}*+?\\";
+    int result = 0;
+
     switch(re->type) {
     case UNION:
-        result = re_union_as_string(re);
+        result = re_union_as_string(re, str);
         break;
     case CONCAT:
-        result = re_concat_as_string(re);
+        result = re_concat_as_string(re, str);
         break;
     case CSET:
-        result = re_cset_as_string(re);
+        result = re_cset_as_string(re, str);
         break;
     case CHAR:
-        if (re->c != '\\' && strchr(special_chars, re->c) == NULL) {
-            if (ALLOC_N(result, 2) == 0) {
-                result[0] = re->c;
-            }
+        if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
+            if (ALLOC_N(str->rx, 2) < 0)
+                goto error;
+            str->rx[0] = re->c;
+            str->len = 1;
         } else {
-            if (ALLOC_N(result, 3) == 0) {
-                result[0] = '\\';
-                result[1] = re->c;
-            }
+            if (ALLOC_N(str->rx, 3) < 0)
+                goto error;
+            str->rx[0] = '\\';
+            str->rx[1] = re->c;
+            str->len = strlen(str->rx);
         }
         break;
     case ITER:
-        result = re_iter_as_string(re);
+        result = re_iter_as_string(re, str);
         break;
     case EPSILON:
-        if (ALLOC_N(result, 3) == 0) {
-            strcpy(result, "()");
-        }
+        if (ALLOC_N(str->rx, 3) < 0)
+            goto error;
+        strcpy(str->rx, "()");
+        str->len = strlen(str->rx);
         break;
     default:
         assert(0);
@@ -3189,6 +3243,9 @@ static char *re_as_string(const struct re *re) {
         break;
     }
     return result;
+ error:
+    release_re_str(str);
+    return -1;
 }
 
 static int convert_trans_to_re(struct state *s) {
@@ -3394,7 +3451,12 @@ int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
 
     for_each_trans(t, fa->initial) {
         if (t->to == fin) {
-            *regexp = re_as_string(t->re);
+            struct re_str str;
+            MEMZERO(&str, 1);
+            if (re_as_string(t->re, &str) < 0)
+                goto error;
+            *regexp = str.rx;
+            *regexp_len = str.len;
         }
     }
 
@@ -3404,8 +3466,6 @@ int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
         }
     }
     fa_free(fa);
-    if (*regexp != NULL)
-        *regexp_len = strlen(*regexp);
 
     return 0;
  error:
diff --git a/tests/fatest.c b/tests/fatest.c
index bf1071d..e17990e 100644
--- a/tests/fatest.c
+++ b/tests/fatest.c
@@ -85,7 +85,10 @@ static void assertAsRegexp(CuTest *tc, struct fa *fa) {
     r = fa_as_regexp(fa1, &re, &re_len);
     CuAssertIntEquals(tc, 0, r);
 
-    r = fa_compile(re, strlen(re), &fa2);
+    r = fa_compile(re, re_len, &fa2);
+    if (r != REG_NOERROR) {
+        print_regerror(r, re);
+    }
     CuAssertIntEquals(tc, REG_NOERROR, r);
 
     CuAssertTrue(tc, fa_equals(fa, fa2));
@@ -439,6 +442,24 @@ static void testRangeEnd(CuTest *tc) {
     make_fa(tc, re, strlen(re), REG_ERANGE);
 }
 
+static void testNul(CuTest *tc) {
+    static const char *const re0 = "a\0b";
+    int re0_len = 3;
+
+    struct fa *fa1 = make_fa(tc, "a\0b", re0_len, REG_NOERROR);
+    struct fa *fa2 = make_good_fa(tc, "a.b");
+    char *re;
+    size_t re_len;
+    int r;
+
+    CuAssertTrue(tc, fa_contains(fa1, fa2));
+
+    r = fa_as_regexp(fa1, &re, &re_len);
+    CuAssertIntEquals(tc, 0, r);
+    CuAssertIntEquals(tc, re0_len, re_len);
+    CuAssertIntEquals(tc, 0, memcmp(re0, re, re0_len));
+}
+
 int main(int argc, char **argv) {
     if (argc == 1) {
         char *output = NULL;
@@ -458,6 +479,7 @@ int main(int argc, char **argv) {
         SUITE_ADD_TEST(suite, testAsRegexp);
         SUITE_ADD_TEST(suite, testAsRegexpMinus);
         SUITE_ADD_TEST(suite, testRangeEnd);
+        SUITE_ADD_TEST(suite, testNul);
 
         CuSuiteRun(suite);
         CuSuiteSummary(suite, &output);




More information about the augeas-devel mailing list