[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[lvm-devel] Using glibc regex



This patch replaces internal  dm_regex and uses  glibc regex routines so we
achieve better performance for things like hundreds patterns.

Needs some testing...

Zdenek

>From 844352bc6974d0bca718af3832ad7512cec1fb0b Mon Sep 17 00:00:00 2001
From: Zdenek Kabelac <zkabelac redhat com>
Date: Tue, 20 Apr 2010 16:04:15 +0200
Subject: [PATCH] Switch to glibc regex to speedup pattern matching.

Signed-off-by: Zdenek Kabelac <zkabelac redhat com>
---
 libdm/regex/matcher.c |  351 +++++--------------------------------------------
 1 files changed, 31 insertions(+), 320 deletions(-)

diff --git a/libdm/regex/matcher.c b/libdm/regex/matcher.c
index d9c2d8a..6ca3faa 100644
--- a/libdm/regex/matcher.c
+++ b/libdm/regex/matcher.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.  
- * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
  *
  * This file is part of the device-mapper userspace tools.
  *
@@ -14,345 +14,56 @@
  */
 
 #include "dmlib.h"
-#include "parse_rx.h"
-#include "ttree.h"
-#include "assert.h"
+#include <regex.h>
 
-struct dfa_state {
-	int final;
-	struct dfa_state *lookup[256];
+struct dm_regex {
+	int num_patterns;       /* Number of compiled regex patterns */
+	regex_t regex[0];       /* Compiled regex */
 };
 
-struct state_queue {
-	struct dfa_state *s;
-	dm_bitset_t bits;
-	struct state_queue *next;
-};
-
-struct dm_regex {		/* Instance variables for the lexer */
-	struct dfa_state *start;
-	unsigned num_nodes;
-	int nodes_entered;
-	struct rx_node **nodes;
-	struct dm_pool *scratch, *mem;
-};
-
-#define TARGET_TRANS '\0'
-
-static int _count_nodes(struct rx_node *rx)
-{
-	int r = 1;
-
-	if (rx->left)
-		r += _count_nodes(rx->left);
-
-	if (rx->right)
-		r += _count_nodes(rx->right);
-
-	return r;
-}
-
-static void _fill_table(struct dm_regex *m, struct rx_node *rx)
-{
-	assert((rx->type != OR) || (rx->left && rx->right));
-
-	if (rx->left)
-		_fill_table(m, rx->left);
-
-	if (rx->right)
-		_fill_table(m, rx->right);
-
-	m->nodes[m->nodes_entered++] = rx;
-}
-
-static void _create_bitsets(struct dm_regex *m)
-{
-	int i;
-
-	for (i = 0; i < m->num_nodes; i++) {
-		struct rx_node *n = m->nodes[i];
-		n->firstpos = dm_bitset_create(m->scratch, m->num_nodes);
-		n->lastpos = dm_bitset_create(m->scratch, m->num_nodes);
-		n->followpos = dm_bitset_create(m->scratch, m->num_nodes);
-	}
-}
-
-static void _calc_functions(struct dm_regex *m)
-{
-	int i, j, final = 1;
-	struct rx_node *rx, *c1, *c2;
-
-	for (i = 0; i < m->num_nodes; i++) {
-		rx = m->nodes[i];
-		c1 = rx->left;
-		c2 = rx->right;
-
-		if (dm_bit(rx->charset, TARGET_TRANS))
-			rx->final = final++;
-
-		switch (rx->type) {
-		case CAT:
-			if (c1->nullable)
-				dm_bit_union(rx->firstpos,
-					  c1->firstpos, c2->firstpos);
-			else
-				dm_bit_copy(rx->firstpos, c1->firstpos);
-
-			if (c2->nullable)
-				dm_bit_union(rx->lastpos,
-					  c1->lastpos, c2->lastpos);
-			else
-				dm_bit_copy(rx->lastpos, c2->lastpos);
-
-			rx->nullable = c1->nullable && c2->nullable;
-			break;
-
-		case PLUS:
-			dm_bit_copy(rx->firstpos, c1->firstpos);
-			dm_bit_copy(rx->lastpos, c1->lastpos);
-			rx->nullable = c1->nullable;
-			break;
-
-		case OR:
-			dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
-			dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
-			rx->nullable = c1->nullable || c2->nullable;
-			break;
-
-		case QUEST:
-		case STAR:
-			dm_bit_copy(rx->firstpos, c1->firstpos);
-			dm_bit_copy(rx->lastpos, c1->lastpos);
-			rx->nullable = 1;
-			break;
-
-		case CHARSET:
-			dm_bit_set(rx->firstpos, i);
-			dm_bit_set(rx->lastpos, i);
-			rx->nullable = 0;
-			break;
-
-		default:
-			log_error(INTERNAL_ERROR "Unknown calc node type");
-		}
-
-		/*
-		 * followpos has it's own switch
-		 * because PLUS and STAR do the
-		 * same thing.
-		 */
-		switch (rx->type) {
-		case CAT:
-			for (j = 0; j < m->num_nodes; j++) {
-				if (dm_bit(c1->lastpos, j)) {
-					struct rx_node *n = m->nodes[j];
-					dm_bit_union(n->followpos,
-						  n->followpos, c2->firstpos);
-				}
-			}
-			break;
-
-		case PLUS:
-		case STAR:
-			for (j = 0; j < m->num_nodes; j++) {
-				if (dm_bit(rx->lastpos, j)) {
-					struct rx_node *n = m->nodes[j];
-					dm_bit_union(n->followpos,
-						  n->followpos, rx->firstpos);
-				}
-			}
-			break;
-		}
-	}
-}
-
-static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
-{
-	return dm_pool_zalloc(mem, sizeof(struct dfa_state));
-}
-
-static struct state_queue *_create_state_queue(struct dm_pool *mem,
-					       struct dfa_state *dfa,
-					       dm_bitset_t bits)
-{
-	struct state_queue *r = dm_pool_alloc(mem, sizeof(*r));
-
-	if (!r) {
-		stack;
-		return NULL;
-	}
-
-	r->s = dfa;
-	r->bits = dm_bitset_create(mem, bits[0]);	/* first element is the size */
-	dm_bit_copy(r->bits, bits);
-	r->next = 0;
-	return r;
-}
-
-static int _calc_states(struct dm_regex *m, struct rx_node *rx)
-{
-	unsigned iwidth = (m->num_nodes / DM_BITS_PER_INT) + 1;
-	struct ttree *tt = ttree_create(m->scratch, iwidth);
-	struct state_queue *h, *t, *tmp;
-	struct dfa_state *dfa, *ldfa;
-	int i, a, set_bits = 0, count = 0;
-	dm_bitset_t bs, dfa_bits;
-
-	if (!tt)
-		return_0;
-
-	if (!(bs = dm_bitset_create(m->scratch, m->num_nodes)))
-		return_0;
-
-	/* create first state */
-	dfa = _create_dfa_state(m->mem);
-	m->start = dfa;
-	ttree_insert(tt, rx->firstpos + 1, dfa);
-
-	/* prime the queue */
-	h = t = _create_state_queue(m->scratch, dfa, rx->firstpos);
-	while (h) {
-		/* pop state off front of the queue */
-		dfa = h->s;
-		dfa_bits = h->bits;
-		h = h->next;
-
-		/* iterate through all the inputs for this state */
-		dm_bit_clear_all(bs);
-		for (a = 0; a < 256; a++) {
-			/* iterate through all the states in firstpos */
-			for (i = dm_bit_get_first(dfa_bits);
-			     i >= 0; i = dm_bit_get_next(dfa_bits, i)) {
-				if (dm_bit(m->nodes[i]->charset, a)) {
-					if (a == TARGET_TRANS)
-						dfa->final = m->nodes[i]->final;
-
-					dm_bit_union(bs, bs,
-						  m->nodes[i]->followpos);
-					set_bits = 1;
-				}
-			}
-
-			if (set_bits) {
-				ldfa = ttree_lookup(tt, bs + 1);
-				if (!ldfa) {
-					/* push */
-					ldfa = _create_dfa_state(m->mem);
-					ttree_insert(tt, bs + 1, ldfa);
-					tmp =
-					    _create_state_queue(m->scratch,
-								ldfa, bs);
-					if (!h)
-						h = t = tmp;
-					else {
-						t->next = tmp;
-						t = tmp;
-					}
-
-					count++;
-				}
-
-				dfa->lookup[a] = ldfa;
-				set_bits = 0;
-				dm_bit_clear_all(bs);
-			}
-		}
-	}
-
-	log_debug("Matcher built with %d dfa states", count);
-	return 1;
-}
-
 struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns,
 				 unsigned num_patterns)
 {
-	char *all, *ptr;
 	int i;
-	size_t len = 0;
-	struct rx_node *rx;
-	struct dm_pool *scratch = dm_pool_create("regex matcher", 10 * 1024);
 	struct dm_regex *m;
 
-	if (!scratch)
+	if (!(m = dm_pool_alloc(mem, sizeof(*m) + num_patterns * sizeof(regex_t))))
 		return_NULL;
 
-	if (!(m = dm_pool_alloc(mem, sizeof(*m)))) {
-		dm_pool_destroy(scratch);
-		return_NULL;
-	}
-
-	memset(m, 0, sizeof(*m));
-
-	/* join the regexps together, delimiting with zero */
-	for (i = 0; i < num_patterns; i++)
-		len += strlen(patterns[i]) + 8;
-
-	ptr = all = dm_pool_alloc(scratch, len + 1);
-
-	if (!all)
-		goto_bad;
-
 	for (i = 0; i < num_patterns; i++) {
-		ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
-		if (i < (num_patterns - 1))
-			*ptr++ = '|';
-	}
-
-	/* parse this expression */
-	if (!(rx = rx_parse_tok(scratch, all, ptr))) {
-		log_error("Couldn't parse regex");
-		goto bad;
+		log_debug("Pattern %d/%d: %s", i, num_patterns, patterns[i]);
+		if (regcomp(&m->regex[i], patterns[i], REG_EXTENDED) != 0) {
+			log_error("Failed to compile pattern: %s\n", patterns[i]);
+			dm_pool_free(mem, m);
+			return_NULL;
+		}
 	}
 
-	m->mem = mem;
-	m->scratch = scratch;
-	m->num_nodes = _count_nodes(rx);
-	m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes);
-
-	if (!m->nodes)
-		goto_bad;
-
-	_fill_table(m, rx);
-	_create_bitsets(m);
-	_calc_functions(m);
-	_calc_states(m, rx);
-	dm_pool_destroy(scratch);
-	m->scratch = NULL;
+	m->num_patterns = num_patterns;
 
 	return m;
-
-      bad:
-	dm_pool_destroy(scratch);
-	dm_pool_free(mem, m);
-	return NULL;
-}
-
-static struct dfa_state *_step_matcher(int c, struct dfa_state *cs, int *r)
-{
-	if (!(cs = cs->lookup[(unsigned char) c]))
-		return NULL;
-
-	if (cs->final && (cs->final > *r))
-		*r = cs->final;
-
-	return cs;
 }
 
+/**
+ * Match string against set of regular expressions.
+ * Note: testing from the last regex pattern.
+ *
+ * \param regex
+ * Structure dm_regex holds precompiled regex patterns.
+ *
+ * \param s
+ * String for matching.
+ *
+ * \return
+ * Index of matched regular expresion or -1 if there is no match.
+ */
 int dm_regex_match(struct dm_regex *regex, const char *s)
 {
-	struct dfa_state *cs = regex->start;
-	int r = 0;
-
-	if (!(cs = _step_matcher(HAT_CHAR, cs, &r)))
-		goto out;
-
-	for (; *s; s++)
-		if (!(cs = _step_matcher(*s, cs, &r)))
-			goto out;
+	int i;
 
-	_step_matcher(DOLLAR_CHAR, cs, &r);
+	for (i = regex->num_patterns - 1; i >= 0; --i)
+		if (regexec(&regex->regex[i], s, 0, NULL, 0) == 0)
+			break;
 
-      out:
-	/* subtract 1 to get back to zero index */
-	return r - 1;
+	return i;
 }
-- 
1.7.0.1


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]