summaryrefslogtreecommitdiff
path: root/posix/regcomp.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2003-11-12 19:09:20 +0000
committerUlrich Drepper <drepper@redhat.com>2003-11-12 19:09:20 +0000
commit14744156b935eb7fb1a2013fdc3ce6613defa94d (patch)
treeb6dfa5ab8046feddf82d275f25b9baee162dddfc /posix/regcomp.c
parent3c0fb5745f66c8920ed4cfa8d3ead55216b15ec1 (diff)
Update.
* posix/regcomp.c (optimize_utf8): New function. (re_compile_fastmap_iter): Use dfa->mb_cur_max > 1 instead of !icase. (re_compile_internal): Call optimize_utf8 if not case insensitive and in UTF-8 locale. * posix/regex_internal.h: Ifdef out some prototypes if RE_NO_INTERNAL_PROTOTYPES is defined to shut up warnings. * posix/Makefile (tests): Add bug-regex20. (bug-regex20-ENV): Add LOCPATH. * posix/bug-regex20.c: New test. 2003-11-12 Jakub Jelinek <jakub@redhat.com>
Diffstat (limited to 'posix/regcomp.c')
-rw-r--r--posix/regcomp.c68
1 files changed, 66 insertions, 2 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 82d4bb1c57..ce91ef6807 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -30,6 +30,9 @@ static void free_charset (re_charset_t *cset);
#endif /* RE_ENABLE_I18N */
static void free_workarea_compile (regex_t *preg);
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
+#ifdef RE_ENABLE_I18N
+static void optimize_utf8 (re_dfa_t *dfa);
+#endif
static reg_errcode_t analyze (re_dfa_t *dfa);
static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -322,7 +325,7 @@ re_compile_fastmap_iter (bufp, init_state, fastmap)
{
re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
#ifdef RE_ENABLE_I18N
- if ((bufp->syntax & RE_ICASE) && !icase)
+ if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
{
unsigned char *buf = alloca (dfa->mb_cur_max), *p;
wchar_t wc;
@@ -389,7 +392,7 @@ re_compile_fastmap_iter (bufp, init_state, fastmap)
memset (&state, '\0', sizeof (state));
__wcrtomb (buf, cset->mbchars[i], &state);
re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
- if ((bufp->syntax & RE_ICASE) && !icase)
+ if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
{
__wcrtomb (buf, towlower (cset->mbchars[i]), &state);
re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
@@ -760,6 +763,12 @@ re_compile_internal (preg, pattern, length, syntax)
if (BE (dfa->str_tree == NULL, 0))
goto re_compile_internal_free_return;
+#ifdef RE_ENABLE_I18N
+ /* If possible, do searching in single byte encoding to speed things up. */
+ if (dfa->is_utf8 && !(syntax & RE_ICASE))
+ optimize_utf8 (dfa);
+#endif
+
/* Analyze the tree and collect information which is necessary to
create the dfa. */
err = analyze (dfa);
@@ -945,6 +954,61 @@ create_initial_state (dfa)
return REG_NOERROR;
}
+#ifdef RE_ENABLE_I18N
+/* If it is possible to do searching in single byte encoding instead of UTF-8
+ to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
+ DFA nodes where needed. */
+
+static void
+optimize_utf8 (dfa)
+ re_dfa_t *dfa;
+{
+ int node;
+
+ for (node = 0; node < dfa->nodes_len; ++node)
+ switch (dfa->nodes[node].type)
+ {
+ case CHARACTER:
+ /* Chars >= 0x80 are optimizable in some cases (e.g. when not
+ followed by DUP operator, not in bracket etc.).
+ For now punt on them all. */
+ if (dfa->nodes[node].opr.c >= 0x80)
+ return;
+ break;
+ case ANCHOR:
+ switch (dfa->nodes[node].opr.idx)
+ {
+ case LINE_FIRST:
+ case LINE_LAST:
+ case BUF_FIRST:
+ case BUF_LAST:
+ break;
+ default:
+ /* Word anchors etc. cannot be handled. */
+ return;
+ }
+ break;
+ case OP_BACK_REF:
+ case OP_ALT:
+ case END_OF_RE:
+ case BACK_SLASH:
+ case OP_DUP_ASTERISK:
+ case OP_DUP_QUESTION:
+ case OP_DUP_PLUS:
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
+ break;
+ default:
+ return;
+ }
+
+ /* The search can be in single byte locale. */
+ dfa->mb_cur_max = 1;
+ dfa->is_utf8 = 0;
+ dfa->has_mb_node = dfa->nbackref > 0;
+}
+#endif
+
/* Analyze the structure tree, and calculate "first", "next", "edest",
"eclosure", and "inveclosure". */