summaryrefslogtreecommitdiff
path: root/posix
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2003-11-16 07:14:28 +0000
committerUlrich Drepper <drepper@redhat.com>2003-11-16 07:14:28 +0000
commitf0c7c524bb92cdc42cc4e0f7ba1ddda865a4494c (patch)
treef06d459ac9d5a2d7c02591cd5375dd7ad6ff7a1e /posix
parent2def87644d44b41bb908d4ed150a110d4d9399ea (diff)
Update.
* posix/regex_internal.h: Add forward declaration of re_dfa_t. Replace last two parameters of re_string_allocate and re_string_construct with pointer to DFA. (re_dfa_t): Add map_notascii field. * posix/regcomp.c (re_compile_internal): Add call of re_string_construct. (init_dfa): Initialize mpa_notascii. * posix/regex_internal.c: Adjust definitions of re_string_allocate and re_string_construct. Pass DFA to re_string_construct. Adjust definition. Initialize map_notascii field. (build_wcs_upper_buffer): If map_notascii is zero use simplfied method to map ASCII values to upper case. * posix/regex.c: Include localeinfo.h. * posix/regexec.c: Adjust call of re_string_allocate. * locale/langinfo.h: Add _NL_CTYPE_MAP_TO_NONASCII. * locale/localeinfo.h (LIMAGIC): Change value. * locale/categories.def. Add entry for _NL_CTYPE_MAP_TO_NONASCII. * locale/C-ctype.h: Likewise. * locale/programs/ld-ctype.c: Compute whether any mapping maps from ASCII to non-ASCII value. Write out that value.
Diffstat (limited to 'posix')
-rw-r--r--posix/regcomp.c5
-rw-r--r--posix/regex.c26
-rw-r--r--posix/regex_internal.c166
-rw-r--r--posix/regex_internal.h18
-rw-r--r--posix/regexec.c9
5 files changed, 146 insertions, 78 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 9f56b389df..68ce551c3a 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -748,8 +748,7 @@ re_compile_internal (preg, pattern, length, syntax)
#endif
err = re_string_construct (&regexp, pattern, length, preg->translate,
- syntax & RE_ICASE, dfa->mb_cur_max,
- dfa->is_utf8);
+ syntax & RE_ICASE, dfa);
if (BE (err != REG_NOERROR, 0))
{
re_free (dfa);
@@ -828,6 +827,8 @@ init_dfa (dfa, pat_len)
if (dfa->mb_cur_max > 1
&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
dfa->is_utf8 = 1;
+ dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
+ != 0);
#endif
if (BE (dfa->nodes == NULL || dfa->state_table == NULL
diff --git a/posix/regex.c b/posix/regex.c
index 98d86e1b80..f18178a479 100644
--- a/posix/regex.c
+++ b/posix/regex.c
@@ -20,25 +20,27 @@
#ifdef _LIBC
/* We have to keep the namespace clean. */
-# define regfree(preg) __regfree (preg)
-# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
-# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
-# define regerror(errcode, preg, errbuf, errbuf_size) \
+# define regfree(preg) __regfree (preg)
+# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
+# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
+# define regerror(errcode, preg, errbuf, errbuf_size) \
__regerror(errcode, preg, errbuf, errbuf_size)
-# define re_set_registers(bu, re, nu, st, en) \
+# define re_set_registers(bu, re, nu, st, en) \
__re_set_registers (bu, re, nu, st, en)
-# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
+# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
-# define re_match(bufp, string, size, pos, regs) \
+# define re_match(bufp, string, size, pos, regs) \
__re_match (bufp, string, size, pos, regs)
-# define re_search(bufp, string, size, startpos, range, regs) \
+# define re_search(bufp, string, size, startpos, range, regs) \
__re_search (bufp, string, size, startpos, range, regs)
-# define re_compile_pattern(pattern, length, bufp) \
+# define re_compile_pattern(pattern, length, bufp) \
__re_compile_pattern (pattern, length, bufp)
-# define re_set_syntax(syntax) __re_set_syntax (syntax)
-# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
+# define re_set_syntax(syntax) __re_set_syntax (syntax)
+# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
-# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
+# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
+
+# include "../locale/localeinfo.h"
#endif
/* POSIX says that <sys/types.h> must be included (by the caller) before
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index 329fc81de9..6f07bd4dd1 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -21,7 +21,7 @@
static void re_string_construct_common (const char *str, int len,
re_string_t *pstr,
RE_TRANSLATE_TYPE trans, int icase,
- int mb_cur_max, int is_utf8);
+ const re_dfa_t *dfa);
#ifdef RE_ENABLE_I18N
static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
wint_t *last_wc);
@@ -47,17 +47,16 @@ static unsigned int inline calc_state_hash (const re_node_set *nodes,
re_string_reconstruct before using the object. */
static reg_errcode_t
-re_string_allocate (pstr, str, len, init_len, trans, icase,
- mb_cur_max, is_utf8)
+re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
re_string_t *pstr;
const char *str;
- int len, init_len, icase, mb_cur_max, is_utf8;
+ int len, init_len, icase;
RE_TRANSLATE_TYPE trans;
+ const re_dfa_t *dfa;
{
reg_errcode_t ret;
int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
- re_string_construct_common (str, len, pstr, trans, icase,
- mb_cur_max, is_utf8);
+ re_string_construct_common (str, len, pstr, trans, icase, dfa);
pstr->stop = pstr->len;
ret = re_string_realloc_buffers (pstr, init_buf_len);
@@ -68,22 +67,22 @@ re_string_allocate (pstr, str, len, init_len, trans, icase,
: (unsigned char *) str);
pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
- || mb_cur_max > 1) ? pstr->valid_len : len;
+ || dfa->mb_cur_max > 1) ? pstr->valid_len : len;
return REG_NOERROR;
}
/* This function allocate the buffers, and initialize them. */
static reg_errcode_t
-re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
+re_string_construct (pstr, str, len, trans, icase, dfa)
re_string_t *pstr;
const char *str;
- int len, icase, mb_cur_max, is_utf8;
+ int len, icase;
RE_TRANSLATE_TYPE trans;
+ const re_dfa_t *dfa;
{
reg_errcode_t ret;
- re_string_construct_common (str, len, pstr, trans, icase,
- mb_cur_max, is_utf8);
+ re_string_construct_common (str, len, pstr, trans, icase, dfa);
pstr->stop = pstr->len;
/* Set 0 so that this function can initialize whole buffers. */
pstr->valid_len = 0;
@@ -101,7 +100,7 @@ re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
if (icase)
{
#ifdef RE_ENABLE_I18N
- if (mb_cur_max > 1)
+ if (dfa->mb_cur_max > 1)
build_wcs_upper_buffer (pstr);
else
#endif /* RE_ENABLE_I18N */
@@ -110,7 +109,7 @@ re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
else
{
#ifdef RE_ENABLE_I18N
- if (mb_cur_max > 1)
+ if (dfa->mb_cur_max > 1)
build_wcs_buffer (pstr);
else
#endif /* RE_ENABLE_I18N */
@@ -167,20 +166,22 @@ re_string_realloc_buffers (pstr, new_buf_len)
static void
-re_string_construct_common (str, len, pstr, trans, icase, mb_cur_max, is_utf8)
+re_string_construct_common (str, len, pstr, trans, icase, dfa)
const char *str;
int len;
re_string_t *pstr;
RE_TRANSLATE_TYPE trans;
- int icase, mb_cur_max, is_utf8;
+ int icase;
+ const re_dfa_t *dfa;
{
memset (pstr, '\0', sizeof (re_string_t));
pstr->raw_mbs = (const unsigned char *) str;
pstr->len = len;
pstr->trans = trans;
pstr->icase = icase ? 1 : 0;
- pstr->mb_cur_max = mb_cur_max;
- pstr->is_utf8 = is_utf8;
+ pstr->mb_cur_max = dfa->mb_cur_max;
+ pstr->is_utf8 = dfa->is_utf8;
+ pstr->map_notascii = dfa->map_notascii;
}
#ifdef RE_ENABLE_I18N
@@ -253,47 +254,110 @@ build_wcs_upper_buffer (pstr)
/* Build the buffers from pstr->valid_len to either pstr->len or
pstr->bufs_len. */
end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
- for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
- {
- wchar_t wc;
- remain_len = end_idx - byte_idx;
- prev_st = pstr->cur_state;
- mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
- + byte_idx), remain_len, &pstr->cur_state);
- if (BE (mbclen == (size_t) -2, 0))
+
+#ifdef _LIBC
+ /* The following optimization assumes that the wchar_t encoding is
+ always ISO 10646. */
+ if (! pstr->map_notascii && pstr->trans == NULL)
+ for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
+ if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
+ && mbsinit (&pstr->cur_state))
{
- /* The buffer doesn't have enough space, finish to build. */
- pstr->cur_state = prev_st;
- break;
+ /* In case of a singlebyte character. */
+ pstr->mbs[byte_idx]
+ = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
+ /* The next step uses the assumption that wchar_t is encoded
+ with ISO 10646: all ASCII values can be converted like this. */
+ pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
+ ++byte_idx;
}
- else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
+ else
{
- /* In case of a singlebyte character. */
- int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
- /* Apply the translation if we need. */
- if (pstr->trans != NULL && mbclen == 1)
+ wchar_t wc;
+ remain_len = end_idx - byte_idx;
+ prev_st = pstr->cur_state;
+ mbclen = mbrtowc (&wc,
+ ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
+ + byte_idx), remain_len, &pstr->cur_state);
+ if (BE (mbclen > 1, 1))
{
- ch = pstr->trans[ch];
- pstr->mbs_case[byte_idx] = ch;
+ if (iswlower (wc))
+ wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc),
+ &prev_st);
+ else
+ memcpy (pstr->mbs + byte_idx,
+ pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
+ pstr->wcs[byte_idx++] = towupper (wc);
+ /* Write paddings. */
+ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
+ pstr->wcs[byte_idx++] = WEOF;
+ }
+ else if (mbclen == (size_t) -1 || mbclen == 0)
+ {
+ /* In case of a singlebyte character. */
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
+ /* Apply the translation if we need. */
+ if (BE (pstr->trans != NULL, 0) && mbclen == 1)
+ {
+ ch = pstr->trans[ch];
+ pstr->mbs_case[byte_idx] = ch;
+ }
+ pstr->wcs[byte_idx] = towupper (wc);
+ pstr->mbs[byte_idx++] = toupper (ch);
+ if (BE (mbclen == (size_t) -1, 0))
+ pstr->cur_state = prev_st;
}
- pstr->wcs[byte_idx] = iswlower (wc) ? towupper (wc) : wc;
- pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
- if (BE (mbclen == (size_t) -1, 0))
- pstr->cur_state = prev_st;
- }
- else /* mbclen > 1 */
- {
- if (iswlower (wc))
- wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
else
- memcpy (pstr->mbs + byte_idx,
- pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
- pstr->wcs[byte_idx++] = iswlower (wc) ? towupper (wc) : wc;
- /* Write paddings. */
- for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
- pstr->wcs[byte_idx++] = WEOF;
+ {
+ /* The buffer doesn't have enough space, finish to build. */
+ pstr->cur_state = prev_st;
+ break;
+ }
}
- }
+ else
+#endif
+ for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
+ {
+ wchar_t wc;
+ remain_len = end_idx - byte_idx;
+ prev_st = pstr->cur_state;
+ mbclen = mbrtowc (&wc,
+ ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
+ + byte_idx), remain_len, &pstr->cur_state);
+ if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
+ {
+ /* In case of a singlebyte character. */
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
+ /* Apply the translation if we need. */
+ if (BE (pstr->trans != NULL, 0) && mbclen == 1)
+ {
+ ch = pstr->trans[ch];
+ pstr->mbs_case[byte_idx] = ch;
+ }
+ pstr->wcs[byte_idx] = towupper (wc);
+ pstr->mbs[byte_idx++] = toupper (ch);
+ if (BE (mbclen == (size_t) -1, 0))
+ pstr->cur_state = prev_st;
+ }
+ else if (BE (mbclen != (size_t) -2, 1))
+ {
+ if (iswlower (wc))
+ wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
+ else
+ memcpy (pstr->mbs + byte_idx,
+ pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
+ pstr->wcs[byte_idx++] = towupper (wc);
+ /* Write paddings. */
+ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
+ pstr->wcs[byte_idx++] = WEOF;
+ }
+ else
+ {
+ /* The buffer doesn't have enough space, finish to build. */
+ pstr->cur_state = prev_st;
+ break;
+ }
+ }
pstr->valid_len = byte_idx;
}
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 18622cad5c..9fcf865f65 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -335,6 +335,7 @@ struct re_string_t
/* 1 if REG_ICASE. */
unsigned int icase : 1;
unsigned int is_utf8 : 1;
+ unsigned int map_notascii : 1;
int mb_cur_max;
};
typedef struct re_string_t re_string_t;
@@ -345,31 +346,32 @@ typedef struct re_string_t re_string_t;
#define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)
+struct re_dfa_t;
+typedef struct re_dfa_t re_dfa_t;
#ifndef RE_NO_INTERNAL_PROTOTYPES
static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str,
int len, int init_len,
RE_TRANSLATE_TYPE trans, int icase,
- int mb_cur_max, int is_utf8);
+ const re_dfa_t *dfa);
static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str,
int len, RE_TRANSLATE_TYPE trans,
- int icase, int mb_cur_max,
- int is_utf8);
+ int icase, const re_dfa_t *dfa);
static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
int eflags, int newline);
static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
int new_buf_len);
-#ifdef RE_ENABLE_I18N
+# ifdef RE_ENABLE_I18N
static void build_wcs_buffer (re_string_t *pstr);
static void build_wcs_upper_buffer (re_string_t *pstr);
-#endif /* RE_ENABLE_I18N */
+# endif /* RE_ENABLE_I18N */
static void build_upper_buffer (re_string_t *pstr);
static void re_string_translate_buffer (re_string_t *pstr);
static void re_string_destruct (re_string_t *pstr);
-#ifdef RE_ENABLE_I18N
+# ifdef RE_ENABLE_I18N
static int re_string_elem_size_at (const re_string_t *pstr, int idx);
static inline int re_string_char_size_at (const re_string_t *pstr, int idx);
static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx);
-#endif /* RE_ENABLE_I18N */
+# endif /* RE_ENABLE_I18N */
static unsigned int re_string_context_at (const re_string_t *input, int idx,
int eflags, int newline_anchor);
#endif
@@ -610,9 +612,9 @@ struct re_dfa_t
collating element. */
unsigned int has_mb_node : 1;
unsigned int is_utf8 : 1;
+ unsigned int map_notascii : 1;
int mb_cur_max;
};
-typedef struct re_dfa_t re_dfa_t;
#ifndef RE_NO_INTERNAL_PROTOTYPES
static reg_errcode_t re_node_set_alloc (re_node_set *set, int size);
diff --git a/posix/regexec.c b/posix/regexec.c
index e962275ba3..7470197506 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -605,8 +605,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
fl_longest_match = (nmatch != 0 || dfa->nbackref);
err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
- preg->translate, preg->syntax & RE_ICASE,
- dfa->mb_cur_max, dfa->is_utf8);
+ preg->translate, preg->syntax & RE_ICASE, dfa);
if (BE (err != REG_NOERROR, 0))
goto free_return;
input.stop = stop;
@@ -1760,7 +1759,7 @@ check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
{
struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
int dst, cpos;
-
+
/* If this backreference goes beyond the point we're
examining, don't go any further. */
if (ent->str_idx > str_idx)
@@ -1797,12 +1796,12 @@ check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
}
break;
}
-
+
case OP_OPEN_SUBEXP:
if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
return -1;
break;
-
+
case OP_CLOSE_SUBEXP:
if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
return 0;