summaryrefslogtreecommitdiff
path: root/posix
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2002-04-26 20:52:02 +0000
committerUlrich Drepper <drepper@redhat.com>2002-04-26 20:52:02 +0000
commit434d3784f194e382d86edd72c9c6a1d0051a7b96 (patch)
tree3678759b0c16bc0f694e6892eb7f111c0605dc8d /posix
parent58fe8d109631d84a4392c7a8f77db3d163e37345 (diff)
Update.
2002-04-26 Isamu Hasegawa <isamu@yamato.ibm.com> * posix/regcomp.c (re_compile_fastmap_iter): Fix fastmap in case of not _LIBC and RE_ENABLE_I18N. (build_range_exp): Implement for not _LIBC. (build_collating_symbol): Likewise. (parse_bracket_exp): Unify redundant error handlings. Don't erase mbcset for non matching list in multibyte envs. (build_word_op): Add '_' to matching list for \w operator. * posix/regex_internal.c (re_string_construct): Invoke build_upper_buffer in case of not RE_ENABLE_I18N. (re_string_reconstruct): Don't touch cur_state in case of not RE_ENABLE_I18N. * posix/regex_internal.h (attribute_hidden): New macro in case of not _LIBC. (re_charset_t): Define range_starts/ends in case of not _LIBC. * posix/regexec.c (sift_states_iter_mb): Hide in case of not RE_ENABLE_I18N. (transit_state_mb): Likewise. (check_node_accept_bytes): Implement the code evaluating range expression in case of not _LIBC. (find_collation_sequence_value): Hide in case of not _LIBC. 2002-04-26 Jakub Jelinek <jakub@redhat.com> * sysdeps/unix/sysv/linux/sparc/sparc32/semctl.c: Copied from i386/semctl.c. (__old_semctl, __new_semctl): Only use va_arg if the argument will be used.
Diffstat (limited to 'posix')
-rw-r--r--posix/regcomp.c227
-rw-r--r--posix/regex_internal.c9
-rw-r--r--posix/regex_internal.h8
-rw-r--r--posix/regexec.c123
4 files changed, 278 insertions, 89 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 149814cf98..59836b15e0 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -114,6 +114,16 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
re_string_t *regexp,
re_token_t *token);
+#ifndef _LIBC
+static reg_errcode_t build_range_exp (re_charset_t *mbcset,
+ re_bitset_ptr_t sbcset, int *range_alloc,
+ bracket_elem_t *start_elem,
+ bracket_elem_t *end_elem);
+static reg_errcode_t build_collating_symbol (re_charset_t *mbcset,
+ re_bitset_ptr_t sbcset,
+ int *coll_sym_alloc,
+ unsigned char *name);
+#endif /* not _LIBC */
static reg_errcode_t build_equiv_class (re_charset_t *mbcset,
re_bitset_ptr_t sbcset,
int *equiv_class_alloc,
@@ -354,7 +364,14 @@ re_compile_fastmap_iter (bufp, init_state, fastmap)
if (table[ch] < 0)
fastmap[ch] = 1;
}
-#endif
+#else
+# ifdef RE_ENABLE_I18N
+ if (MB_CUR_MAX > 1)
+ for (i = 0; i < SBC_MAX; ++i)
+ if (__btowc (i) == WEOF)
+ fastmap[i] = 1;
+# endif /* RE_ENABLE_I18N */
+#endif /* not _LIBC */
}
for (i = 0; i < cset->nmbchars; ++i)
{
@@ -2207,6 +2224,136 @@ parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
I'm not sure, but maybe enough. */
#define BRACKET_NAME_BUF_SIZE 32
+#ifndef _LIBC
+ /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
+ Build the range expression which starts from START_ELEM, and ends
+ at END_ELEM. The result are written to MBCSET and SBCSET.
+ RANGE_ALLOC is the allocated size of mbcset->range_starts, and
+ mbcset->range_ends, is a pointer argument sinse we may
+ update it. */
+
+static reg_errcode_t
+build_range_exp (mbcset, sbcset, range_alloc, start_elem, end_elem)
+ re_charset_t *mbcset;
+ re_bitset_ptr_t sbcset;
+ int *range_alloc;
+ bracket_elem_t *start_elem, *end_elem;
+{
+ unsigned int start_ch, end_ch;
+ /* Equivalence Classes and Character Classes can't be a range start/end. */
+ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
+ || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
+ 0))
+ return REG_ERANGE;
+
+ /* We can handle no multi character collating elements without libc
+ support. */
+ if (BE ((start_elem->type == COLL_SYM && strlen (start_elem->opr.name) > 1)
+ || (end_elem->type == COLL_SYM && strlen (end_elem->opr.name) > 1),
+ 0))
+ return REG_ECOLLATE;
+
+# ifdef RE_ENABLE_I18N
+ {
+ wchar_t wc, start_wc, end_wc;
+ wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
+
+ start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
+ : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
+ : 0));
+ end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
+ : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
+ : 0));
+ start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
+ ? __btowc (start_ch) : start_elem->opr.wch);
+ end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
+ ? __btowc (end_ch) : end_elem->opr.wch);
+ cmp_buf[0] = start_wc;
+ cmp_buf[4] = end_wc;
+ if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+ return REG_ERANGE;
+
+ /* Check the space of the arrays. */
+ if (*range_alloc == mbcset->nranges)
+ {
+ /* There are not enough space, need realloc. */
+ wchar_t *new_array_start, *new_array_end;
+ int new_nranges;
+
+ /* +1 in case of mbcset->nranges is 0. */
+ new_nranges = 2 * mbcset->nranges + 1;
+ /* Use realloc since mbcset->range_starts and mbcset->range_ends
+ are NULL if *range_alloc == 0. */
+ new_array_start = re_realloc (mbcset->range_starts, wchar_t,
+ new_nranges);
+ new_array_end = re_realloc (mbcset->range_ends, wchar_t,
+ new_nranges);
+
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+ return REG_ESPACE;
+
+ mbcset->range_starts = new_array_start;
+ mbcset->range_ends = new_array_end;
+ *range_alloc = new_nranges;
+ }
+
+ mbcset->range_starts[mbcset->nranges] = start_wc;
+ mbcset->range_ends[mbcset->nranges++] = end_wc;
+
+ /* Build the table for single byte characters. */
+ for (wc = 0; wc <= SBC_MAX; ++wc)
+ {
+ cmp_buf[2] = wc;
+ if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
+ && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ bitset_set (sbcset, wc);
+ }
+ }
+# else /* not RE_ENABLE_I18N */
+ {
+ unsigned int ch;
+ start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
+ : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
+ : 0));
+ end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
+ : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
+ : 0));
+ if (start_ch > end_ch)
+ return REG_ERANGE;
+ /* Build the table for single byte characters. */
+ for (ch = 0; ch <= SBC_MAX; ++ch)
+ if (start_ch <= ch && ch <= end_ch)
+ bitset_set (sbcset, ch);
+ }
+# endif /* not RE_ENABLE_I18N */
+ return REG_NOERROR;
+}
+#endif /* not _LIBC */
+
+#ifndef _LIBC
+/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
+ Build the collating element which is represented by NAME.
+ The result are written to MBCSET and SBCSET.
+ COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
+ pointer argument since we may update it. */
+
+static reg_errcode_t
+build_collating_symbol (mbcset, sbcset, coll_sym_alloc, name)
+ re_charset_t *mbcset;
+ re_bitset_ptr_t sbcset;
+ int *coll_sym_alloc;
+ unsigned char *name;
+{
+ if (BE (strlen (name) != 1, 0))
+ return REG_ECOLLATE;
+ else
+ {
+ bitset_set (sbcset, name[0]);
+ return REG_NOERROR;
+ }
+}
+#endif /* not _LIBC */
+
/* This function parse bracket expression like "[abc]", "[a-c]",
"[[.a-a.]]" etc. */
@@ -2225,7 +2372,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
const int32_t *symb_table;
const unsigned char *extra;
- /* Local function for parse_bracket_exp.
+ /* Local function for parse_bracket_exp used in _LIBC environement.
Seek the collating symbol entry correspondings to NAME.
Return the index of the symbol in the SYMB_TABLE. */
@@ -2257,7 +2404,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
return elem;
}
- /* Local function for parse_bracket_exp.
+ /* Local function for parse_bracket_exp used in _LIBC environement.
Look up the collation sequence value of BR_ELEM.
Return the value if succeeded, UINT_MAX otherwise. */
@@ -2321,7 +2468,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
return UINT_MAX;
}
- /* Local function for parse_bracket_exp.
+ /* Local function for parse_bracket_exp used in _LIBC environement.
Build the range expression which starts from START_ELEM, and ends
at END_ELEM. The result are written to MBCSET and SBCSET.
RANGE_ALLOC is the allocated size of mbcset->range_starts, and
@@ -2364,6 +2511,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
*range_alloc = new_nranges;
}
+ /* Equivalence Classes and Character Classes can't be a range
+ start/end. */
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
0))
@@ -2397,9 +2546,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
}
return REG_NOERROR;
}
-#endif
- /* Local function for parse_bracket_exp.
+ /* Local function for parse_bracket_exp used in _LIBC environement.
Build the collating element which is represented by NAME.
The result are written to MBCSET and SBCSET.
COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
@@ -2412,7 +2560,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
int *coll_sym_alloc;
unsigned char *name;
{
-#ifdef _LIBC
int32_t elem, idx;
if (nrules != 0)
{
@@ -2452,7 +2599,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
return REG_NOERROR;
}
else
-#endif
{
if (BE (strlen (name) != 1, 0))
return REG_ECOLLATE;
@@ -2463,6 +2609,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
}
}
}
+#endif
+
re_token_t br_token;
re_bitset_ptr_t sbcset;
re_charset_t *mbcset;
@@ -2497,10 +2645,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
token_len = peek_token_bracket (token, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = REG_BADPAT;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
if (token->type == OP_NON_MATCH_LIST)
{
@@ -2512,10 +2658,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
token_len = peek_token_bracket (token, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = REG_BADPAT;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
if (MB_CUR_MAX > 1)
for (i = 0; i < SBC_MAX; ++i)
@@ -2541,19 +2685,15 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
syntax);
if (BE (ret != REG_NOERROR, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = ret;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
token_len = peek_token_bracket (token, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = REG_BADPAT;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
if (token->type == OP_CHARSET_RANGE)
{
@@ -2561,10 +2701,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
token_len2 = peek_token_bracket (&token2, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = REG_BADPAT;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
if (token2.type == OP_CLOSE_BRACKET)
{
@@ -2583,28 +2721,20 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
dfa, syntax);
if (BE (ret != REG_NOERROR, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = ret;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
token_len = peek_token_bracket (token, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
- re_free (sbcset);
- free_charset (mbcset);
*err = REG_BADPAT;
- return NULL;
+ goto parse_bracket_exp_free_return;
}
*err = build_range_exp (mbcset, sbcset, &range_alloc, &start_elem,
&end_elem);
if (BE (*err != REG_NOERROR, 0))
- {
- re_free (sbcset);
- free_charset (mbcset);
- return NULL;
- }
+ goto parse_bracket_exp_free_return;
}
else
{
@@ -2632,21 +2762,13 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
*err = build_equiv_class (mbcset, sbcset, &equiv_class_alloc,
start_elem.opr.name);
if (BE (*err != REG_NOERROR, 0))
- {
- re_free (sbcset);
- free_charset (mbcset);
- return NULL;
- }
+ goto parse_bracket_exp_free_return;
break;
case COLL_SYM:
*err = build_collating_symbol (mbcset, sbcset, &coll_sym_alloc,
start_elem.opr.name);
if (BE (*err != REG_NOERROR, 0))
- {
- re_free (sbcset);
- free_charset (mbcset);
- return NULL;
- }
+ goto parse_bracket_exp_free_return;
break;
case CHAR_CLASS:
ret = build_charclass (mbcset, sbcset, &char_class_alloc,
@@ -2678,7 +2800,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
goto parse_bracket_exp_espace;
if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
- || mbcset->nranges || (mbcset->nchar_classes && MB_CUR_MAX > 1))
+ || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
+ || mbcset->non_match)))
{
re_token_t alt_token;
bin_tree_t *mbc_tree;
@@ -2704,11 +2827,15 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
}
parse_bracket_exp_espace:
- free_charset (mbcset);
*err = REG_ESPACE;
+ parse_bracket_exp_free_return:
+ re_free (sbcset);
+ free_charset (mbcset);
return NULL;
}
+/* Parse an element in the bracket expression. */
+
static reg_errcode_t
parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
bracket_elem_t *elem;
@@ -2738,6 +2865,10 @@ parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
return REG_NOERROR;
}
+/* Parse a bracket symbol in the bracket expression. Bracket symbols are
+ such as [:<character_class>:], [.<collating_element>.], and
+ [=<equivalent_class>=]. */
+
static reg_errcode_t
parse_bracket_symbol (elem, regexp, token)
bracket_elem_t *elem;
@@ -2968,10 +3099,12 @@ build_word_op (dfa, not, err)
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
bitset_set(cset->sbcset, '\0');
*/
+#ifdef RE_ENABLE_I18N
if (MB_CUR_MAX > 1)
for (i = 0; i < SBC_MAX; ++i)
if (__btowc (i) == WEOF)
bitset_set (sbcset, i);
+#endif /* RE_ENABLE_I18N */
}
/* We don't care the syntax in this case. */
@@ -2983,6 +3116,8 @@ build_word_op (dfa, not, err)
*err = REG_ESPACE;
return NULL;
}
+ /* \w match '_' also. */
+ bitset_set (sbcset, '_');
/* If it is non-matching list. */
if (mbcset->non_match)
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index b688d0f7d9..5327c265c2 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -60,7 +60,9 @@
static void re_string_construct_common (const unsigned char *str,
int len, re_string_t *pstr,
RE_TRANSLATE_TYPE trans, int icase);
+#ifdef RE_ENABLE_I18N
static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx);
+#endif /* RE_ENABLE_I18N */
static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
const re_node_set *nodes,
unsigned int hash);
@@ -134,8 +136,8 @@ re_string_construct (pstr, str, len, trans, icase)
if (MB_CUR_MAX > 1)
build_wcs_upper_buffer (pstr);
else
- build_upper_buffer (pstr);
#endif /* RE_ENABLE_I18N */
+ build_upper_buffer (pstr);
}
else
{
@@ -409,7 +411,10 @@ re_string_reconstruct (pstr, idx, eflags, newline)
if (offset < 0)
{
/* Reset buffer. */
- memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
+#ifdef RE_ENABLE_I18N
+ if (MB_CUR_MAX > 1)
+ memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
+#endif /* RE_ENABLE_I18N */
pstr->valid_len = pstr->raw_mbs_idx = 0;
pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
: CONTEXT_NEWLINE | CONTEXT_BEGBUF);
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index f676ae2746..75cc81517b 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -37,7 +37,8 @@
# define __iswctype iswctype
# define __btowc btowc
# define __mempcpy memcpy
-#endif
+# define attribute_hidden
+#endif /* not _LIBC */
extern const char __re_error_msgid[] attribute_hidden;
extern const size_t __re_error_msgid_idx[] attribute_hidden;
@@ -161,7 +162,10 @@ typedef struct
#ifdef _LIBC
uint32_t *range_starts;
uint32_t *range_ends;
-#endif
+#else /* not _LIBC */
+ wchar_t *range_starts;
+ wchar_t *range_ends;
+#endif /* not _LIBC */
int nranges;
/* Character classes. */
diff --git a/posix/regexec.c b/posix/regexec.c
index e888970936..2c7a2774eb 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -64,9 +64,11 @@ static int proceed_next_node (const regex_t *preg,
static reg_errcode_t set_regs (const regex_t *preg,
const re_match_context_t *mctx,
size_t nmatch, regmatch_t *pmatch, int last);
+#ifdef RE_ENABLE_I18N
static int sift_states_iter_mb (const regex_t *preg,
const re_match_context_t *mctx,
int node_idx, int str_idx, int max_str_idx);
+#endif /* RE_ENABLE_I18N */
static int sift_states_iter_bkref (const re_dfa_t *dfa,
re_dfastate_t **state_log,
struct re_backref_cache_entry *mctx_entry,
@@ -88,9 +90,11 @@ static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
re_dfastate_t *pstate,
int fl_search,
re_match_context_t *mctx);
+#ifdef RE_ENABLE_I18N
static reg_errcode_t transit_state_mb (const regex_t *preg,
re_dfastate_t *pstate,
re_match_context_t *mctx);
+#endif /* RE_ENABLE_I18N */
static reg_errcode_t transit_state_bkref (const regex_t *preg,
re_dfastate_t *pstate,
re_match_context_t *mctx);
@@ -101,10 +105,14 @@ static reg_errcode_t transit_state_bkref_loop (const regex_t *preg,
static re_dfastate_t **build_trtable (const regex_t *dfa,
const re_dfastate_t *state,
int fl_search);
+#ifdef RE_ENABLE_I18N
static int check_node_accept_bytes (const regex_t *preg, int node_idx,
const re_string_t *input, int idx);
+# ifdef _LIBC
static unsigned int find_collation_sequence_value (const unsigned char *mbs,
size_t name_len);
+# endif /* _LIBC */
+#endif /* RE_ENABLE_I18N */
static int group_nodes_into_DFAstates (const regex_t *dfa,
const re_dfastate_t *state,
re_node_set *states_node,
@@ -912,9 +920,12 @@ proceed_next_node (preg, mctx, pidx, node, eps_via_nodes)
type = dfa->nodes[entity].type;
}
+#ifdef RE_ENABLE_I18N
if (ACCEPT_MB_NODE (type))
naccepted = check_node_accept_bytes (preg, entity, mctx->input, *pidx);
- else if (type == OP_BACK_REF)
+ else
+#endif /* RE_ENABLE_I18N */
+ if (type == OP_BACK_REF)
{
for (i = 0; i < mctx->nbkref_ents; ++i)
{
@@ -1121,13 +1132,16 @@ sift_states_backward (preg, mctx, last_node)
type = dfa->nodes[entity].type;
}
+#ifdef RE_ENABLE_I18N
/* If the node may accept `multi byte'. */
if (ACCEPT_MB_NODE (type))
naccepted = sift_states_iter_mb (preg, mctx, entity, str_idx,
mctx->match_last);
/* If the node is a back reference. */
- else if (type == OP_BACK_REF)
+ else
+#endif /* RE_ENABLE_I18N */
+ if (type == OP_BACK_REF)
for (j = 0; j < mctx->nbkref_ents; ++j)
{
naccepted = sift_states_iter_bkref (dfa, mctx->state_log,
@@ -1201,6 +1215,7 @@ clean_state_log_if_need (mctx, next_state_log_idx)
return REG_NOERROR;
}
+#ifdef RE_ENABLE_I18N
static int
sift_states_iter_mb (preg, mctx, node_idx, str_idx, max_str_idx)
const regex_t *preg;
@@ -1222,6 +1237,7 @@ sift_states_iter_mb (preg, mctx, node_idx, str_idx, max_str_idx)
`naccepted' bytes input. */
return naccepted;
}
+#endif /* RE_ENABLE_I18N */
static int
sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_last)
@@ -1317,6 +1333,7 @@ transit_state (err, preg, mctx, state, fl_search)
}
else
{
+#ifdef RE_ENABLE_I18N
/* If the current state can accept multibyte. */
if (state->accept_mb)
{
@@ -1324,6 +1341,7 @@ transit_state (err, preg, mctx, state, fl_search)
if (BE (*err != REG_NOERROR, 0))
return NULL;
}
+#endif /* RE_ENABLE_I18N */
/* Then decide the next state with the single byte. */
if (1)
@@ -1474,6 +1492,7 @@ transit_state_sb (err, preg, state, fl_search, mctx)
return next_state;
}
+#ifdef RE_ENABLE_I18N
static reg_errcode_t
transit_state_mb (preg, pstate, mctx)
const regex_t *preg;
@@ -1543,6 +1562,7 @@ transit_state_mb (preg, pstate, mctx)
}
return REG_NOERROR;
}
+#endif /* RE_ENABLE_I18N */
static reg_errcode_t
transit_state_bkref (preg, pstate, mctx)
@@ -1991,7 +2011,14 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
return ndests;
}
-/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. */
+#ifdef RE_ENABLE_I18N
+/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
+ Return the number of the bytes the node accepts.
+ STR_IDX is the current index of the input string.
+
+ This function handles the nodes which can accept one character, or
+ one collating element like '.', '[a-z]', opposite to the other nodes
+ can only accept one byte. */
static int
check_node_accept_bytes (preg, node_idx, input, str_idx)
@@ -2003,14 +2030,16 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
const re_token_t *node = dfa->nodes + node_idx;
int elem_len = re_string_elem_size_at (input, str_idx);
int char_len = re_string_char_size_at (input, str_idx);
- int i, j;
-#ifdef _LIBC
+ int i;
+# ifdef _LIBC
+ int j;
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-#endif /* _LIBC */
+# endif /* _LIBC */
if (elem_len <= 1 && char_len <= 1)
return 0;
if (node->type == OP_PERIOD)
{
+ /* '.' accepts any one character except the following two cases. */
if ((!(preg->syntax & RE_DOT_NEWLINE) &&
re_string_byte_at (input, str_idx) == '\n') ||
((preg->syntax & RE_DOT_NOT_NULL) &&
@@ -2021,18 +2050,40 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
else if (node->type == COMPLEX_BRACKET)
{
const re_charset_t *cset = node->opr.mbcset;
+# ifdef _LIBC
const unsigned char *pin = re_string_get_buffer (input) + str_idx;
-#ifdef _LIBC
+# endif /* _LIBC */
+ int match_len = 0;
+ wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
+ ? re_string_wchar_at (input, str_idx) : 0);
+
+ /* match with multibyte character? */
+ for (i = 0; i < cset->nmbchars; ++i)
+ if (wc == cset->mbchars[i])
+ {
+ match_len = char_len;
+ goto check_node_accept_bytes_match;
+ }
+ /* match with character_class? */
+ for (i = 0; i < cset->nchar_classes; ++i)
+ {
+ wctype_t wt = cset->char_classes[i];
+ if (__iswctype (wc, wt))
+ {
+ match_len = char_len;
+ goto check_node_accept_bytes_match;
+ }
+ }
+
+# ifdef _LIBC
if (nrules != 0)
{
- int match_len = 0;
unsigned int in_collseq = 0;
const int32_t *table, *indirect;
const unsigned char *weights, *extra, *collseqwc;
int32_t idx;
- wchar_t wc = 0;
/* This #include defines a local function! */
-# include <locale/weight.h>
+# include <locale/weight.h>
/* match with collating_symbol? */
if (cset->ncoll_syms)
@@ -2057,9 +2108,6 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
}
}
- if (cset->nranges || cset->nchar_classes || cset->nmbchars)
- wc = re_string_wchar_at (input, str_idx);
-
if (cset->nranges)
{
if (elem_len <= char_len)
@@ -2112,43 +2160,39 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
}
}
}
-
- /* match with multibyte character? */
- for (i = 0; i < cset->nmbchars; ++i)
- if (wc == cset->mbchars[i])
- {
- match_len = char_len;
- goto check_node_accept_bytes_match;
- }
-
- /* match with character_class? */
- for (i = 0; i < cset->nchar_classes; ++i)
+ }
+ else
+# endif /* _LIBC */
+ {
+ /* match with range expression? */
+ wchar_t cmp_buf[6] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
+ for (i = 0; i < cset->nranges; ++i)
{
- wctype_t wt = cset->char_classes[i];
- if (__iswctype (wc, wt))
+ cmp_buf[0] = cset->range_starts[i];
+ cmp_buf[4] = cset->range_ends[i];
+ if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
+ && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
{
match_len = char_len;
goto check_node_accept_bytes_match;
}
}
-
- check_node_accept_bytes_match:
- if (!cset->non_match)
- return match_len;
+ }
+ check_node_accept_bytes_match:
+ if (!cset->non_match)
+ return match_len;
+ else
+ {
+ if (match_len > 0)
+ return 0;
else
- {
- if (match_len > 0)
- return 0;
- else
- return re_string_elem_size_at (input, str_idx);
- }
+ return (elem_len > char_len) ? elem_len : char_len;
}
-#endif
}
return 0;
}
-#ifdef _LIBC
+# ifdef _LIBC
static unsigned int
find_collation_sequence_value (mbs, mbs_len)
const unsigned char *mbs;
@@ -2204,7 +2248,8 @@ find_collation_sequence_value (mbs, mbs_len)
}
}
}
-#endif
+# endif /* _LIBC */
+#endif /* RE_ENABLE_I18N */
/* Check whether the node accepts the byte which is IDX-th
byte of the INPUT. */