summaryrefslogtreecommitdiff
path: root/posix/regexec.c
diff options
context:
space:
mode:
Diffstat (limited to 'posix/regexec.c')
-rw-r--r--posix/regexec.c352
1 files changed, 193 insertions, 159 deletions
diff --git a/posix/regexec.c b/posix/regexec.c
index 72ae70b916..a03df2636a 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -22,8 +22,6 @@ static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
int n) internal_function;
static void match_ctx_clean (re_match_context_t *mctx) internal_function;
static void match_ctx_free (re_match_context_t *cache) internal_function;
-static void match_ctx_free_subtops (re_match_context_t *mctx)
- internal_function;
static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
int str_idx, int from, int to)
internal_function;
@@ -606,15 +604,16 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
reg_errcode_t err;
re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
int left_lim, right_lim, incr;
- int fl_longest_match, match_first, match_last = -1;
- int fast_translate, sb;
+ int fl_longest_match, match_first, match_kind, match_last = -1;
+ int sb, ch;
#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
re_match_context_t mctx = { .dfa = dfa };
#else
re_match_context_t mctx;
#endif
- char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
- && range && !preg->can_be_null) ? preg->fastmap : NULL);
+ char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
+ && range && !preg->can_be_null) ? preg->fastmap : NULL;
+ unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
memset (&mctx, '\0', sizeof (re_match_context_t));
@@ -685,88 +684,100 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
left_lim = (range < 0) ? start + range : start;
right_lim = (range < 0) ? start : start + range;
sb = dfa->mb_cur_max == 1;
- fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
-
- for (;;)
+ match_kind =
+ (fastmap
+ ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
+ | (range >= 0 ? 2 : 0)
+ | (t != NULL ? 1 : 0))
+ : 8);
+
+ for (;; match_first += incr)
{
- /* At first get the current byte from input string. */
- if (fastmap)
+ err = REG_NOMATCH;
+ if (match_first < left_lim || right_lim < match_first)
+ goto free_return;
+
+ /* Advance as rapidly as possible through the string, until we
+ find a plausible place to start matching. This may be done
+ with varying efficiency, so there are various possibilities:
+ only the most common of them are specialized, in order to
+ save on code size. We use a switch statement for speed. */
+ switch (match_kind)
{
- if (BE (fast_translate, 1))
+ case 8:
+ /* No fastmap. */
+ break;
+
+ case 7:
+ /* Fastmap with single-byte translation, match forward. */
+ while (BE (match_first < right_lim, 1)
+ && !fastmap[t[(unsigned char) string[match_first]]])
+ ++match_first;
+ goto forward_match_found_start_or_reached_end;
+
+ case 6:
+ /* Fastmap without translation, match forward. */
+ while (BE (match_first < right_lim, 1)
+ && !fastmap[(unsigned char) string[match_first]])
+ ++match_first;
+
+ forward_match_found_start_or_reached_end:
+ if (BE (match_first == right_lim, 0))
{
- unsigned RE_TRANSLATE_TYPE t
- = (unsigned RE_TRANSLATE_TYPE) preg->translate;
- if (BE (range >= 0, 1))
- {
- if (BE (t != NULL, 0))
- {
- while (BE (match_first < right_lim, 1)
- && !fastmap[t[(unsigned char) string[match_first]]])
- ++match_first;
- }
- else
- {
- while (BE (match_first < right_lim, 1)
- && !fastmap[(unsigned char) string[match_first]])
- ++match_first;
- }
- if (BE (match_first == right_lim, 0))
- {
- int ch = match_first >= length
- ? 0 : (unsigned char) string[match_first];
- if (!fastmap[t ? t[ch] : ch])
- break;
- }
- }
- else
- {
- while (match_first >= left_lim)
- {
- int ch = match_first >= length
- ? 0 : (unsigned char) string[match_first];
- if (fastmap[t ? t[ch] : ch])
- break;
- --match_first;
- }
- if (match_first < left_lim)
- break;
- }
+ ch = match_first >= length
+ ? 0 : (unsigned char) string[match_first];
+ if (!fastmap[t ? t[ch] : ch])
+ goto free_return;
}
- else
+ break;
+
+ case 4:
+ case 5:
+ /* Fastmap without multi-byte translation, match backwards. */
+ while (match_first >= left_lim)
{
- int ch;
+ ch = match_first >= length
+ ? 0 : (unsigned char) string[match_first];
+ if (fastmap[t ? t[ch] : ch])
+ break;
+ --match_first;
+ }
+ if (match_first < left_lim)
+ goto free_return;
+ break;
- do
+ default:
+ /* In this case, we can't determine easily the current byte,
+ since it might be a component byte of a multibyte
+ character. Then we use the constructed buffer instead. */
+ for (;;)
+ {
+ /* If MATCH_FIRST is out of the valid range, reconstruct the
+ buffers. */
+ unsigned int offset = match_first - mctx.input.raw_mbs_idx;
+ if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
{
- /* In this case, we can't determine easily the current byte,
- since it might be a component byte of a multibyte
- character. Then we use the constructed buffer
- instead. */
- /* If MATCH_FIRST is out of the valid range, reconstruct the
- buffers. */
- if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
- <= match_first
- || match_first < mctx.input.raw_mbs_idx)
- {
- err = re_string_reconstruct (&mctx.input, match_first,
- eflags);
- if (BE (err != REG_NOERROR, 0))
- goto free_return;
- }
- /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
- Note that MATCH_FIRST must not be smaller than 0. */
- ch = ((match_first >= length) ? 0
- : re_string_byte_at (&mctx.input,
- match_first
- - mctx.input.raw_mbs_idx));
- if (fastmap[ch])
- break;
- match_first += incr;
+ err = re_string_reconstruct (&mctx.input, match_first,
+ eflags);
+ if (BE (err != REG_NOERROR, 0))
+ goto free_return;
+
+ offset = match_first - mctx.input.raw_mbs_idx;
}
- while (match_first >= left_lim && match_first <= right_lim);
- if (! fastmap[ch])
+ /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
+ Note that MATCH_FIRST must not be smaller than 0. */
+ ch = (match_first >= length
+ ? 0 : re_string_byte_at (&mctx.input, offset));
+ if (fastmap[ch])
break;
+ match_first += incr;
+ if (match_first < left_lim || match_first > right_lim)
+ {
+ err = REG_NOMATCH;
+ goto free_return;
+ }
}
+ break;
}
/* Reconstruct the buffers so that the matcher can assume that
@@ -774,57 +785,60 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
err = re_string_reconstruct (&mctx.input, match_first, eflags);
if (BE (err != REG_NOERROR, 0))
goto free_return;
+
#ifdef RE_ENABLE_I18N
- /* Eliminate it when it is a component of a multibyte character
- and isn't the head of a multibyte character. */
- if (sb || re_string_first_byte (&mctx.input, 0))
+ /* Don't consider this char as a possible match start if it part,
+ yet isn't the head, of a multibyte character. */
+ if (!sb && !re_string_first_byte (&mctx.input, 0))
+ continue;
#endif
+
+ /* It seems to be appropriate one, then use the matcher. */
+ /* We assume that the matching starts from 0. */
+ mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
+ match_last = check_matching (&mctx, fl_longest_match,
+ range >= 0 ? &match_first : NULL);
+ if (match_last != -1)
{
- /* It seems to be appropriate one, then use the matcher. */
- /* We assume that the matching starts from 0. */
- mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
- match_last = check_matching (&mctx, fl_longest_match,
- range >= 0 ? &match_first : NULL);
- if (match_last != -1)
+ if (BE (match_last == -2, 0))
{
- if (BE (match_last == -2, 0))
+ err = REG_ESPACE;
+ goto free_return;
+ }
+ else
+ {
+ mctx.match_last = match_last;
+ if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
{
- err = REG_ESPACE;
- goto free_return;
+ re_dfastate_t *pstate = mctx.state_log[match_last];
+ mctx.last_node = check_halt_state_context (&mctx, pstate,
+ match_last);
}
- else
+ if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
+ || dfa->nbackref)
{
- mctx.match_last = match_last;
- if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
- {
- re_dfastate_t *pstate = mctx.state_log[match_last];
- mctx.last_node = check_halt_state_context (&mctx, pstate,
- match_last);
- }
- if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
- || dfa->nbackref)
- {
- err = prune_impossible_nodes (&mctx);
- if (err == REG_NOERROR)
- break;
- if (BE (err != REG_NOMATCH, 0))
- goto free_return;
- match_last = -1;
- }
- else
- break; /* We found a match. */
+ err = prune_impossible_nodes (&mctx);
+ if (err == REG_NOERROR)
+ break;
+ if (BE (err != REG_NOMATCH, 0))
+ goto free_return;
+ match_last = -1;
}
+ else
+ break; /* We found a match. */
}
- match_ctx_clean (&mctx);
}
- /* Update counter. */
- match_first += incr;
- if (match_first < left_lim || right_lim < match_first)
- break;
+
+ match_ctx_clean (&mctx);
}
+#ifdef DEBUG
+ assert (match_last != -1);
+ assert (err == REG_NOERROR);
+#endif
+
/* Set pmatch[] if we need. */
- if (match_last != -1 && nmatch > 0)
+ if (nmatch > 0)
{
int reg_idx;
@@ -869,7 +883,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
pmatch[reg_idx].rm_eo += match_first;
}
}
- err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
+
free_return:
re_free (mctx.state_log);
if (dfa->nbackref)
@@ -1072,6 +1086,20 @@ check_matching (mctx, fl_longest_match, p_match_first)
while (!re_string_eoi (&mctx->input))
{
re_dfastate_t *old_state = cur_state;
+ int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
+
+ if (BE (next_char_idx >= mctx->input.bufs_len, 0)
+ || (BE (next_char_idx >= mctx->input.valid_len, 0)
+ && mctx->input.valid_len < mctx->input.len))
+ {
+ err = extend_buffers (mctx);
+ if (BE (err != REG_NOERROR, 0))
+ {
+ assert (err == REG_ESPACE);
+ return -2;
+ }
+ }
+
cur_state = transit_state (&err, mctx, cur_state);
if (mctx->state_log != NULL)
cur_state = merge_state_with_log (&err, mctx, cur_state);
@@ -1090,10 +1118,10 @@ check_matching (mctx, fl_longest_match, p_match_first)
break;
}
- if (at_init_state)
+ if (BE (at_init_state, 0))
{
if (old_state == cur_state)
- next_start_idx = re_string_cur_idx (&mctx->input);
+ next_start_idx = next_char_idx;
else
at_init_state = 0;
}
@@ -1109,13 +1137,16 @@ check_matching (mctx, fl_longest_match, p_match_first)
/* We found an appropriate halt state. */
match_last = re_string_cur_idx (&mctx->input);
match = 1;
+
+ /* We found a match, do not modify match_first below. */
+ p_match_first = NULL;
if (!fl_longest_match)
break;
}
}
- }
+ }
- if (match_last == -1 && p_match_first)
+ if (p_match_first)
*p_match_first += next_start_idx;
return match_last;
@@ -1854,7 +1885,12 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
{
int dst, cpos;
- if (ent->node != node || ent->subexp_from != ent->subexp_to)
+ if (ent->node != node)
+ continue;
+
+ if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
+ && (ent->eps_reachable_subexps_map
+ & (1 << (subexp_idx - 1))) == 0)
continue;
/* Recurse trying to reach the OP_OPEN_SUBEXP and
@@ -1875,11 +1911,13 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
subexp_idx, dst, bkref_idx);
- if (cpos == -1 && (boundaries & 1))
+ if (cpos == -1 /* && (boundaries & 1) */)
return -1;
- if (cpos == 0 /* && (boundaries & 2) */)
+ if (cpos == 0 && (boundaries & 2))
return 0;
+
+ ent->eps_reachable_subexps_map &= ~(1 << (subexp_idx - 1));
}
while (ent++->more);
break;
@@ -2167,23 +2205,14 @@ transit_state (err, mctx, state)
re_dfastate_t **trtable;
unsigned char ch;
- if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
- || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
- && mctx->input.valid_len < mctx->input.len))
+#ifdef RE_ENABLE_I18N
+ /* If the current state can accept multibyte. */
+ if (BE (state->accept_mb, 0))
{
- *err = extend_buffers (mctx);
+ *err = transit_state_mb (mctx, state);
if (BE (*err != REG_NOERROR, 0))
return NULL;
}
-
-#ifdef RE_ENABLE_I18N
- /* If the current state can accept multibyte. */
- if (state->accept_mb)
- {
- *err = transit_state_mb (mctx, state);
- if (BE (*err != REG_NOERROR, 0))
- return NULL;
- }
#endif /* RE_ENABLE_I18N */
/* Then decide the next state with the single byte. */
@@ -4078,28 +4107,6 @@ static void
match_ctx_clean (mctx)
re_match_context_t *mctx;
{
- match_ctx_free_subtops (mctx);
- mctx->nsub_tops = 0;
- mctx->nbkref_ents = 0;
-}
-
-/* Free all the memory associated with MCTX. */
-
-static void
-match_ctx_free (mctx)
- re_match_context_t *mctx;
-{
- match_ctx_free_subtops (mctx);
- re_free (mctx->sub_tops);
- re_free (mctx->bkref_ents);
-}
-
-/* Free all the memory associated with MCTX->SUB_TOPS. */
-
-static void
-match_ctx_free_subtops (mctx)
- re_match_context_t *mctx;
-{
int st_idx;
for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
{
@@ -4119,6 +4126,21 @@ match_ctx_free_subtops (mctx)
}
free (top);
}
+
+ mctx->nsub_tops = 0;
+ mctx->nbkref_ents = 0;
+}
+
+/* Free all the memory associated with MCTX. */
+
+static void
+match_ctx_free (mctx)
+ re_match_context_t *mctx;
+{
+ /* First, free all the memory associated with MCTX->SUB_TOPS. */
+ match_ctx_clean (mctx);
+ re_free (mctx->sub_tops);
+ re_free (mctx->bkref_ents);
}
/* Add a new backreference entry to MCTX.
@@ -4154,6 +4176,18 @@ match_ctx_add_entry (mctx, node, str_idx, from, to)
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
+
+ /* This is a cache that saves negative results of check_dst_limits_calc_pos.
+ If bit N is clear, means that this entry won't epsilon-transition to
+ an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
+ it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
+ such node.
+
+ A backreference does not epsilon-transition unless it is empty, so set
+ to all zeros if FROM != TO. */
+ mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
+ = (from == to ? ~0 : 0);
+
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
if (mctx->max_mb_elem_len < to - from)
mctx->max_mb_elem_len = to - from;