summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2006-09-07 10:42:04 +0000
committerJakub Jelinek <jakub@redhat.com>2006-09-07 10:42:04 +0000
commiteaa2ea12a27561393da77bcf435dad34d84d3e14 (patch)
tree6e62528215ef815a96aea18ed1f6e40915aca6ec
parent8021fca5412b5b1664e806556205d5cede365974 (diff)
* posix/regex_internal.c (re_string_reconstruct): Handlecvs/fedora-glibc-2_4_90-30
offset < pstr->valid_raw_len && pstr->offsets_needed case. Ensure no bytes read before raw_mbs array. Pass a saved copy of pstr->valid_len - 1 rather than pstr->valid_raw_len - 1 to re_string_context_at. * posix/Makefile: Add rules to build and run bug-regex26 test. * posix/bug-regex26.c: New test.
-rw-r--r--ChangeLog8
-rw-r--r--fedora/glibc.spec.in7
-rw-r--r--posix/Makefile3
-rw-r--r--posix/bug-regex26.c38
-rw-r--r--posix/regex_internal.c106
5 files changed, 139 insertions, 23 deletions
diff --git a/ChangeLog b/ChangeLog
index 8d30040fdc..413d398e91 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
2006-09-06 Jakub Jelinek <jakub@redhat.com>
+ * posix/regex_internal.c (re_string_reconstruct): Handle
+ offset < pstr->valid_raw_len && pstr->offsets_needed case.
+ Ensure no bytes read before raw_mbs array. Pass a saved copy of
+ pstr->valid_len - 1 rather than pstr->valid_raw_len - 1 to
+ re_string_context_at.
+ * posix/Makefile: Add rules to build and run bug-regex26 test.
+ * posix/bug-regex26.c: New test.
+
* locale/programs/ld-collate.c (collate_read): Goto sym_equiv_free
rather than col_sym_free. Move seqp declaration earlier.
diff --git a/fedora/glibc.spec.in b/fedora/glibc.spec.in
index 791e04b0a8..0ffcae11a8 100644
--- a/fedora/glibc.spec.in
+++ b/fedora/glibc.spec.in
@@ -1455,10 +1455,13 @@ rm -f *.filelist*
%changelog
* Thu Sep 7 2006 Jakub Jelinek <jakub@redhat.com> 2.4.90-30
-- fix or_IN February name (#204730)
-- fix pthread_create called from cancellation handlers (BZ#3124)
- add librtkaio, to use it add /%{lib}/rtkaio to your
LD_LIBRARY_PATH or /etc/ld.so.conf
+- fix or_IN February name (#204730)
+- fix pthread_create called from cancellation handlers (BZ#3124)
+- fix regex case insensitive searches with characters where upper
+ and lower case multibyte representations have different length
+ (e.g. I and dotless i, #202991)
* Tue Sep 5 2006 Jakub Jelinek <jakub@redhat.com> 2.4.90-29
- randomize resolver query ids before use instead after use (#205113)
diff --git a/posix/Makefile b/posix/Makefile
index 605d02c896..311712d24e 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -81,7 +81,7 @@ tests := tstgetopt testfnm runtests runptests \
bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
- bug-regex25 tst-nice tst-nanosleep tst-regex2 \
+ bug-regex25 bug-regex26 tst-nice tst-nanosleep tst-regex2 \
transbug tst-rxspencer tst-pcre tst-boost \
bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
tst-getaddrinfo2 bug-glob1 bug-glob2 tst-sysconf \
@@ -190,6 +190,7 @@ bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex22-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex23-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex25-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex26-ENV = LOCPATH=$(common-objpfx)localedata
tst-rxspencer-ARGS = --utf8 rxspencer/tests
tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
tst-pcre-ARGS = PCRE.tests
diff --git a/posix/bug-regex26.c b/posix/bug-regex26.c
new file mode 100644
index 0000000000..b784cfcef8
--- /dev/null
+++ b/posix/bug-regex26.c
@@ -0,0 +1,38 @@
+/* Test re_search with dotless i.
+ Copyright (C) 2006 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Jakub Jelinek <jakub@redhat.com>, 2006.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <locale.h>
+#include <regex.h>
+#include <string.h>
+
+int
+main (void)
+{
+ struct re_pattern_buffer r;
+ struct re_registers s;
+ setlocale (LC_ALL, "en_US.UTF-8");
+ memset (&r, 0, sizeof (r));
+ memset (&s, 0, sizeof (s));
+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE);
+ re_compile_pattern ("insert into", 11, &r);
+ re_search (&r, "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK",
+ 15, 0, 15, &s);
+ return 0;
+}
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index ac312db0cd..66154e0cea 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -585,34 +585,98 @@ re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
if (BE (offset != 0, 1))
{
- /* Are the characters which are already checked remain? */
- if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
- /* Handling this would enlarge the code too much.
- Accept a slowdown in that case. */
- && pstr->offsets_needed == 0
-#endif
- )
+ /* Should the already checked characters be kept? */
+ if (BE (offset < pstr->valid_raw_len, 1))
{
/* Yes, move them to the front of the buffer. */
- pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
#ifdef RE_ENABLE_I18N
- if (pstr->mb_cur_max > 1)
- memmove (pstr->wcs, pstr->wcs + offset,
- (pstr->valid_len - offset) * sizeof (wint_t));
+ if (BE (pstr->offsets_needed, 0))
+ {
+ int low = 0, high = pstr->valid_len, mid;
+ do
+ {
+ mid = (high + low) / 2;
+ if (pstr->offsets[mid] > offset)
+ high = mid;
+ else if (pstr->offsets[mid] < offset)
+ low = mid + 1;
+ else
+ break;
+ }
+ while (low < high);
+ if (pstr->offsets[mid] < offset)
+ ++mid;
+ pstr->tip_context = re_string_context_at (pstr, mid - 1,
+ eflags);
+ /* This can be quite complicated, so handle specially
+ only the common and easy case where the character with
+ different length representation of lower and upper
+ case is present at or after offset. */
+ if (pstr->valid_len > offset
+ && mid == offset && pstr->offsets[mid] == offset)
+ {
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
+ memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
+ for (low = 0; low < pstr->valid_len; low++)
+ pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+ }
+ else
+ {
+ /* Otherwise, just find out how long the partial multibyte
+ character at offset is and fill it with WEOF/255. */
+ pstr->len = pstr->raw_len - idx + offset;
+ pstr->stop = pstr->raw_stop - idx + offset;
+ pstr->offsets_needed = 0;
+ while (mid > 0 && pstr->offsets[mid - 1] == offset)
+ --mid;
+ while (mid < pstr->valid_len)
+ if (pstr->wcs[mid] != WEOF)
+ break;
+ else
+ ++mid;
+ if (mid == pstr->valid_len)
+ pstr->valid_len = 0;
+ else
+ {
+ pstr->valid_len = pstr->offsets[mid] - offset;
+ if (pstr->valid_len)
+ {
+ for (low = 0; low < pstr->valid_len; ++low)
+ pstr->wcs[low] = WEOF;
+ memset (pstr->mbs, 255, pstr->valid_len);
+ }
+ }
+ pstr->valid_raw_len = pstr->valid_len;
+ }
+ }
+ else
+#endif
+ {
+ pstr->tip_context = re_string_context_at (pstr, offset - 1,
+ eflags);
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1)
+ memmove (pstr->wcs, pstr->wcs + offset,
+ (pstr->valid_len - offset) * sizeof (wint_t));
#endif /* RE_ENABLE_I18N */
- if (BE (pstr->mbs_allocated, 0))
- memmove (pstr->mbs, pstr->mbs + offset,
- pstr->valid_len - offset);
- pstr->valid_len -= offset;
- pstr->valid_raw_len -= offset;
+ if (BE (pstr->mbs_allocated, 0))
+ memmove (pstr->mbs, pstr->mbs + offset,
+ pstr->valid_len - offset);
+ pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
#if DEBUG
- assert (pstr->valid_len > 0);
+ assert (pstr->valid_len > 0);
#endif
+ }
}
else
{
/* No, skip all characters until IDX. */
+ int prev_valid_len = pstr->valid_len;
+
#ifdef RE_ENABLE_I18N
if (BE (pstr->offsets_needed, 0))
{
@@ -636,6 +700,8 @@ re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
byte other than 0x80 - 0xbf. */
raw = pstr->raw_mbs + pstr->raw_mbs_idx;
end = raw + (offset - pstr->mb_cur_max);
+ if (end < pstr->raw_mbs)
+ end = pstr->raw_mbs;
p = raw + offset - 1;
#ifdef _LIBC
/* We know the wchar_t encoding is UCS4, so for the simple
@@ -643,7 +709,7 @@ re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
if (isascii (*p) && BE (pstr->trans == NULL, 1))
{
memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
- pstr->valid_len = 0;
+ /* pstr->valid_len = 0; */
wc = (wchar_t) *p;
}
else
@@ -686,7 +752,7 @@ re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
if (wc == WEOF)
pstr->tip_context
- = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+ = re_string_context_at (pstr, prev_valid_len - 1, eflags);
else
pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
&& IS_WIDE_WORD_CHAR (wc))