summaryrefslogtreecommitdiff
path: root/posix/tst-fnmatch.input
diff options
context:
space:
mode:
Diffstat (limited to 'posix/tst-fnmatch.input')
-rw-r--r--posix/tst-fnmatch.input125
1 files changed, 83 insertions, 42 deletions
diff --git a/posix/tst-fnmatch.input b/posix/tst-fnmatch.input
index 589fb2a940..dc2ca8d01a 100644
--- a/posix/tst-fnmatch.input
+++ b/posix/tst-fnmatch.input
@@ -23,6 +23,63 @@
# wording describing the situations to be tested. It does not specify
# any specific tests. I.e., the tests below are in no case sufficient.
# They are hopefully necessary, though.
+#
+# See:
+#
+# http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html
+#
+# > RE Bracket Expression
+# >
+# > Range expressions are, historically, an integral part of REs.
+# > However, the requirements of "natural language behavior" and
+# > portability do conflict. In the POSIX locale, ranges must be treated
+# > according to the collating sequence and include such characters that
+# > fall within the range based on that collating sequence, regardless
+# > of character values. In other locales, ranges have unspecified behavior.
+# > ...
+# > The current standard leaves unspecified the behavior of a range
+# > expression outside the POSIX locale. This makes it clearer that
+# > conforming applications should avoid range expressions outside the
+# > POSIX locale, and it allows implementations and compatible user-mode
+# > matchers to interpret range expressions using native order, CEO,
+# > collation sequence, or other, more advanced techniques. The concerns
+# > which led to this change were raised in IEEE PASC interpretation
+# > 1003.2 #43 and others, and related to ambiguities in the
+# > specification of how multi-character collating elements should be
+# > handled in range expressions. These ambiguities had led to multiple
+# > interpretations of the specification, in conflicting ways, which led
+# > to varying implementations. As noted above, efforts were made to
+# > resolve the differences, but no solution has been found that would
+# > be specific enough to allow for portable software while not
+# > invalidating existing implementations.
+#
+# Therefore, using [a-z] does not make much sense except in the C/POSIX locale.
+# The new iso14651_t1_common lists upper case and lower case Latin characters
+# in a different order than the old one which causes surprising results
+# for example in the de_DE locale: [a-z] now includes A because A comes
+# after a in iso14651_t1_common but does not include Z because that comes
+# after z in iso14651_t1_common.
+#
+# This lead to several bugs and problems with user scripts that do not
+# expect [a-z] to match uppercase characters.
+#
+# See the following bugs:
+# https://sourceware.org/bugzilla/show_bug.cgi?id=23393
+# https://sourceware.org/bugzilla/show_bug.cgi?id=23420
+#
+# No consensus exists on how best to handle the changes so the
+# iso14651_t1_common collation element order (CEO) has been changed to
+# deinterlace the a-z and A-Z regions.
+#
+# With the deinterlacing commit ac3a3b4b0d561d776b60317d6a926050c8541655
+# could be reverted to re-test the correct non-interleaved expectations.
+#
+# Please note that despite the region being deinterlaced, the ordering
+# of collation remains the same. In glibc we implement CEO and because of
+# that we can reorder the elements to reorder ranges without impacting
+# collation which depends on weights. The collation element ordering
+# could have been changed to include just a-z, A-Z, and 0-9 in three
+# distinct blocks, but this needs more discussion by the community.
# B.6 004(C)
C "!#%+,-./01234567889" "!#%+,-./01234567889" 0
@@ -418,47 +475,21 @@ C "-" "[Z-\\]]" NOMATCH
# Following are tests outside the scope of IEEE 2003.2 since they are using
# locales other than the C locale. The main focus of the tests is on the
# handling of ranges and the recognition of character (vs bytes).
-#
-# See:
-#
-# http://pubs.opengroup.org/onlinepubs/7908799/xbd/re.html
-#
-# > A range expression represents the set of collating elements that fall
-# > between two elements in the current collation sequence,
-# > inclusively. It is expressed as the starting point and the ending
-# > point separated by a hyphen (-).
-# >
-# > Range expressions must not be used in portable applications because
-# > their behaviour is dependent on the collating sequence. Ranges will be
-# > treated according to the current collating sequence, and include such
-# > characters that fall within the range based on that collating
-# > sequence, regardless of character values. This, however, means that
-# > the interpretation will differ depending on collating sequence. If,
-# > for instance, one collating sequence defines ä as a variant of a,
-# > while another defines it as a letter following z, then the expression
-# > [ä-z] is valid in the first language and invalid in the second.
-#
-# Therefore, using [a-z] does not make much sense except in the C/POSIX locale.
-# The new iso14651_t1_common lists upper case and lower case Latin characters
-# in a different order than the old one which causes surprising results
-# for example in the de_DE locale: [a-z] now includes A because A comes
-# after a in iso14651_t1_common but does not include Z because that comes
-# after z in iso14651_t1_common.
de_DE.ISO-8859-1 "a" "[a-z]" 0
de_DE.ISO-8859-1 "z" "[a-z]" 0
de_DE.ISO-8859-1 "ä" "[a-z]" 0
de_DE.ISO-8859-1 "ö" "[a-z]" 0
de_DE.ISO-8859-1 "ü" "[a-z]" 0
-de_DE.ISO-8859-1 "A" "[a-z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "A" "[a-z]" NOMATCH
de_DE.ISO-8859-1 "Z" "[a-z]" NOMATCH
-de_DE.ISO-8859-1 "Ä" "[a-z]" 0 # surprising but correct!
-de_DE.ISO-8859-1 "Ö" "[a-z]" 0 # surprising but correct!
-de_DE.ISO-8859-1 "Ü" "[a-z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "Ä" "[a-z]" NOMATCH
+de_DE.ISO-8859-1 "Ö" "[a-z]" NOMATCH
+de_DE.ISO-8859-1 "Ü" "[a-z]" NOMATCH
de_DE.ISO-8859-1 "a" "[A-Z]" NOMATCH
-de_DE.ISO-8859-1 "z" "[A-Z]" 0 # surprising but correct!
-de_DE.ISO-8859-1 "ä" "[A-Z]" 0 # surprising but correct!
-de_DE.ISO-8859-1 "ö" "[A-Z]" 0 # surprising but correct!
-de_DE.ISO-8859-1 "ü" "[A-Z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "z" "[A-Z]" NOMATCH
+de_DE.ISO-8859-1 "ä" "[A-Z]" NOMATCH
+de_DE.ISO-8859-1 "ö" "[A-Z]" NOMATCH
+de_DE.ISO-8859-1 "ü" "[A-Z]" NOMATCH
de_DE.ISO-8859-1 "A" "[A-Z]" 0
de_DE.ISO-8859-1 "Z" "[A-Z]" 0
de_DE.ISO-8859-1 "Ä" "[A-Z]" 0
@@ -536,21 +567,31 @@ de_DE.ISO-8859-1 "ba" "[[.a.]]a" NOMATCH
# And with a multibyte character set.
+en_US.UTF-8 "a" "[a-z]" 0
+en_US.UTF-8 "z" "[a-z]" 0
+en_US.UTF-8 "A" "[a-z]" NOMATCH
+en_US.UTF-8 "Z" "[a-z]" NOMATCH
+en_US.UTF-8 "a" "[A-Z]" NOMATCH
+en_US.UTF-8 "z" "[A-Z]" NOMATCH
+en_US.UTF-8 "A" "[A-Z]" 0
+en_US.UTF-8 "Z" "[A-Z]" 0
+en_US.UTF-8 "0" "[0-9]" 0
+en_US.UTF-8 "9" "[0-9]" 0
de_DE.UTF-8 "a" "[a-z]" 0
de_DE.UTF-8 "z" "[a-z]" 0
de_DE.UTF-8 "ä" "[a-z]" 0
de_DE.UTF-8 "ö" "[a-z]" 0
de_DE.UTF-8 "ü" "[a-z]" 0
-de_DE.UTF-8 "A" "[a-z]" 0 # surprising but correct!
+de_DE.UTF-8 "A" "[a-z]" NOMATCH
de_DE.UTF-8 "Z" "[a-z]" NOMATCH
-de_DE.UTF-8 "Ä" "[a-z]" 0 # surprising but correct!
-de_DE.UTF-8 "Ö" "[a-z]" 0 # surprising but correct!
-de_DE.UTF-8 "Ü" "[a-z]" 0 # surprising but correct!
+de_DE.UTF-8 "Ä" "[a-z]" NOMATCH
+de_DE.UTF-8 "Ö" "[a-z]" NOMATCH
+de_DE.UTF-8 "Ü" "[a-z]" NOMATCH
de_DE.UTF-8 "a" "[A-Z]" NOMATCH
-de_DE.UTF-8 "z" "[A-Z]" 0 # surprising but correct!
-de_DE.UTF-8 "ä" "[A-Z]" 0 # surprising but correct!
-de_DE.UTF-8 "ö" "[A-Z]" 0 # surprising but correct!
-de_DE.UTF-8 "ü" "[A-Z]" 0 # surprising but correct!
+de_DE.UTF-8 "z" "[A-Z]" NOMATCH
+de_DE.UTF-8 "ä" "[A-Z]" NOMATCH
+de_DE.UTF-8 "ö" "[A-Z]" NOMATCH
+de_DE.UTF-8 "ü" "[A-Z]" NOMATCH
de_DE.UTF-8 "A" "[A-Z]" 0
de_DE.UTF-8 "Z" "[A-Z]" 0
de_DE.UTF-8 "Ä" "[A-Z]" 0