summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog24
-rw-r--r--locale/C-collate.c3
-rw-r--r--locale/categories.def3
-rw-r--r--locale/langinfo.h1
-rw-r--r--locale/lc-collate.c18
-rw-r--r--locale/localeinfo.h9
-rw-r--r--locale/programs/ld-collate.c121
-rw-r--r--locale/programs/ld-ctype.c6
-rw-r--r--locale/weight.h251
-rw-r--r--string/strxfrm.c525
-rw-r--r--wcsmbs/wcsxfrm.c23
-rw-r--r--wctype/wctrans.c2
12 files changed, 526 insertions, 460 deletions
diff --git a/ChangeLog b/ChangeLog
index 257dee79f4..f0d5a021c6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+1999-12-25 Ulrich Drepper <drepper@cygnus.com>
+
+ * locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the
+ indirect table.
+ * locale/langinfo.h: Likewise.
+ * locale/categories.def: Likewise. Remove reference to postload
+ functions.
+ * locale/lc-collate.c (_nl_postload_collate): Removed. Also remove
+ __collate_tablemb, __collate_weightmb, and __collate_extramb.
+ * locale/localeinfo.h: Remove declaration for removed variables above.
+ Remove prototype for _nl_get_era_entry.
+ * locale/weight.h: Complete rewrite for new collate implementation.
+ * locale/programs/ld-collate.c: Many changes to make output file
+ usable in strxfrm/strcoll.
+ * string/strxfrm.c: Complete rewrite for new collate implementation.
+ * wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation
+ locally.
+
+1999-12-25 Shinya Hanataka <hanataka@abyss.rim.or.jp>
+
+ * locale/programs/ld-ctype.c (allocate_arrays): Correctly assign
+ transformation values for chars >255.
+ * wctype/wctrans.c: Return pointer unmodified.
+
1999-12-24 Ulrich Drepper <drepper@cygnus.com>
* sysdeps/posix/system.c (__libc_system): Check whether command
diff --git a/locale/C-collate.c b/locale/C-collate.c
index 94f6e0f60f..7875f5de22 100644
--- a/locale/C-collate.c
+++ b/locale/C-collate.c
@@ -150,12 +150,13 @@ const struct locale_data _nl_C_LC_COLLATE =
_nl_C_name,
NULL, 0, 0, /* no file mapped */
UNDELETABLE,
- 5,
+ 6,
{
{ word: 0 },
{ string: NULL },
{ string: NULL },
{ string: NULL },
+ { string: NULL },
{ string: NULL }
}
};
diff --git a/locale/categories.def b/locale/categories.def
index 06d79ed202..40fc74213c 100644
--- a/locale/categories.def
+++ b/locale/categories.def
@@ -47,7 +47,8 @@ DEFINE_CATEGORY
DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string)
DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string)
DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string)
- ), _nl_postload_collate)
+ DEFINE_ELEMENT (_NL_COLLATE_INDIRECTMB, "collate-indirectmb", std, string)
+ ), NO_POSTLOAD)
/* The actual definition of ctype is meaningless here. It is hard coded in
diff --git a/locale/langinfo.h b/locale/langinfo.h
index ff48fab35f..3f39298c17 100644
--- a/locale/langinfo.h
+++ b/locale/langinfo.h
@@ -235,6 +235,7 @@ enum
_NL_COLLATE_TABLEMB,
_NL_COLLATE_WEIGHTMB,
_NL_COLLATE_EXTRAMB,
+ _NL_COLLATE_INDIRECTMB,
_NL_NUM_LC_COLLATE,
/* LC_CTYPE category: character classification.
diff --git a/locale/lc-collate.c b/locale/lc-collate.c
index 02262b5ce2..623be06e26 100644
--- a/locale/lc-collate.c
+++ b/locale/lc-collate.c
@@ -22,21 +22,3 @@
_NL_CURRENT_DEFINE (LC_COLLATE);
-
-const int32_t *__collate_tablemb;
-const unsigned char *__collate_weightmb;
-const unsigned char *__collate_extramb;
-
-/* We are called after loading LC_CTYPE data to load it into
- the variables used by the collation functions and regex. */
-void
-_nl_postload_collate (void)
-{
-#define paste(a,b) paste1(a,b)
-#define paste1(a,b) a##b
-#define current(x) _NL_CURRENT (LC_COLLATE, paste(_NL_COLLATE_,x))
-
- __collate_tablemb = (const int32_t *) current (TABLEMB);
- __collate_weightmb = (const unsigned char *) current (WEIGHTMB);
- __collate_extramb = (const unsigned char *) current (EXTRAMB);
-}
diff --git a/locale/localeinfo.h b/locale/localeinfo.h
index 078e205f4f..ced96ac4a9 100644
--- a/locale/localeinfo.h
+++ b/locale/localeinfo.h
@@ -165,9 +165,6 @@ extern void _nl_unload_locale (struct locale_data *locale);
extern void _nl_remove_locale (int locale, struct locale_data *data);
-/* initialize `era' entries */
-extern void _nl_init_era_entries (void);
-
/* Return `era' entry which corresponds to TP. Used in strftime. */
extern struct era_entry *_nl_get_era_entry (const struct tm *tp);
@@ -180,10 +177,4 @@ extern const char *_nl_get_alt_digit (unsigned int number);
/* Similar, but now for wide characters. */
extern const wchar_t *_nl_get_walt_digit (unsigned int number);
-
-/* Global variables for LC_COLLATE category data. */
-extern const int32_t *__collate_tablemb;
-extern const unsigned char *__collate_extrweightmb;
-extern const unsigned char *__collate_extramb;
-
#endif /* localeinfo.h */
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index 65229275ff..c629bd477a 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -137,9 +137,6 @@ struct locale_collate_t
/* To make handling of errors easier we have another section. */
struct section_list error_section;
- /* Number of sorting rules given in order_start line. */
- uint32_t nrules;
-
/* Start of the order list. */
struct element_t *start;
@@ -176,7 +173,7 @@ struct locale_collate_t
/* We have a few global variables which are used for reading all
LC_COLLATE category descriptions in all files. */
-static int nrules;
+static uint32_t nrules;
/* These are definitions used by some of the functions for handling
@@ -426,7 +423,7 @@ read_directions (struct linereader *ldfile, struct token *arg,
if (! warned)
{
lr_error (ldfile, _("\
-%s: `%s' mentioned twice in definition of weight %d in category `%s'"),
+%s: `%s' mentioned twice in definition of weight %d"),
"LC_COLLATE", "position", cnt + 1);
}
}
@@ -450,7 +447,13 @@ read_directions (struct linereader *ldfile, struct token *arg,
/* See whether we have to increment the counter. */
if (arg->tok != tok_comma && rules[cnt] != 0)
- ++cnt;
+ {
+ /* Add the default `forward' if we have seen only `position'. */
+ if (rules[cnt] == sort_position)
+ rules[cnt] = sort_position | sort_forward;
+
+ ++cnt;
+ }
if (arg->tok == tok_eof || arg->tok == tok_eol)
/* End of line or file, so we exit the loop. */
@@ -876,7 +879,7 @@ insert_value (struct linereader *ldfile, struct token *arg,
elem->nmbs = seq->nbytes;
}
- if (elem->wcs == NULL && seq != ILLEGAL_CHAR_VALUE)
+ if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
{
uint32_t wcs[2] = { wc, 0 };
@@ -1552,7 +1555,7 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
}
-static inline int32_t
+static int32_t
output_weight (struct obstack *pool, struct locale_collate_t *collate,
struct element_t *elem)
{
@@ -1575,25 +1578,18 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate,
int len = 0;
int i;
- /* Add the direction. */
- obstack_1grow (pool, elem->section->rules[cnt]);
-
for (i = 0; i < elem->weights[cnt].cnt; ++i)
- /* Encode the weight value. */
- if (elem->weights[cnt].w[i] == NULL)
- {
- /* This entry was IGNORE. */
- buf[len++] = IGNORE_CHAR;
- }
- else
+ /* Encode the weight value. We do nothing for IGNORE entries. */
+ if (elem->weights[cnt].w[i] != NULL)
len += utf8_encode (&buf[len],
elem->weights[cnt].w[i]->mborder[cnt]);
/* And add the buffer content. */
+ obstack_1grow (pool, len);
obstack_grow (pool, buf, len);
}
- return retval;
+ return retval | ((elem->section->ruleidx & 0x7f) << 24);
}
@@ -1611,11 +1607,13 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
int32_t tablemb[256];
struct obstack weightpool;
struct obstack extrapool;
+ struct obstack indirectpool;
struct section_list *sect;
int i;
obstack_init (&weightpool);
obstack_init (&extrapool);
+ obstack_init (&indirectpool);
data.magic = LIMAGIC (LC_COLLATE);
data.n = nelems;
@@ -1629,7 +1627,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
cnt = 0;
assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
- iov[2 + cnt].iov_base = &collate->nrules;
+ iov[2 + cnt].iov_base = &nrules;
iov[2 + cnt].iov_len = sizeof (uint32_t);
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt;
@@ -1638,7 +1636,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
if (sect->ruleidx == i)
{
- obstack_grow (&weightpool, sect->rules, nrules);
+ int j;
+
+ obstack_make_room (&weightpool, nrules);
+
+ for (j = 0; j < nrules; ++j)
+ obstack_1grow_fast (&weightpool, sect->rules[j]);
++i;
}
/* And align the output. */
@@ -1674,7 +1677,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
&& collate->mbheads[ch]->nmbs == 1)
{
tablemb[ch] = output_weight (&weightpool, collate,
- collate->mbheads[ch]);
+ collate->mbheads[ch]);
}
else
{
@@ -1719,38 +1722,60 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
{
int i;
- /* More than one consecutive entry. We mark this by having
- a negative index into the weight table. */
- weightidx = -weightidx;
-
/* Now add first the initial byte sequence. */
added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1)
+ __alignof__ (int32_t) - 1)
& ~(__alignof__ (int32_t) - 1));
obstack_make_room (&extrapool, added);
+ /* More than one consecutive entry. We mark this by having
+ a negative index into the indirect table. */
if (sizeof (int32_t) == sizeof (int))
- obstack_int_grow_fast (&extrapool, weightidx);
+ obstack_int_grow_fast (&extrapool,
+ obstack_object_size (&indirectpool)
+ / sizeof (int32_t));
else
- obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
- obstack_1grow_fast (&extrapool, runp->section->ruleidx);
+ {
+ int32_t i = (obstack_object_size (&indirectpool)
+ / sizeof (int32_t));
+ obstack_grow (&extrapool, &i, sizeof (int32_t));
+ }
obstack_1grow_fast (&extrapool, runp->nmbs - 1);
for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
- /* Now find the end of the consecutive sequence. */
- do
- runp = runp->next;
- while (runp->mbnext != NULL
- && runp->nmbs == runp->mbnext->nmbs
- && memcmp (runp->mbs, runp->mbnext->mbs,
- runp->nmbs - 1) == 0
- && (runp->mbs[runp->nmbs - 1] + 1
- == runp->mbnext->mbs[runp->nmbs - 1]));
-
- /* And add the end by sequence. Without length this time. */
+ /* Now find the end of the consecutive sequence and
+ add all the indeces in the indirect pool. */
+ while (1)
+ {
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow_fast (&extrapool, weightidx);
+ else
+ obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
+
+ runp = runp->next;
+ if (runp->mbnext == NULL
+ || runp->nmbs != runp->mbnext->nmbs
+ || memcmp (runp->mbs, runp->mbnext->mbs,
+ runp->nmbs - 1) != 0
+ || (runp->mbs[runp->nmbs - 1] + 1
+ != runp->mbnext->mbs[runp->nmbs - 1]))
+ break;
+
+ /* Insert the weight. */
+ weightidx = output_weight (&weightpool, collate, runp);
+ }
+
+ /* And add the end byte sequence. Without length this
+ time. */
for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
+
+ weightidx = output_weight (&weightpool, collate, runp);
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow_fast (&extrapool, weightidx);
+ else
+ obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
}
else
{
@@ -1768,7 +1793,6 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
obstack_int_grow_fast (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
- obstack_1grow_fast (&extrapool, runp->section->ruleidx);
obstack_1grow_fast (&extrapool, runp->nmbs - 1);
for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
@@ -1835,6 +1859,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt;
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
+ iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
+ iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
@@ -1842,6 +1872,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
obstack_free (&weightpool, NULL);
obstack_free (&extrapool, NULL);
+ obstack_free (&indirectpool, NULL);
}
@@ -2291,16 +2322,16 @@ error while adding equivalent collating symbol"));
uint32_t cnt;
/* This means we have exactly one rule: `forward'. */
- if (collate->nrules > 1)
+ if (nrules > 1)
lr_error (ldfile, _("\
%s: invalid number of sorting rules"),
"LC_COLLATE");
else
- collate->nrules = 1;
+ nrules = 1;
sp->rules = obstack_alloc (&collate->mempool,
(sizeof (enum coll_sort_rule)
- * collate->nrules));
- for (cnt = 0; cnt < collate->nrules; ++cnt)
+ * nrules));
+ for (cnt = 0; cnt < nrules; ++cnt)
sp->rules[cnt] = sort_forward;
/* Next line. */
diff --git a/locale/programs/ld-ctype.c b/locale/programs/ld-ctype.c
index 86d086021d..d98b7bdfd2 100644
--- a/locale/programs/ld-ctype.c
+++ b/locale/programs/ld-ctype.c
@@ -3073,10 +3073,8 @@ Computing table size for character classes might take a while..."),
while (idx2 < ctype->map_collection_act[idx])
{
if (ctype->map_collection[idx][idx2] != 0)
- *find_idx (ctype, &ctype->map32[idx],
- &ctype->map_collection_max[idx],
- &ctype->map_collection_act[idx],
- ctype->names[idx2]) = ctype->map_collection[idx][idx2];
+ ctype->map32[idx][ctype->charnames[idx2]] =
+ ctype->map_collection[idx][idx2];
++idx2;
}
}
diff --git a/locale/weight.h b/locale/weight.h
index 6e31e2d495..356ee57855 100644
--- a/locale/weight.h
+++ b/locale/weight.h
@@ -17,191 +17,106 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
-#include <alloca.h>
-#include <errno.h>
-#include <langinfo.h>
-#include "localeinfo.h"
-
-#ifndef STRING_TYPE
-# error STRING_TYPE not defined
-#endif
+/* Find index of weight. */
+static inline int32_t
+findidx (const unsigned char **cpp)
+{
+ int_fast32_t i = table[*(*cpp)++];
+ const unsigned char *cp;
-#ifndef USTRING_TYPE
-# error USTRING_TYPE not defined
-#endif
+ if (i >= 0)
+ /* This is an index into the weight table. Cool. */
+ return i;
-typedef struct weight_t
-{
- struct weight_t *prev;
- struct weight_t *next;
- struct data_pair
+ /* Oh well, more than one sequence starting with this byte.
+ Search for the correct one. */
+ cp = &extra[-i];
+ while (1)
{
- int number;
- const uint32_t *value;
- } data[0];
-} weight_t;
-
-
-/* The following five macros grant access to the values in the
- collate locale file that do not depend on byte order. */
-#ifndef USE_IN_EXTENDED_LOCALE_MODEL
-# define collate_nrules \
- (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES))
-# define collate_hash_size \
- (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE))
-# define collate_hash_layers \
- (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS))
-# define collate_undefined \
- (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_UNDEFINED_WC))
-# define collate_rules \
- ((uint32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULES))
-
-static __inline void get_weight (const STRING_TYPE **str, weight_t *result);
-static __inline void
-get_weight (const STRING_TYPE **str, weight_t *result)
-#else
-# define collate_nrules \
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word
-# define collate_hash_size \
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word
-# define collate_hash_layers \
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word
-# define collate_undefined \
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED_WC)].word
-# define collate_rules \
- ((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULES)].string)
-
-static __inline void get_weight (const STRING_TYPE **str, weight_t *result,
- struct locale_data *current,
- const uint32_t *__collate_tablewc,
- const uint32_t *__collate_extrawc);
-static __inline void
-get_weight (const STRING_TYPE **str, weight_t *result,
- struct locale_data *current, const uint32_t *__collate_tablewc,
- const uint32_t *__collate_extrawc)
-#endif
-{
- unsigned int ch = *((USTRING_TYPE *) (*str))++;
- size_t slot;
+ size_t nhere;
+ const unsigned char *usrc = *cpp;
- if (sizeof (STRING_TYPE) == 1)
- slot = ch * (collate_nrules + 1);
- else
- {
- const size_t level_size = collate_hash_size * (collate_nrules + 1);
- size_t level;
+ /* The first thing is the index. */
+ i = *((int32_t *) cp);
+ cp += sizeof (int32_t);
- slot = (ch % collate_hash_size) * (collate_nrules + 1);
+ /* Next is the length of the byte sequence. These are always
+ short byte sequences so there is no reason to call any
+ function (even if they are inlined). */
+ nhere = *cp++;
- level = 0;
- while (__collate_tablewc[slot] != (uint32_t) ch)
+ if (i >= 0)
{
- if (__collate_tablewc[slot + 1] == 0
- || ++level >= collate_hash_layers)
- {
- size_t idx = collate_undefined;
- size_t cnt;
+ /* It is a single character. If it matches we found our
+ index. Note that at the end of each list there is an
+ entry of length zero which represents the single byte
+ sequence. The first (and here only) byte was tested
+ already. */
+ size_t cnt;
- for (cnt = 0; cnt < collate_nrules; ++cnt)
- {
- result->data[cnt].number = __collate_extrawc[idx++];
- result->data[cnt].value = &__collate_extrawc[idx];
- idx += result->data[cnt].number;
- }
- /* The Unix standard requires that a character outside
- the domain is signalled by setting `errno'. */
- __set_errno (EINVAL);
- return;
- }
- slot += level_size;
- }
- }
+ for (cnt = 0; cnt < nhere; ++cnt)
+ if (cp[cnt] != usrc[cnt])
+ break;
- if (__collate_tablewc[slot + 1] != (uint32_t) FORWARD_CHAR)
- {
- /* We have a simple form. One value for each weight. */
- size_t cnt;
+ if (cnt == nhere)
+ {
+ /* Found it. */
+ *cpp += nhere;
+ return i;
+ }
- for (cnt = 0; cnt < collate_nrules; ++cnt)
- {
- result->data[cnt].number = 1;
- result->data[cnt].value = &__collate_tablewc[slot + 1 + cnt];
+ /* Up to the next entry. */
+ cp += nhere;
}
- return;
- }
+ else
+ {
+ /* This is a range of characters. First decide whether the
+ current byte sequence lies in the range. */
+ size_t cnt;
+ size_t offset = 0;
- /* We now look for any collation element which starts with CH.
- There might none, but the last list member is a catch-all case
- because it is simple the character CH. The value of this entry
- might be the same as UNDEFINED. */
- slot = __collate_tablewc[slot + 2];
+ for (cnt = 0; cnt < nhere; ++cnt)
+ if (cp[cnt] != usrc[cnt])
+ break;
- while (1)
- {
- size_t idx;
+ if (cnt != nhere)
+ {
+ if (cp[cnt] > usrc[cnt])
+ {
+ /* Cannot be in this range. */
+ cp += 2 * nhere;
+ continue;
+ }
- /* This is a comparison between a uint32_t array (aka wchar_t) and
- an 8-bit string. */
- for (idx = 0; __collate_extrawc[slot + 2 + idx] != 0; ++idx)
- if (__collate_extrawc[slot + 2 + idx] != (uint32_t) (*str)[idx])
- break;
+ /* Test against the end of the range. */
+ for (cnt = 0; cnt < nhere; ++cnt)
+ if (cp[nhere + cnt] != usrc[cnt])
+ break;
- /* When the loop finished with all character of the collation
- element used, we found the longest prefix. */
- if (__collate_extrawc[slot + 2 + idx] == 0)
- {
- size_t cnt;
+ if (cnt != nhere && cp[nhere + cnt] < usrc[cnt])
+ {
+ /* Cannot be in this range. */
+ cp += 2 * nhere;
+ continue;
+ }
- *str += idx;
- idx += slot + 3;
- for (cnt = 0; cnt < collate_nrules; ++cnt)
- {
- result->data[cnt].number = __collate_extrawc[idx++];
- result->data[cnt].value = &__collate_extrawc[idx];
- idx += result->data[cnt].number;
+ /* This range matches the next characters. Now find
+ the offset in the indirect table. */
+ for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt);
+
+ do
+ {
+ offset <<= 8;
+ offset += usrc[cnt] - cp[cnt];
+ }
+ while (++cnt < nhere);
}
- return;
- }
- /* To next entry in list. */
- slot += __collate_extrawc[slot];
+ *cpp += nhere;
+ return offset;
+ }
}
-}
-
-/* To process a string efficiently we retrieve all information about
- the string at once. The following macro constructs a double linked
- list of this information. It is a macro because we use `alloca'
- and we use a double linked list because of the backward collation
- order.
-
- We have this strange extra macro since the functions which use the
- given locale (not the global one) cannot use the global tables. */
-#ifndef USE_IN_EXTENDED_LOCALE_MODEL
-# define call_get_weight(strp, newp) get_weight ((strp), (newp))
-#else
-# define call_get_weight(strp, newp) \
- get_weight ((strp), (newp), current, collate_table, collate_extra)
-#endif
-
-#define get_string(str, forw, backw) \
- do \
- { \
- weight_t *newp; \
- while (*str != '\0') \
- { \
- newp = (weight_t *) alloca (sizeof (weight_t) \
- + (collate_nrules \
- * sizeof (struct data_pair))); \
- \
- newp->prev = backw; \
- if (backw == NULL) \
- forw = newp; \
- else \
- backw->next = newp; \
- newp->next = NULL; \
- backw = newp; \
- call_get_weight (&str, newp); \
- } \
- } \
- while (0)
+ /* NOTREACHED */
+ return 0x43219876;
+}
diff --git a/string/strxfrm.c b/string/strxfrm.c
index 2a3a8a9032..344e65b957 100644
--- a/string/strxfrm.c
+++ b/string/strxfrm.c
@@ -17,282 +17,397 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
+#include <langinfo.h>
#include <stddef.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
-#ifndef WIDE_VERSION
-# define STRING_TYPE char
-# define USTRING_TYPE unsigned char
-# define L_(Ch) Ch
-# ifdef USE_IN_EXTENDED_LOCALE_MODEL
-# define STRXFRM __strxfrm_l
-# else
-# define STRXFRM strxfrm
-# endif
-# define STRLEN strlen
-# define STPNCPY __stpncpy
-#endif
+#include "../locale/localeinfo.h"
-#ifndef USE_IN_EXTENDED_LOCALE_MODEL
-size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
+#ifdef USE_IN_EXTENDED_LOCALE_MODEL
+# define STRXFRM __strxfrm_l
#else
-size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+# define STRXFRM strxfrm
#endif
-{
- if (n != 0)
- STPNCPY (dest, src, n);
- return STRLEN (src);
-}
-
-#if 0
-/* Include the shared helper functions. `strxfrm'/`wcsxfrm' also use
- these functions. */
-#include "../locale/weight.h"
-
-#ifndef WIDE_VERSION
-/* Write 32 bit value UTF-8 encoded but only if enough space is left. */
-static __inline size_t
-print_val (u_int32_t value, char *dest, size_t max, size_t act)
+/* These are definitions used by some of the functions for handling
+ UTF-8 encoding below. */
+static const uint32_t encoding_mask[] =
{
- char tmp[6];
- int idx = 0;
+ ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
+};
- if (value < 0x80)
- tmp[idx++] = (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x20)
- tmp[idx++] = '\xc0' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x10)
- tmp[idx++] = '\xe0' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x08)
- tmp[idx++] = '\xf0' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x04)
- tmp[idx++] = '\xf8' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- tmp[idx++] = '\xfc' + (char) (value >> 6);
- }
- }
- }
- }
- }
+static const unsigned char encoding_byte[] =
+{
+ 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
- while (idx-- > 0)
- {
- if (act < max)
- dest[act] = tmp[idx];
- ++act;
- }
- return act;
-}
-#else
-static __inline size_t
-print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act)
+/* We need UTF-8 encoding of numbers. */
+static inline int
+utf8_encode (char *buf, int val)
{
- /* We cannot really assume wchar_t is 32 bits wide. But it is for
- GCC and so we don't do much optimization for the other case. */
- if (sizeof (wchar_t) == 4)
+ char *startp = buf;
+ int retval;
+
+ if (val < 0x80)
{
- if (act < max)
- dest[act] = (wchar_t) value;
- ++act;
+ *buf++ = (char) val;
+ retval = 1;
}
else
{
- wchar_t tmp[3];
- size_t idx = 0;
+ int step;
- if (value < 0x8000)
- tmp[idx++] = (wchar_t) act;
- else
- {
- tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
- value >>= 14;
- if (value < 0x2000)
- tmp[idx++] = (wchar_t) (0xc000 + value);
- else
- {
- tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
- value >>= 14;
- tmp[idx++] = (wchar_t) (0xe000 + value);
- }
- }
- while (idx-- > 0)
+ for (step = 2; step < 6; ++step)
+ if ((val & encoding_mask[step - 2]) == 0)
+ break;
+ retval = step;
+
+ *buf = encoding_byte[step - 2];
+ --step;
+ do
{
- if (act < max)
- dest[act] = tmp[idx];
- ++act;
+ buf[step] = 0x80 | (val & 0x3f);
+ val >>= 6;
}
+ while (--step > 0);
+ *buf |= val;
}
- return act;
+
+ return buf - startp;
}
-#endif
-/* Transform SRC into a form such that the result of strcmp
- on two strings that have been transformed by strxfrm is
- the same as the result of strcoll on the two strings before
- their transformation. The transformed string is put in at
- most N characters of DEST and its length is returned. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
+STRXFRM (char *dest, const char *src, size_t n)
#else
size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+STRXFRM (char *dest, const char *src, size_t n, __locale_t l)
#endif
{
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
struct locale_data *current = l->__locales[LC_COLLATE];
-# if BYTE_ORDER == BIG_ENDIAN
- const u_int32_t *collate_table = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string;
- const u_int32_t *collate_extra = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string;
-# elif BYTE_ORDER == LITTLE_ENDIAN
- const u_int32_t *collate_table = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string;
- const u_int32_t *collate_extra = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string;
-# else
-# error bizarre byte order
-# endif
+ uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
+#else
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
#endif
- weight_t *forw = NULL;
- weight_t *backw = NULL;
- size_t pass;
- size_t written;
-
- /* If the current locale does not specify locale data we use normal
- 8-bit string comparison. */
- if (collate_nrules == 0)
+ /* We don't assign the following values right away since it might be
+ unnecessary in case there are no rules. */
+ const unsigned char *rulesets;
+ const int32_t *table;
+ const unsigned char *weights;
+ const unsigned char *extra;
+ const int32_t *indirect;
+ uint_fast32_t pass;
+ size_t needed;
+ const unsigned char *usrc;
+ size_t srclen = strlen (src);
+ int32_t *idxarr;
+ unsigned char *rulearr;
+ size_t idxmax;
+ size_t idxcnt;
+ int use_malloc = 0;
+
+#include "../locale/weight.h"
+
+ if (nrules == 0)
{
if (n != 0)
- STPNCPY (dest, src, n);
+ __stpncpy (dest, src, n);
- return STRLEN (src);
+ return srclen;
}
+#ifdef USE_IN_EXTENDED_LOCALE_MODEL
+ rulesets = (const unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+ table = (const int32_t *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLEMB)].string;
+ weights = (const unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB)].string;
+ extra = (const unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB)].string;
+ indirect = (const int32_t *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)].string;
+#else
+ rulesets = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ weights = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
+#endif
+
/* Handle an empty string as a special case. */
- if (*src == '\0')
+ if (srclen == 0)
{
if (n != 0)
- *dest = '\0';
+ *dest = '\0';
return 1;
}
- /* Get full information about the string. This means we get
- information for all passes in a special data structure. */
- get_string (src, forw, backw);
+ /* We need the elements of the string as unsigned values since they
+ are used as indeces. */
+ usrc = (const unsigned char *) src;
+
+ /* Perform the first pass over the string and while doing this find
+ and store the weights for each character. Since we want this to
+ be as fast as possible we are using `alloca' to store the temporary
+ values. But since there is no limit on the length of the string
+ we have to use `malloc' if the string is too long. We should be
+ very conservative here. */
+ if (srclen >= 16384)
+ {
+ idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
+ rulearr = (unsigned char *) &idxarr[srclen];
+
+ if (idxarr == NULL)
+ /* No memory. Well, go with the stack then.
+
+ XXX Once this implementation is stable we will handle this
+ differently. Instead of precomputing the indeces we will
+ do this in time. This means, though, that this happens for
+ every pass again. */
+ goto try_stack;
+ use_malloc = 1;
+ }
+ else
+ {
+ try_stack:
+ idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
+ rulearr = (unsigned char *) alloca (srclen);
+ }
- /* Now we have all the information. In at most the given number of
- passes we can finally decide about the order. */
- written = 0;
- for (pass = 0; pass < collate_nrules; ++pass)
+ idxmax = 0;
+ do
{
- int forward = (collate_rules[pass] & sort_forward) != 0;
- const weight_t *run = forward ? forw : backw;
- int idx = forward ? 0 : run->data[pass].number - 1;
+ int32_t tmp = findidx (&usrc);
+ rulearr[idxmax] = tmp >> 24;
+ idxarr[idxmax] = tmp & 0x80ffffff;
- while (1)
+ ++idxmax;
+ }
+ while (*usrc != '\0');
+
+ /* Now the passes over the weights. We now use the indeces we found
+ before. */
+ needed = 0;
+ for (pass = 0; pass < nrules; ++pass)
+ {
+ size_t backw_stop = ~0ul;
+ int rule = rulesets[rulearr[0] * nrules + pass];
+ /* We assume that if a rule has defined `position' in one section
+ this is true for all of them. */
+ int position = rule & sort_position;
+
+ if (position == 0)
{
- int ignore = 0;
- u_int32_t w = 0;
-
- /* Here we have to check for IGNORE entries. If these are
- found we count them and go on with he next value. */
- while (run != NULL
- && ((w = run->data[pass].value[idx])
- == (u_int32_t) IGNORE_CHAR))
+ for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
- ++ignore;
- if (forward
- ? ++idx >= run->data[pass].number
- : --idx < 0)
+ if ((rule & sort_forward) != 0)
{
- weight_t *nextp = forward ? run->next : run->prev;
- if (nextp == NULL)
+ size_t len;
+
+ if (backw_stop != ~0ul)
{
- w = 0;
- /* No more non-INGOREd elements means lowest
- possible value. */
- ignore = -1;
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+ {
+ len = weights[idxarr[backw]++];
+
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[backw]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[backw] += len;
+ }
+ }
+
+ backw_stop = ~0ul;
}
+
+ /* Now handle the forward element. */
+ len = weights[idxarr[idxcnt]++];
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[idxcnt]++];
else
- idx = forward ? 0 : nextp->data[pass].number - 1;
- run = nextp;
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[idxcnt] += len;
+ }
+ }
+ else
+ {
+ /* Remember where the backwards series started. */
+ if (backw_stop == ~0ul)
+ backw_stop = idxcnt;
}
+
+ rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
- /* Stop if all characters are processed. */
- if (run == NULL)
- break;
- /* Now we have information of the number of ignored weights
- and the value of the next weight. We have to add 2
- because 0 means EOS and 1 is the intermediate string end. */
- if ((collate_rules[pass] & sort_position) != 0)
- written = print_val (ignore + 2, dest, n, written);
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
- if (w != 0)
- written = print_val (w, dest, n, written);
+ for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+ {
+ size_t len = weights[idxarr[backw]++];
- /* We have to increment the index counters. */
- if (forward)
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[backw]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[backw] += len;
+ }
+ }
+ }
+ }
+ else
+ {
+ int val = 1;
+ char buf[7];
+ size_t buflen;
+ size_t i;
+
+ for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
- if (++idx >= run->data[pass].number)
+ if ((rule & sort_forward) != 0)
+ {
+ size_t len;
+
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+ {
+ len = weights[idxarr[backw]++];
+ if (len != 0)
+ {
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[backw] + i];
+ }
+ idxarr[backw] += len;
+ needed += buflen + len;
+ val = 1;
+ }
+ else
+ ++val;
+ }
+
+ backw_stop = ~0ul;
+ }
+
+ /* Now handle the forward element. */
+ len = weights[idxarr[idxcnt]++];
+ if (len != 0)
+ {
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[idxcnt] + i];
+ }
+ idxarr[idxcnt] += len;
+ needed += buflen + len;
+ val = 1;
+ }
+ else
+ /* Note that we don't have to increment `idxarr[idxcnt]'
+ since the length is zero. */
+ ++val;
+ }
+ else
{
- run = run->next;
- idx = 0;
+ /* Remember where the backwards series started. */
+ if (backw_stop == ~0ul)
+ backw_stop = idxcnt;
}
+
+ rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
- else
+
+ if (backw_stop != ~0)
{
- if (--idx < 0)
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxmax - 1; backw >= backw_stop; --backw)
{
- run = run->prev;
- if (run != NULL)
- idx = run->data[pass].number - 1;
+ size_t len = weights[idxarr[backw]++];
+ if (len != 0)
+ {
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[backw] + i];
+ }
+ idxarr[backw] += len;
+ needed += buflen + len;
+ val = 1;
+ }
+ else
+ ++val;
}
}
}
- /* Write marker for end of word. */
- if (pass + 1 < collate_nrules)
- written = print_val (1, dest, n, written);
+ /* Finally store the byte to separate the passes or terminate
+ the string. */
+ if (needed < n)
+ dest[needed] = pass + 1 < nrules ? '\1' : '\0';
+ ++needed;
+ }
+
+ /* This is a little optimization: many collation specifications have
+ a `position' rule at the end and if no non-ignored character
+ is found the last \1 byte is immediately followed by a \0 byte
+ signalling this. We can avoid the \1 byte(s). */
+ if (needed > 2 && dest[needed - 2] == '\1')
+ {
+ /* Remove the \1 byte. */
+ --needed;
+ dest[needed - 1] = '\0';
}
- /* Terminate string. */
- if (written < n)
- dest[written] = L_('\0');
+ /* Free the memory if needed. */
+ if (use_malloc)
+ free (idxarr);
- /* Return length without counting the terminating '\0'. */
- return written;
+ return needed;
}
-#endif
diff --git a/wcsmbs/wcsxfrm.c b/wcsmbs/wcsxfrm.c
index e41251f559..99a359399e 100644
--- a/wcsmbs/wcsxfrm.c
+++ b/wcsmbs/wcsxfrm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc.
+/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@@ -19,16 +19,23 @@
#include <wchar.h>
-#define WIDE_VERSION 1
-#define STRING_TYPE wchar_t
-#define USTRING_TYPE wint_t
-#define L_(Ch) L##Ch
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define STRXFRM __wcsxfrm_l
#else
# define STRXFRM wcsxfrm
#endif
-#define STRLEN __wcslen
-#define STPNCPY __wcpncpy
-#include <string/strxfrm.c>
+
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+size_t
+STRXFRM (wchar_t *dest, const wchar_t *src, size_t n)
+#else
+size_t
+STRXFRM (wchar_t *dest, const wchar_t *src, size_t n, __locale_t l)
+#endif
+{
+ if (n != 0)
+ __wcpncpy (dest, src, n);
+
+ return __wcslen (src);
+}
diff --git a/wctype/wctrans.c b/wctype/wctrans.c
index a5b4a32aac..5d7b5bda38 100644
--- a/wctype/wctrans.c
+++ b/wctype/wctrans.c
@@ -52,5 +52,5 @@ wctrans (const char *property)
/* We have to search the table. */
result = (int32_t *) _NL_CURRENT (LC_CTYPE, _NL_NUM_LC_CTYPE + cnt - 2);
- return (wctrans_t) (result + 128);
+ return (wctrans_t) result;
}