summaryrefslogtreecommitdiff
path: root/locale/programs/linereader.c
diff options
context:
space:
mode:
Diffstat (limited to 'locale/programs/linereader.c')
-rw-r--r--locale/programs/linereader.c452
1 files changed, 321 insertions, 131 deletions
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
index 31278d63c2..99ed0f2480 100644
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -1,6 +1,6 @@
/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
+ Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -28,22 +28,20 @@
#include <stdlib.h>
#include <string.h>
+#include "charmap.h"
#include "error.h"
#include "linereader.h"
-#include "charset.h"
+#include "localedef.h"
#include "stringtrans.h"
-void *xmalloc (size_t __n);
-void *xrealloc (void *__p, size_t __n);
-char *xstrdup (const char *__str);
-
-
+/* Prototypes for local functions. */
static struct token *get_toplvl_escape (struct linereader *lr);
static struct token *get_symname (struct linereader *lr);
static struct token *get_ident (struct linereader *lr);
static struct token *get_string (struct linereader *lr,
- const struct charset_t *charset);
+ const struct charmap_t *charmap,
+ const struct repertoire_t *repertoire);
struct linereader *
@@ -126,9 +124,14 @@ lr_next (struct linereader *lr)
if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
{
+#if 0
+ /* XXX Is this correct? */
/* An escaped newline character is substituted with a single <SP>. */
--n;
lr->buf[n - 1] = ' ';
+#else
+ n -= 2;
+#endif
}
lr->buf[n] = '\0';
@@ -149,7 +152,8 @@ extern char *program_name;
struct token *
-lr_token (struct linereader *lr, const struct charset_t *charset)
+lr_token (struct linereader *lr, const struct charmap_t *charmap,
+ const struct repertoire_t *repertoire)
{
int ch;
@@ -193,12 +197,29 @@ lr_token (struct linereader *lr, const struct charset_t *charset)
return get_toplvl_escape (lr);
/* Match ellipsis. */
- if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0)
+ if (ch == '.')
{
- lr_getc (lr);
- lr_getc (lr);
- lr->token.tok = tok_ellipsis;
- return &lr->token;
+ if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
+ {
+ lr_getc (lr);
+ lr_getc (lr);
+ lr_getc (lr);
+ lr->token.tok = tok_ellipsis4;
+ return &lr->token;
+ }
+ if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
+ {
+ lr_getc (lr);
+ lr_getc (lr);
+ lr->token.tok = tok_ellipsis3;
+ return &lr->token;
+ }
+ if (lr->buf[lr->idx] == '.')
+ {
+ lr_getc (lr);
+ lr->token.tok = tok_ellipsis2;
+ return &lr->token;
+ }
}
switch (ch)
@@ -238,7 +259,7 @@ lr_token (struct linereader *lr, const struct charset_t *charset)
return &lr->token;
case '"':
- return get_string (lr, charset);
+ return get_string (lr, charmap, repertoire);
case '-':
ch = lr_getc (lr);
@@ -261,7 +282,7 @@ get_toplvl_escape (struct linereader *lr)
/* This is supposed to be a numeric value. We return the
numerical value and the number of bytes. */
size_t start_idx = lr->idx - 1;
- unsigned int value = 0;
+ char *bytes = lr->token.val.charcode.bytes;
int nbytes = 0;
int ch;
@@ -287,11 +308,11 @@ get_toplvl_escape (struct linereader *lr)
|| (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
{
esc_error:
- lr->token.val.str.start = &lr->buf[start_idx];
+ lr->token.val.str.startmb = &lr->buf[start_idx];
while (ch != EOF && !isspace (ch))
ch = lr_getc (lr);
- lr->token.val.str.len = lr->idx - start_idx;
+ lr->token.val.str.lenmb = lr->idx - start_idx;
lr->token.tok = tok_error;
return &lr->token;
@@ -300,7 +321,7 @@ get_toplvl_escape (struct linereader *lr)
if (isdigit (ch))
byte = ch - '0';
else
- byte = _tolower (ch) - 'a' + 10;
+ byte = tolower (ch) - 'a' + 10;
ch = lr_getc (lr);
if ((base == 16 && !isxdigit (ch))
@@ -311,7 +332,7 @@ get_toplvl_escape (struct linereader *lr)
if (isdigit (ch))
byte += ch - '0';
else
- byte += _tolower (ch) - 'a' + 10;
+ byte += tolower (ch) - 'a' + 10;
ch = lr_getc (lr);
if (base != 16 && isdigit (ch))
@@ -322,10 +343,7 @@ get_toplvl_escape (struct linereader *lr)
ch = lr_getc (lr);
}
- value *= 256;
- value += byte;
-
- ++nbytes;
+ bytes[nbytes++] = byte;
}
while (ch == lr->escape_char && nbytes < 4);
@@ -335,23 +353,52 @@ get_toplvl_escape (struct linereader *lr)
lr_ungetn (lr, 1);
lr->token.tok = tok_charcode;
- lr->token.val.charcode.val = value;
lr->token.val.charcode.nbytes = nbytes;
return &lr->token;
}
-#define ADDC(ch) \
- do \
- { \
- if (bufact == bufmax) \
- { \
- bufmax *= 2; \
- buf = xrealloc (buf, bufmax); \
- } \
- buf[bufact++] = (ch); \
- } \
+#define ADDC(ch) \
+ do \
+ { \
+ if (bufact == bufmax) \
+ { \
+ bufmax *= 2; \
+ buf = xrealloc (buf, bufmax); \
+ } \
+ buf[bufact++] = (ch); \
+ } \
+ while (0)
+
+
+#define ADDS(s, l) \
+ do \
+ { \
+ size_t _l = (l); \
+ if (bufact + _l > bufmax) \
+ { \
+ if (bufact < _l) \
+ bufact = _l; \
+ bufmax *= 2; \
+ buf = xrealloc (buf, bufmax); \
+ } \
+ memcpy (&buf[bufact], s, _l); \
+ bufact += _l; \
+ } \
+ while (0)
+
+
+#define ADDWC(ch) \
+ do \
+ { \
+ if (buf2act == buf2max) \
+ { \
+ buf2max *= 2; \
+ buf2 = xrealloc (buf2, buf2max * 4); \
+ } \
+ buf2[buf2act++] = (ch); \
+ } \
while (0)
@@ -399,9 +446,8 @@ get_symname (struct linereader *lr)
if (cp == &buf[bufact - 1])
{
/* Yes, it is. */
- lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4;
- lr->token.val.charcode.val = strtoul (buf, NULL, 16);
- lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4;
+ lr->token.tok = tok_ucs4;
+ lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
return &lr->token;
}
@@ -422,8 +468,8 @@ get_symname (struct linereader *lr)
buf[bufact] = '\0';
buf = xrealloc (buf, bufact + 1);
- lr->token.val.str.start = buf;
- lr->token.val.str.len = bufact - 1;
+ lr->token.val.str.startmb = buf;
+ lr->token.val.str.lenmb = bufact - 1;
}
return &lr->token;
@@ -446,8 +492,18 @@ get_ident (struct linereader *lr)
while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
&& ch != '<' && ch != ',')
- /* XXX Handle escape sequences? */
- ADDC (ch);
+ {
+ if (ch == lr->escape_char)
+ {
+ ch = lr_getc (lr);
+ if (ch == '\n' || ch == EOF)
+ {
+ lr_error (lr, _("invalid escape sequence"));
+ break;
+ }
+ }
+ ADDC (ch);
+ }
lr_ungetn (lr, 1);
@@ -465,8 +521,8 @@ get_ident (struct linereader *lr)
buf[bufact] = '\0';
buf = xrealloc (buf, bufact + 1);
- lr->token.val.str.start = buf;
- lr->token.val.str.len = bufact;
+ lr->token.val.str.startmb = buf;
+ lr->token.val.str.lenmb = bufact;
}
return &lr->token;
@@ -474,113 +530,247 @@ get_ident (struct linereader *lr)
static struct token *
-get_string (struct linereader *lr, const struct charset_t *charset)
+get_string (struct linereader *lr, const struct charmap_t *charmap,
+ const struct repertoire_t *repertoire)
{
- int illegal_string = 0;
- char *buf, *cp;
+ int return_widestr = lr->return_widestr;
+ char *buf;
+ char *buf2 = NULL;
size_t bufact;
size_t bufmax = 56;
- int ch;
+ /* We must return two different strings. */
buf = xmalloc (bufmax);
bufact = 0;
- while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
- if (ch != '<' || charset == NULL)
- {
- if (ch == lr->escape_char)
- {
- ch = lr_getc (lr);
- if (ch == '\n' || ch == EOF)
- break;
- }
+ /* We know it'll be a string. */
+ lr->token.tok = tok_string;
+
+ /* If we need not translate the strings (i.e., expand <...> parts)
+ we can run a simple loop. */
+ if (!lr->translate_strings)
+ {
+ int ch;
+
+ buf2 = NULL;
+ while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
ADDC (ch);
- }
- else
- {
- /* We have to get the value of the symbol. */
- unsigned int value;
- size_t startidx = bufact;
-
- if (!lr->translate_strings)
- ADDC ('<');
-
- while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
- {
- if (ch == lr->escape_char)
- {
- ch = lr_getc (lr);
- if (ch == '\n' || ch == EOF)
- break;
- }
- ADDC (ch);
- }
-
- if (ch == '\n' || ch == EOF)
- lr_error (lr, _("unterminated string"));
- else
- if (!lr->translate_strings)
- ADDC ('>');
-
- if (lr->translate_strings)
- {
- value = charset_find_value (&charset->char_table, &buf[startidx],
- bufact - startidx);
- if ((wchar_t) value == ILLEGAL_CHAR_VALUE)
+
+ /* Catch errors with trailing escape character. */
+ if (bufact > 0 && buf[bufact - 1] == lr->escape_char
+ && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
+ {
+ lr_error (lr, _("illegal escape sequence at end of string"));
+ --bufact;
+ }
+ else if (ch == '\n' || ch == EOF)
+ lr_error (lr, _("unterminated string"));
+
+ ADDC ('\0');
+ }
+ else
+ {
+ int illegal_string = 0;
+ size_t buf2act = 0;
+ size_t buf2max = 56 * sizeof (uint32_t);
+ int ch;
+ int warned = 0;
+
+ /* We have to provide the wide character result as well. */
+ if (return_widestr)
+ buf2 = xmalloc (buf2max);
+
+ /* Read until the end of the string (or end of the line or file). */
+ while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
+ {
+ size_t startidx;
+ uint32_t wch;
+ struct charseq *seq;
+
+ if (ch != '<')
+ {
+ /* The standards leave it up to the implementation to decide
+ what to do with character which stand for themself. We
+ could jump through hoops to find out the value relative to
+ the charmap and the repertoire map, but instead we leave
+ it up to the locale definition author to write a better
+ definition. We assume here that every character which
+ stands for itself is encoded using ISO 8859-1. Using the
+ escape character is allowed. */
+ if (ch == lr->escape_char)
+ {
+ ch = lr_getc (lr);
+ if (ch == '\n' || ch == EOF)
+ break;
+ }
+
+ if (verbose && !warned)
+ {
+ lr_error (lr, _("\
+non-symbolic character value should not be used"));
+ warned = 1;
+ }
+
+ ADDC (ch);
+ if (return_widestr)
+ ADDWC ((uint32_t) ch);
+
+ continue;
+ }
+
+ /* Now we have to search for the end of the symbolic name, i.e.,
+ the closing '>'. */
+ startidx = bufact;
+ while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
+ {
+ if (ch == lr->escape_char)
+ {
+ ch = lr_getc (lr);
+ if (ch == '\n' || ch == EOF)
+ break;
+ }
+ ADDC (ch);
+ }
+ if (ch == '\n' || ch == EOF)
+ /* Not a correct string. */
+ break;
+ if (bufact == startidx)
+ {
+ /* <> is no correct name. Ignore it and also signal an
+ error. */
illegal_string = 1;
- bufact = startidx;
+ continue;
+ }
- if (bufmax - bufact < 8)
- {
- bufmax *= 2;
- buf = (char *) xrealloc (buf, bufmax);
- }
+ /* It might be a Uxxxx symbol. */
+ if (buf[startidx] == 'U'
+ && (bufact - startidx == 5 || bufact - startidx == 9))
+ {
+ char *cp = buf + startidx + 1;
+ while (cp < &buf[bufact] && isxdigit (*cp))
+ ++cp;
+
+ if (cp == &buf[bufact])
+ {
+ const char *symbol = NULL;
+
+ /* Yes, it is. */
+ ADDC ('\0');
+ wch = strtoul (buf + startidx + 1, NULL, 16);
+
+ /* Now forget about the name we just added. */
+ bufact = startidx;
+
+ if (return_widestr)
+ ADDWC (wch);
+
+ /* Now determine from the repertoire the name of the
+ character and find it in the charmap. */
+ if (repertoire != NULL)
+ symbol = repertoire_find_symbol (repertoire, wch);
+
+ if (symbol == NULL)
+ {
+ /* We cannot generate a string since we cannot map
+ from the Unicode number to the character symbol. */
+ lr_error (lr,
+ _("character <U%0*X> not in repertoire map"),
+ wch > 0xffff ? 8 : 4, wch);
+
+ illegal_string = 1;
+ }
+ else
+ {
+ seq = charmap_find_value (charmap, symbol,
+ strlen (symbol));
+
+ if (seq == NULL)
+ {
+ /* Not a known name. */
+ lr_error (lr,
+ _("symbol `%s' not in charmap"), symbol);
+ illegal_string = 1;
+ }
+ else
+ ADDS (seq->bytes, seq->nbytes);
+ }
+
+ continue;
+ }
+ }
+
+ if (return_widestr)
+ {
+ /* We now have the symbolic name in buf[startidx] to
+ buf[bufact-1]. Now find out the value for this
+ character in the repertoire map as well as in the
+ charmap (in this order). */
+ wch = repertoire_find_value (repertoire, &buf[startidx],
+ bufact - startidx);
+ if (wch == ILLEGAL_CHAR_VALUE)
+ {
+ /* This name is not in the repertoire map. */
+ lr_error (lr, _("symbol `%.*s' not in repertoire map"),
+ bufact - startidx, &buf[startidx]);
+ illegal_string = 1;
+ }
+ else
+ ADDWC (wch);
+ }
+
+ /* Now the same for the multibyte representation. */
+ seq = charmap_find_value (charmap, &buf[startidx],
+ bufact - startidx);
- cp = &buf[bufact];
- if (encode_char (value, &cp))
+ if (seq == NULL)
+ {
+ /* This name is not in the charmap. */
+ lr_error (lr, _("symbol `%.*s' not in charmap"),
+ bufact - startidx, &buf[startidx]);
illegal_string = 1;
- bufact = cp - buf;
- }
- }
+ /* Now forget about the name we just added. */
+ bufact = startidx;
+ }
+ else
+ {
+ /* Now forget about the name we just added. */
+ bufact = startidx;
- /* Catch errors with trailing escape character. */
- if (bufact > 0 && buf[bufact - 1] == lr->escape_char
- && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
- {
- lr_error (lr, _("illegal escape sequence at end of string"));
- --bufact;
- }
- else if (ch == '\n' || ch == EOF)
- lr_error (lr, _("unterminated string"));
+ ADDS (seq->bytes, seq->nbytes);
+ }
+ }
- /* Terminate string if necessary. */
- if (lr->translate_strings)
- {
- cp = &buf[bufact];
- if (encode_char (0, &cp))
- illegal_string = 1;
+ if (ch == '\n' || ch == EOF)
+ {
+ lr_error (lr, _("unterminated string"));
+ illegal_string = 1;
+ }
- bufact = cp - buf;
- }
- else
- ADDC ('\0');
+ if (illegal_string)
+ {
+ free (buf);
+ if (buf2 != NULL)
+ free (buf2);
+ lr->token.val.str.startmb = NULL;
+ lr->token.val.str.lenmb = 0;
- lr->token.tok = tok_string;
+ return &lr->token;
+ }
- if (illegal_string)
- {
- free (buf);
- lr->token.val.str.start = NULL;
- lr->token.val.str.len = 0;
- }
- else
- {
- buf = xrealloc (buf, bufact + 1);
+ ADDC ('\0');
- lr->token.val.str.start = buf;
- lr->token.val.str.len = bufact;
+ if (return_widestr)
+ {
+ ADDWC (0);
+ lr->token.val.str.startwc = xrealloc (buf2,
+ buf2act * sizeof (uint32_t));
+ lr->token.val.str.lenwc = buf2act;
+ }
}
+ lr->token.val.str.startmb = xrealloc (buf, bufact);
+ lr->token.val.str.lenmb = bufact;
+
return &lr->token;
}