summaryrefslogtreecommitdiff
path: root/locale/locfile-lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'locale/locfile-lex.c')
-rw-r--r--locale/locfile-lex.c533
1 files changed, 533 insertions, 0 deletions
diff --git a/locale/locfile-lex.c b/locale/locfile-lex.c
new file mode 100644
index 0000000000..20e4f0f9cd
--- /dev/null
+++ b/locale/locfile-lex.c
@@ -0,0 +1,533 @@
+/* Copyright (C) 1995 Free Software Foundation, Inc.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB. If
+not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+Cambridge, MA 02139, USA. */
+
+#include <ctype.h>
+#include <langinfo.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "localedef.h"
+#include "token.h"
+
+
+/* Include the hashing table for the keywords. */
+const struct locale_keyword* in_word_set (register const char *str,
+ register int len);
+#include "keyword.h"
+
+
+/* Contains the status of reading the locale definition file. */
+struct locfile_data locfile_data;
+
+/* This is a flag used while collation input. This is the only place
+ where element names beside the ones defined in the character map are
+ allowed. There we must not give error messages. */
+int reject_new_char = 1;
+
+/* Prototypes for local functions. */
+static int get_char (void);
+
+
+#define LD locfile_data
+
+/* Opens the locale definition file and initializes the status data structure
+ for following calls of `locfile_lex'. */
+void
+locfile_open (const char *fname)
+{
+ if (fname == NULL)
+ /* We read from stdin. */
+ LD.filename = "<stdin>";
+ else
+ {
+ if (freopen (fname, "r", stdin) == NULL)
+ error (4, 0, gettext ("input file `%s' not found"), fname);
+ LD.filename = fname;
+ }
+
+ /* Set default values. */
+ LD.escape_char = '\\';
+ LD.comment_char = '#';
+
+ LD.bufsize = sysconf (_SC_LINE_MAX);
+ LD.buf = (char *) xmalloc (LD.bufsize);
+ LD.strbuf = (char *) xmalloc (LD.bufsize);
+
+ LD.buf_ptr = LD.returned_tokens = LD.line_no = 0;
+
+ /* Now sign that we want immediately read a line. */
+ LD.continue_line = 1;
+ LD.buf[LD.buf_ptr] = '\0';
+}
+
+
+int
+xlocfile_lex (char **token, int *token_len)
+{
+ int retval = locfile_lex (token, token_len);
+
+ if (retval == 0)
+ /* I.e. end of file. */
+ error (4, 0, gettext ("%s: unexpected end of file in locale defintion "
+ "file"), locfile_data.filename);
+
+ return retval;
+}
+
+int
+locfile_lex (char **token, int *token_len)
+{
+ int start_again;
+ int retval = 0;
+
+ do
+ {
+ int start_ptr;
+
+ start_again = 0;
+
+ /* Read the next line. Skip over empty lines and comments. */
+ if ((LD.buf[LD.buf_ptr] == '\0' && LD.continue_line != 0)
+ || LD.buf_ptr >= LD.bufsize
+ || (posix_conformance == 0 && LD.buf[LD.buf_ptr] == LD.comment_char))
+ do
+ {
+ size_t linelen;
+
+ LD.buf_ptr = 0;
+
+ if (fgets (LD.buf, LD.bufsize, stdin) == NULL)
+ {
+ /* This makes subsequent calls also return EOF. */
+ LD.buf[0] = '\0';
+ return 0;
+ }
+
+ /* Increment line number counter. */
+ ++LD.line_no;
+
+ /* We now have to look whether this line is continued and
+ whether it at all fits into our buffer. */
+ linelen = strlen (LD.buf);
+
+ if (linelen == LD.bufsize - 1)
+ /* The did not fit into the buffer. */
+ error (2, 0, gettext ("%s:%Zd: line too long; use "
+ "`getconf LINE_MAX' to get the maximum "
+ "line length"), LD.filename, LD.line_no);
+
+ /* Remove '\n' at end of line. */
+ if (LD.buf[linelen - 1] == '\n')
+ LD.buf[--linelen] = '\0';
+
+ if (linelen > 0 && LD.buf[linelen - 1] == LD.escape_char)
+ {
+ LD.buf[--linelen] = '\0';
+ LD.continue_line = 1;
+ }
+ else
+ LD.continue_line = 0;
+
+ while (isspace (LD.buf[LD.buf_ptr]))
+ ++LD.buf_ptr;
+
+ /* We are not so restrictive and allow white spaces before
+ a comment. */
+ if (posix_conformance == 0
+ && LD.buf[LD.buf_ptr] == LD.comment_char
+ && LD.buf_ptr != 0)
+ error (0, 0, gettext ("%s:%Zd: comment does not start in "
+ "column 1"), LD.filename, LD.line_no);
+ }
+ while (LD.buf[LD.buf_ptr] == '\0'
+ || LD.buf[LD.buf_ptr] == LD.comment_char);
+
+
+ /* Get information for return values. */
+ *token = LD.buf + LD.buf_ptr;
+ start_ptr = LD.buf_ptr;
+
+ /* If no further character is in the line this is the end of a logical
+ line. This information is needed in the parser. */
+ if (LD.buf[LD.buf_ptr] == '\0')
+ {
+ LD.buf_ptr = LD.bufsize;
+ retval = TOK_ENDOFLINE;
+ }
+ else if (isalpha (LD.buf[LD.buf_ptr]))
+ /* The token is an identifier. The POSIX standard does not say
+ what characters might be contained but offical POSIX locale
+ definition files contain beside alnum characters '_', '-' and
+ '+'. */
+ {
+ const struct locale_keyword *kw;
+
+ do
+ ++LD.buf_ptr;
+ while (isalnum (LD.buf[LD.buf_ptr]) || LD.buf[LD.buf_ptr] == '_'
+ || LD.buf[LD.buf_ptr] == '-' || LD.buf[LD.buf_ptr] == '+');
+
+ /* Look in table of keywords. */
+ kw = in_word_set (*token, LD.buf_ptr - start_ptr);
+ if (kw == NULL)
+ retval = TOK_IDENT;
+ else
+ {
+ if (kw->token_id == TOK_ESCAPE_CHAR
+ || kw->token_id == TOK_COMMENT_CHAR)
+ /* `escape_char' and `comment_char' are keywords for the
+ lexer. Do not give them to the parser. */
+ {
+ start_again = 1;
+
+ if (!isspace (LD.buf[LD.buf_ptr])
+ || (posix_conformance && LD.returned_tokens > 0))
+ error (0, 0, gettext ("%s:%Zd: syntax error in locale "
+ "definition file"),
+ LD.filename, LD.line_no);
+
+ do
+ ++LD.buf_ptr;
+ while (isspace (LD.buf[LD.buf_ptr]));
+
+ kw->token_id == TOK_ESCAPE_CHAR
+ ? LD.escape_char
+ : LD.comment_char = LD.buf[LD.buf_ptr++];
+
+ ignore_to_eol (0, posix_conformance);
+ }
+ else
+ /* It is one of the normal keywords. */
+ retval = kw->token_id;
+ }
+
+ *token_len = LD.buf_ptr - start_ptr;
+ }
+ else if (LD.buf[LD.buf_ptr] == '"')
+ /* Read a string. All symbolic character descriptions are expanded.
+ This has to be done in a local buffer because a simple symbolic
+ character like <A> may expand to upto 6 bytes. */
+ {
+ char *last = LD.strbuf;
+
+ ++LD.buf_ptr;
+ while (LD.buf[LD.buf_ptr] != '"')
+ {
+ int pre = LD.buf_ptr;
+ int char_val = get_char (); /* token, token_len); */
+
+ if (char_val == 0)
+ {
+ error (4, 0, gettext ("%s:%Zd: unterminated string at end "
+ "of line"), LD.filename, LD.line_no);
+ /* NOTREACHED */
+ }
+
+ if (char_val > 0)
+ /* Unknown characters are simply not stored. */
+ last += char_to_utf (last, char_val);
+ else
+ {
+ char tmp[LD.buf_ptr - pre + 1];
+ memcpy (tmp, &LD.buf[pre], LD.buf_ptr - pre);
+ tmp[LD.buf_ptr - pre] = '\0';
+ error (0, 0, gettext ("%s:%Zd: character `%s' not defined"),
+ LD.filename, LD.line_no, tmp);
+ }
+ }
+ if (LD.buf[LD.buf_ptr] != '\0')
+ ++LD.buf_ptr;
+
+ *last = '\0';
+ *token = LD.strbuf;
+ *token_len = last - LD.strbuf;
+ retval = TOK_STRING;
+ }
+ else if (LD.buf[LD.buf_ptr] == '.' && LD.buf[LD.buf_ptr + 1] == '.'
+ && LD.buf[LD.buf_ptr + 2] == '.')
+ {
+ LD.buf_ptr += 3;
+ retval = TOK_ELLIPSIS;
+ }
+ else if (LD.buf[LD.buf_ptr] == LD.escape_char)
+ {
+ char *endp;
+
+ ++LD.buf_ptr;
+ switch (LD.buf[LD.buf_ptr])
+ {
+ case 'x':
+ if (isdigit (LD.buf[++LD.buf_ptr]))
+ {
+ retval = strtol (&LD.buf[LD.buf_ptr], &endp, 16);
+ if (endp - (LD.buf + LD.buf_ptr) < 2 || retval > 255)
+ retval = 'x';
+ else
+ LD.buf_ptr = endp - LD.buf;
+ }
+ else
+ retval = 'x';
+ break;
+ case 'd':
+ if (isdigit (LD.buf[++LD.buf_ptr]))
+ {
+ retval = strtol (&LD.buf[LD.buf_ptr], &endp, 10);
+ if (endp - (LD.buf + LD.buf_ptr) < 2 || retval > 255)
+ retval = 'd';
+ else
+ LD.buf_ptr = endp - LD.buf;
+ }
+ else
+ retval = 'd';
+ break;
+ case '0'...'9':
+ retval = strtol (&LD.buf[LD.buf_ptr], &endp, 8);
+ if (endp - (LD.buf + LD.buf_ptr) < 2 || retval > 255)
+ retval = LD.buf[LD.buf_ptr++];
+ else
+ LD.buf_ptr = endp - LD.buf;
+ break;
+ case 'a':
+ retval = '\a';
+ ++LD.buf_ptr;
+ break;
+ case 'b':
+ retval = '\b';
+ ++LD.buf_ptr;
+ break;
+ case 'f':
+ retval = '\f';
+ ++LD.buf_ptr;
+ break;
+ case 'n':
+ retval = '\n';
+ ++LD.buf_ptr;
+ break;
+ case 'r':
+ retval = '\r';
+ ++LD.buf_ptr;
+ break;
+ case 't':
+ retval = '\t';
+ ++LD.buf_ptr;
+ break;
+ case 'v':
+ retval = '\v';
+ ++LD.buf_ptr;
+ break;
+ default:
+ retval = LD.buf[LD.buf_ptr++];
+ break;
+ }
+ }
+ else if (isdigit (LD.buf[LD.buf_ptr]))
+ {
+ char *endp;
+
+ *token_len = strtol (&LD.buf[LD.buf_ptr], &endp, 10);
+ LD.buf_ptr = endp - LD.buf;
+ retval = TOK_NUMBER;
+ }
+ else if (LD.buf[LD.buf_ptr] == '-' && LD.buf[LD.buf_ptr + 1] == '1')
+ {
+ LD.buf_ptr += 2;
+ retval = TOK_MINUS1;
+ }
+ else
+ {
+ int ch = get_char (); /* token, token_len); */
+ if (ch != -1)
+ {
+ *token_len = ch;
+ retval = TOK_CHAR;
+ }
+ else
+ retval = TOK_ILL_CHAR;
+ }
+
+ /* Ignore white space. */
+ while (isspace (LD.buf[LD.buf_ptr]))
+ ++LD.buf_ptr;
+ }
+ while (start_again != 0);
+
+ ++LD.returned_tokens;
+ return retval;
+}
+
+
+/* Code a character with UTF-8 if the character map has multi-byte
+ characters. */
+int
+char_to_utf (char *buf, int char_val)
+{
+ if (charmap_data.mb_cur_max == 1)
+ {
+ *buf++ = char_val;
+ return 1;
+ }
+ else
+ {
+/* The number of bits coded in each character. */
+#define CBPC 6
+ static struct coding_tab
+ {
+ int mask;
+ int val;
+ }
+ tab[] =
+ {
+ { 0x7f, 0x00 },
+ { 0x7ff, 0xc0 },
+ { 0xffff, 0xe0 },
+ { 0x1fffff, 0xf0 },
+ { 0x3ffffff, 0xf8 },
+ { 0x7fffffff, 0xfc },
+ { 0, }
+ };
+ struct coding_tab *t;
+ int c;
+ int cnt = 1;
+
+ for (t = tab; char_val > t->mask; ++t, ++cnt)
+ ;
+
+ c = cnt;
+
+ buf += cnt;
+ while (c > 1)
+ {
+ *--buf = 0x80 | (char_val & ((1 << CBPC) - 1));
+ char_val >>= CBPC;
+ --c;
+ }
+
+ *--buf = t->val | char_val;
+
+ return cnt;
+ }
+}
+
+
+/* Ignore rest of line upto ENDOFLINE token, starting with given token.
+ If WARN_FLAG is set warn about any token but ENDOFLINE. */
+void
+ignore_to_eol (int token, int warn_flag)
+{
+ if (token == TOK_ENDOFLINE)
+ return;
+
+ if (LD.buf[LD.buf_ptr] != '\0' && warn_flag)
+ error (0, 0, gettext ("%s:%Zd: trailing garbage at end of line"),
+ locfile_data.filename, locfile_data.line_no);
+
+ while (LD.continue_line)
+ {
+ LD.continue_line = 0;
+
+ /* Increment line number counter. */
+ ++LD.line_no;
+
+ if (fgets (LD.buf, LD.bufsize, stdin) != NULL)
+ {
+ /* We now have to look whether this line is continued and
+ whether it at all fits into our buffer. */
+ int linelen = strlen (LD.buf);
+
+ if (linelen == LD.bufsize - 1)
+ /* The did not fit into the buffer. */
+ error (2, 0, gettext ("%s:%Zd: line too long; use `getconf "
+ "LINE_MAX' to get the current maximum "
+ "line length"), LD.filename, LD.line_no);
+
+ /* Remove '\n' at end of line. */
+ if (LD.buf[linelen - 1] == '\n')
+ --linelen;
+
+ if (LD.buf[linelen - 1] == LD.escape_char)
+ LD.continue_line = 1;
+ }
+ }
+
+ /* This causes to begin the next line. */
+ LD.buf_ptr = LD.bufsize;
+}
+
+
+/* Return the value of the character at the beginning of the input buffer.
+ Symbolic character constants are expanded. */
+static int
+get_char (void)
+{
+ if (LD.buf[LD.buf_ptr] == '<')
+ /* This is a symbolic character name. */
+ {
+ int char_val;
+ char *startp = LD.buf + (++LD.buf_ptr);
+ char *endp = startp;
+
+ while (LD.buf[LD.buf_ptr] != '>' && isprint (LD.buf[LD.buf_ptr]))
+ {
+ if (LD.buf[LD.buf_ptr] == '\0'
+ || (LD.buf[LD.buf_ptr] == LD.escape_char
+ && LD.buf[++LD.buf_ptr] == '\0'))
+ break;
+
+ *endp++ = LD.buf[LD.buf_ptr++];
+ }
+
+ if (LD.buf[LD.buf_ptr] != '>' && LD.buf[LD.buf_ptr] == '\0')
+ {
+ error (0, 0, gettext ("%s:%Zd: end of line in character symbol"),
+ LD.filename, LD.line_no);
+
+ if (startp == endp)
+ return -1;
+ }
+ else
+ ++LD.buf_ptr;
+
+ char_val = find_char (startp, endp - startp);
+ if (char_val == -1 && verbose != 0 && reject_new_char != 0)
+ {
+ /* Locale defintions are often given very general. Missing
+ characters are only reported when explicitely requested. */
+ char tmp[endp - startp + 3];
+
+ tmp[0] = '<';
+ memcpy (tmp + 1, startp, endp - startp);
+ tmp[endp - startp + 1] = '>';
+ tmp[endp - startp + 2] = '\0';
+
+ error (0, 0, gettext ("%s:%Zd: character `%s' not defined"),
+ LD.filename, LD.line_no, tmp);
+ }
+
+ return char_val;
+ }
+ else
+ return (int) LD.buf[LD.buf_ptr++];
+}
+
+/*
+ * Local Variables:
+ * mode:c
+ * c-basic-offset:2
+ * End:
+ */