summaryrefslogtreecommitdiff
path: root/localedata/unicode-gen/unicode_utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'localedata/unicode-gen/unicode_utils.py')
-rw-r--r--localedata/unicode-gen/unicode_utils.py516
1 files changed, 516 insertions, 0 deletions
diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py
new file mode 100644
index 0000000000..8cc5f2ba2a
--- /dev/null
+++ b/localedata/unicode-gen/unicode_utils.py
@@ -0,0 +1,516 @@
+# Utilities to generate Unicode data for glibc from upstream Unicode data.
+#
+# Copyright (C) 2014-2016 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+This module contains utilities used by the scripts to generate
+Unicode data for glibc from upstream Unicode data files.
+'''
+
+import sys
+import re
+
+# Dictionary holding the entire contents of the UnicodeData.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: {'category': 'Cc',
+# 'title': None,
+# 'digit': '',
+# 'name': '<control>',
+# 'bidi': 'BN',
+# 'combining': '0',
+# 'comment': '',
+# 'oldname': 'NULL',
+# 'decomposition': '',
+# 'upper': None,
+# 'mirrored': 'N',
+# 'lower': None,
+# 'decdigit': '',
+# 'numeric': ''},
+# …
+# }
+UNICODE_ATTRIBUTES = {}
+
+# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {917504: ['Default_Ignorable_Code_Point'],
+# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
+# …
+# }
+DERIVED_CORE_PROPERTIES = {}
+
+# Dictionary holding the entire contents of the EastAsianWidths.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: 'N', … , 45430: 'W', …}
+EAST_ASIAN_WIDTHS = {}
+
+def fill_attribute(code_point, fields):
+ '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
+
+ One entry in the UNICODE_ATTRIBUTES dictionary represents one line
+ in the UnicodeData.txt file.
+
+ '''
+ UNICODE_ATTRIBUTES[code_point] = {
+ 'name': fields[1], # Character name
+ 'category': fields[2], # General category
+ 'combining': fields[3], # Canonical combining classes
+ 'bidi': fields[4], # Bidirectional category
+ 'decomposition': fields[5], # Character decomposition mapping
+ 'decdigit': fields[6], # Decimal digit value
+ 'digit': fields[7], # Digit value
+ 'numeric': fields[8], # Numeric value
+ 'mirrored': fields[9], # mirrored
+ 'oldname': fields[10], # Old Unicode 1.0 name
+ 'comment': fields[11], # comment
+ # Uppercase mapping
+ 'upper': int(fields[12], 16) if fields[12] else None,
+ # Lowercase mapping
+ 'lower': int(fields[13], 16) if fields[13] else None,
+ # Titlecase mapping
+ 'title': int(fields[14], 16) if fields[14] else None,
+ }
+
+def fill_attributes(filename):
+ '''Stores the entire contents of the UnicodeData.txt file
+ in the UNICODE_ATTRIBUTES dictionary.
+
+ A typical line for a single code point in UnicodeData.txt looks
+ like this:
+
+ 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+
+ Code point ranges are indicated by pairs of lines like this:
+
+ 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
+ 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
+ '''
+ with open(filename, mode='r') as unicode_data_file:
+ fields_start = []
+ for line in unicode_data_file:
+ fields = line.strip().split(';')
+ if len(fields) != 15:
+ sys.stderr.write(
+ 'short line in file "%(f)s": %(l)s\n' %{
+ 'f': filename, 'l': line})
+ exit(1)
+ if fields[2] == 'Cs':
+ # Surrogates are UTF-16 artefacts,
+ # not real characters. Ignore them.
+ fields_start = []
+ continue
+ if fields[1].endswith(', First>'):
+ fields_start = fields
+ fields_start[1] = fields_start[1].split(',')[0][1:]
+ continue
+ if fields[1].endswith(', Last>'):
+ fields[1] = fields[1].split(',')[0][1:]
+ if fields[1:] != fields_start[1:]:
+ sys.stderr.write(
+ 'broken code point range in file "%(f)s": %(l)s\n' %{
+ 'f': filename, 'l': line})
+ exit(1)
+ for code_point in range(
+ int(fields_start[0], 16),
+ int(fields[0], 16)+1):
+ fill_attribute(code_point, fields)
+ fields_start = []
+ continue
+ fill_attribute(int(fields[0], 16), fields)
+ fields_start = []
+
+def fill_derived_core_properties(filename):
+ '''Stores the entire contents of the DerivedCoreProperties.txt file
+ in the DERIVED_CORE_PROPERTIES dictionary.
+
+ Lines in DerivedCoreProperties.txt are either a code point range like
+ this:
+
+ 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+
+ or a single code point like this:
+
+ 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
+
+ '''
+ with open(filename, mode='r') as derived_core_properties_file:
+ for line in derived_core_properties_file:
+ match = re.match(
+ r'^(?P<codepoint1>[0-9A-F]{4,6})'
+ + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+ + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
+ line)
+ if not match:
+ continue
+ start = match.group('codepoint1')
+ end = match.group('codepoint2')
+ if not end:
+ end = start
+ for code_point in range(int(start, 16), int(end, 16)+1):
+ prop = match.group('property')
+ if code_point in DERIVED_CORE_PROPERTIES:
+ DERIVED_CORE_PROPERTIES[code_point].append(prop)
+ else:
+ DERIVED_CORE_PROPERTIES[code_point] = [prop]
+
+def fill_east_asian_widths(filename):
+ '''Stores the entire contents of the EastAsianWidths.txt file
+ in the EAST_ASIAN_WIDTHS dictionary.
+
+ Lines in EastAsianWidths.txt are either a code point range like
+ this:
+
+ 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
+
+ or a single code point like this:
+
+ A015;W # Lm YI SYLLABLE WU
+ '''
+ with open(filename, mode='r') as east_asian_widths_file:
+ for line in east_asian_widths_file:
+ match = re.match(
+ r'^(?P<codepoint1>[0-9A-F]{4,6})'
+ +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+ +r'\s*;\s*(?P<property>[a-zA-Z]+)',
+ line)
+ if not match:
+ continue
+ start = match.group('codepoint1')
+ end = match.group('codepoint2')
+ if not end:
+ end = start
+ for code_point in range(int(start, 16), int(end, 16)+1):
+ EAST_ASIAN_WIDTHS[code_point] = match.group('property')
+
+def to_upper(code_point):
+ '''Returns the code point of the uppercase version
+ of the given code point'''
+ if (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['upper']):
+ return UNICODE_ATTRIBUTES[code_point]['upper']
+ else:
+ return code_point
+
+def to_lower(code_point):
+ '''Returns the code point of the lowercase version
+ of the given code point'''
+ if (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['lower']):
+ return UNICODE_ATTRIBUTES[code_point]['lower']
+ else:
+ return code_point
+
+def to_upper_turkish(code_point):
+ '''Returns the code point of the Turkish uppercase version
+ of the given code point'''
+ if code_point == 0x0069:
+ return 0x0130
+ return to_upper(code_point)
+
+def to_lower_turkish(code_point):
+ '''Returns the code point of the Turkish lowercase version
+ of the given code point'''
+ if code_point == 0x0049:
+ return 0x0131
+ return to_lower(code_point)
+
+def to_title(code_point):
+ '''Returns the code point of the titlecase version
+ of the given code point'''
+ if (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['title']):
+ return UNICODE_ATTRIBUTES[code_point]['title']
+ else:
+ return code_point
+
+def is_upper(code_point):
+ '''Checks whether the character with this code point is uppercase'''
+ return (to_lower(code_point) != code_point
+ or (code_point in DERIVED_CORE_PROPERTIES
+ and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_lower(code_point):
+ '''Checks whether the character with this code point is lowercase'''
+ # Some characters are defined as “Lowercase” in
+ # DerivedCoreProperties.txt but do not have a mapping to upper
+ # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
+ # one of these.
+ return (to_upper(code_point) != code_point
+ # <U00DF> is lowercase, but without simple to_upper mapping.
+ or code_point == 0x00DF
+ or (code_point in DERIVED_CORE_PROPERTIES
+ and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_alpha(code_point):
+ '''Checks whether the character with this code point is alphabetic'''
+ return ((code_point in DERIVED_CORE_PROPERTIES
+ and
+ 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
+ or
+ # Consider all the non-ASCII digits as alphabetic.
+ # ISO C 99 forbids us to have them in category “digit”,
+ # but we want iswalnum to return true on them.
+ (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
+ and not (code_point >= 0x0030 and code_point <= 0x0039)))
+
+def is_digit(code_point):
+ '''Checks whether the character with this code point is a digit'''
+ if False:
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
+ # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
+ # a zero. Must add <0> in front of them by hand.
+ else:
+ # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+ # takes it away:
+ # 7.25.2.1.5:
+ # The iswdigit function tests for any wide character that
+ # corresponds to a decimal-digit character (as defined in 5.2.1).
+ # 5.2.1:
+ # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+ return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_outdigit(code_point):
+ '''Checks whether the character with this code point is outdigit'''
+ return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_blank(code_point):
+ '''Checks whether the character with this code point is blank'''
+ return (code_point == 0x0009 # '\t'
+ # Category Zs without mention of '<noBreak>'
+ or (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
+ and '<noBreak>' not in
+ UNICODE_ATTRIBUTES[code_point]['decomposition']))
+
+def is_space(code_point):
+ '''Checks whether the character with this code point is a space'''
+ # Don’t make U+00A0 a space. Non-breaking space means that all programs
+ # should treat it like a punctuation character, not like a space.
+ return (code_point == 0x0020 # ' '
+ or code_point == 0x000C # '\f'
+ or code_point == 0x000A # '\n'
+ or code_point == 0x000D # '\r'
+ or code_point == 0x0009 # '\t'
+ or code_point == 0x000B # '\v'
+ # Categories Zl, Zp, and Zs without mention of "<noBreak>"
+ or (UNICODE_ATTRIBUTES[code_point]['name']
+ and
+ (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
+ or
+ (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
+ and
+ '<noBreak>' not in
+ UNICODE_ATTRIBUTES[code_point]['decomposition']))))
+
+def is_cntrl(code_point):
+ '''Checks whether the character with this code point is
+ a control character'''
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
+ or
+ UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
+
+def is_xdigit(code_point):
+ '''Checks whether the character with this code point is
+ a hexadecimal digit'''
+ if False:
+ return (is_digit(code_point)
+ or (code_point >= 0x0041 and code_point <= 0x0046)
+ or (code_point >= 0x0061 and code_point <= 0x0066))
+ else:
+ # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+ # takes it away:
+ # 7.25.2.1.12:
+ # The iswxdigit function tests for any wide character that
+ # corresponds to a hexadecimal-digit character (as defined
+ # in 6.4.4.1).
+ # 6.4.4.1:
+ # hexadecimal-digit: one of
+ # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+ return ((code_point >= 0x0030 and code_point <= 0x0039)
+ or (code_point >= 0x0041 and code_point <= 0x0046)
+ or (code_point >= 0x0061 and code_point <= 0x0066))
+
+def is_graph(code_point):
+ '''Checks whether the character with this code point is
+ a graphical character'''
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
+ and not is_space(code_point))
+
+def is_print(code_point):
+ '''Checks whether the character with this code point is printable'''
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
+ and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
+
+def is_punct(code_point):
+ '''Checks whether the character with this code point is punctuation'''
+ if False:
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
+ else:
+ # The traditional POSIX definition of punctuation is every graphic,
+ # non-alphanumeric character.
+ return (is_graph(code_point)
+ and not is_alpha(code_point)
+ and not is_digit(code_point))
+
+def is_combining(code_point):
+ '''Checks whether the character with this code point is
+ a combining character'''
+ # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
+ # file. In 3.0.1 it was identical to the union of the general categories
+ # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
+ # PropList.txt file, so we take the latter definition.
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and
+ UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
+
+def is_combining_level3(code_point):
+ '''Checks whether the character with this code point is
+ a combining level3 character'''
+ return (is_combining(code_point)
+ and
+ int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
+
+def ucs_symbol(code_point):
+ '''Return the UCS symbol string for a Unicode character.'''
+ if code_point < 0x10000:
+ return '<U{:04X}>'.format(code_point)
+ else:
+ return '<U{:08X}>'.format(code_point)
+
+def ucs_symbol_range(code_point_low, code_point_high):
+ '''Returns a string UCS symbol string for a code point range.
+
+ Example:
+
+ <U0041>..<U005A>
+ '''
+ return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
+
+def verifications():
+ '''Tests whether the is_* functions observe the known restrictions'''
+ for code_point in sorted(UNICODE_ATTRIBUTES):
+ # toupper restriction: "Only characters specified for the keywords
+ # lower and upper shall be specified.
+ if (to_upper(code_point) != code_point
+ and not (is_lower(code_point) or is_upper(code_point))):
+ sys.stderr.write(
+ ('%(sym)s is not upper|lower '
+ + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
+ 'sym': ucs_symbol(code_point),
+ 'c': code_point,
+ 'uc': to_upper(code_point)})
+ # tolower restriction: "Only characters specified for the keywords
+ # lower and upper shall be specified.
+ if (to_lower(code_point) != code_point
+ and not (is_lower(code_point) or is_upper(code_point))):
+ sys.stderr.write(
+ ('%(sym)s is not upper|lower '
+ + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
+ 'sym': ucs_symbol(code_point),
+ 'c': code_point,
+ 'uc': to_lower(code_point)})
+ # alpha restriction: "Characters classified as either upper or lower
+ # shall automatically belong to this class.
+ if ((is_lower(code_point) or is_upper(code_point))
+ and not is_alpha(code_point)):
+ sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # alpha restriction: “No character specified for the keywords cntrl,
+ # digit, punct or space shall be specified.”
+ if (is_alpha(code_point) and is_cntrl(code_point)):
+ sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_alpha(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is alpha and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_alpha(code_point) and is_punct(code_point)):
+ sys.stderr.write('%(sym)s is alpha and punct\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_alpha(code_point) and is_space(code_point)):
+ sys.stderr.write('%(sym)s is alpha and space\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # space restriction: “No character specified for the keywords upper,
+ # lower, alpha, digit, graph or xdigit shall be specified.”
+ # upper, lower, alpha already checked above.
+ if (is_space(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is space and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_space(code_point) and is_graph(code_point)):
+ sys.stderr.write('%(sym)s is space and graph\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_space(code_point) and is_xdigit(code_point)):
+ sys.stderr.write('%(sym)s is space and xdigit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # cntrl restriction: “No character specified for the keywords upper,
+ # lower, alpha, digit, punct, graph, print or xdigit shall be
+ # specified.” upper, lower, alpha already checked above.
+ if (is_cntrl(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_punct(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and punct\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_graph(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and graph\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_print(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and print\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_xdigit(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # punct restriction: “No character specified for the keywords upper,
+ # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
+ # be specified.” upper, lower, alpha, cntrl already checked above.
+ if (is_punct(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is punct and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_punct(code_point) and is_xdigit(code_point)):
+ sys.stderr.write('%(sym)s is punct and xdigit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_punct(code_point) and code_point == 0x0020):
+ sys.stderr.write('%(sym)s is punct\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # graph restriction: “No character specified for the keyword cntrl
+ # shall be specified.” Already checked above.
+
+ # print restriction: “No character specified for the keyword cntrl
+ # shall be specified.” Already checked above.
+
+ # graph - print relation: differ only in the <space> character.
+ # How is this possible if there are more than one space character?!
+ # I think susv2/xbd/locale.html should speak of “space characters”,
+ # not “space character”.
+ if (is_print(code_point)
+ and not (is_graph(code_point) or is_space(code_point))):
+ sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
+ 'sym': unicode_utils.ucs_symbol(code_point)})
+ if (not is_print(code_point)
+ and (is_graph(code_point) or code_point == 0x0020)):
+ sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
+ 'sym': unicode_utils.ucs_symbol(code_point)})