diff options
Diffstat (limited to 'localedata/unicode-gen/gen_unicode_ctype.py')
-rwxr-xr-x | localedata/unicode-gen/gen_unicode_ctype.py | 513 |
1 files changed, 43 insertions, 470 deletions
diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py index 0c74f2a849..58acc9550a 100755 --- a/localedata/unicode-gen/gen_unicode_ctype.py +++ b/localedata/unicode-gen/gen_unicode_ctype.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file. -# Copyright (C) 2014-2015 Free Software Foundation, Inc. +# Copyright (C) 2014-2016 Free Software Foundation, Inc. # This file is part of the GNU C Library. # Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000. # @@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option: ''' import argparse -import sys import time import re - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '<control>', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the DerivedCoreProperties.txt file -# -# Contents of this dictionary look like this: -# -# {917504: ['Default_Ignorable_Code_Point'], -# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], -# … -# } -DERIVED_CORE_PROPERTIES = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; - 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_derived_core_properties(filename): - '''Stores the entire contents of the DerivedCoreProperties.txt file - in the DERIVED_CORE_PROPERTIES dictionary. - - Lines in DerivedCoreProperties.txt are either a code point range like - this: - - 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z - - or a single code point like this: - - 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR - - ''' - with open(filename, mode='r') as derived_core_properties_file: - for line in derived_core_properties_file: - match = re.match( - r'^(?P<codepoint1>[0-9A-F]{4,6})' - + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' - + r'\s*;\s*(?P<property>[a-zA-Z_]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - prop = match.group('property') - if code_point in DERIVED_CORE_PROPERTIES: - DERIVED_CORE_PROPERTIES[code_point].append(prop) - else: - DERIVED_CORE_PROPERTIES[code_point] = [prop] - -def to_upper(code_point): - '''Returns the code point of the uppercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['upper']): - return UNICODE_ATTRIBUTES[code_point]['upper'] - else: - return code_point - -def to_lower(code_point): - '''Returns the code point of the lowercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['lower']): - return UNICODE_ATTRIBUTES[code_point]['lower'] - else: - return code_point - -def to_title(code_point): - '''Returns the code point of the titlecase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['title']): - return UNICODE_ATTRIBUTES[code_point]['title'] - else: - return code_point - -def is_upper(code_point): - '''Checks whether the character with this code point is uppercase''' - return (to_lower(code_point) != code_point - or (code_point in DERIVED_CORE_PROPERTIES - and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_lower(code_point): - '''Checks whether the character with this code point is lowercase''' - # Some characters are defined as “Lowercase” in - # DerivedCoreProperties.txt but do not have a mapping to upper - # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is - # one of these. - return (to_upper(code_point) != code_point - # <U00DF> is lowercase, but without simple to_upper mapping. - or code_point == 0x00DF - or (code_point in DERIVED_CORE_PROPERTIES - and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_alpha(code_point): - '''Checks whether the character with this code point is alphabetic''' - return ((code_point in DERIVED_CORE_PROPERTIES - and - 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) - or - # Consider all the non-ASCII digits as alphabetic. - # ISO C 99 forbids us to have them in category “digit”, - # but we want iswalnum to return true on them. - (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' - and not (code_point >= 0x0030 and code_point <= 0x0039))) - -def is_digit(code_point): - '''Checks whether the character with this code point is a digit''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') - # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without - # a zero. Must add <0> in front of them by hand. - else: - # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.5: - # The iswdigit function tests for any wide character that - # corresponds to a decimal-digit character (as defined in 5.2.1). - # 5.2.1: - # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_outdigit(code_point): - '''Checks whether the character with this code point is outdigit''' - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_blank(code_point): - '''Checks whether the character with this code point is blank''' - return (code_point == 0x0009 # '\t' - # Category Zs without mention of '<noBreak>' - or (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' - and '<noBreak>' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])) - -def is_space(code_point): - '''Checks whether the character with this code point is a space''' - # Don’t make U+00A0 a space. Non-breaking space means that all programs - # should treat it like a punctuation character, not like a space. - return (code_point == 0x0020 # ' ' - or code_point == 0x000C # '\f' - or code_point == 0x000A # '\n' - or code_point == 0x000D # '\r' - or code_point == 0x0009 # '\t' - or code_point == 0x000B # '\v' - # Categories Zl, Zp, and Zs without mention of "<noBreak>" - or (UNICODE_ATTRIBUTES[code_point]['name'] - and - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] - or - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] - and - '<noBreak>' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])))) - -def is_cntrl(code_point): - '''Checks whether the character with this code point is - a control character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' - or - UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) - -def is_xdigit(code_point): - '''Checks whether the character with this code point is - a hexadecimal digit''' - if False: - return (is_digit(code_point) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - else: - # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.12: - # The iswxdigit function tests for any wide character that - # corresponds to a hexadecimal-digit character (as defined - # in 6.4.4.1). - # 6.4.4.1: - # hexadecimal-digit: one of - # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - return ((code_point >= 0x0030 and code_point <= 0x0039) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - -def is_graph(code_point): - '''Checks whether the character with this code point is - a graphical character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' - and not is_space(code_point)) - -def is_print(code_point): - '''Checks whether the character with this code point is printable''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' - and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) - -def is_punct(code_point): - '''Checks whether the character with this code point is punctuation''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) - else: - # The traditional POSIX definition of punctuation is every graphic, - # non-alphanumeric character. - return (is_graph(code_point) - and not is_alpha(code_point) - and not is_digit(code_point)) - -def is_combining(code_point): - '''Checks whether the character with this code point is - a combining character''' - # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt - # file. In 3.0.1 it was identical to the union of the general categories - # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the - # PropList.txt file, so we take the latter definition. - return (UNICODE_ATTRIBUTES[code_point]['name'] - and - UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) - -def is_combining_level3(code_point): - '''Checks whether the character with this code point is - a combining level3 character''' - return (is_combining(code_point) - and - int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) - -def ucs_symbol_range(code_point_low, code_point_high): - '''Returns a string UCS symbol string for a code point range. - - Example: - - <U0041>..<U005A> - ''' - return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) +import unicode_utils def code_point_ranges(is_class_function): '''Returns a list of ranges of code points for which is_class_function @@ -379,7 +43,7 @@ def code_point_ranges(is_class_function): [[65, 90], [192, 214], [216, 222], [256], … ] ''' cp_ranges = [] - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): if is_class_function(code_point): if (cp_ranges and cp_ranges[-1][-1] == code_point - 1): @@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function): if line.strip(): line += ';' if len(code_point_range) == 1: - range_string = ucs_symbol(code_point_range[0]) + range_string = unicode_utils.ucs_symbol(code_point_range[0]) else: - range_string = ucs_symbol_range( + range_string = unicode_utils.ucs_symbol_range( code_point_range[0], code_point_range[-1]) if len(line+range_string) > max_column: i18n_file.write(line+'/\n') @@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function): line = prefix map_string = '' i18n_file.write('%s /\n' %map_name) - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): mapped = map_function(code_point) if code_point != mapped: if line.strip(): line += ';' map_string = '(' \ - + ucs_symbol(code_point) \ + + unicode_utils.ucs_symbol(code_point) \ + ',' \ - + ucs_symbol(mapped) \ + + unicode_utils.ucs_symbol(mapped) \ + ')' if len(line+map_string) > max_column: i18n_file.write(line+'/\n') @@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function): i18n_file.write(line+'\n') i18n_file.write('\n') -def verifications(): - '''Tests whether the is_* functions observe the known restrictions''' - for code_point in sorted(UNICODE_ATTRIBUTES): - # toupper restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_upper(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_upper(code_point)}) - # tolower restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_lower(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_lower(code_point)}) - # alpha restriction: "Characters classified as either upper or lower - # shall automatically belong to this class. - if ((is_lower(code_point) or is_upper(code_point)) - and not is_alpha(code_point)): - sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ - 'sym': ucs_symbol(code_point)}) - # alpha restriction: “No character specified for the keywords cntrl, - # digit, punct or space shall be specified.” - if (is_alpha(code_point) and is_cntrl(code_point)): - sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is alpha and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is alpha and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_space(code_point)): - sys.stderr.write('%(sym)s is alpha and space\n' %{ - 'sym': ucs_symbol(code_point)}) - # space restriction: “No character specified for the keywords upper, - # lower, alpha, digit, graph or xdigit shall be specified.” - # upper, lower, alpha already checked above. - if (is_space(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is space and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is space and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is space and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # cntrl restriction: “No character specified for the keywords upper, - # lower, alpha, digit, punct, graph, print or xdigit shall be - # specified.” upper, lower, alpha already checked above. - if (is_cntrl(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is cntrl and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is cntrl and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is cntrl and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_print(code_point)): - sys.stderr.write('%(sym)s is cntrl and print\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # punct restriction: “No character specified for the keywords upper, - # lower, alpha, digit, cntrl, xdigit or as the <space> character shall - # be specified.” upper, lower, alpha, cntrl already checked above. - if (is_punct(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is punct and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is punct and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and code_point == 0x0020): - sys.stderr.write('%(sym)s is punct\n' %{ - 'sym': ucs_symbol(code_point)}) - # graph restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # print restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # graph - print relation: differ only in the <space> character. - # How is this possible if there are more than one space character?! - # I think susv2/xbd/locale.html should speak of “space characters”, - # not “space character”. - if (is_print(code_point) - and not (is_graph(code_point) or is_space(code_point))): - sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ - 'sym': ucs_symbol(code_point)}) - if (not is_print(code_point) - and (is_graph(code_point) or code_point == 0x0020)): - sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ - 'sym': ucs_symbol(code_point)}) - def read_input_file(filename): '''Reads the original glibc i18n file to get the original head and tail. @@ -636,7 +196,7 @@ def output_tail(i18n_file, tail=''): else: i18n_file.write('END LC_CTYPE\n') -def output_tables(i18n_file, unicode_version): +def output_tables(i18n_file, unicode_version, turkish): '''Write the new LC_CTYPE character classes to the output file''' i18n_file.write('% The following is the 14652 i18n fdcc-set ' + 'LC_CTYPE category.\n') @@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version): + 'program.\n\n') i18n_file.write('% The "upper" class reflects the uppercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'upper', is_upper) + output_charclass(i18n_file, 'upper', unicode_utils.is_upper) i18n_file.write('% The "lower" class reflects the lowercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'lower', is_lower) + output_charclass(i18n_file, 'lower', unicode_utils.is_lower) i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' + 'reflecting\n') i18n_file.write('% the recommendations in TR 10176 annex A\n') - output_charclass(i18n_file, 'alpha', is_alpha) + output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha) i18n_file.write('% The "digit" class must only contain the ' + 'BASIC LATIN digits, says ISO C 99\n') i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') - output_charclass(i18n_file, 'digit', is_digit) + output_charclass(i18n_file, 'digit', unicode_utils.is_digit) i18n_file.write('% The "outdigit" information is by default ' + '"0" to "9". We don\'t have to\n') i18n_file.write('% provide it here since localedef will fill ' @@ -669,29 +229,36 @@ def output_tables(i18n_file, unicode_version): i18n_file.write('% outdigit /\n') i18n_file.write('% <U0030>..<U0039>\n\n') # output_charclass(i18n_file, 'outdigit', is_outdigit) - output_charclass(i18n_file, 'space', is_space) - output_charclass(i18n_file, 'cntrl', is_cntrl) - output_charclass(i18n_file, 'punct', is_punct) - output_charclass(i18n_file, 'graph', is_graph) - output_charclass(i18n_file, 'print', is_print) + output_charclass(i18n_file, 'space', unicode_utils.is_space) + output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl) + output_charclass(i18n_file, 'punct', unicode_utils.is_punct) + output_charclass(i18n_file, 'graph', unicode_utils.is_graph) + output_charclass(i18n_file, 'print', unicode_utils.is_print) i18n_file.write('% The "xdigit" class must only contain the ' + 'BASIC LATIN digits and A-F, a-f,\n') i18n_file.write('% says ISO C 99 ' + '(sections 7.25.2.1.12 and 6.4.4.1).\n') - output_charclass(i18n_file, 'xdigit', is_xdigit) - output_charclass(i18n_file, 'blank', is_blank) - output_charmap(i18n_file, 'toupper', to_upper) - output_charmap(i18n_file, 'tolower', to_lower) - output_charmap(i18n_file, 'map "totitle";', to_title) + output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit) + output_charclass(i18n_file, 'blank', unicode_utils.is_blank) + if turkish: + i18n_file.write('% The case conversions reflect ' + + 'Turkish conventions.\n') + output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish) + output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish) + else: + output_charmap(i18n_file, 'toupper', unicode_utils.to_upper) + output_charmap(i18n_file, 'tolower', unicode_utils.to_lower) + output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title) i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' + 'annex B.1\n') i18n_file.write('% That is, all combining characters (level 2+3).\n') - output_charclass(i18n_file, 'class "combining";', is_combining) + output_charclass(i18n_file, 'class "combining";', + unicode_utils.is_combining) i18n_file.write('% The "combining_level3" class reflects ' + 'ISO/IEC 10646-1 annex B.2\n') i18n_file.write('% That is, combining characters of level 3.\n') - output_charclass(i18n_file, - 'class "combining_level3";', is_combining_level3) + output_charclass(i18n_file, 'class "combining_level3";', + unicode_utils.is_combining_level3) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -737,15 +304,21 @@ if __name__ == "__main__": required=True, type=str, help='The Unicode version of the input files used.') + PARSER.add_argument( + '--turkish', + action='store_true', + help='Use Turkish case conversions.') ARGS = PARSER.parse_args() - fill_attributes(ARGS.unicode_data_file) - fill_derived_core_properties(ARGS.derived_core_properties_file) - verifications() + unicode_utils.fill_attributes( + ARGS.unicode_data_file) + unicode_utils.fill_derived_core_properties( + ARGS.derived_core_properties_file) + unicode_utils.verifications() HEAD = TAIL = '' if ARGS.input_file: (HEAD, TAIL) = read_input_file(ARGS.input_file) with open(ARGS.output_file, mode='w') as I18N_FILE: output_head(I18N_FILE, ARGS.unicode_version, head=HEAD) - output_tables(I18N_FILE, ARGS.unicode_version) + output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish) output_tail(I18N_FILE, tail=TAIL) |