diff options
Diffstat (limited to 'localedata/unicode-gen/utf8_gen.py')
-rwxr-xr-x | localedata/unicode-gen/utf8_gen.py | 186 |
1 files changed, 135 insertions, 51 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index 2c63787a35..2d8d631a96 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- -# Copyright (C) 2014-2016 Free Software Foundation, Inc. +# Copyright (C) 2014-2018 Free Software Foundation, Inc. # This file is part of the GNU C Library. # # The GNU C Library is free software; you can redistribute it and/or @@ -27,6 +27,7 @@ Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt It will output UTF-8 file ''' +import argparse import sys import re import unicode_utils @@ -197,9 +198,10 @@ def write_header_charmap(outfile): outfile.write("% alias ISO-10646/UTF-8\n") outfile.write("CHARMAP\n") -def write_header_width(outfile): +def write_header_width(outfile, unicode_version): '''Writes the header on top of the WIDTH section to the output file''' - outfile.write('% Character width according to Unicode 7.0.0.\n') + outfile.write('% Character width according to Unicode ' + + '{:s}.\n'.format(unicode_version)) outfile.write('% - Default width is 1.\n') outfile.write('% - Double-width characters have width 2; generated from\n') outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') @@ -215,66 +217,148 @@ def write_header_width(outfile): # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n") outfile.write("WIDTH\n") -def process_width(outfile, ulines, elines): +def process_width(outfile, ulines, elines, plines): '''ulines are lines from UnicodeData.txt, elines are lines from - EastAsianWidth.txt + EastAsianWidth.txt containing characters with width “W” or “F”, + plines are lines from PropList.txt which contain characters + with the property “Prepended_Concatenation_Mark”. ''' width_dict = {} + for line in elines: + fields = line.split(";") + if not '..' in fields[0]: + code_points = (fields[0], fields[0]) + else: + code_points = fields[0].split("..") + for key in range(int(code_points[0], 16), + int(code_points[1], 16)+1): + width_dict[key] = 2 + for line in ulines: fields = line.split(";") - if fields[4] == "NSM" or fields[2] == "Cf": - width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( - int(fields[0], 16)) + '\t0' + if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): + width_dict[int(fields[0], 16)] = 0 - for line in elines: - # If an entry in EastAsianWidth.txt is found, it overrides entries in - # UnicodeData.txt: + for line in plines: + # Characters with the property “Prepended_Concatenation_Mark” + # should have the width 1: fields = line.split(";") if not '..' in fields[0]: - width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( - int(fields[0], 16)) + '\t2' + code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") - for key in range(int(code_points[0], 16), - int(code_points[1], 16)+1): - if key in width_dict: - del width_dict[key] - width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( - unicode_utils.ucs_symbol(int(code_points[0], 16)), - unicode_utils.ucs_symbol(int(code_points[1], 16))) + for key in range(int(code_points[0], 16), + int(code_points[1], 16)+1): + del width_dict[key] # default width is 1 + + # handle special cases for compatibility + for key in list((0x00AD,)): + # https://www.cs.tut.fi/~jkorpela/shy.html + if key in width_dict: + del width_dict[key] # default width is 1 + for key in list(range(0x1160, 0x1200)): + width_dict[key] = 0 + for key in list(range(0x3248, 0x3250)): + # These are “A” which means we can decide whether to treat them + # as “W” or “N” based on context: + # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html + # For us, “W” seems better. + width_dict[key] = 2 + for key in list(range(0x4DC0, 0x4E00)): + width_dict[key] = 2 + same_width_lists = [] + current_width_list = [] for key in sorted(width_dict): - outfile.write(width_dict[key]+'\n') + if not current_width_list: + current_width_list = [key] + elif (key == current_width_list[-1] + 1 + and width_dict[key] == width_dict[current_width_list[0]]): + current_width_list.append(key) + else: + same_width_lists.append(current_width_list) + current_width_list = [key] + if current_width_list: + same_width_lists.append(current_width_list) + + for same_width_list in same_width_lists: + if len(same_width_list) == 1: + outfile.write('{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + width_dict[same_width_list[0]])) + else: + outfile.write('{:s}...{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + unicode_utils.ucs_symbol(same_width_list[-1]), + width_dict[same_width_list[0]])) if __name__ == "__main__": - if len(sys.argv) < 3: - print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt") - else: - with open(sys.argv[1], mode='r') as UNIDATA_FILE: - UNICODE_DATA_LINES = UNIDATA_FILE.readlines() - with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE: - EAST_ASIAN_WIDTH_LINES = [] - for LINE in EAST_ASIAN_WIDTH_FILE: - # If characters from EastAasianWidth.txt which are from - # from reserved ranges (i.e. not yet assigned code points) - # are added to the WIDTH section of the UTF-8 file, then - # “make check” produces “Unknown Character” errors for - # these code points because such unassigned code points - # are not in the CHARMAP section of the UTF-8 file. - # - # Therefore, we skip all reserved code points when reading - # the EastAsianWidth.txt file. - if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): - continue - if re.match(r'^[^;]*;[WF]', LINE): - EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) - with open('UTF-8', mode='w') as OUTFILE: - # Processing UnicodeData.txt and write CHARMAP to UTF-8 file - write_header_charmap(OUTFILE) - process_charmap(UNICODE_DATA_LINES, OUTFILE) - OUTFILE.write("END CHARMAP\n\n") - # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file - write_header_width(OUTFILE) - process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES) - OUTFILE.write("END WIDTH\n") + PARSER = argparse.ArgumentParser( + description=''' + Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-e', '--east_asian_with_file', + nargs='?', + type=str, + default='EastAsianWidth.txt', + help=('The EastAsianWidth.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-p', '--prop_list_file', + nargs='?', + type=str, + default='PropList.txt', + help=('The PropList.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: + UNICODE_DATA_LINES = UNIDATA_FILE.readlines() + with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: + EAST_ASIAN_WIDTH_LINES = [] + for LINE in EAST_ASIAN_WIDTH_FILE: + # If characters from EastAasianWidth.txt which are from + # from reserved ranges (i.e. not yet assigned code points) + # are added to the WIDTH section of the UTF-8 file, then + # “make check” produces “Unknown Character” errors for + # these code points because such unassigned code points + # are not in the CHARMAP section of the UTF-8 file. + # + # Therefore, we skip all reserved code points when reading + # the EastAsianWidth.txt file. + if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): + continue + if re.match(r'^[^;]*;[WF]', LINE): + EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) + with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE: + PROP_LIST_LINES = [] + for LINE in PROP_LIST_FILE: + if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): + PROP_LIST_LINES.append(LINE.strip()) + with open('UTF-8', mode='w') as OUTFILE: + # Processing UnicodeData.txt and write CHARMAP to UTF-8 file + write_header_charmap(OUTFILE) + process_charmap(UNICODE_DATA_LINES, OUTFILE) + OUTFILE.write("END CHARMAP\n\n") + # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file + write_header_width(OUTFILE, ARGS.unicode_version) + process_width(OUTFILE, + UNICODE_DATA_LINES, + EAST_ASIAN_WIDTH_LINES, + PROP_LIST_LINES) + OUTFILE.write("END WIDTH\n") |