summaryrefslogtreecommitdiff
path: root/localedata/unicode-gen/utf8_gen.py
diff options
context:
space:
mode:
Diffstat (limited to 'localedata/unicode-gen/utf8_gen.py')
-rwxr-xr-xlocaledata/unicode-gen/utf8_gen.py186
1 files changed, 135 insertions, 51 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index 2c63787a35..2d8d631a96 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
-# Copyright (C) 2014-2016 Free Software Foundation, Inc.
+# Copyright (C) 2014-2018 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
@@ -27,6 +27,7 @@ Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
It will output UTF-8 file
'''
+import argparse
import sys
import re
import unicode_utils
@@ -197,9 +198,10 @@ def write_header_charmap(outfile):
outfile.write("% alias ISO-10646/UTF-8\n")
outfile.write("CHARMAP\n")
-def write_header_width(outfile):
+def write_header_width(outfile, unicode_version):
'''Writes the header on top of the WIDTH section to the output file'''
- outfile.write('% Character width according to Unicode 7.0.0.\n')
+ outfile.write('% Character width according to Unicode '
+ + '{:s}.\n'.format(unicode_version))
outfile.write('% - Default width is 1.\n')
outfile.write('% - Double-width characters have width 2; generated from\n')
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
@@ -215,66 +217,148 @@ def write_header_width(outfile):
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
outfile.write("WIDTH\n")
-def process_width(outfile, ulines, elines):
+def process_width(outfile, ulines, elines, plines):
'''ulines are lines from UnicodeData.txt, elines are lines from
- EastAsianWidth.txt
+ EastAsianWidth.txt containing characters with width “W” or “F”,
+ plines are lines from PropList.txt which contain characters
+ with the property “Prepended_Concatenation_Mark”.
'''
width_dict = {}
+ for line in elines:
+ fields = line.split(";")
+ if not '..' in fields[0]:
+ code_points = (fields[0], fields[0])
+ else:
+ code_points = fields[0].split("..")
+ for key in range(int(code_points[0], 16),
+ int(code_points[1], 16)+1):
+ width_dict[key] = 2
+
for line in ulines:
fields = line.split(";")
- if fields[4] == "NSM" or fields[2] == "Cf":
- width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
- int(fields[0], 16)) + '\t0'
+ if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
+ width_dict[int(fields[0], 16)] = 0
- for line in elines:
- # If an entry in EastAsianWidth.txt is found, it overrides entries in
- # UnicodeData.txt:
+ for line in plines:
+ # Characters with the property “Prepended_Concatenation_Mark”
+ # should have the width 1:
fields = line.split(";")
if not '..' in fields[0]:
- width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
- int(fields[0], 16)) + '\t2'
+ code_points = (fields[0], fields[0])
else:
code_points = fields[0].split("..")
- for key in range(int(code_points[0], 16),
- int(code_points[1], 16)+1):
- if key in width_dict:
- del width_dict[key]
- width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
- unicode_utils.ucs_symbol(int(code_points[0], 16)),
- unicode_utils.ucs_symbol(int(code_points[1], 16)))
+ for key in range(int(code_points[0], 16),
+ int(code_points[1], 16)+1):
+ del width_dict[key] # default width is 1
+
+ # handle special cases for compatibility
+ for key in list((0x00AD,)):
+ # https://www.cs.tut.fi/~jkorpela/shy.html
+ if key in width_dict:
+ del width_dict[key] # default width is 1
+ for key in list(range(0x1160, 0x1200)):
+ width_dict[key] = 0
+ for key in list(range(0x3248, 0x3250)):
+ # These are “A” which means we can decide whether to treat them
+ # as “W” or “N” based on context:
+ # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
+ # For us, “W” seems better.
+ width_dict[key] = 2
+ for key in list(range(0x4DC0, 0x4E00)):
+ width_dict[key] = 2
+ same_width_lists = []
+ current_width_list = []
for key in sorted(width_dict):
- outfile.write(width_dict[key]+'\n')
+ if not current_width_list:
+ current_width_list = [key]
+ elif (key == current_width_list[-1] + 1
+ and width_dict[key] == width_dict[current_width_list[0]]):
+ current_width_list.append(key)
+ else:
+ same_width_lists.append(current_width_list)
+ current_width_list = [key]
+ if current_width_list:
+ same_width_lists.append(current_width_list)
+
+ for same_width_list in same_width_lists:
+ if len(same_width_list) == 1:
+ outfile.write('{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ width_dict[same_width_list[0]]))
+ else:
+ outfile.write('{:s}...{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ unicode_utils.ucs_symbol(same_width_list[-1]),
+ width_dict[same_width_list[0]]))
if __name__ == "__main__":
- if len(sys.argv) < 3:
- print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
- else:
- with open(sys.argv[1], mode='r') as UNIDATA_FILE:
- UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
- with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
- EAST_ASIAN_WIDTH_LINES = []
- for LINE in EAST_ASIAN_WIDTH_FILE:
- # If characters from EastAasianWidth.txt which are from
- # from reserved ranges (i.e. not yet assigned code points)
- # are added to the WIDTH section of the UTF-8 file, then
- # “make check” produces “Unknown Character” errors for
- # these code points because such unassigned code points
- # are not in the CHARMAP section of the UTF-8 file.
- #
- # Therefore, we skip all reserved code points when reading
- # the EastAsianWidth.txt file.
- if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
- continue
- if re.match(r'^[^;]*;[WF]', LINE):
- EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
- with open('UTF-8', mode='w') as OUTFILE:
- # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
- write_header_charmap(OUTFILE)
- process_charmap(UNICODE_DATA_LINES, OUTFILE)
- OUTFILE.write("END CHARMAP\n\n")
- # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
- write_header_width(OUTFILE)
- process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
- OUTFILE.write("END WIDTH\n")
+ PARSER = argparse.ArgumentParser(
+ description='''
+ Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
+ ''')
+ PARSER.add_argument(
+ '-u', '--unicode_data_file',
+ nargs='?',
+ type=str,
+ default='UnicodeData.txt',
+ help=('The UnicodeData.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-e', '--east_asian_with_file',
+ nargs='?',
+ type=str,
+ default='EastAsianWidth.txt',
+ help=('The EastAsianWidth.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-p', '--prop_list_file',
+ nargs='?',
+ type=str,
+ default='PropList.txt',
+ help=('The PropList.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '--unicode_version',
+ nargs='?',
+ required=True,
+ type=str,
+ help='The Unicode version of the input files used.')
+ ARGS = PARSER.parse_args()
+
+ with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
+ UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
+ with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
+ EAST_ASIAN_WIDTH_LINES = []
+ for LINE in EAST_ASIAN_WIDTH_FILE:
+ # If characters from EastAasianWidth.txt which are from
+ # from reserved ranges (i.e. not yet assigned code points)
+ # are added to the WIDTH section of the UTF-8 file, then
+ # “make check” produces “Unknown Character” errors for
+ # these code points because such unassigned code points
+ # are not in the CHARMAP section of the UTF-8 file.
+ #
+ # Therefore, we skip all reserved code points when reading
+ # the EastAsianWidth.txt file.
+ if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
+ continue
+ if re.match(r'^[^;]*;[WF]', LINE):
+ EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
+ with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
+ PROP_LIST_LINES = []
+ for LINE in PROP_LIST_FILE:
+ if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
+ PROP_LIST_LINES.append(LINE.strip())
+ with open('UTF-8', mode='w') as OUTFILE:
+ # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
+ write_header_charmap(OUTFILE)
+ process_charmap(UNICODE_DATA_LINES, OUTFILE)
+ OUTFILE.write("END CHARMAP\n\n")
+ # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
+ write_header_width(OUTFILE, ARGS.unicode_version)
+ process_width(OUTFILE,
+ UNICODE_DATA_LINES,
+ EAST_ASIAN_WIDTH_LINES,
+ PROP_LIST_LINES)
+ OUTFILE.write("END WIDTH\n")