1 files changed, 135 insertions, 51 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index 2c63787a35..2d8d631a96 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
-# Copyright (C) 2014-2016 Free Software Foundation, Inc.
+# Copyright (C) 2014-2018 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
@@ -27,6 +27,7 @@ Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
 It will output UTF-8 file
 '''
 
+import argparse
 import sys
 import re
 import unicode_utils
@@ -197,9 +198,10 @@ def write_header_charmap(outfile):
     outfile.write("% alias ISO-10646/UTF-8\n")
     outfile.write("CHARMAP\n")
 
-def write_header_width(outfile):
+def write_header_width(outfile, unicode_version):
     '''Writes the header on top of the WIDTH section to the output file'''
-    outfile.write('% Character width according to Unicode 7.0.0.\n')
+    outfile.write('% Character width according to Unicode '
+                  + '{:s}.\n'.format(unicode_version))
     outfile.write('% - Default width is 1.\n')
     outfile.write('% - Double-width characters have width 2; generated from\n')
     outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
@@ -215,66 +217,148 @@ def write_header_width(outfile):
 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
     outfile.write("WIDTH\n")
 
-def process_width(outfile, ulines, elines):
+def process_width(outfile, ulines, elines, plines):
     '''ulines are lines from UnicodeData.txt, elines are lines from
-    EastAsianWidth.txt
+    EastAsianWidth.txt containing characters with width “W” or “F”,
+    plines are lines from PropList.txt which contain characters
+    with the property “Prepended_Concatenation_Mark”.
 
     '''
     width_dict = {}
+    for line in elines:
+        fields = line.split(";")
+        if not '..' in fields[0]:
+            code_points = (fields[0], fields[0])
+        else:
+            code_points = fields[0].split("..")
+        for key in range(int(code_points[0], 16),
+                         int(code_points[1], 16)+1):
+            width_dict[key] = 2
+
     for line in ulines:
         fields = line.split(";")
-        if fields[4] == "NSM" or fields[2] == "Cf":
-            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
-                int(fields[0], 16)) + '\t0'
+        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
+            width_dict[int(fields[0], 16)] = 0
 
-    for line in elines:
-        # If an entry in EastAsianWidth.txt is found, it overrides entries in
-        # UnicodeData.txt:
+    for line in plines:
+        # Characters with the property “Prepended_Concatenation_Mark”
+        # should have the width 1:
         fields = line.split(";")
         if not '..' in fields[0]:
-            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
-                int(fields[0], 16)) + '\t2'
+            code_points = (fields[0], fields[0])
         else:
             code_points = fields[0].split("..")
-            for key in range(int(code_points[0], 16),
-                             int(code_points[1], 16)+1):
-                if  key in width_dict:
-                    del width_dict[key]
-            width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
-                unicode_utils.ucs_symbol(int(code_points[0], 16)),
-                unicode_utils.ucs_symbol(int(code_points[1], 16)))
+        for key in range(int(code_points[0], 16),
+                         int(code_points[1], 16)+1):
+            del width_dict[key] # default width is 1
+
+    # handle special cases for compatibility
+    for key in list((0x00AD,)):
+        # https://www.cs.tut.fi/~jkorpela/shy.html
+        if key in width_dict:
+            del width_dict[key] # default width is 1
+    for key in list(range(0x1160, 0x1200)):
+        width_dict[key] = 0
+    for key in list(range(0x3248, 0x3250)):
+        # These are “A” which means we can decide whether to treat them
+        # as “W” or “N” based on context:
+        # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
+        # For us, “W” seems better.
+        width_dict[key] = 2
+    for key in list(range(0x4DC0, 0x4E00)):
+        width_dict[key] = 2
 
+    same_width_lists = []
+    current_width_list = []
     for key in sorted(width_dict):
-        outfile.write(width_dict[key]+'\n')
+        if not current_width_list:
+            current_width_list = [key]
+        elif (key == current_width_list[-1] + 1
+              and width_dict[key] == width_dict[current_width_list[0]]):
+            current_width_list.append(key)
+        else:
+            same_width_lists.append(current_width_list)
+            current_width_list = [key]
+    if current_width_list:
+        same_width_lists.append(current_width_list)
+
+    for same_width_list in same_width_lists:
+        if len(same_width_list) == 1:
+            outfile.write('{:s}\t{:d}\n'.format(
+                unicode_utils.ucs_symbol(same_width_list[0]),
+                width_dict[same_width_list[0]]))
+        else:
+            outfile.write('{:s}...{:s}\t{:d}\n'.format(
+                unicode_utils.ucs_symbol(same_width_list[0]),
+                unicode_utils.ucs_symbol(same_width_list[-1]),
+                width_dict[same_width_list[0]]))
 
 if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
-    else:
-        with open(sys.argv[1], mode='r') as UNIDATA_FILE:
-            UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
-        with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
-            EAST_ASIAN_WIDTH_LINES = []
-            for LINE in EAST_ASIAN_WIDTH_FILE:
-                # If characters from EastAasianWidth.txt which are from
-                # from reserved ranges (i.e. not yet assigned code points)
-                # are added to the WIDTH section of the UTF-8 file, then
-                # “make check” produces “Unknown Character” errors for
-                # these code points because such unassigned code points
-                # are not in the CHARMAP section of the UTF-8 file.
-                #
-                # Therefore, we skip all reserved code points when reading
-                # the EastAsianWidth.txt file.
-                if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
-                    continue
-                if re.match(r'^[^;]*;[WF]', LINE):
-                    EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
-        with open('UTF-8', mode='w') as OUTFILE:
-            # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
-            write_header_charmap(OUTFILE)
-            process_charmap(UNICODE_DATA_LINES, OUTFILE)
-            OUTFILE.write("END CHARMAP\n\n")
-            # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
-            write_header_width(OUTFILE)
-            process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
-            OUTFILE.write("END WIDTH\n")
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-e', '--east_asian_with_file',
+        nargs='?',
+        type=str,
+        default='EastAsianWidth.txt',
+        help=('The EastAsianWidth.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-p', '--prop_list_file',
+        nargs='?',
+        type=str,
+        default='PropList.txt',
+        help=('The PropList.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
+        UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
+    with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
+        EAST_ASIAN_WIDTH_LINES = []
+        for LINE in EAST_ASIAN_WIDTH_FILE:
+            # If characters from EastAasianWidth.txt which are from
+            # from reserved ranges (i.e. not yet assigned code points)
+            # are added to the WIDTH section of the UTF-8 file, then
+            # “make check” produces “Unknown Character” errors for
+            # these code points because such unassigned code points
+            # are not in the CHARMAP section of the UTF-8 file.
+            #
+            # Therefore, we skip all reserved code points when reading
+            # the EastAsianWidth.txt file.
+            if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
+                continue
+            if re.match(r'^[^;]*;[WF]', LINE):
+                EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
+    with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
+        PROP_LIST_LINES = []
+        for LINE in PROP_LIST_FILE:
+            if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
+                PROP_LIST_LINES.append(LINE.strip())
+    with open('UTF-8', mode='w') as OUTFILE:
+        # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
+        write_header_charmap(OUTFILE)
+        process_charmap(UNICODE_DATA_LINES, OUTFILE)
+        OUTFILE.write("END CHARMAP\n\n")
+        # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
+        write_header_width(OUTFILE, ARGS.unicode_version)
+        process_width(OUTFILE,
+                      UNICODE_DATA_LINES,
+                      EAST_ASIAN_WIDTH_LINES,
+                      PROP_LIST_LINES)
+        OUTFILE.write("END WIDTH\n")