Automate LC_CTYPE generation for tr_TR, update to Unicode 8.0.0 (bug 18491).

This patch makes the automation of Unicode LC_CTYPE generation also support generating the modified LC_CTYPE used for Turkish (where case conversions of 'i' and 'I' differ from ASCII conventions), so allowing that to be more readily kept in sync for future Unicode updates. The patch includes the locale update generated by the scripts. Tested for x86_64. [BZ #18491] * unicode-gen/unicode_utils.py (to_upper_turkish): New function. (to_lower_turkish): Likewise. * unicode-gen/gen_unicode_ctype.py (output_tables): Support producing output with Turkish case conversions. (--turkish): New command-line option. * unicode-gen/Makefile (GENERATED): Add tr_TR. (tr_TR): New rule. * locales/tr_TR: Regenerate LC_CTYPE.
author: Joseph Myers <joseph@codesourcery.com> 2015-12-11 12:45:19 +0000
committer: Joseph Myers <joseph@codesourcery.com> 2015-12-11 12:45:19 +0000
commit: 85bafe6f3d0e4647af5e74178b46abdf44d3b981 (patch)
tree: bb6033cdf1d2cd10e26ddf42fa009a923e256807 /localedata/unicode-gen
parent: 77356912e83601fd0240d22fe4d960348b82b5c3 (diff)
3 files changed, 37 insertions, 5 deletions
diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile
index 5b7305d54e..4cb1fabc77 100644
--- a/localedata/unicode-gen/Makefile
+++ b/localedata/unicode-gen/Makefile
@@ -41,7 +41,7 @@ PYTHON3 = python3
 WGET = wget
 
 DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
-GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
+GENERATED = i18n tr_TR UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
 REPORTS = i18n-report UTF-8-report
 
 all: $(GENERATED)
@@ -50,6 +50,7 @@ check: check-i18n check-UTF-8
 
 install:
 	cp -p i18n ../locales/i18n
+	cp -p tr_TR ../locales/tr_TR
 	cp -p UTF-8 ../charmaps/UTF-8
 	cp -p translit_combining ../locales/translit_combining
 	cp -p translit_compat ../locales/translit_compat
@@ -82,6 +83,13 @@ check-i18n: i18n-report
 		i18n-report; \
 	then echo manual verification required; false; else true; fi
 
+tr_TR: UnicodeData.txt DerivedCoreProperties.txt
+tr_TR: ../locales/tr_TR # Preserve non-ctype information.
+tr_TR: gen_unicode_ctype.py
+	$(PYTHON3) gen_unicode_ctype.py -u UnicodeData.txt \
+	  -d DerivedCoreProperties.txt -i ../locales/tr_TR -o $@ \
+	  --unicode_version $(UNICODE_VERSION) --turkish
+
 UTF-8: UnicodeData.txt EastAsianWidth.txt
 UTF-8: utf8_gen.py
 	$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py
index 0f064f5ba5..bcb50bf9a5 100755
--- a/localedata/unicode-gen/gen_unicode_ctype.py
+++ b/localedata/unicode-gen/gen_unicode_ctype.py
@@ -196,7 +196,7 @@ def output_tail(i18n_file, tail=''):
     else:
         i18n_file.write('END LC_CTYPE\n')
 
-def output_tables(i18n_file, unicode_version):
+def output_tables(i18n_file, unicode_version, turkish):
     '''Write the new LC_CTYPE character classes to the output file'''
     i18n_file.write('% The following is the 14652 i18n fdcc-set '
                     + 'LC_CTYPE category.\n')
@@ -240,8 +240,14 @@ def output_tables(i18n_file, unicode_version):
                     + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
     output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
     output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
-    output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
-    output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
+    if turkish:
+        i18n_file.write('% The case conversions reflect '
+                        + 'Turkish conventions.\n')
+        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
+        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
+    else:
+        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
+        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
     output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
     i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
                     + 'annex B.1\n')
@@ -298,6 +304,10 @@ if __name__ == "__main__":
         required=True,
         type=str,
         help='The Unicode version of the input files used.')
+    PARSER.add_argument(
+        '--turkish',
+        action='store_true',
+        help='Use Turkish case conversions.')
     ARGS = PARSER.parse_args()
 
     unicode_utils.fill_attributes(
@@ -310,5 +320,5 @@ if __name__ == "__main__":
         (HEAD, TAIL) = read_input_file(ARGS.input_file)
     with open(ARGS.output_file, mode='w') as I18N_FILE:
         output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
-        output_tables(I18N_FILE, ARGS.unicode_version)
+        output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
         output_tail(I18N_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py
index ee91582823..26a57ef293 100644
--- a/localedata/unicode-gen/unicode_utils.py
+++ b/localedata/unicode-gen/unicode_utils.py
@@ -220,6 +220,20 @@ def to_lower(code_point):
     else:
         return code_point
 
+def to_upper_turkish(code_point):
+    '''Returns the code point of the Turkish uppercase version
+    of the given code point'''
+    if code_point == 0x0069:
+        return 0x0130
+    return to_upper(code_point)
+
+def to_lower_turkish(code_point):
+    '''Returns the code point of the Turkish lowercase version
+    of the given code point'''
+    if code_point == 0x0049:
+        return 0x0131
+    return to_lower(code_point)
+
 def to_title(code_point):
     '''Returns the code point of the titlecase version
     of the given code point'''
author	Joseph Myers <joseph@codesourcery.com>	2015-12-11 12:45:19 +0000
committer	Joseph Myers <joseph@codesourcery.com>	2015-12-11 12:45:19 +0000
commit	85bafe6f3d0e4647af5e74178b46abdf44d3b981 (patch)
tree	bb6033cdf1d2cd10e26ddf42fa009a923e256807 /localedata/unicode-gen
parent	77356912e83601fd0240d22fe4d960348b82b5c3 (diff)