1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
#!/usr/bin/env python3
import os
import sys
import textwrap
self_path = os.path.dirname(os.path.realpath(__file__));
f = open(self_path + "/unicode/CaseFolding.txt", "r")
status_list = [ "C", "F" ]
folding_list = [ dict(), dict(), dict() ]
# Filter the foldings for "full" folding.
for line in f:
comment_off = line.find("#")
if comment_off >= 0:
line = line[:comment_off]
line = line.strip()
if not line:
continue
raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3)
if not status.strip() in status_list:
continue
codepoint = int(raw_codepoint.strip(), 16)
mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")]
mapping_len = len(mapping)
if mapping_len in range(1, 4):
folding_list[mapping_len-1][codepoint] = mapping
else:
assert(False)
f.close()
# If we assume that (index0 ... index-1) makes a range (as defined below),
# check that the newly provided index is compatible with the range too; i.e.
# verify that the range can be extended without breaking its properties.
#
# Currently, we can handle ranges which:
#
# (1) either form consecutive sequence of codepoints and which map that range
# to other consecutive range of codepoints (of the same length);
#
# (2) or a consecutive sequence of codepoints with step 2 where each codepoint
# CP is mapped to the codepoint CP+1
# (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1237; 0x1238 -> 0x1239; ...).
#
# Note: When the codepoints in the range are mapped to multiple codepoints,
# only the 1st mapped codepoint is considered. All the other ones have to be
# shared by all the mappings covered by the range.
def is_range_compatible(folding, codepoint_list, index0, index):
N = index - index0
codepoint0 = codepoint_list[index0]
codepoint1 = codepoint_list[index0+1]
codepointN = codepoint_list[index]
mapping0 = folding[codepoint0]
mapping1 = folding[codepoint1]
mappingN = folding[codepointN]
# Check the range type (1):
if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N \
and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:] \
and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]:
return True
# Check the range type (2):
if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N \
and mapping0[0] - codepoint0 == 1 \
and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:] \
and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]:
return True
return False
def mapping_str(list, mapping):
return ",".join("0x{:04x}".format(x) for x in mapping)
for mapping_len in range(1, 4):
folding = folding_list[mapping_len-1]
codepoint_list = list(folding)
index0 = 0
count = len(folding)
records = list()
data_records = list()
while index0 < count:
index1 = index0 + 1
while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1):
index1 += 1
if index1 - index0 > 2:
# Range of codepoints
records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]]))
index0 = index1
else:
# Single codepoint
records.append("S(0x{:04x})".format(codepoint_list[index0]))
data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
index0 += 1
sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len))
sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
initial_indent = " ", subsequent_indent=" ")))
sys.stdout.write("\n};\n")
sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len))
sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110,
initial_indent = " ", subsequent_indent=" ")))
sys.stdout.write("\n};\n")
|