2
0
mirror of https://github.com/boostorg/parser.git synced 2026-01-19 16:32:13 +00:00
Files
parser/misc/generate_case_fold_data.py
Zach Laine d0fe18b61d Add single-code point Unicode case folding function; data and tests are
generated from CaseFolding.txt.

Partially addressed #44.
2024-01-04 02:26:36 -06:00

123 lines
3.6 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright (c) 2024 T. Zachary Laine
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
# Get the latest version of the data file at:
# https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt
import itertools
def print_structs():
print('''struct short_mapping_range {
char32_t cp_first_;
char32_t cp_last_;
uint16_t stride_;
uint16_t first_idx_;
};''')
f = open('CaseFolding.txt')
lines = f.readlines()
lines = filter(
lambda x: False if (x.startswith('#') or x == '\n' or ' T;' in x or ' S;' in x) else True,
lines)
lines = list(map(lambda x: x.split(';')[:-1], lines))
f_line_indices = list(filter(lambda i: ' F;' in lines[i], range(len(lines))))
# Remove all C; lines that come before a F; line that applies to the same code
# point.
for f_idx in reversed(f_line_indices):
prev = f_idx - 1
if 0 <= prev:
if lines[f_idx][0] == lines[prev][0]:
lines = lines[:prev] + lines[prev + 1:]
# The mapping code points, in the order listed in lines (only individual mapping-cps).
all_mapping_cps = []
max_mapping_len = 1
# The mapping code points, in the order listed in lines (only multiple mapping-cps).
big_mappings = []
for line in lines:
mappings = line[-1].strip().split(' ')
max_mapping_len = max(len(mappings), max_mapping_len)
if 1 < len(mappings):
big_mappings.append((line[0], mappings))
else:
line.append(len(all_mapping_cps))
all_mapping_cps += mappings
def print_long_mapping():
print(f'''inline constexpr int longest_mapping = {max_mapping_len};
struct long_mapping {{
char32_t cp_;
char32_t mapping_[longest_mapping + 1];
}};''')
print(f'inline constexpr long_mapping long_mappings[{len(big_mappings)}] = {{')
for big_mapping in big_mappings:
mappings = ', '.join(map(lambda x: '0x' + x, big_mapping[1]))
print(f' {{0x{big_mapping[0]}, {{{mappings} , 0}}}},')
print('};')
def print_mapping_cps():
print(f'inline constexpr char32_t single_mapping_cps[{len(all_mapping_cps)}] = {{')
str_ = ''
idx = 0
max_per_line = 8
for cp in map(lambda x: '0x' + x, all_mapping_cps):
str_ += cp + ', '
idx += 1
if (idx % max_per_line) == 0:
print(' ' + str_)
str_ = ''
if idx % max_per_line:
print(' ' + str_)
print('};\n')
# Each range consists of a first-index, last-index, and the stride of values
# within [first, last).
ranges = []
lines = list(filter(lambda x: len(x[2].strip().split(' ')) == 1, lines))
pairs = zip(lines[:-1], lines[1:])
def pair_diff(pair):
prev_cp = int('0x' + pair[0][0], 0)
cp = int('0x' + pair[1][0], 0)
return cp - prev_cp
for k, g in itertools.groupby(pairs, pair_diff):
g = list(g)
ranges.append((g[0][0][0], g[-1][1][0], k, g[0][0][-1]))
ranges[-1] = (ranges[-1][0], lines[-1][0] + ' + 1', ranges[-1][2], ranges[-1][3])
def print_ranges():
print(f'inline constexpr short_mapping_range mapping_ranges[{len(ranges)}] = {{')
for r in ranges:
print(f' {{0x{r[0]}, 0x{r[1]}, {r[2]}, {r[3]}}}, ')
print('};')
print('''// Copyright (c) 2024 T. Zachary Laine
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// Warning: This header is auto-generated (see misc/generate_case_fold_data.py).
// The lack of include guards is intentional.
namespace boost::parser::detail {
''')
print_structs()
print_long_mapping()
print_ranges()
print_mapping_cps()
print('}')