Last active
January 28, 2022 04:55
-
-
Save chayleaf/e0334b1d87616c8d362c4e064a565570 to your computer and use it in GitHub Desktop.
lightweight unicode collation+lowercasing (only cyrillics+some diacritics)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Copyright: pavlukivan 2022 | |
# License: 0BSD (Meaning this notice can be removed, use this as you like) | |
# Tested on Py3.10, should work even on Py3.6 though | |
import functools | |
import itertools | |
import unicodedata | |
# Amount of spaces to use as a single identation level | |
IDENT = 2 | |
# I recommend using the uppercase version here, because German Eszett's | |
# lowercase version turns into SS, but uppercase version (added to the | |
# alphabet in 2017) turns into the lowercase version. | |
# Combined char mappings to ascii. | |
combined_chars = { | |
'german': { 'ẞ': 'SS' }, | |
'french': { 'Æ': 'AE', 'Œ': 'OE', }, | |
'danish': { 'Æ': 'AE', 'Ǽ': 'AE', }, | |
'dutch': { 'IJ': 'IJ', }, | |
'norwegian': { 'Æ': 'AE' }, | |
} | |
# Characters which can't be decomposed into an ascii-only sequence | |
other_chars = { | |
'czech': 'ÁČĎÉĚÍŇÓŘŠŤÚŮÝŽĂÂÎȘȚ', | |
'danish': 'ÅÁÉÍÓÚÝØǾ', | |
'dutch': 'ÁÉÍÓÚÝ', | |
'finnish': 'ÅÄÖŠŽ', | |
'french': 'ÀÂÇÉÈÊËÎÏÔÙÛÜŸ', | |
'german': 'ÄÖÜ', | |
'hungarian': 'ÁÉÓŐÖÚŰÜ', | |
'italian': 'ÀÈÉÌÍÎÒÓÙÚ', | |
'norwegian': 'ÅÀÉÈÊÓÒÔÙÜØ', | |
'polish': 'ĄĆĘŁŃÓŚŹŻ', | |
'portuguese': 'ÁÂÃÀÇÉÊÈÍÌÓÔÕÙÚ', | |
'russian': 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ', | |
'spanish': 'Ñ', | |
'swedish': 'ÅÄÖ', | |
'ukrainian': 'АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ', | |
} | |
''' | |
Unpack a dict with languages as keys and character lists as values into | |
a set of all characters from all listed languages | |
''' | |
def set_of_chars(d): | |
ret = set() | |
for k, v in d.items(): | |
for ch in v: | |
ret.add(unicodedata.normalize('NFC', ch)) | |
return ret | |
''' | |
Get all (or most) character sequences that turn into ch when normalized | |
ch must be NFC-normalized | |
''' | |
@functools.cache | |
def all_forms(ch): | |
assert (len(ch) == 1) | |
decomposed = unicodedata.normalize('NFD', ch) | |
n = len(decomposed) | |
# for each character c in decomposed: all characters that, when | |
# normalized as NFC, turn into c | |
chars = [set() for i in range(n)] | |
# for each codepoint numbered i in decomposed: all characters that, | |
# when normalized as NFD, turn into a byte substring of decomposed of | |
# length i | |
chars_up_till = [set() for i in range(n)] | |
# for all unicode chars | |
for uchar in range(0x11000): | |
uchar = chr(uchar) | |
uchar_decomp = unicodedata.normalize('NFD', uchar) | |
# fill chars | |
for i in range(n): | |
if uchar_decomp == decomposed[i]: | |
chars[i].add(uchar) | |
# fill chars_up_till | |
if decomposed.startswith(uchar_decomp): | |
chars_up_till[len(uchar_decomp) - 1].add(uchar) | |
# chars which are exactly the same when decomposed | |
ret = chars_up_till[n - 1] | |
# just in case, add the original char as a representation | |
ret.add(ch) | |
if n == 1: | |
return chars[0] | |
# process diacritics | |
# this won't work that well with more than 2 diacritics, but it | |
# probably doesn't matter in this case | |
for i in range(0, n - 1): | |
# for each possible start | |
for start in chars_up_till[i]: | |
# for each permutation of an end (diacritics can be applied | |
# in any order) | |
for end in set(itertools.chain.from_iterable(map(itertools.permutations, itertools.product(*chars[i + 1:])))): | |
end = ''.join(end) | |
cur_ch = start + end | |
if unicodedata.normalize('NFC', cur_ch) == ch: | |
ret.add(cur_ch) | |
return ret | |
mapping = {} | |
# reverse mapping | |
originals = {} | |
# add v as a non-ascii lowercase version of k | |
def add_raw(k, v, original): | |
global mapping, originals | |
# non-ascii parts are the same | |
if [c for c in k if ord(c) >= 128] == [c for c in v if ord(c) >= 128]: | |
return | |
k = k.encode('utf-8') | |
v = v.encode('utf-8') | |
mapping[k] = v | |
originals[k] = original | |
# process a character (optionally, specify what it should be mapped into) | |
def add_char(ch, turn_into=None): | |
ch = unicodedata.normalize('NFC', ch) | |
if turn_into == None: | |
# if not specified, turn into the same character | |
turn_into = ch | |
# decompose, so that there's more ascii in the results, since ascii | |
# can be handled way easier | |
norm_lower = unicodedata.normalize('NFD', turn_into).upper().lower() | |
for form in all_forms(ch): | |
# add mapping: form -> ch.lower() | |
add_raw(form, norm_lower, ch) | |
add_raw(form.lower(), norm_lower, ch) | |
add_raw(form.lower().upper(), norm_lower, ch) | |
for _, d in combined_chars.items(): | |
for k, v in d.items(): | |
add_char(k, v) | |
for ch in set_of_chars(other_chars): | |
add_char(ch) | |
# unique lengths | |
all_lengths = reversed(sorted(set(len(k) for k in mapping.keys()))) | |
''' | |
Quote a bytestring for C | |
''' | |
def quote_for_c(b): | |
if type(b) is str: | |
b = b.encode('utf-8') | |
ret = [ '"' ] | |
for x in b: | |
if x == b'"': | |
ret.append('\\"') | |
elif x == b'\\': | |
ret.append('\\') | |
elif x == b'\t': | |
ret.append('\\t') | |
elif x == b'\n': | |
ret.append('\\n') | |
elif x == b'\r': | |
ret.append('\\r') | |
# printable ascii | |
elif x >= 32 and x < 127: | |
ret.append(chr(x)) | |
else: | |
ret.append('\\x' + hex(x)[2:]) | |
ret.append('"') | |
return ''.join(ret) | |
def c_byteshift(x, n, used_type): | |
# convert char x into byte, and then byteshift it by n bytes | |
if n == 0: | |
return '(' + used_type + ')(unsigned char)' + x | |
else: | |
return '((' + used_type + ')(unsigned char)' + x + ' << ' + str(n * 8) + ')' | |
def to_c_number(b): | |
if type(b) is str: | |
b = b.encode('utf-8') | |
assert (len(b) <= 4) | |
return '0x' + ''.join(hex(n)[2:] for n in b) | |
for length in all_lengths: | |
# find the smallest type that can fit in the largest amount of bytes we'll use | |
assert (length <= 8) | |
if length > 4: | |
used_type = 'unsigned long long' | |
elif length > 2: | |
used_type = 'unsigned int' | |
else: | |
used_type = 'unsigned short' | |
print('if (chars_left_including_current >= ', length, ') {', sep='') | |
print(' ' * IDENT, used_type, ' next_n_bytes = ', c_byteshift('chars[i]', length - 1, used_type), sep='', end='') | |
for i in range(1, length): | |
print(' | ', c_byteshift('chars[i + ' + str(i) + ']', length - i - 1, used_type), sep='', end='') | |
print(';') | |
print(' ' * IDENT, 'switch (next_n_bytes) {', sep='') | |
# all k:v pairs of given length | |
all_kvs = [(k, v) for k, v in mapping.items() if len(k) == length] | |
# sort by value | |
all_kvs.sort(key=lambda x: x[0]) | |
for k, v in all_kvs: | |
print( | |
' ' * (IDENT * 2), | |
'case ', to_c_number(k), ': ', | |
'/* ', originals[k], ' */ ', | |
'answer += ', | |
quote_for_c(v), | |
'; i += ', length - 1, '; continue;', | |
sep='' | |
) | |
# prevent non-exhaustiveness warnings | |
print(' ' * (IDENT * 2), 'default: break;', sep='') | |
# close switch | |
print(' ' * IDENT, '}', sep='') | |
# close if | |
print('}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright: pavlukivan 2022 | |
// License: 0BSD (Meaning this notice can be removed, use this as you like) | |
#include "lightweight_unicode.h" | |
// See https://en.wikipedia.org/wiki/UTF-8#Encoding | |
// How long the utf8 character starting at this char is supposed to | |
// be, or 0 if it's not a valid utf8 character | |
int utf8_char_length(unsigned char start) { | |
// 0xxxxxxx | |
if (((start >> 7) & 1) == 0) return 1; | |
// 110xxxxx | |
if (((start >> 5) & 0b111) == 0b110) return 2; | |
// 1110xxxx | |
if (((start >> 4) & 0b1111) == 0b1110) return 3; | |
// 11110xxx | |
if (((start >> 3) & 0b11111) == 0b11110) return 4; | |
// invalid character! | |
return 0; | |
} | |
char ascii_to_lower(char in) { | |
if (in >= 'A' && in <= 'Z') | |
return in + ('z' - 'Z'); | |
return in; | |
} | |
std::string to_lower_utf8(const std::string &source) { | |
int length = source.length(); | |
const char *chars = source.data(); | |
std::string answer; | |
for (int i = 0; i < length; ++i) { | |
int chars_left_including_current = length - i; | |
int supposed_char_length = utf8_char_length(chars[i]); | |
if ( | |
// invalid character start | |
supposed_char_length <= 0 | |
// character is longer than there are bytes left | |
|| supposed_char_length > chars_left_including_current) | |
{ | |
answer += ascii_to_lower(chars[i]); | |
continue; | |
} | |
// autogenerated part start | |
{ | |
if (chars_left_including_current >= 4) { | |
unsigned int next_n_bytes = ((unsigned int)(unsigned char)chars[i] << 24) | ((unsigned int)(unsigned char)chars[i + 1] << 16) | ((unsigned int)(unsigned char)chars[i + 2] << 8) | (unsigned int)(unsigned char)chars[i + 3]; | |
switch (next_n_bytes) { | |
case 0xc386cd81: /* Ǽ */ answer += "ae"; i += 3; continue; | |
case 0xc3a6cd81: /* Ǽ */ answer += "ae"; i += 3; continue; | |
case 0xc386cc81: /* Ǽ */ answer += "ae"; i += 3; continue; | |
case 0xc3a6cc81: /* Ǽ */ answer += "ae"; i += 3; continue; | |
case 0xc398cd81: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 3; continue; | |
case 0xc3b8cd81: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 3; continue; | |
case 0xc398cc81: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 3; continue; | |
case 0xd095cc88: /* Ё */ answer += "\xd0\xb5\xcc\x88"; i += 3; continue; | |
case 0xd098cc86: /* Й */ answer += "\xd0\xb8\xcc\x86"; i += 3; continue; | |
case 0xd086cc88: /* Ї */ answer += "\xd1\x96\xcc\x88"; i += 3; continue; | |
default: break; | |
} | |
} | |
if (chars_left_including_current >= 3) { | |
unsigned int next_n_bytes = ((unsigned int)(unsigned char)chars[i] << 16) | ((unsigned int)(unsigned char)chars[i + 1] << 8) | (unsigned int)(unsigned char)chars[i + 2]; | |
switch (next_n_bytes) { | |
case 0x41cd80: /* À */ answer += "a\xcc\x80"; i += 2; continue; | |
case 0x61cd80: /* À */ answer += "a\xcc\x80"; i += 2; continue; | |
case 0x41cd81: /* Á */ answer += "a\xcc\x81"; i += 2; continue; | |
case 0x61cd81: /* Á */ answer += "a\xcc\x81"; i += 2; continue; | |
case 0xe284ab: /* Å */ answer += "a\xcc\x8a"; i += 2; continue; | |
case 0x43cd81: /* Ć */ answer += "c\xcc\x81"; i += 2; continue; | |
case 0x63cd81: /* Ć */ answer += "c\xcc\x81"; i += 2; continue; | |
case 0x45cd80: /* È */ answer += "e\xcc\x80"; i += 2; continue; | |
case 0x65cd80: /* È */ answer += "e\xcc\x80"; i += 2; continue; | |
case 0x45cd81: /* É */ answer += "e\xcc\x81"; i += 2; continue; | |
case 0x65cd81: /* É */ answer += "e\xcc\x81"; i += 2; continue; | |
case 0x49cd80: /* Ì */ answer += "i\xcc\x80"; i += 2; continue; | |
case 0x69cd80: /* Ì */ answer += "i\xcc\x80"; i += 2; continue; | |
case 0x49cd81: /* Í */ answer += "i\xcc\x81"; i += 2; continue; | |
case 0x69cd81: /* Í */ answer += "i\xcc\x81"; i += 2; continue; | |
case 0x4ecd81: /* Ń */ answer += "n\xcc\x81"; i += 2; continue; | |
case 0x6ecd81: /* Ń */ answer += "n\xcc\x81"; i += 2; continue; | |
case 0x4fcd80: /* Ò */ answer += "o\xcc\x80"; i += 2; continue; | |
case 0x6fcd80: /* Ò */ answer += "o\xcc\x80"; i += 2; continue; | |
case 0x4fcd81: /* Ó */ answer += "o\xcc\x81"; i += 2; continue; | |
case 0x6fcd81: /* Ó */ answer += "o\xcc\x81"; i += 2; continue; | |
case 0xe1ba9e: /* ẞ */ answer += "ss"; i += 2; continue; | |
case 0x53cd81: /* Ś */ answer += "s\xcc\x81"; i += 2; continue; | |
case 0x73cd81: /* Ś */ answer += "s\xcc\x81"; i += 2; continue; | |
case 0x55cd80: /* Ù */ answer += "u\xcc\x80"; i += 2; continue; | |
case 0x75cd80: /* Ù */ answer += "u\xcc\x80"; i += 2; continue; | |
case 0x55cd81: /* Ú */ answer += "u\xcc\x81"; i += 2; continue; | |
case 0x75cd81: /* Ú */ answer += "u\xcc\x81"; i += 2; continue; | |
case 0x59cd81: /* Ý */ answer += "y\xcc\x81"; i += 2; continue; | |
case 0x79cd81: /* Ý */ answer += "y\xcc\x81"; i += 2; continue; | |
case 0x5acd81: /* Ź */ answer += "z\xcc\x81"; i += 2; continue; | |
case 0x7acd81: /* Ź */ answer += "z\xcc\x81"; i += 2; continue; | |
default: break; | |
} | |
} | |
if (chars_left_including_current >= 2) { | |
unsigned short next_n_bytes = ((unsigned short)(unsigned char)chars[i] << 8) | (unsigned short)(unsigned char)chars[i + 1]; | |
switch (next_n_bytes) { | |
case 0xc386: /* Æ */ answer += "ae"; i += 1; continue; | |
case 0xc3a6: /* Æ */ answer += "ae"; i += 1; continue; | |
case 0xc7bc: /* Ǽ */ answer += "ae"; i += 1; continue; | |
case 0xc7bd: /* Ǽ */ answer += "ae"; i += 1; continue; | |
case 0xc380: /* À */ answer += "a\xcc\x80"; i += 1; continue; | |
case 0xc3a0: /* À */ answer += "a\xcc\x80"; i += 1; continue; | |
case 0xc381: /* Á */ answer += "a\xcc\x81"; i += 1; continue; | |
case 0xc3a1: /* Á */ answer += "a\xcc\x81"; i += 1; continue; | |
case 0xc382: /* Â */ answer += "a\xcc\x82"; i += 1; continue; | |
case 0xc3a2: /* Â */ answer += "a\xcc\x82"; i += 1; continue; | |
case 0xc383: /* Ã */ answer += "a\xcc\x83"; i += 1; continue; | |
case 0xc3a3: /* Ã */ answer += "a\xcc\x83"; i += 1; continue; | |
case 0xc482: /* Ă */ answer += "a\xcc\x86"; i += 1; continue; | |
case 0xc483: /* Ă */ answer += "a\xcc\x86"; i += 1; continue; | |
case 0xc384: /* Ä */ answer += "a\xcc\x88"; i += 1; continue; | |
case 0xc3a4: /* Ä */ answer += "a\xcc\x88"; i += 1; continue; | |
case 0xc3a5: /* Å */ answer += "a\xcc\x8a"; i += 1; continue; | |
case 0xc385: /* Å */ answer += "a\xcc\x8a"; i += 1; continue; | |
case 0xc484: /* Ą */ answer += "a\xcc\xa8"; i += 1; continue; | |
case 0xc485: /* Ą */ answer += "a\xcc\xa8"; i += 1; continue; | |
case 0xc486: /* Ć */ answer += "c\xcc\x81"; i += 1; continue; | |
case 0xc487: /* Ć */ answer += "c\xcc\x81"; i += 1; continue; | |
case 0xc48c: /* Č */ answer += "c\xcc\x8c"; i += 1; continue; | |
case 0xc48d: /* Č */ answer += "c\xcc\x8c"; i += 1; continue; | |
case 0xc387: /* Ç */ answer += "c\xcc\xa7"; i += 1; continue; | |
case 0xc3a7: /* Ç */ answer += "c\xcc\xa7"; i += 1; continue; | |
case 0xc48e: /* Ď */ answer += "d\xcc\x8c"; i += 1; continue; | |
case 0xc48f: /* Ď */ answer += "d\xcc\x8c"; i += 1; continue; | |
case 0xc388: /* È */ answer += "e\xcc\x80"; i += 1; continue; | |
case 0xc3a8: /* È */ answer += "e\xcc\x80"; i += 1; continue; | |
case 0xc389: /* É */ answer += "e\xcc\x81"; i += 1; continue; | |
case 0xc3a9: /* É */ answer += "e\xcc\x81"; i += 1; continue; | |
case 0xc38a: /* Ê */ answer += "e\xcc\x82"; i += 1; continue; | |
case 0xc3aa: /* Ê */ answer += "e\xcc\x82"; i += 1; continue; | |
case 0xc38b: /* Ë */ answer += "e\xcc\x88"; i += 1; continue; | |
case 0xc3ab: /* Ë */ answer += "e\xcc\x88"; i += 1; continue; | |
case 0xc49a: /* Ě */ answer += "e\xcc\x8c"; i += 1; continue; | |
case 0xc49b: /* Ě */ answer += "e\xcc\x8c"; i += 1; continue; | |
case 0xc498: /* Ę */ answer += "e\xcc\xa8"; i += 1; continue; | |
case 0xc499: /* Ę */ answer += "e\xcc\xa8"; i += 1; continue; | |
case 0xc4b2: /* IJ */ answer += "ij"; i += 1; continue; | |
case 0xc4b3: /* IJ */ answer += "ij"; i += 1; continue; | |
case 0xc38c: /* Ì */ answer += "i\xcc\x80"; i += 1; continue; | |
case 0xc3ac: /* Ì */ answer += "i\xcc\x80"; i += 1; continue; | |
case 0xc38d: /* Í */ answer += "i\xcc\x81"; i += 1; continue; | |
case 0xc3ad: /* Í */ answer += "i\xcc\x81"; i += 1; continue; | |
case 0xc38e: /* Î */ answer += "i\xcc\x82"; i += 1; continue; | |
case 0xc3ae: /* Î */ answer += "i\xcc\x82"; i += 1; continue; | |
case 0xc38f: /* Ï */ answer += "i\xcc\x88"; i += 1; continue; | |
case 0xc3af: /* Ï */ answer += "i\xcc\x88"; i += 1; continue; | |
case 0xc583: /* Ń */ answer += "n\xcc\x81"; i += 1; continue; | |
case 0xc584: /* Ń */ answer += "n\xcc\x81"; i += 1; continue; | |
case 0xc391: /* Ñ */ answer += "n\xcc\x83"; i += 1; continue; | |
case 0xc3b1: /* Ñ */ answer += "n\xcc\x83"; i += 1; continue; | |
case 0xc587: /* Ň */ answer += "n\xcc\x8c"; i += 1; continue; | |
case 0xc588: /* Ň */ answer += "n\xcc\x8c"; i += 1; continue; | |
case 0xc592: /* Œ */ answer += "oe"; i += 1; continue; | |
case 0xc593: /* Œ */ answer += "oe"; i += 1; continue; | |
case 0xc392: /* Ò */ answer += "o\xcc\x80"; i += 1; continue; | |
case 0xc3b2: /* Ò */ answer += "o\xcc\x80"; i += 1; continue; | |
case 0xc393: /* Ó */ answer += "o\xcc\x81"; i += 1; continue; | |
case 0xc3b3: /* Ó */ answer += "o\xcc\x81"; i += 1; continue; | |
case 0xc394: /* Ô */ answer += "o\xcc\x82"; i += 1; continue; | |
case 0xc3b4: /* Ô */ answer += "o\xcc\x82"; i += 1; continue; | |
case 0xc395: /* Õ */ answer += "o\xcc\x83"; i += 1; continue; | |
case 0xc3b5: /* Õ */ answer += "o\xcc\x83"; i += 1; continue; | |
case 0xc396: /* Ö */ answer += "o\xcc\x88"; i += 1; continue; | |
case 0xc3b6: /* Ö */ answer += "o\xcc\x88"; i += 1; continue; | |
case 0xc590: /* Ő */ answer += "o\xcc\x8b"; i += 1; continue; | |
case 0xc591: /* Ő */ answer += "o\xcc\x8b"; i += 1; continue; | |
case 0xc598: /* Ř */ answer += "r\xcc\x8c"; i += 1; continue; | |
case 0xc599: /* Ř */ answer += "r\xcc\x8c"; i += 1; continue; | |
case 0xc39f: /* ẞ */ answer += "ss"; i += 1; continue; | |
case 0xc59a: /* Ś */ answer += "s\xcc\x81"; i += 1; continue; | |
case 0xc59b: /* Ś */ answer += "s\xcc\x81"; i += 1; continue; | |
case 0xc5a0: /* Š */ answer += "s\xcc\x8c"; i += 1; continue; | |
case 0xc5a1: /* Š */ answer += "s\xcc\x8c"; i += 1; continue; | |
case 0xc898: /* Ș */ answer += "s\xcc\xa6"; i += 1; continue; | |
case 0xc899: /* Ș */ answer += "s\xcc\xa6"; i += 1; continue; | |
case 0xc5a4: /* Ť */ answer += "t\xcc\x8c"; i += 1; continue; | |
case 0xc5a5: /* Ť */ answer += "t\xcc\x8c"; i += 1; continue; | |
case 0xc89a: /* Ț */ answer += "t\xcc\xa6"; i += 1; continue; | |
case 0xc89b: /* Ț */ answer += "t\xcc\xa6"; i += 1; continue; | |
case 0xc399: /* Ù */ answer += "u\xcc\x80"; i += 1; continue; | |
case 0xc3b9: /* Ù */ answer += "u\xcc\x80"; i += 1; continue; | |
case 0xc39a: /* Ú */ answer += "u\xcc\x81"; i += 1; continue; | |
case 0xc3ba: /* Ú */ answer += "u\xcc\x81"; i += 1; continue; | |
case 0xc39b: /* Û */ answer += "u\xcc\x82"; i += 1; continue; | |
case 0xc3bb: /* Û */ answer += "u\xcc\x82"; i += 1; continue; | |
case 0xc39c: /* Ü */ answer += "u\xcc\x88"; i += 1; continue; | |
case 0xc3bc: /* Ü */ answer += "u\xcc\x88"; i += 1; continue; | |
case 0xc5ae: /* Ů */ answer += "u\xcc\x8a"; i += 1; continue; | |
case 0xc5af: /* Ů */ answer += "u\xcc\x8a"; i += 1; continue; | |
case 0xc5b0: /* Ű */ answer += "u\xcc\x8b"; i += 1; continue; | |
case 0xc5b1: /* Ű */ answer += "u\xcc\x8b"; i += 1; continue; | |
case 0xc39d: /* Ý */ answer += "y\xcc\x81"; i += 1; continue; | |
case 0xc3bd: /* Ý */ answer += "y\xcc\x81"; i += 1; continue; | |
case 0xc5b8: /* Ÿ */ answer += "y\xcc\x88"; i += 1; continue; | |
case 0xc3bf: /* Ÿ */ answer += "y\xcc\x88"; i += 1; continue; | |
case 0xc5b9: /* Ź */ answer += "z\xcc\x81"; i += 1; continue; | |
case 0xc5ba: /* Ź */ answer += "z\xcc\x81"; i += 1; continue; | |
case 0xc5bb: /* Ż */ answer += "z\xcc\x87"; i += 1; continue; | |
case 0xc5bc: /* Ż */ answer += "z\xcc\x87"; i += 1; continue; | |
case 0xc5bd: /* Ž */ answer += "z\xcc\x8c"; i += 1; continue; | |
case 0xc5be: /* Ž */ answer += "z\xcc\x8c"; i += 1; continue; | |
case 0xc398: /* Ø */ answer += "\xc3\xb8"; i += 1; continue; | |
case 0xc7be: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 1; continue; | |
case 0xc7bf: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 1; continue; | |
case 0xc581: /* Ł */ answer += "\xc5\x82"; i += 1; continue; | |
case 0xd090: /* А */ answer += "\xd0\xb0"; i += 1; continue; | |
case 0xd091: /* Б */ answer += "\xd0\xb1"; i += 1; continue; | |
case 0xd092: /* В */ answer += "\xd0\xb2"; i += 1; continue; | |
case 0xd093: /* Г */ answer += "\xd0\xb3"; i += 1; continue; | |
case 0xd094: /* Д */ answer += "\xd0\xb4"; i += 1; continue; | |
case 0xd095: /* Е */ answer += "\xd0\xb5"; i += 1; continue; | |
case 0xd081: /* Ё */ answer += "\xd0\xb5\xcc\x88"; i += 1; continue; | |
case 0xd191: /* Ё */ answer += "\xd0\xb5\xcc\x88"; i += 1; continue; | |
case 0xd096: /* Ж */ answer += "\xd0\xb6"; i += 1; continue; | |
case 0xd097: /* З */ answer += "\xd0\xb7"; i += 1; continue; | |
case 0xd098: /* И */ answer += "\xd0\xb8"; i += 1; continue; | |
case 0xd099: /* Й */ answer += "\xd0\xb8\xcc\x86"; i += 1; continue; | |
case 0xd0b9: /* Й */ answer += "\xd0\xb8\xcc\x86"; i += 1; continue; | |
case 0xd09a: /* К */ answer += "\xd0\xba"; i += 1; continue; | |
case 0xd09b: /* Л */ answer += "\xd0\xbb"; i += 1; continue; | |
case 0xd09c: /* М */ answer += "\xd0\xbc"; i += 1; continue; | |
case 0xd09d: /* Н */ answer += "\xd0\xbd"; i += 1; continue; | |
case 0xd09e: /* О */ answer += "\xd0\xbe"; i += 1; continue; | |
case 0xd09f: /* П */ answer += "\xd0\xbf"; i += 1; continue; | |
case 0xd0a0: /* Р */ answer += "\xd1\x80"; i += 1; continue; | |
case 0xd0a1: /* С */ answer += "\xd1\x81"; i += 1; continue; | |
case 0xd0a2: /* Т */ answer += "\xd1\x82"; i += 1; continue; | |
case 0xd0a3: /* У */ answer += "\xd1\x83"; i += 1; continue; | |
case 0xd0a4: /* Ф */ answer += "\xd1\x84"; i += 1; continue; | |
case 0xd0a5: /* Х */ answer += "\xd1\x85"; i += 1; continue; | |
case 0xd0a6: /* Ц */ answer += "\xd1\x86"; i += 1; continue; | |
case 0xd0a7: /* Ч */ answer += "\xd1\x87"; i += 1; continue; | |
case 0xd0a8: /* Ш */ answer += "\xd1\x88"; i += 1; continue; | |
case 0xd0a9: /* Щ */ answer += "\xd1\x89"; i += 1; continue; | |
case 0xd0aa: /* Ъ */ answer += "\xd1\x8a"; i += 1; continue; | |
case 0xd0ab: /* Ы */ answer += "\xd1\x8b"; i += 1; continue; | |
case 0xd0ac: /* Ь */ answer += "\xd1\x8c"; i += 1; continue; | |
case 0xd0ad: /* Э */ answer += "\xd1\x8d"; i += 1; continue; | |
case 0xd0ae: /* Ю */ answer += "\xd1\x8e"; i += 1; continue; | |
case 0xd0af: /* Я */ answer += "\xd1\x8f"; i += 1; continue; | |
case 0xd084: /* Є */ answer += "\xd1\x94"; i += 1; continue; | |
case 0xd086: /* І */ answer += "\xd1\x96"; i += 1; continue; | |
case 0xd087: /* Ї */ answer += "\xd1\x96\xcc\x88"; i += 1; continue; | |
case 0xd197: /* Ї */ answer += "\xd1\x96\xcc\x88"; i += 1; continue; | |
case 0xd290: /* Ґ */ answer += "\xd2\x91"; i += 1; continue; | |
default: break; | |
} | |
} | |
} | |
// autogenerated part end | |
// fallthrough | |
for (int start_i = i; i < start_i + supposed_char_length; ++i) | |
answer += ascii_to_lower(chars[i]); | |
// it will get autoincremented on next cycle, reduce it by one till then | |
--i; | |
} | |
return answer; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright: pavlukivan 2022 | |
// License: 0BSD (Meaning this notice can be removed, use this as you like) | |
#ifndef __LIGHTWEIGHT_UNICODE_H | |
#define __LIGHTWEIGHT_UNICODE_H | |
#include <string> | |
std::string to_lower_utf8(const std::string &source); | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment