Skip to content

Instantly share code, notes, and snippets.

@chayleaf
Last active January 28, 2022 04:55
Show Gist options
  • Save chayleaf/e0334b1d87616c8d362c4e064a565570 to your computer and use it in GitHub Desktop.
Save chayleaf/e0334b1d87616c8d362c4e064a565570 to your computer and use it in GitHub Desktop.
lightweight unicode collation+lowercasing (only cyrillics+some diacritics)
#!/usr/bin/env python3
# Copyright: pavlukivan 2022
# License: 0BSD (Meaning this notice can be removed, use this as you like)
# Tested on Py3.10, should work even on Py3.6 though
import functools
import itertools
import unicodedata
# Amount of spaces to use as a single identation level
IDENT = 2
# I recommend using the uppercase version here, because German Eszett's
# lowercase version turns into SS, but uppercase version (added to the
# alphabet in 2017) turns into the lowercase version.
# Combined char mappings to ascii.
combined_chars = {
'german': { 'ẞ': 'SS' },
'french': { 'Æ': 'AE', 'Œ': 'OE', },
'danish': { 'Æ': 'AE', 'Ǽ': 'AE', },
'dutch': { 'IJ': 'IJ', },
'norwegian': { 'Æ': 'AE' },
}
# Characters which can't be decomposed into an ascii-only sequence
other_chars = {
'czech': 'ÁČĎÉĚÍŇÓŘŠŤÚŮÝŽĂÂÎȘȚ',
'danish': 'ÅÁÉÍÓÚÝØǾ',
'dutch': 'ÁÉÍÓÚÝ',
'finnish': 'ÅÄÖŠŽ',
'french': 'ÀÂÇÉÈÊËÎÏÔÙÛÜŸ',
'german': 'ÄÖÜ',
'hungarian': 'ÁÉÓŐÖÚŰÜ',
'italian': 'ÀÈÉÌÍÎÒÓÙÚ',
'norwegian': 'ÅÀÉÈÊÓÒÔÙÜØ',
'polish': 'ĄĆĘŁŃÓŚŹŻ',
'portuguese': 'ÁÂÃÀÇÉÊÈÍÌÓÔÕÙÚ',
'russian': 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ',
'spanish': 'Ñ',
'swedish': 'ÅÄÖ',
'ukrainian': 'АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ',
}
'''
Unpack a dict with languages as keys and character lists as values into
a set of all characters from all listed languages
'''
def set_of_chars(d):
ret = set()
for k, v in d.items():
for ch in v:
ret.add(unicodedata.normalize('NFC', ch))
return ret
'''
Get all (or most) character sequences that turn into ch when normalized
ch must be NFC-normalized
'''
@functools.cache
def all_forms(ch):
assert (len(ch) == 1)
decomposed = unicodedata.normalize('NFD', ch)
n = len(decomposed)
# for each character c in decomposed: all characters that, when
# normalized as NFC, turn into c
chars = [set() for i in range(n)]
# for each codepoint numbered i in decomposed: all characters that,
# when normalized as NFD, turn into a byte substring of decomposed of
# length i
chars_up_till = [set() for i in range(n)]
# for all unicode chars
for uchar in range(0x11000):
uchar = chr(uchar)
uchar_decomp = unicodedata.normalize('NFD', uchar)
# fill chars
for i in range(n):
if uchar_decomp == decomposed[i]:
chars[i].add(uchar)
# fill chars_up_till
if decomposed.startswith(uchar_decomp):
chars_up_till[len(uchar_decomp) - 1].add(uchar)
# chars which are exactly the same when decomposed
ret = chars_up_till[n - 1]
# just in case, add the original char as a representation
ret.add(ch)
if n == 1:
return chars[0]
# process diacritics
# this won't work that well with more than 2 diacritics, but it
# probably doesn't matter in this case
for i in range(0, n - 1):
# for each possible start
for start in chars_up_till[i]:
# for each permutation of an end (diacritics can be applied
# in any order)
for end in set(itertools.chain.from_iterable(map(itertools.permutations, itertools.product(*chars[i + 1:])))):
end = ''.join(end)
cur_ch = start + end
if unicodedata.normalize('NFC', cur_ch) == ch:
ret.add(cur_ch)
return ret
mapping = {}
# reverse mapping
originals = {}
# add v as a non-ascii lowercase version of k
def add_raw(k, v, original):
global mapping, originals
# non-ascii parts are the same
if [c for c in k if ord(c) >= 128] == [c for c in v if ord(c) >= 128]:
return
k = k.encode('utf-8')
v = v.encode('utf-8')
mapping[k] = v
originals[k] = original
# process a character (optionally, specify what it should be mapped into)
def add_char(ch, turn_into=None):
ch = unicodedata.normalize('NFC', ch)
if turn_into == None:
# if not specified, turn into the same character
turn_into = ch
# decompose, so that there's more ascii in the results, since ascii
# can be handled way easier
norm_lower = unicodedata.normalize('NFD', turn_into).upper().lower()
for form in all_forms(ch):
# add mapping: form -> ch.lower()
add_raw(form, norm_lower, ch)
add_raw(form.lower(), norm_lower, ch)
add_raw(form.lower().upper(), norm_lower, ch)
for _, d in combined_chars.items():
for k, v in d.items():
add_char(k, v)
for ch in set_of_chars(other_chars):
add_char(ch)
# unique lengths
all_lengths = reversed(sorted(set(len(k) for k in mapping.keys())))
'''
Quote a bytestring for C
'''
def quote_for_c(b):
if type(b) is str:
b = b.encode('utf-8')
ret = [ '"' ]
for x in b:
if x == b'"':
ret.append('\\"')
elif x == b'\\':
ret.append('\\')
elif x == b'\t':
ret.append('\\t')
elif x == b'\n':
ret.append('\\n')
elif x == b'\r':
ret.append('\\r')
# printable ascii
elif x >= 32 and x < 127:
ret.append(chr(x))
else:
ret.append('\\x' + hex(x)[2:])
ret.append('"')
return ''.join(ret)
def c_byteshift(x, n, used_type):
# convert char x into byte, and then byteshift it by n bytes
if n == 0:
return '(' + used_type + ')(unsigned char)' + x
else:
return '((' + used_type + ')(unsigned char)' + x + ' << ' + str(n * 8) + ')'
def to_c_number(b):
if type(b) is str:
b = b.encode('utf-8')
assert (len(b) <= 4)
return '0x' + ''.join(hex(n)[2:] for n in b)
for length in all_lengths:
# find the smallest type that can fit in the largest amount of bytes we'll use
assert (length <= 8)
if length > 4:
used_type = 'unsigned long long'
elif length > 2:
used_type = 'unsigned int'
else:
used_type = 'unsigned short'
print('if (chars_left_including_current >= ', length, ') {', sep='')
print(' ' * IDENT, used_type, ' next_n_bytes = ', c_byteshift('chars[i]', length - 1, used_type), sep='', end='')
for i in range(1, length):
print(' | ', c_byteshift('chars[i + ' + str(i) + ']', length - i - 1, used_type), sep='', end='')
print(';')
print(' ' * IDENT, 'switch (next_n_bytes) {', sep='')
# all k:v pairs of given length
all_kvs = [(k, v) for k, v in mapping.items() if len(k) == length]
# sort by value
all_kvs.sort(key=lambda x: x[0])
for k, v in all_kvs:
print(
' ' * (IDENT * 2),
'case ', to_c_number(k), ': ',
'/* ', originals[k], ' */ ',
'answer += ',
quote_for_c(v),
'; i += ', length - 1, '; continue;',
sep=''
)
# prevent non-exhaustiveness warnings
print(' ' * (IDENT * 2), 'default: break;', sep='')
# close switch
print(' ' * IDENT, '}', sep='')
# close if
print('}')
// Copyright: pavlukivan 2022
// License: 0BSD (Meaning this notice can be removed, use this as you like)
#include "lightweight_unicode.h"
// See https://en.wikipedia.org/wiki/UTF-8#Encoding
// How long the utf8 character starting at this char is supposed to
// be, or 0 if it's not a valid utf8 character
int utf8_char_length(unsigned char start) {
// 0xxxxxxx
if (((start >> 7) & 1) == 0) return 1;
// 110xxxxx
if (((start >> 5) & 0b111) == 0b110) return 2;
// 1110xxxx
if (((start >> 4) & 0b1111) == 0b1110) return 3;
// 11110xxx
if (((start >> 3) & 0b11111) == 0b11110) return 4;
// invalid character!
return 0;
}
char ascii_to_lower(char in) {
if (in >= 'A' && in <= 'Z')
return in + ('z' - 'Z');
return in;
}
std::string to_lower_utf8(const std::string &source) {
int length = source.length();
const char *chars = source.data();
std::string answer;
for (int i = 0; i < length; ++i) {
int chars_left_including_current = length - i;
int supposed_char_length = utf8_char_length(chars[i]);
if (
// invalid character start
supposed_char_length <= 0
// character is longer than there are bytes left
|| supposed_char_length > chars_left_including_current)
{
answer += ascii_to_lower(chars[i]);
continue;
}
// autogenerated part start
{
if (chars_left_including_current >= 4) {
unsigned int next_n_bytes = ((unsigned int)(unsigned char)chars[i] << 24) | ((unsigned int)(unsigned char)chars[i + 1] << 16) | ((unsigned int)(unsigned char)chars[i + 2] << 8) | (unsigned int)(unsigned char)chars[i + 3];
switch (next_n_bytes) {
case 0xc386cd81: /* Ǽ */ answer += "ae"; i += 3; continue;
case 0xc3a6cd81: /* Ǽ */ answer += "ae"; i += 3; continue;
case 0xc386cc81: /* Ǽ */ answer += "ae"; i += 3; continue;
case 0xc3a6cc81: /* Ǽ */ answer += "ae"; i += 3; continue;
case 0xc398cd81: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 3; continue;
case 0xc3b8cd81: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 3; continue;
case 0xc398cc81: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 3; continue;
case 0xd095cc88: /* Ё */ answer += "\xd0\xb5\xcc\x88"; i += 3; continue;
case 0xd098cc86: /* Й */ answer += "\xd0\xb8\xcc\x86"; i += 3; continue;
case 0xd086cc88: /* Ї */ answer += "\xd1\x96\xcc\x88"; i += 3; continue;
default: break;
}
}
if (chars_left_including_current >= 3) {
unsigned int next_n_bytes = ((unsigned int)(unsigned char)chars[i] << 16) | ((unsigned int)(unsigned char)chars[i + 1] << 8) | (unsigned int)(unsigned char)chars[i + 2];
switch (next_n_bytes) {
case 0x41cd80: /* À */ answer += "a\xcc\x80"; i += 2; continue;
case 0x61cd80: /* À */ answer += "a\xcc\x80"; i += 2; continue;
case 0x41cd81: /* Á */ answer += "a\xcc\x81"; i += 2; continue;
case 0x61cd81: /* Á */ answer += "a\xcc\x81"; i += 2; continue;
case 0xe284ab: /* Å */ answer += "a\xcc\x8a"; i += 2; continue;
case 0x43cd81: /* Ć */ answer += "c\xcc\x81"; i += 2; continue;
case 0x63cd81: /* Ć */ answer += "c\xcc\x81"; i += 2; continue;
case 0x45cd80: /* È */ answer += "e\xcc\x80"; i += 2; continue;
case 0x65cd80: /* È */ answer += "e\xcc\x80"; i += 2; continue;
case 0x45cd81: /* É */ answer += "e\xcc\x81"; i += 2; continue;
case 0x65cd81: /* É */ answer += "e\xcc\x81"; i += 2; continue;
case 0x49cd80: /* Ì */ answer += "i\xcc\x80"; i += 2; continue;
case 0x69cd80: /* Ì */ answer += "i\xcc\x80"; i += 2; continue;
case 0x49cd81: /* Í */ answer += "i\xcc\x81"; i += 2; continue;
case 0x69cd81: /* Í */ answer += "i\xcc\x81"; i += 2; continue;
case 0x4ecd81: /* Ń */ answer += "n\xcc\x81"; i += 2; continue;
case 0x6ecd81: /* Ń */ answer += "n\xcc\x81"; i += 2; continue;
case 0x4fcd80: /* Ò */ answer += "o\xcc\x80"; i += 2; continue;
case 0x6fcd80: /* Ò */ answer += "o\xcc\x80"; i += 2; continue;
case 0x4fcd81: /* Ó */ answer += "o\xcc\x81"; i += 2; continue;
case 0x6fcd81: /* Ó */ answer += "o\xcc\x81"; i += 2; continue;
case 0xe1ba9e: /**/ answer += "ss"; i += 2; continue;
case 0x53cd81: /* Ś */ answer += "s\xcc\x81"; i += 2; continue;
case 0x73cd81: /* Ś */ answer += "s\xcc\x81"; i += 2; continue;
case 0x55cd80: /* Ù */ answer += "u\xcc\x80"; i += 2; continue;
case 0x75cd80: /* Ù */ answer += "u\xcc\x80"; i += 2; continue;
case 0x55cd81: /* Ú */ answer += "u\xcc\x81"; i += 2; continue;
case 0x75cd81: /* Ú */ answer += "u\xcc\x81"; i += 2; continue;
case 0x59cd81: /* Ý */ answer += "y\xcc\x81"; i += 2; continue;
case 0x79cd81: /* Ý */ answer += "y\xcc\x81"; i += 2; continue;
case 0x5acd81: /* Ź */ answer += "z\xcc\x81"; i += 2; continue;
case 0x7acd81: /* Ź */ answer += "z\xcc\x81"; i += 2; continue;
default: break;
}
}
if (chars_left_including_current >= 2) {
unsigned short next_n_bytes = ((unsigned short)(unsigned char)chars[i] << 8) | (unsigned short)(unsigned char)chars[i + 1];
switch (next_n_bytes) {
case 0xc386: /* Æ */ answer += "ae"; i += 1; continue;
case 0xc3a6: /* Æ */ answer += "ae"; i += 1; continue;
case 0xc7bc: /* Ǽ */ answer += "ae"; i += 1; continue;
case 0xc7bd: /* Ǽ */ answer += "ae"; i += 1; continue;
case 0xc380: /* À */ answer += "a\xcc\x80"; i += 1; continue;
case 0xc3a0: /* À */ answer += "a\xcc\x80"; i += 1; continue;
case 0xc381: /* Á */ answer += "a\xcc\x81"; i += 1; continue;
case 0xc3a1: /* Á */ answer += "a\xcc\x81"; i += 1; continue;
case 0xc382: /* Â */ answer += "a\xcc\x82"; i += 1; continue;
case 0xc3a2: /* Â */ answer += "a\xcc\x82"; i += 1; continue;
case 0xc383: /* Ã */ answer += "a\xcc\x83"; i += 1; continue;
case 0xc3a3: /* Ã */ answer += "a\xcc\x83"; i += 1; continue;
case 0xc482: /* Ă */ answer += "a\xcc\x86"; i += 1; continue;
case 0xc483: /* Ă */ answer += "a\xcc\x86"; i += 1; continue;
case 0xc384: /* Ä */ answer += "a\xcc\x88"; i += 1; continue;
case 0xc3a4: /* Ä */ answer += "a\xcc\x88"; i += 1; continue;
case 0xc3a5: /* Å */ answer += "a\xcc\x8a"; i += 1; continue;
case 0xc385: /* Å */ answer += "a\xcc\x8a"; i += 1; continue;
case 0xc484: /* Ą */ answer += "a\xcc\xa8"; i += 1; continue;
case 0xc485: /* Ą */ answer += "a\xcc\xa8"; i += 1; continue;
case 0xc486: /* Ć */ answer += "c\xcc\x81"; i += 1; continue;
case 0xc487: /* Ć */ answer += "c\xcc\x81"; i += 1; continue;
case 0xc48c: /* Č */ answer += "c\xcc\x8c"; i += 1; continue;
case 0xc48d: /* Č */ answer += "c\xcc\x8c"; i += 1; continue;
case 0xc387: /* Ç */ answer += "c\xcc\xa7"; i += 1; continue;
case 0xc3a7: /* Ç */ answer += "c\xcc\xa7"; i += 1; continue;
case 0xc48e: /* Ď */ answer += "d\xcc\x8c"; i += 1; continue;
case 0xc48f: /* Ď */ answer += "d\xcc\x8c"; i += 1; continue;
case 0xc388: /* È */ answer += "e\xcc\x80"; i += 1; continue;
case 0xc3a8: /* È */ answer += "e\xcc\x80"; i += 1; continue;
case 0xc389: /* É */ answer += "e\xcc\x81"; i += 1; continue;
case 0xc3a9: /* É */ answer += "e\xcc\x81"; i += 1; continue;
case 0xc38a: /* Ê */ answer += "e\xcc\x82"; i += 1; continue;
case 0xc3aa: /* Ê */ answer += "e\xcc\x82"; i += 1; continue;
case 0xc38b: /* Ë */ answer += "e\xcc\x88"; i += 1; continue;
case 0xc3ab: /* Ë */ answer += "e\xcc\x88"; i += 1; continue;
case 0xc49a: /* Ě */ answer += "e\xcc\x8c"; i += 1; continue;
case 0xc49b: /* Ě */ answer += "e\xcc\x8c"; i += 1; continue;
case 0xc498: /* Ę */ answer += "e\xcc\xa8"; i += 1; continue;
case 0xc499: /* Ę */ answer += "e\xcc\xa8"; i += 1; continue;
case 0xc4b2: /* IJ */ answer += "ij"; i += 1; continue;
case 0xc4b3: /* IJ */ answer += "ij"; i += 1; continue;
case 0xc38c: /* Ì */ answer += "i\xcc\x80"; i += 1; continue;
case 0xc3ac: /* Ì */ answer += "i\xcc\x80"; i += 1; continue;
case 0xc38d: /* Í */ answer += "i\xcc\x81"; i += 1; continue;
case 0xc3ad: /* Í */ answer += "i\xcc\x81"; i += 1; continue;
case 0xc38e: /* Î */ answer += "i\xcc\x82"; i += 1; continue;
case 0xc3ae: /* Î */ answer += "i\xcc\x82"; i += 1; continue;
case 0xc38f: /* Ï */ answer += "i\xcc\x88"; i += 1; continue;
case 0xc3af: /* Ï */ answer += "i\xcc\x88"; i += 1; continue;
case 0xc583: /* Ń */ answer += "n\xcc\x81"; i += 1; continue;
case 0xc584: /* Ń */ answer += "n\xcc\x81"; i += 1; continue;
case 0xc391: /* Ñ */ answer += "n\xcc\x83"; i += 1; continue;
case 0xc3b1: /* Ñ */ answer += "n\xcc\x83"; i += 1; continue;
case 0xc587: /* Ň */ answer += "n\xcc\x8c"; i += 1; continue;
case 0xc588: /* Ň */ answer += "n\xcc\x8c"; i += 1; continue;
case 0xc592: /* Π*/ answer += "oe"; i += 1; continue;
case 0xc593: /* Π*/ answer += "oe"; i += 1; continue;
case 0xc392: /* Ò */ answer += "o\xcc\x80"; i += 1; continue;
case 0xc3b2: /* Ò */ answer += "o\xcc\x80"; i += 1; continue;
case 0xc393: /* Ó */ answer += "o\xcc\x81"; i += 1; continue;
case 0xc3b3: /* Ó */ answer += "o\xcc\x81"; i += 1; continue;
case 0xc394: /* Ô */ answer += "o\xcc\x82"; i += 1; continue;
case 0xc3b4: /* Ô */ answer += "o\xcc\x82"; i += 1; continue;
case 0xc395: /* Õ */ answer += "o\xcc\x83"; i += 1; continue;
case 0xc3b5: /* Õ */ answer += "o\xcc\x83"; i += 1; continue;
case 0xc396: /* Ö */ answer += "o\xcc\x88"; i += 1; continue;
case 0xc3b6: /* Ö */ answer += "o\xcc\x88"; i += 1; continue;
case 0xc590: /* Ő */ answer += "o\xcc\x8b"; i += 1; continue;
case 0xc591: /* Ő */ answer += "o\xcc\x8b"; i += 1; continue;
case 0xc598: /* Ř */ answer += "r\xcc\x8c"; i += 1; continue;
case 0xc599: /* Ř */ answer += "r\xcc\x8c"; i += 1; continue;
case 0xc39f: /**/ answer += "ss"; i += 1; continue;
case 0xc59a: /* Ś */ answer += "s\xcc\x81"; i += 1; continue;
case 0xc59b: /* Ś */ answer += "s\xcc\x81"; i += 1; continue;
case 0xc5a0: /* Š */ answer += "s\xcc\x8c"; i += 1; continue;
case 0xc5a1: /* Š */ answer += "s\xcc\x8c"; i += 1; continue;
case 0xc898: /* Ș */ answer += "s\xcc\xa6"; i += 1; continue;
case 0xc899: /* Ș */ answer += "s\xcc\xa6"; i += 1; continue;
case 0xc5a4: /* Ť */ answer += "t\xcc\x8c"; i += 1; continue;
case 0xc5a5: /* Ť */ answer += "t\xcc\x8c"; i += 1; continue;
case 0xc89a: /* Ț */ answer += "t\xcc\xa6"; i += 1; continue;
case 0xc89b: /* Ț */ answer += "t\xcc\xa6"; i += 1; continue;
case 0xc399: /* Ù */ answer += "u\xcc\x80"; i += 1; continue;
case 0xc3b9: /* Ù */ answer += "u\xcc\x80"; i += 1; continue;
case 0xc39a: /* Ú */ answer += "u\xcc\x81"; i += 1; continue;
case 0xc3ba: /* Ú */ answer += "u\xcc\x81"; i += 1; continue;
case 0xc39b: /* Û */ answer += "u\xcc\x82"; i += 1; continue;
case 0xc3bb: /* Û */ answer += "u\xcc\x82"; i += 1; continue;
case 0xc39c: /* Ü */ answer += "u\xcc\x88"; i += 1; continue;
case 0xc3bc: /* Ü */ answer += "u\xcc\x88"; i += 1; continue;
case 0xc5ae: /* Ů */ answer += "u\xcc\x8a"; i += 1; continue;
case 0xc5af: /* Ů */ answer += "u\xcc\x8a"; i += 1; continue;
case 0xc5b0: /* Ű */ answer += "u\xcc\x8b"; i += 1; continue;
case 0xc5b1: /* Ű */ answer += "u\xcc\x8b"; i += 1; continue;
case 0xc39d: /* Ý */ answer += "y\xcc\x81"; i += 1; continue;
case 0xc3bd: /* Ý */ answer += "y\xcc\x81"; i += 1; continue;
case 0xc5b8: /* Ÿ */ answer += "y\xcc\x88"; i += 1; continue;
case 0xc3bf: /* Ÿ */ answer += "y\xcc\x88"; i += 1; continue;
case 0xc5b9: /* Ź */ answer += "z\xcc\x81"; i += 1; continue;
case 0xc5ba: /* Ź */ answer += "z\xcc\x81"; i += 1; continue;
case 0xc5bb: /* Ż */ answer += "z\xcc\x87"; i += 1; continue;
case 0xc5bc: /* Ż */ answer += "z\xcc\x87"; i += 1; continue;
case 0xc5bd: /* Ž */ answer += "z\xcc\x8c"; i += 1; continue;
case 0xc5be: /* Ž */ answer += "z\xcc\x8c"; i += 1; continue;
case 0xc398: /* Ø */ answer += "\xc3\xb8"; i += 1; continue;
case 0xc7be: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 1; continue;
case 0xc7bf: /* Ǿ */ answer += "\xc3\xb8\xcc\x81"; i += 1; continue;
case 0xc581: /* Ł */ answer += "\xc5\x82"; i += 1; continue;
case 0xd090: /* А */ answer += "\xd0\xb0"; i += 1; continue;
case 0xd091: /* Б */ answer += "\xd0\xb1"; i += 1; continue;
case 0xd092: /* В */ answer += "\xd0\xb2"; i += 1; continue;
case 0xd093: /* Г */ answer += "\xd0\xb3"; i += 1; continue;
case 0xd094: /* Д */ answer += "\xd0\xb4"; i += 1; continue;
case 0xd095: /* Е */ answer += "\xd0\xb5"; i += 1; continue;
case 0xd081: /* Ё */ answer += "\xd0\xb5\xcc\x88"; i += 1; continue;
case 0xd191: /* Ё */ answer += "\xd0\xb5\xcc\x88"; i += 1; continue;
case 0xd096: /* Ж */ answer += "\xd0\xb6"; i += 1; continue;
case 0xd097: /* З */ answer += "\xd0\xb7"; i += 1; continue;
case 0xd098: /* И */ answer += "\xd0\xb8"; i += 1; continue;
case 0xd099: /* Й */ answer += "\xd0\xb8\xcc\x86"; i += 1; continue;
case 0xd0b9: /* Й */ answer += "\xd0\xb8\xcc\x86"; i += 1; continue;
case 0xd09a: /* К */ answer += "\xd0\xba"; i += 1; continue;
case 0xd09b: /* Л */ answer += "\xd0\xbb"; i += 1; continue;
case 0xd09c: /* М */ answer += "\xd0\xbc"; i += 1; continue;
case 0xd09d: /* Н */ answer += "\xd0\xbd"; i += 1; continue;
case 0xd09e: /* О */ answer += "\xd0\xbe"; i += 1; continue;
case 0xd09f: /* П */ answer += "\xd0\xbf"; i += 1; continue;
case 0xd0a0: /* Р */ answer += "\xd1\x80"; i += 1; continue;
case 0xd0a1: /* С */ answer += "\xd1\x81"; i += 1; continue;
case 0xd0a2: /* Т */ answer += "\xd1\x82"; i += 1; continue;
case 0xd0a3: /* У */ answer += "\xd1\x83"; i += 1; continue;
case 0xd0a4: /* Ф */ answer += "\xd1\x84"; i += 1; continue;
case 0xd0a5: /* Х */ answer += "\xd1\x85"; i += 1; continue;
case 0xd0a6: /* Ц */ answer += "\xd1\x86"; i += 1; continue;
case 0xd0a7: /* Ч */ answer += "\xd1\x87"; i += 1; continue;
case 0xd0a8: /* Ш */ answer += "\xd1\x88"; i += 1; continue;
case 0xd0a9: /* Щ */ answer += "\xd1\x89"; i += 1; continue;
case 0xd0aa: /* Ъ */ answer += "\xd1\x8a"; i += 1; continue;
case 0xd0ab: /* Ы */ answer += "\xd1\x8b"; i += 1; continue;
case 0xd0ac: /* Ь */ answer += "\xd1\x8c"; i += 1; continue;
case 0xd0ad: /* Э */ answer += "\xd1\x8d"; i += 1; continue;
case 0xd0ae: /* Ю */ answer += "\xd1\x8e"; i += 1; continue;
case 0xd0af: /* Я */ answer += "\xd1\x8f"; i += 1; continue;
case 0xd084: /* Є */ answer += "\xd1\x94"; i += 1; continue;
case 0xd086: /* І */ answer += "\xd1\x96"; i += 1; continue;
case 0xd087: /* Ї */ answer += "\xd1\x96\xcc\x88"; i += 1; continue;
case 0xd197: /* Ї */ answer += "\xd1\x96\xcc\x88"; i += 1; continue;
case 0xd290: /* Ґ */ answer += "\xd2\x91"; i += 1; continue;
default: break;
}
}
}
// autogenerated part end
// fallthrough
for (int start_i = i; i < start_i + supposed_char_length; ++i)
answer += ascii_to_lower(chars[i]);
// it will get autoincremented on next cycle, reduce it by one till then
--i;
}
return answer;
}
// Copyright: pavlukivan 2022
// License: 0BSD (Meaning this notice can be removed, use this as you like)
#ifndef __LIGHTWEIGHT_UNICODE_H
#define __LIGHTWEIGHT_UNICODE_H
#include <string>
std::string to_lower_utf8(const std::string &source);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment