Created
August 6, 2024 08:40
-
-
Save dezashibi/8f157667a51bec3ded2a00c4678a544f to your computer and use it in GitHub Desktop.
Get Unicode (UTF8) characters working in C for Windows and POSIX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <locale.h> | |
#include <stddef.h> | |
#include <stdio.h> | |
#include <wchar.h> | |
#include <wctype.h> | |
#if defined(_WIN32) || defined(_WIN64) | |
#include <Windows.h> | |
#define LOCALE_STR ".UTF-8" | |
#else | |
#define LOCALE_STR "" | |
#endif | |
void setup_locale() | |
{ | |
#if defined(_WIN32) || defined(_WIN64) | |
SetConsoleOutputCP(CP_UTF8); | |
#endif | |
setlocale(LC_ALL, LOCALE_STR); | |
} | |
#define CHECK_CATEGORY(func, letter) (func(letter) > 0 ? "is" : "is not") | |
// Taken from here: https://stackoverflow.com/a/40474925 | |
// checks if it is a character resembling a digit | |
// between 0 (zero) and 9 (nine). | |
// Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm) | |
// are not included. | |
// Restriction to 16-bit is arbitrary, no digits above at time of writing | |
// with a range of 0-9 (except e.g.: digits with a full stop or a comma following) | |
int is_utf16_digit(wchar_t w) | |
{ | |
// a small shortcut | |
if (iswdigit(w)) | |
{ | |
return 1; | |
} | |
// a bit of range sectioning for legibility | |
if (w <= 0x9ef) | |
{ | |
if ((w >= 0x0660 && w <= 0x0669) || (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs | |
|| (w >= 0x0966 && w <= 0x096f) // DEVANAGARI DIGITs | |
|| (w >= 0x09e6 && w <= 0x09ef) // BENGALI DIGITs | |
) | |
{ | |
return 1; | |
} | |
else | |
{ | |
return 0; | |
} | |
} | |
else if (w > 0x9ef && w <= 0xc6f) | |
{ | |
if ((w >= 0x0a66 && w <= 0x0a6f) // GURMUKHI DIGITs | |
|| (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs | |
|| (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs | |
|| (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs | |
|| (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs | |
) | |
{ | |
return 1; | |
} | |
else | |
{ | |
return 0; | |
} | |
} | |
else if (w > 0xc6f && w <= 0xf29) | |
{ | |
if ((w >= 0x0ce6 && w <= 0x0cef) // KANNADA DIGITs | |
|| (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs | |
|| (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs | |
|| (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs | |
|| (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs | |
) | |
{ | |
return 1; | |
} | |
else | |
{ | |
return 0; | |
} | |
} | |
else if (w > 0xf29 && w <= 0xff19) | |
{ | |
if ((w >= 0x1040 && w <= 0x1049) // MYANMAR DIGITs | |
|| (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs | |
|| (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs | |
|| (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs | |
/* | |
0x2460 - 0x2468 CIRCLED DIGITs 1-9 | |
0x24ea CIRCLED DIGITs 0 | |
0x2474 - 0x247c PARENTHESIZED DIGITs 1-9 | |
0x2488 - 0x2490 DIGITs 1-9 FULL STOP | |
0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9 | |
and so on and so forth | |
*/ | |
|| (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?) | |
/* | |
First characters larger than 16 bits: | |
(0x10107 - 0x1010f AEGEAN NUMBERs 1-9) | |
0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9 | |
0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4 | |
0x10e60 - 0x10e68 RUMI DIGITs 1-9 | |
Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100. | |
Forgotten by the committee? | |
First consecutive digits are the digits with a comma: | |
0x1f101 - 0x1f10a | |
*/ | |
) | |
{ | |
return 1; | |
} | |
else | |
{ | |
return 0; | |
} | |
} | |
else | |
{ | |
return 0; | |
} | |
} | |
int main() | |
{ | |
setup_locale(); | |
wchar_t c = L'ت'; | |
wchar_t d = L'۲'; | |
wchar_t d2 = L'2'; | |
wchar_t d3 = L'؛'; | |
printf("'%lc' %s letter, '%lc' %s digit\n", c, CHECK_CATEGORY(iswalpha, c), c, CHECK_CATEGORY(is_utf16_digit, c)); | |
printf("'%lc' %s letter, '%lc' %s digit\n", d, CHECK_CATEGORY(iswalpha, d), d, CHECK_CATEGORY(is_utf16_digit, d)); | |
printf("'%lc' %s letter, '%lc' %s digit\n", d2, CHECK_CATEGORY(iswalpha, d2), d2, CHECK_CATEGORY(is_utf16_digit, d2)); | |
printf("'%lc' %s letter, '%lc' %s punctuation\n", d3, CHECK_CATEGORY(iswalpha, d3), d3, CHECK_CATEGORY(iswpunct, d3)); | |
printf("%s", "List of all Persian/Arabic family alphabets: "); | |
for (wchar_t i = L'آ'; i <= L'ی'; ++i) | |
if (iswalpha(i) && !is_utf16_digit(i)) | |
printf("%lc ", i); | |
puts(""); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment