Skip to content

Instantly share code, notes, and snippets.

@dezashibi
Created August 6, 2024 08:40
Show Gist options
  • Save dezashibi/8f157667a51bec3ded2a00c4678a544f to your computer and use it in GitHub Desktop.
Save dezashibi/8f157667a51bec3ded2a00c4678a544f to your computer and use it in GitHub Desktop.
Get Unicode (UTF8) characters working in C for Windows and POSIX
#include <locale.h>
#include <stddef.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#if defined(_WIN32) || defined(_WIN64)
#include <Windows.h>
#define LOCALE_STR ".UTF-8"
#else
#define LOCALE_STR ""
#endif
void setup_locale()
{
#if defined(_WIN32) || defined(_WIN64)
SetConsoleOutputCP(CP_UTF8);
#endif
setlocale(LC_ALL, LOCALE_STR);
}
#define CHECK_CATEGORY(func, letter) (func(letter) > 0 ? "is" : "is not")
// Taken from here: https://stackoverflow.com/a/40474925
// checks if it is a character resembling a digit
// between 0 (zero) and 9 (nine).
// Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm)
// are not included.
// Restriction to 16-bit is arbitrary, no digits above at time of writing
// with a range of 0-9 (except e.g.: digits with a full stop or a comma following)
int is_utf16_digit(wchar_t w)
{
// a small shortcut
if (iswdigit(w))
{
return 1;
}
// a bit of range sectioning for legibility
if (w <= 0x9ef)
{
if ((w >= 0x0660 && w <= 0x0669) || (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs
|| (w >= 0x0966 && w <= 0x096f) // DEVANAGARI DIGITs
|| (w >= 0x09e6 && w <= 0x09ef) // BENGALI DIGITs
)
{
return 1;
}
else
{
return 0;
}
}
else if (w > 0x9ef && w <= 0xc6f)
{
if ((w >= 0x0a66 && w <= 0x0a6f) // GURMUKHI DIGITs
|| (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs
|| (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs
|| (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs
|| (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs
)
{
return 1;
}
else
{
return 0;
}
}
else if (w > 0xc6f && w <= 0xf29)
{
if ((w >= 0x0ce6 && w <= 0x0cef) // KANNADA DIGITs
|| (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs
|| (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs
|| (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs
|| (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs
)
{
return 1;
}
else
{
return 0;
}
}
else if (w > 0xf29 && w <= 0xff19)
{
if ((w >= 0x1040 && w <= 0x1049) // MYANMAR DIGITs
|| (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs
|| (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs
|| (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs
/*
0x2460 - 0x2468 CIRCLED DIGITs 1-9
0x24ea CIRCLED DIGITs 0
0x2474 - 0x247c PARENTHESIZED DIGITs 1-9
0x2488 - 0x2490 DIGITs 1-9 FULL STOP
0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9
and so on and so forth
*/
|| (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?)
/*
First characters larger than 16 bits:
(0x10107 - 0x1010f AEGEAN NUMBERs 1-9)
0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9
0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4
0x10e60 - 0x10e68 RUMI DIGITs 1-9
Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100.
Forgotten by the committee?
First consecutive digits are the digits with a comma:
0x1f101 - 0x1f10a
*/
)
{
return 1;
}
else
{
return 0;
}
}
else
{
return 0;
}
}
int main()
{
setup_locale();
wchar_t c = L'ت';
wchar_t d = L'۲';
wchar_t d2 = L'2';
wchar_t d3 = L'؛';
printf("'%lc' %s letter, '%lc' %s digit\n", c, CHECK_CATEGORY(iswalpha, c), c, CHECK_CATEGORY(is_utf16_digit, c));
printf("'%lc' %s letter, '%lc' %s digit\n", d, CHECK_CATEGORY(iswalpha, d), d, CHECK_CATEGORY(is_utf16_digit, d));
printf("'%lc' %s letter, '%lc' %s digit\n", d2, CHECK_CATEGORY(iswalpha, d2), d2, CHECK_CATEGORY(is_utf16_digit, d2));
printf("'%lc' %s letter, '%lc' %s punctuation\n", d3, CHECK_CATEGORY(iswalpha, d3), d3, CHECK_CATEGORY(iswpunct, d3));
printf("%s", "List of all Persian/Arabic family alphabets: ");
for (wchar_t i = L'آ'; i <= L'ی'; ++i)
if (iswalpha(i) && !is_utf16_digit(i))
printf("%lc ", i);
puts("");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment