dezashibi · August 6, 2024 08:40
diff --git a/windows_posix_utf8_characters.c b/windows_posix_utf8_characters.c
 #include <locale.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <wchar.h>
 #include <wctype.h>

 #if defined(_WIN32) || defined(_WIN64)
 #include <Windows.h>
 #define LOCALE_STR ".UTF-8"
 #else
 #define LOCALE_STR ""
 #endif

 void setup_locale()
 {
 #if defined(_WIN32) || defined(_WIN64)
    SetConsoleOutputCP(CP_UTF8);
 #endif
    setlocale(LC_ALL, LOCALE_STR);
 }

 #define CHECK_CATEGORY(func, letter) (func(letter) > 0 ? "is" : "is not")

 // Taken from here: https://stackoverflow.com/a/40474925
 // checks if it is a character resembling a digit
 // between 0 (zero) and 9 (nine).
 // Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm)
 // are not included.
 // Restriction to 16-bit is arbitrary, no digits above at time of writing
 // with a range of 0-9 (except e.g.: digits with a full stop or a comma following)
 int is_utf16_digit(wchar_t w)
 {
    // a small shortcut
    if (iswdigit(w))
    {
        return 1;
    }
    // a bit of range sectioning for legibility
    if (w <= 0x9ef)
    {
        if ((w >= 0x0660 && w <= 0x0669) || (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs
            || (w >= 0x0966 && w <= 0x096f)                              // DEVANAGARI DIGITs
            || (w >= 0x09e6 && w <= 0x09ef)                              // BENGALI DIGITs
        )
        {
            return 1;
        }
        else
        {
            return 0;
        }
    }
    else if (w > 0x9ef && w <= 0xc6f)
    {
        if ((w >= 0x0a66 && w <= 0x0a6f)    // GURMUKHI DIGITs
            || (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs
            || (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs
            || (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs
            || (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs
        )
        {
            return 1;
        }
        else
        {
            return 0;
        }
    }
    else if (w > 0xc6f && w <= 0xf29)
    {
        if ((w >= 0x0ce6 && w <= 0x0cef)    // KANNADA DIGITs
            || (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs
            || (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs
            || (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs
            || (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs
        )
        {
            return 1;
        }
        else
        {
            return 0;
        }
    }
    else if (w > 0xf29 && w <= 0xff19)
    {
        if ((w >= 0x1040 && w <= 0x1049)    // MYANMAR DIGITs
            || (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs
            || (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs
            || (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs
                                            /*
                                              0x2460 - 0x2468 CIRCLED DIGITs 1-9
                                              0x24ea          CIRCLED DIGITs 0
                                              0x2474 - 0x247c PARENTHESIZED DIGITs 1-9
                                              0x2488 - 0x2490 DIGITs 1-9 FULL STOP
                                              0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9
                                              and so on and so forth
                                            */
            || (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?)
                                            /*
                                              First characters larger than 16 bits:
                                              (0x10107 - 0x1010f AEGEAN NUMBERs 1-9)
                                              0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9
                                              0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4
                                              0x10e60 - 0x10e68 RUMI DIGITs 1-9
                                              Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100.
                                              Forgotten by the committee?
                                              First consecutive digits are the digits with a comma:
                                              0x1f101 - 0x1f10a
                                            */
        )
        {
            return 1;
        }
        else
        {
            return 0;
        }
    }
    else
    {
        return 0;
    }
 }

 int main()
 {
    setup_locale();

    wchar_t c = L'ت';
    wchar_t d = L'۲';
    wchar_t d2 = L'2';
    wchar_t d3 = L'؛';

    printf("'%lc' %s letter, '%lc' %s digit\n", c, CHECK_CATEGORY(iswalpha, c), c, CHECK_CATEGORY(is_utf16_digit, c));
    printf("'%lc' %s letter, '%lc' %s digit\n", d, CHECK_CATEGORY(iswalpha, d), d, CHECK_CATEGORY(is_utf16_digit, d));
    printf("'%lc' %s letter, '%lc' %s digit\n", d2, CHECK_CATEGORY(iswalpha, d2), d2, CHECK_CATEGORY(is_utf16_digit, d2));
    printf("'%lc' %s letter, '%lc' %s punctuation\n", d3, CHECK_CATEGORY(iswalpha, d3), d3, CHECK_CATEGORY(iswpunct, d3));

    printf("%s", "List of all Persian/Arabic family alphabets: ");
    for (wchar_t i = L'آ'; i <= L'ی'; ++i)
        if (iswalpha(i) && !is_utf16_digit(i))
            printf("%lc ", i);

    puts("");

    return 0;
 }
	#include <locale.h>
	#include <stddef.h>
	#include <stdio.h>
	#include <wchar.h>
	#include <wctype.h>

	#if defined(_WIN32) \|\| defined(_WIN64)
	#include <Windows.h>
	#define LOCALE_STR ".UTF-8"
	#else
	#define LOCALE_STR ""
	#endif

	void setup_locale()
	{
	#if defined(_WIN32) \|\| defined(_WIN64)
	SetConsoleOutputCP(CP_UTF8);
	#endif
	setlocale(LC_ALL, LOCALE_STR);
	}

	#define CHECK_CATEGORY(func, letter) (func(letter) > 0 ? "is" : "is not")

	// Taken from here: https://stackoverflow.com/a/40474925
	// checks if it is a character resembling a digit
	// between 0 (zero) and 9 (nine).
	// Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm)
	// are not included.
	// Restriction to 16-bit is arbitrary, no digits above at time of writing
	// with a range of 0-9 (except e.g.: digits with a full stop or a comma following)
	int is_utf16_digit(wchar_t w)
	{
	// a small shortcut
	if (iswdigit(w))
	{
	return 1;
	}
	// a bit of range sectioning for legibility
	if (w <= 0x9ef)
	{
	if ((w >= 0x0660 && w <= 0x0669) \|\| (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs
	\|\| (w >= 0x0966 && w <= 0x096f) // DEVANAGARI DIGITs
	\|\| (w >= 0x09e6 && w <= 0x09ef) // BENGALI DIGITs
	)
	{
	return 1;
	}
	else
	{
	return 0;
	}
	}
	else if (w > 0x9ef && w <= 0xc6f)
	{
	if ((w >= 0x0a66 && w <= 0x0a6f) // GURMUKHI DIGITs
	\|\| (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs
	\|\| (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs
	\|\| (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs
	\|\| (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs
	)
	{
	return 1;
	}
	else
	{
	return 0;
	}
	}
	else if (w > 0xc6f && w <= 0xf29)
	{
	if ((w >= 0x0ce6 && w <= 0x0cef) // KANNADA DIGITs
	\|\| (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs
	\|\| (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs
	\|\| (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs
	\|\| (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs
	)
	{
	return 1;
	}
	else
	{
	return 0;
	}
	}
	else if (w > 0xf29 && w <= 0xff19)
	{
	if ((w >= 0x1040 && w <= 0x1049) // MYANMAR DIGITs
	\|\| (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs
	\|\| (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs
	\|\| (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs
	/*
	0x2460 - 0x2468 CIRCLED DIGITs 1-9
	0x24ea CIRCLED DIGITs 0
	0x2474 - 0x247c PARENTHESIZED DIGITs 1-9
	0x2488 - 0x2490 DIGITs 1-9 FULL STOP
	0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9
	and so on and so forth
	*/
	\|\| (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?)
	/*
	First characters larger than 16 bits:
	(0x10107 - 0x1010f AEGEAN NUMBERs 1-9)
	0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9
	0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4
	0x10e60 - 0x10e68 RUMI DIGITs 1-9
	Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100.
	Forgotten by the committee?
	First consecutive digits are the digits with a comma:
	0x1f101 - 0x1f10a
	*/
	)
	{
	return 1;
	}
	else
	{
	return 0;
	}
	}
	else
	{
	return 0;
	}
	}

	int main()
	{
	setup_locale();

	wchar_t c = L'ت';
	wchar_t d = L'۲';
	wchar_t d2 = L'2';
	wchar_t d3 = L'؛';

	printf("'%lc' %s letter, '%lc' %s digit\n", c, CHECK_CATEGORY(iswalpha, c), c, CHECK_CATEGORY(is_utf16_digit, c));
	printf("'%lc' %s letter, '%lc' %s digit\n", d, CHECK_CATEGORY(iswalpha, d), d, CHECK_CATEGORY(is_utf16_digit, d));
	printf("'%lc' %s letter, '%lc' %s digit\n", d2, CHECK_CATEGORY(iswalpha, d2), d2, CHECK_CATEGORY(is_utf16_digit, d2));
	printf("'%lc' %s letter, '%lc' %s punctuation\n", d3, CHECK_CATEGORY(iswalpha, d3), d3, CHECK_CATEGORY(iswpunct, d3));

	printf("%s", "List of all Persian/Arabic family alphabets: ");
	for (wchar_t i = L'آ'; i <= L'ی'; ++i)
	if (iswalpha(i) && !is_utf16_digit(i))
	printf("%lc ", i);

	puts("");

	return 0;
	}