Created
January 5, 2017 06:41
-
-
Save cloudwu/4705dbf6a0a4657bee78f2749e8da601 to your computer and use it in GitHub Desktop.
A filter for convert utf8 to utf16 in windows.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Use mingw in windows | |
// gcc -O2 -o utf8.exe utf8to16.c | |
#include <stdio.h> | |
#include <fcntl.h> | |
#include <io.h> | |
#include <wchar.h> | |
static const char trailingBytesForUTF8[256] = { | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 | |
}; | |
static const unsigned long offsetsFromUTF8[6] = { | |
0x00000000UL, 0x00003080UL, 0x000E2080UL, | |
0x03C82080UL, 0xFA082080UL, 0x82082080UL | |
}; | |
#define UNI_SHIFT 10 | |
#define UNI_BASE 0x0010000 | |
#define UNI_MASK 0x3FF | |
#define UNI_SUR_HIGH_START 0xD800 | |
#define UNI_SUR_LOW_START 0xDC00 | |
int | |
main() { | |
int c; | |
_setmode(_fileno(stdout), _O_WTEXT); | |
while ((c=fgetc(stdin)) >= 0) { | |
unsigned long ch = 0; | |
int extra = trailingBytesForUTF8[c]; | |
#define READBYTE() ch += c; ch <<= 6; if ((c=fgetc(stdin)) < 0) return 1; | |
switch (extra) { | |
case 5: READBYTE(); | |
case 4: READBYTE(); | |
case 3: READBYTE(); | |
case 2: READBYTE(); | |
case 1: READBYTE(); | |
case 0: ch += c; | |
} | |
ch -= offsetsFromUTF8[extra]; | |
if (ch <= 0xffff) { | |
wprintf(L"%lc", ch); | |
} else { | |
ch -= UNI_BASE; | |
wprintf(L"%lc%lc", (ch >> UNI_SHIFT) + UNI_SUR_HIGH_START, | |
(ch & UNI_MASK) + UNI_SUR_LOW_START); | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment