Skip to content

Instantly share code, notes, and snippets.

@mtornwall
Created August 13, 2013 04:07
Show Gist options
  • Save mtornwall/6217806 to your computer and use it in GitHub Desktop.
Save mtornwall/6217806 to your computer and use it in GitHub Desktop.
naïve utf8 encode/decode
#include <stdio.h>
#include <string.h>
#include <stdint.h>
typedef uint32_t rune;
uint8_t *
encode(rune *runes, uint8_t *buf, size_t length)
{
for (size_t i = 0; i < length; i++) {
rune r = runes[i];
unsigned n, k;
if (r > 0x10FFFF) {
return NULL; // Out of range per RFC 3629.
} else if (r >= 0x10000) {
n = 4;
} else if (r >= 0x800) {
n = 3;
} else if (r >= 0x80) {
n = 2;
} else {
// We have an ASCII character. Echo as-is.
*buf++ = (uint8_t) r;
continue;
}
/* This is hairy. The high-order bits must come first, so we'll
* emit the bytes "backwards", ending with the first one.
*/
for (k = n; k > 1; k--) {
*(buf + k - 1) = 0x80 | (r & 0x3f);
r >>= 6;
}
*buf = ~(0xff >> n) | ((uint8_t) r & (0x7f >> n));
buf += n;
}
*buf = 0;
return buf;
}
rune *
decode(uint8_t *u, rune *buf, size_t *length)
{
rune r = 0;
*length = 0;
while (*u) {
// Start of UTF-8 sequence?
if (*u >= 0xC0) {
unsigned n = 0;
// Figure out the total length of the sequence.
while (*u & (0x80 >> n))
n++;
// The first few bits of data come from the start sequence.
r = *u++ & (0xff >> n);
// Consume all continuation bytes.
for (; n > 1; n--) {
if ((*u & 0xC0) != 0x80)
return NULL;
r = (r << 6) | (*u++ & 0x3f);
}
} else {
r = (rune) *u++;
}
buf[(*length)++] = r;
}
return buf;
}
int main()
{
char utf8[1024];
rune runes[1024];
size_t length;
fgets(utf8,sizeof(utf8),stdin);
decode(utf8, runes, &length);
for (size_t i = 0; i < length; i++) {
printf("%x ", runes[i]);
}
memset(utf8,0,sizeof(utf8));
encode(runes,utf8,length);
printf("\n%s\n",utf8);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment