Last active
January 4, 2022 23:00
-
-
Save liangfu/19b848da2757d9b8a596bdb5182b6095 to your computer and use it in GitHub Desktop.
An implementation of IEEE 16-bit floating point data type
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <inttypes.h> | |
static const uint32_t kSingleSignMask = 0x80000000; | |
static const uint32_t kSingleExpMask = 0x7f800000; | |
static const uint32_t kSingleMantMask = 0x007fffff; | |
static const uint32_t kHalfSignMask = 0x8000; | |
static const uint32_t kHalfExpMask = 0x7c00; | |
static const uint32_t kHalfMantMask = 0x03ff; | |
/*! \brief Implement of IEEE 16-bit floating point data type */ | |
struct half { | |
uint16_t data; | |
half() { | |
data = 0; | |
} | |
half(uint16_t a) { | |
data = a; | |
} | |
operator double() const { | |
return to_float(); | |
} | |
operator float() const { | |
return to_float(); | |
} | |
// Referring to https://github.com/ramenhut/half/blob/master/half.h | |
float to_float() const { | |
if (0 == data) { return 0.0f; } | |
else if (0x8000 == data) { return -0.0f; } | |
uint32_t sig_bits = ((data) >> 15); | |
uint32_t exp_bits = ((data) >> 10) & 0x1f; | |
uint32_t man_bits = data & 0x03ff; | |
uint32_t a = ((sig_bits << 31) & kSingleSignMask); | |
exp_bits = exp_bits - 15 + 127; | |
a = ((exp_bits << 23) & kSingleExpMask) | (a & kSingleSignMask); | |
man_bits = man_bits << 13; | |
a = (man_bits & kSingleMantMask) | (a & (kSingleSignMask | kSingleExpMask)); | |
printf("0x%x (0x%x, 0x%x, 0x%x)\n", a, sig_bits, exp_bits, man_bits); | |
return reinterpret_cast<float*>(&a)[0]; | |
} | |
}; | |
/*! \brief Implement of Google's 16-bit floating point data type */ | |
struct bfloat16 { | |
uint16_t data; | |
operator double() const { | |
return to_float(); | |
} | |
operator float() const { | |
return to_float(); | |
} | |
float to_float() const { | |
uint32_t a = data << 16; | |
return reinterpret_cast<float*>(&a)[0]; | |
} | |
}; | |
half GetHalf(uint32_t sig_bits, uint32_t exp_bits, uint32_t man_bits) { | |
uint16_t a = ((sig_bits << 15) & kHalfSignMask); | |
a = ((exp_bits << 10) & kHalfExpMask) | (a & kHalfSignMask); | |
a = (man_bits & kHalfMantMask) | (a & (kHalfSignMask | kHalfExpMask)); | |
printf("0x%x (0x%x, 0x%x, 0x%x)\n", a, sig_bits, exp_bits, man_bits); | |
half h; | |
h.data = a; | |
return h; | |
} | |
bfloat16 GetBFloat16(uint32_t sig_bits, uint32_t exp_bits, uint32_t man_bits) { | |
uint32_t a = ((sig_bits << 31) & kSingleSignMask); | |
a = ((exp_bits << 23) & kSingleExpMask) | (a & kSingleSignMask); | |
man_bits = man_bits << 16; | |
a = (man_bits & kSingleMantMask) | (a & (kSingleSignMask | kSingleExpMask)); | |
printf("0x%x (0x%x, 0x%x, 0x%x)\n", a, sig_bits, exp_bits, man_bits); | |
bfloat16 h; | |
h.data = a >> 16; | |
return h; | |
} | |
int main() { | |
half a = GetHalf(0, 0b10000, 0x03ff); | |
double d = a; | |
float f = a; | |
uint32_t u = reinterpret_cast<uint32_t*>(&f)[0]; | |
printf("%g\n", d); | |
printf("%g\n", f); | |
printf("0x%x\n", u); | |
printf("0x%x\n", u >> 23); | |
printf("--\n"); | |
fflush(stdout); | |
a = GetHalf(1, 0b10000, 0x03ff); | |
d = a; | |
f = a; | |
u = reinterpret_cast<uint32_t*>(&f)[0]; | |
printf("%g\n", d); | |
printf("%g\n", f); | |
printf("0x%x\n", u); | |
printf("0x%x\n", u >> 23); | |
printf("--\n"); | |
fflush(stdout); | |
float g = -0.165894; | |
bfloat16 b; | |
b.data = reinterpret_cast<uint32_t*>(&g)[0] >> 16; | |
f = b; | |
u = reinterpret_cast<uint32_t*>(&f)[0]; | |
printf("%g\n", g); | |
printf("%g\n", f); | |
printf("0x%x\n", u); | |
printf("--\n"); | |
fflush(stdout); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment