Here are my two implementations. The first one uses branches, which can be a problem on an in-order CPU, but uses very little memory:
/* This method is faster than the OpenEXR implementation (very often
* used, eg. in Ogre), with the additional benefit of rounding, inspired
* by James Tursa’s half-precision code. */
static inline uint16_t float_to_half_branch(uint32_t x)
{
uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
/* If zero, or denormal, or exponent underflows too much for a denormal
* half, return signed zero. */
if (e < 103)
return bits;
/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
if (e > 142)
{
bits |= 0x7c00u;
/* If exponent was 0xff and one mantissa bit was set, it means NaN,
* not Inf, so make sure we set one mantissa bit too. */
bits |= e == 255 && (x & 0x007fffffu);
return bits;
}
/* If exponent underflows but not too much, return a denormal */
if (e < 113)
{
m |= 0x0800u;
/* Extra rounding may overflow and set mantissa to 0 and exponent
* to 1, which is OK. */
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
return bits;
}
bits |= ((e - 112) << 10) | (m >> 1);
/* Extra rounding. An overflow will set mantissa to 0 and increment
* the exponent, which is OK. */
bits += m & 1;
return bits;
}
The second one only uses tables, which uses some memory and thus can suffer from cache misses if used only from time to time:
/* These macros implement a finite iterator useful to build lookup
* tables. For instance, S64(0) will call S1(x) for all values of x
* between 0 and 63.
* Due to the exponential behaviour of the calls, the stress on the
* compiler may be important. */
#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
/* Lookup table-based algorithm from “Fast Half Float Conversions”
* by Jeroen van der Zijp, November 2008. No rounding is performed,
* and some NaN values may be incorrectly converted to Inf. */
static inline uint16_t float_to_half_nobranch(uint32_t x)
{
static uint16_t const basetable[512] =
{
#define S1(i) (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
S256(0),
#undef S1
#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
S256(0),
#undef S1
};
static uint8_t const shifttable[512] =
{
#define S1(i) (((i) < 103) ? 24 : \
((i) < 113) ? 126 - (i) : \
((i) < 143 || (i) == 255) ? 13 : 24)
S256(0), S256(0),
#undef S1
};
uint16_t bits = basetable[(x >> 23) & 0x1ff];
bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
return bits;
}