Created
August 25, 2017 02:13
-
-
Save nigeltao/2884d66f16fd3109184967a78cfea7a5 to your computer and use it in GitHub Desktop.
zlib: diff -u inffast.c contrib/inffast64/inffast64.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- inffast.c 2017-08-25 12:05:02.309161736 +1000 | |
+++ contrib/inffast64/inffast64.c 2017-08-25 12:09:05.504252162 +1000 | |
@@ -11,8 +11,11 @@ | |
#ifdef ASMINF | |
# pragma message("Assembler code may have bugs -- use at your own risk") | |
#elif INFLATE_FAST64 | |
-/* Skip this implementation; use contrib/inffast64/inffast64.c instead. */ | |
-#else | |
+ | |
+#if defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS) || \ | |
+ defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN) | |
+#include <stdint.h> | |
+#endif | |
/* | |
Decode literal, length, and distance codes and write out the resulting | |
@@ -45,10 +48,28 @@ | |
Therefore if strm->avail_in >= 6, then there is enough input to avoid | |
checking for available input while decoding. | |
+ - On some architectures, it can be significantly faster (e.g. up to 1.2x | |
+ faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a | |
+ time, so INFLATE_FAST_MIN_HAVE == 8. This requires a little endian | |
+ architecture. | |
+ | |
- The maximum bytes that a single length/distance pair can output is 258 | |
bytes, which is the maximum length that can be coded. inflate_fast() | |
requires strm->avail_out >= 258 for each loop to avoid checking for | |
output space. | |
+ | |
+ - On some architectures, for length-distance copies, we similarly load and | |
+ store 8 bytes at a time, if the distance is at least 8. Again, this can | |
+ be significantly faster (e.g. up to 1.3x faster on x86_64). Rounding up | |
+ to a multiple of 8 gives INFLATE_FAST_MIN_LEFT == 264. This does not | |
+ require a little endian architecture. This always copies 8*L bytes (where | |
+ L is the smallest integer such that 8*L >= len, i.e. we round length up | |
+ to a multiple of 8), instead of only len bytes, but that's OK, as | |
+ subsequent iterations will fix the overrun. | |
+ | |
+ - Combining those two optimizations for 64 bit unaligned loads gives up to | |
+ a 1.5x throughput improvement on x86_64. | |
+ | |
*/ | |
void ZLIB_INTERNAL inflate_fast(strm, start) | |
z_streamp strm; | |
@@ -67,7 +88,54 @@ | |
unsigned whave; /* valid bytes in the window */ | |
unsigned wnext; /* window write index */ | |
unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ | |
- unsigned long hold; /* local strm->hold */ | |
+ | |
+ /* hold is a local copy of strm->hold. By default, hold satisfies the same | |
+ invariants that strm->hold does, namely that (hold >> bits) == 0. This | |
+ invariant is kept by loading bits into hold one byte at a time, like: | |
+ | |
+ hold |= next_byte_of_input << bits; in++; bits += 8; | |
+ | |
+ If we need to ensure that bits >= 15 then this code snippet is simply | |
+ repeated. Over one iteration of the outermost do/while loop, this | |
+ happens up to six times (48 bits of input), as described in the NOTES | |
+ above. | |
+ | |
+ However, on some little endian architectures, it can be significantly | |
+ faster to load 64 bits once instead of 8 bits six times: | |
+ | |
+ if (bits <= 16) { | |
+ hold |= next_8_bytes_of_input << bits; in += 6; bits += 48; | |
+ } | |
+ | |
+ Unlike the simpler one byte load, shifting the next_8_bytes_of_input | |
+ by bits will overflow and lose those high bits, up to 2 bytes' worth. | |
+ The conservative estimate is therefore that we have read only 6 bytes | |
+ (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the | |
+ rest of the iteration, and we will not need to load another 8 bytes. | |
+ | |
+ Inside this function, we no longer satisfy (hold >> bits) == 0, but | |
+ this is not problematic, even if that overflow does not land on an 8 bit | |
+ byte boundary. Those excess bits will eventually shift down lower as the | |
+ Huffman decoder consumes input, and when new input bits need to be loaded | |
+ into the bits variable, the same input bits will be or'ed over those | |
+ existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b). | |
+ Note that we therefore write that load operation as "hold |= etc" and not | |
+ "hold += etc". | |
+ | |
+ Outside that loop, at the end of the function, hold is bitwise and'ed | |
+ with (1<<bits)-1 to drop those excess bits so that, on function exit, we | |
+ keep the invariant that (state->hold >> state->bits) == 0. | |
+ | |
+ TODO: rename the bits variable to nbits, so that this block comment | |
+ is less confusing when discussing bits (the variable) and bits (one | |
+ eighth of a byte). | |
+ */ | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN | |
+ uint64_t hold; | |
+#else | |
+ unsigned long hold; | |
+#endif | |
+ | |
unsigned bits; /* local strm->bits */ | |
code const FAR *lcode; /* local strm->lencode */ | |
code const FAR *dcode; /* local strm->distcode */ | |
@@ -105,10 +173,41 @@ | |
input data or output space */ | |
do { | |
if (bits < 15) { | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN | |
+ /* Example disassembly on __x86_64__: | |
+ 49 8b 06 mov (%r14),%rax | |
+ 89 f1 mov %esi,%ecx | |
+ 49 83 c6 06 add $0x6,%r14 | |
+ 83 c6 30 add $0x30,%esi | |
+ 48 d3 e0 shl %cl,%rax | |
+ 48 09 c2 or %rax,%rdx | |
+ */ | |
+ hold |= *((uint64_t *)(in)) << bits; | |
+ in += 6; | |
+ bits += 48; | |
+#else | |
+ /* Example disassembly on __x86_64__: | |
+ 45 0f b6 45 00 movzbl 0x0(%r13),%r8d | |
+ 89 f1 mov %esi,%ecx | |
+ 49 83 c5 02 add $0x2,%r13 | |
+ 49 d3 e0 shl %cl,%r8 | |
+ 8d 4e 08 lea 0x8(%rsi),%ecx | |
+ 83 c6 10 add $0x10,%esi | |
+ 49 01 d0 add %rdx,%r8 | |
+ 41 0f b6 55 ff movzbl -0x1(%r13),%edx | |
+ 48 d3 e2 shl %cl,%rdx | |
+ 4c 01 c2 add %r8,%rdx | |
+ */ | |
+ /* TODO: replace "hold += etc" with "hold |= etc", here and below, | |
+ to be consistent with the 64 bit unaligned code path. This is | |
+ only a comment for now so that the commit that introduced this | |
+ comment has no effect whatsoever on architectures without | |
+ ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN. */ | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
+#endif | |
} | |
here = lcode[hold & lmask]; | |
dolen: | |
@@ -127,8 +226,14 @@ | |
op &= 15; /* number of extra bits */ | |
if (op) { | |
if (bits < op) { | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN | |
+ hold |= *((uint64_t *)(in)) << bits; | |
+ in += 6; | |
+ bits += 48; | |
+#else | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
+#endif | |
} | |
len += (unsigned)hold & ((1U << op) - 1); | |
hold >>= op; | |
@@ -136,10 +241,16 @@ | |
} | |
Tracevv((stderr, "inflate: length %u\n", len)); | |
if (bits < 15) { | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN | |
+ hold |= *((uint64_t *)(in)) << bits; | |
+ in += 6; | |
+ bits += 48; | |
+#else | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
+#endif | |
} | |
here = dcode[hold & dmask]; | |
dodist: | |
@@ -151,12 +262,18 @@ | |
dist = (unsigned)(here.val); | |
op &= 15; /* number of extra bits */ | |
if (bits < op) { | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN | |
+ hold |= *((uint64_t *)(in)) << bits; | |
+ in += 6; | |
+ bits += 48; | |
+#else | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
if (bits < op) { | |
hold += (unsigned long)(*in++) << bits; | |
bits += 8; | |
} | |
+#endif | |
} | |
dist += (unsigned)hold & ((1U << op) - 1); | |
#ifdef INFLATE_STRICT | |
@@ -239,6 +356,22 @@ | |
from = out - dist; /* rest from output */ | |
} | |
} | |
+ | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS | |
+ if (dist >= 8) { | |
+ while (1) { | |
+ *((uint64_t*)(out)) = *((uint64_t*)(from)); | |
+ if (len <= 8) { | |
+ out += len; | |
+ break; | |
+ } | |
+ out += 8; | |
+ from += 8; | |
+ len -= 8; | |
+ } | |
+ continue; | |
+ } | |
+#endif | |
while (len > 2) { | |
*out++ = *from++; | |
*out++ = *from++; | |
@@ -253,6 +386,21 @@ | |
} | |
else { | |
from = out - dist; /* copy direct from output */ | |
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS | |
+ if (dist >= 8) { | |
+ while (1) { | |
+ *((uint64_t*)(out)) = *((uint64_t*)(from)); | |
+ if (len <= 8) { | |
+ out += len; | |
+ break; | |
+ } | |
+ out += 8; | |
+ from += 8; | |
+ len -= 8; | |
+ } | |
+ continue; | |
+ } | |
+#endif | |
do { /* minimum length is three */ | |
*out++ = *from++; | |
*out++ = *from++; | |
@@ -326,4 +474,4 @@ | |
- Moving len -= 3 statement into middle of loop | |
*/ | |
-#endif /* !ASMINF && !INFLATE_FAST64 */ | |
+#endif /* !ASMINF && INFLATE_FAST64 */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment