nigeltao · August 25, 2017 02:13
diff --git a/inffast64.diff b/inffast64.diff
 --- inffast.c	2017-08-25 12:05:02.309161736 +1000
 +++ contrib/inffast64/inffast64.c	2017-08-25 12:09:05.504252162 +1000
 @@ -11,8 +11,11 @@
 #ifdef ASMINF
 #  pragma message("Assembler code may have bugs -- use at your own risk")
 #elif INFLATE_FAST64
 -/* Skip this implementation; use contrib/inffast64/inffast64.c instead. */
 -#else
 +
 +#if defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS) || \
 +    defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN)
 +#include <stdint.h>
 +#endif
 
 /*
    Decode literal, length, and distance codes and write out the resulting
 @@ -45,10 +48,28 @@
       Therefore if strm->avail_in >= 6, then there is enough input to avoid
       checking for available input while decoding.
 
 +    - On some architectures, it can be significantly faster (e.g. up to 1.2x
 +      faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
 +      time, so INFLATE_FAST_MIN_HAVE == 8. This requires a little endian
 +      architecture.
 +
     - The maximum bytes that a single length/distance pair can output is 258
       bytes, which is the maximum length that can be coded.  inflate_fast()
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
 +
 +    - On some architectures, for length-distance copies, we similarly load and
 +      store 8 bytes at a time, if the distance is at least 8. Again, this can
 +      be significantly faster (e.g. up to 1.3x faster on x86_64). Rounding up
 +      to a multiple of 8 gives INFLATE_FAST_MIN_LEFT == 264. This does not
 +      require a little endian architecture. This always copies 8*L bytes (where
 +      L is the smallest integer such that 8*L >= len, i.e. we round length up
 +      to a multiple of 8), instead of only len bytes, but that's OK, as
 +      subsequent iterations will fix the overrun.
 +
 +    - Combining those two optimizations for 64 bit unaligned loads gives up to
 +      a 1.5x throughput improvement on x86_64.
 +
  */
 void ZLIB_INTERNAL inflate_fast(strm, start)
 z_streamp strm;
 @@ -67,7 +88,54 @@
     unsigned whave;             /* valid bytes in the window */
     unsigned wnext;             /* window write index */
     unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
 -    unsigned long hold;         /* local strm->hold */
 +
 +    /* hold is a local copy of strm->hold. By default, hold satisfies the same
 +       invariants that strm->hold does, namely that (hold >> bits) == 0. This
 +       invariant is kept by loading bits into hold one byte at a time, like:
 +
 +       hold |= next_byte_of_input << bits; in++; bits += 8;
 +
 +       If we need to ensure that bits >= 15 then this code snippet is simply
 +       repeated. Over one iteration of the outermost do/while loop, this
 +       happens up to six times (48 bits of input), as described in the NOTES
 +       above.
 +
 +       However, on some little endian architectures, it can be significantly
 +       faster to load 64 bits once instead of 8 bits six times:
 +
 +       if (bits <= 16) {
 +         hold |= next_8_bytes_of_input << bits; in += 6; bits += 48;
 +       }
 +
 +       Unlike the simpler one byte load, shifting the next_8_bytes_of_input
 +       by bits will overflow and lose those high bits, up to 2 bytes' worth.
 +       The conservative estimate is therefore that we have read only 6 bytes
 +       (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
 +       rest of the iteration, and we will not need to load another 8 bytes.
 +
 +       Inside this function, we no longer satisfy (hold >> bits) == 0, but
 +       this is not problematic, even if that overflow does not land on an 8 bit
 +       byte boundary. Those excess bits will eventually shift down lower as the
 +       Huffman decoder consumes input, and when new input bits need to be loaded
 +       into the bits variable, the same input bits will be or'ed over those
 +       existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b).
 +       Note that we therefore write that load operation as "hold |= etc" and not
 +       "hold += etc".
 +
 +       Outside that loop, at the end of the function, hold is bitwise and'ed
 +       with (1<<bits)-1 to drop those excess bits so that, on function exit, we
 +       keep the invariant that (state->hold >> state->bits) == 0.
 +
 +       TODO: rename the bits variable to nbits, so that this block comment
 +       is less confusing when discussing bits (the variable) and bits (one
 +       eighth of a byte).
 +       */
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
 +    uint64_t hold;
 +#else
 +    unsigned long hold;
 +#endif
 +
     unsigned bits;              /* local strm->bits */
     code const FAR *lcode;      /* local strm->lencode */
     code const FAR *dcode;      /* local strm->distcode */
 @@ -105,10 +173,41 @@
        input data or output space */
     do {
         if (bits < 15) {
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
 +            /* Example disassembly on __x86_64__:
 +               49 8b 06              mov    (%r14),%rax
 +               89 f1                 mov    %esi,%ecx
 +               49 83 c6 06           add    $0x6,%r14
 +               83 c6 30              add    $0x30,%esi
 +               48 d3 e0              shl    %cl,%rax
 +               48 09 c2              or     %rax,%rdx
 +             */
 +            hold |= *((uint64_t *)(in)) << bits;
 +            in += 6;
 +            bits += 48;
 +#else
 +            /* Example disassembly on __x86_64__:
 +               45 0f b6 45 00        movzbl 0x0(%r13),%r8d
 +               89 f1                 mov    %esi,%ecx
 +               49 83 c5 02           add    $0x2,%r13
 +               49 d3 e0              shl    %cl,%r8
 +               8d 4e 08              lea    0x8(%rsi),%ecx
 +               83 c6 10              add    $0x10,%esi
 +               49 01 d0              add    %rdx,%r8
 +               41 0f b6 55 ff        movzbl -0x1(%r13),%edx
 +               48 d3 e2              shl    %cl,%rdx
 +               4c 01 c2              add    %r8,%rdx
 +             */
 +            /* TODO: replace "hold += etc" with "hold |= etc", here and below,
 +               to be consistent with the 64 bit unaligned code path. This is
 +               only a comment for now so that the commit that introduced this
 +               comment has no effect whatsoever on architectures without
 +               ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN. */
             hold += (unsigned long)(*in++) << bits;
             bits += 8;
             hold += (unsigned long)(*in++) << bits;
             bits += 8;
 +#endif
         }
         here = lcode[hold & lmask];
       dolen:
 @@ -127,8 +226,14 @@
             op &= 15;                           /* number of extra bits */
             if (op) {
                 if (bits < op) {
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
 +                    hold |= *((uint64_t *)(in)) << bits;
 +                    in += 6;
 +                    bits += 48;
 +#else
                     hold += (unsigned long)(*in++) << bits;
                     bits += 8;
 +#endif
                 }
                 len += (unsigned)hold & ((1U << op) - 1);
                 hold >>= op;
 @@ -136,10 +241,16 @@
             }
             Tracevv((stderr, "inflate:         length %u\n", len));
             if (bits < 15) {
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
 +                hold |= *((uint64_t *)(in)) << bits;
 +                in += 6;
 +                bits += 48;
 +#else
                 hold += (unsigned long)(*in++) << bits;
                 bits += 8;
                 hold += (unsigned long)(*in++) << bits;
                 bits += 8;
 +#endif
             }
             here = dcode[hold & dmask];
           dodist:
 @@ -151,12 +262,18 @@
                 dist = (unsigned)(here.val);
                 op &= 15;                       /* number of extra bits */
                 if (bits < op) {
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
 +                    hold |= *((uint64_t *)(in)) << bits;
 +                    in += 6;
 +                    bits += 48;
 +#else
                     hold += (unsigned long)(*in++) << bits;
                     bits += 8;
                     if (bits < op) {
                         hold += (unsigned long)(*in++) << bits;
                         bits += 8;
                     }
 +#endif
                 }
                 dist += (unsigned)hold & ((1U << op) - 1);
 #ifdef INFLATE_STRICT
 @@ -239,6 +356,22 @@
                             from = out - dist;  /* rest from output */
                         }
                     }
 +
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
 +                    if (dist >= 8) {
 +                        while (1) {
 +                            *((uint64_t*)(out)) = *((uint64_t*)(from));
 +                            if (len <= 8) {
 +                                out += len;
 +                                break;
 +                            }
 +                            out += 8;
 +                            from += 8;
 +                            len -= 8;
 +                        }
 +                        continue;
 +                    }
 +#endif
                     while (len > 2) {
                         *out++ = *from++;
                         *out++ = *from++;
 @@ -253,6 +386,21 @@
                 }
                 else {
                     from = out - dist;          /* copy direct from output */
 +#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
 +                    if (dist >= 8) {
 +                        while (1) {
 +                            *((uint64_t*)(out)) = *((uint64_t*)(from));
 +                            if (len <= 8) {
 +                                out += len;
 +                                break;
 +                            }
 +                            out += 8;
 +                            from += 8;
 +                            len -= 8;
 +                        }
 +                        continue;
 +                    }
 +#endif
                     do {                        /* minimum length is three */
                         *out++ = *from++;
                         *out++ = *from++;
 @@ -326,4 +474,4 @@
    - Moving len -= 3 statement into middle of loop
  */
 
 -#endif /* !ASMINF && !INFLATE_FAST64 */
 +#endif /* !ASMINF && INFLATE_FAST64 */
	--- inffast.c 2017-08-25 12:05:02.309161736 +1000
	+++ contrib/inffast64/inffast64.c 2017-08-25 12:09:05.504252162 +1000
	@@ -11,8 +11,11 @@
	#ifdef ASMINF
	# pragma message("Assembler code may have bugs -- use at your own risk")
	#elif INFLATE_FAST64
	-/* Skip this implementation; use contrib/inffast64/inffast64.c instead. */
	-#else
	+
	+#if defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS) \|\| \
	+ defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN)
	+#include <stdint.h>
	+#endif

	/*
	Decode literal, length, and distance codes and write out the resulting
	@@ -45,10 +48,28 @@
	Therefore if strm->avail_in >= 6, then there is enough input to avoid
	checking for available input while decoding.

	+ - On some architectures, it can be significantly faster (e.g. up to 1.2x
	+ faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
	+ time, so INFLATE_FAST_MIN_HAVE == 8. This requires a little endian
	+ architecture.
	+
	- The maximum bytes that a single length/distance pair can output is 258
	bytes, which is the maximum length that can be coded. inflate_fast()
	requires strm->avail_out >= 258 for each loop to avoid checking for
	output space.
	+
	+ - On some architectures, for length-distance copies, we similarly load and
	+ store 8 bytes at a time, if the distance is at least 8. Again, this can
	+ be significantly faster (e.g. up to 1.3x faster on x86_64). Rounding up
	+ to a multiple of 8 gives INFLATE_FAST_MIN_LEFT == 264. This does not
	+ require a little endian architecture. This always copies 8*L bytes (where
	+ L is the smallest integer such that 8*L >= len, i.e. we round length up
	+ to a multiple of 8), instead of only len bytes, but that's OK, as
	+ subsequent iterations will fix the overrun.
	+
	+ - Combining those two optimizations for 64 bit unaligned loads gives up to
	+ a 1.5x throughput improvement on x86_64.
	+
	*/
	void ZLIB_INTERNAL inflate_fast(strm, start)
	z_streamp strm;
	@@ -67,7 +88,54 @@
	unsigned whave; /* valid bytes in the window */
	unsigned wnext; /* window write index */
	unsigned char FAR window; / allocated sliding window, if wsize != 0 */
	- unsigned long hold; /* local strm->hold */
	+
	+ /* hold is a local copy of strm->hold. By default, hold satisfies the same
	+ invariants that strm->hold does, namely that (hold >> bits) == 0. This
	+ invariant is kept by loading bits into hold one byte at a time, like:
	+
	+ hold \|= next_byte_of_input << bits; in++; bits += 8;
	+
	+ If we need to ensure that bits >= 15 then this code snippet is simply
	+ repeated. Over one iteration of the outermost do/while loop, this
	+ happens up to six times (48 bits of input), as described in the NOTES
	+ above.
	+
	+ However, on some little endian architectures, it can be significantly
	+ faster to load 64 bits once instead of 8 bits six times:
	+
	+ if (bits <= 16) {
	+ hold \|= next_8_bytes_of_input << bits; in += 6; bits += 48;
	+ }
	+
	+ Unlike the simpler one byte load, shifting the next_8_bytes_of_input
	+ by bits will overflow and lose those high bits, up to 2 bytes' worth.
	+ The conservative estimate is therefore that we have read only 6 bytes
	+ (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
	+ rest of the iteration, and we will not need to load another 8 bytes.
	+
	+ Inside this function, we no longer satisfy (hold >> bits) == 0, but
	+ this is not problematic, even if that overflow does not land on an 8 bit
	+ byte boundary. Those excess bits will eventually shift down lower as the
	+ Huffman decoder consumes input, and when new input bits need to be loaded
	+ into the bits variable, the same input bits will be or'ed over those
	+ existing bits. A bitwise or is idempotent: (a \| b \| b) equals (a \| b).
	+ Note that we therefore write that load operation as "hold \|= etc" and not
	+ "hold += etc".
	+
	+ Outside that loop, at the end of the function, hold is bitwise and'ed
	+ with (1<<bits)-1 to drop those excess bits so that, on function exit, we
	+ keep the invariant that (state->hold >> state->bits) == 0.
	+
	+ TODO: rename the bits variable to nbits, so that this block comment
	+ is less confusing when discussing bits (the variable) and bits (one
	+ eighth of a byte).
	+ */
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ uint64_t hold;
	+#else
	+ unsigned long hold;
	+#endif
	+
	unsigned bits; /* local strm->bits */
	code const FAR lcode; / local strm->lencode */
	code const FAR dcode; / local strm->distcode */
	@@ -105,10 +173,41 @@
	input data or output space */
	do {
	if (bits < 15) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ /* Example disassembly on __x86_64__:
	+ 49 8b 06 mov (%r14),%rax
	+ 89 f1 mov %esi,%ecx
	+ 49 83 c6 06 add $0x6,%r14
	+ 83 c6 30 add $0x30,%esi
	+ 48 d3 e0 shl %cl,%rax
	+ 48 09 c2 or %rax,%rdx
	+ */
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	+ /* Example disassembly on __x86_64__:
	+ 45 0f b6 45 00 movzbl 0x0(%r13),%r8d
	+ 89 f1 mov %esi,%ecx
	+ 49 83 c5 02 add $0x2,%r13
	+ 49 d3 e0 shl %cl,%r8
	+ 8d 4e 08 lea 0x8(%rsi),%ecx
	+ 83 c6 10 add $0x10,%esi
	+ 49 01 d0 add %rdx,%r8
	+ 41 0f b6 55 ff movzbl -0x1(%r13),%edx
	+ 48 d3 e2 shl %cl,%rdx
	+ 4c 01 c2 add %r8,%rdx
	+ */
	+ /* TODO: replace "hold += etc" with "hold \|= etc", here and below,
	+ to be consistent with the 64 bit unaligned code path. This is
	+ only a comment for now so that the commit that introduced this
	+ comment has no effect whatsoever on architectures without
	+ ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN. */
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	+#endif
	}
	here = lcode[hold & lmask];
	dolen:
	@@ -127,8 +226,14 @@
	op &= 15; /* number of extra bits */
	if (op) {
	if (bits < op) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	+#endif
	}
	len += (unsigned)hold & ((1U << op) - 1);
	hold >>= op;
	@@ -136,10 +241,16 @@
	}
	Tracevv((stderr, "inflate: length %u\n", len));
	if (bits < 15) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	+#endif
	}
	here = dcode[hold & dmask];
	dodist:
	@@ -151,12 +262,18 @@
	dist = (unsigned)(here.val);
	op &= 15; /* number of extra bits */
	if (bits < op) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	if (bits < op) {
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	}
	+#endif
	}
	dist += (unsigned)hold & ((1U << op) - 1);
	#ifdef INFLATE_STRICT
	@@ -239,6 +356,22 @@
	from = out - dist; /* rest from output */
	}
	}
	+
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
	+ if (dist >= 8) {
	+ while (1) {
	+ ((uint64_t)(out)) = ((uint64_t)(from));
	+ if (len <= 8) {
	+ out += len;
	+ break;
	+ }
	+ out += 8;
	+ from += 8;
	+ len -= 8;
	+ }
	+ continue;
	+ }
	+#endif
	while (len > 2) {
	out++ = from++;
	out++ = from++;
	@@ -253,6 +386,21 @@
	}
	else {
	from = out - dist; /* copy direct from output */
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
	+ if (dist >= 8) {
	+ while (1) {
	+ ((uint64_t)(out)) = ((uint64_t)(from));
	+ if (len <= 8) {
	+ out += len;
	+ break;
	+ }
	+ out += 8;
	+ from += 8;
	+ len -= 8;
	+ }
	+ continue;
	+ }
	+#endif
	do { /* minimum length is three */
	out++ = from++;
	out++ = from++;
	@@ -326,4 +474,4 @@
	- Moving len -= 3 statement into middle of loop
	*/

	-#endif /* !ASMINF && !INFLATE_FAST64 */
	+#endif /* !ASMINF && INFLATE_FAST64 */