summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/libjpeg/src/jcphuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/libjpeg/src/jcphuff.c')
-rw-r--r--src/3rdparty/libjpeg/src/jcphuff.c675
1 files changed, 473 insertions, 202 deletions
diff --git a/src/3rdparty/libjpeg/src/jcphuff.c b/src/3rdparty/libjpeg/src/jcphuff.c
index 046e2e18d4..024d3af0fb 100644
--- a/src/3rdparty/libjpeg/src/jcphuff.c
+++ b/src/3rdparty/libjpeg/src/jcphuff.c
@@ -4,7 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, D. R. Commander.
+ * Copyright (C) 2016, 2018, Matthieu Darbois.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -18,15 +19,69 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
-#include "jchuff.h" /* Declarations shared with jchuff.c */
+#include "jsimd.h"
+#include "jconfigint.h"
+#include <limits.h>
+
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#ifdef _MSC_VER
+#ifdef HAVE_BITSCANFORWARD64
+#pragma intrinsic(_BitScanForward64)
+#endif
+#ifdef HAVE_BITSCANFORWARD
+#pragma intrinsic(_BitScanForward)
+#endif
+#endif
+#endif
#ifdef C_PROGRESSIVE_SUPPORTED
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table. This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.) This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on ARM processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it. When building for ARMv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+#if !defined __thumb__ || defined __thumb2__
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
+
+
/* Expanded entropy encoder object for progressive Huffman encoding. */
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
+ /* Pointer to routine to prepare data for encode_mcu_AC_first() */
+ void (*AC_first_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits);
+ /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
+ int (*AC_refine_prepare) (const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits);
+
/* Mode flag: TRUE for optimization, FALSE for actual data output */
boolean gather_statistics;
@@ -79,26 +134,60 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
#ifdef RIGHT_SHIFT_IS_UNSIGNED
#define ISHIFT_TEMPS int ishift_temp;
-#define IRIGHT_SHIFT(x,shft) \
- ((ishift_temp = (x)) < 0 ? \
- (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
- (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+ ((ishift_temp = (x)) < 0 ? \
+ (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+ (ishift_temp >> (shft)))
#else
#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
+
/* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo,
+METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(void) encode_mcu_AC_first_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
+ JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo,
+METHODDEF(int) encode_mcu_AC_refine_prepare
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
+METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo,
- JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo,
- JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo);
-METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
+
+
+/* Count bit loop zeroes */
+INLINE
+METHODDEF(int)
+count_zeroes(size_t *x)
+{
+ int result;
+#if defined(HAVE_BUILTIN_CTZL)
+ result = __builtin_ctzl(*x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD64)
+ _BitScanForward64(&result, *x);
+ *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD)
+ _BitScanForward(&result, *x);
+ *x >>= result;
+#else
+ result = 0;
+ while ((*x & 1) == 0) {
+ ++result;
+ *x >>= 1;
+ }
+#endif
+ return result;
+}
/*
@@ -106,9 +195,9 @@ METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
*/
METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
boolean is_DC_band;
int ci, tbl;
jpeg_component_info *compptr;
@@ -126,15 +215,23 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
entropy->pub.encode_mcu = encode_mcu_DC_first;
else
entropy->pub.encode_mcu = encode_mcu_AC_first;
+ if (jsimd_can_encode_mcu_AC_first_prepare())
+ entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
+ else
+ entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
} else {
if (is_DC_band)
entropy->pub.encode_mcu = encode_mcu_DC_refine;
else {
entropy->pub.encode_mcu = encode_mcu_AC_refine;
+ if (jsimd_can_encode_mcu_AC_refine_prepare())
+ entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
+ else
+ entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
/* AC refinement needs a correction bit buffer */
if (entropy->bit_buffer == NULL)
entropy->bit_buffer = (char *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
MAX_CORR_BITS * sizeof(char));
}
}
@@ -167,14 +264,14 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
if (entropy->count_ptrs[tbl] == NULL)
entropy->count_ptrs[tbl] = (long *)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
} else {
/* Compute derived values for Huffman table */
/* We may do this more than once for a table, but it's not expensive */
jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
- & entropy->derived_tbls[tbl]);
+ &entropy->derived_tbls[tbl]);
}
}
@@ -198,19 +295,20 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
*/
/* Emit a byte */
-#define emit_byte(entropy,val) \
- { *(entropy)->next_output_byte++ = (JOCTET) (val); \
- if (--(entropy)->free_in_buffer == 0) \
- dump_buffer(entropy); }
+#define emit_byte(entropy, val) { \
+ *(entropy)->next_output_byte++ = (JOCTET)(val); \
+ if (--(entropy)->free_in_buffer == 0) \
+ dump_buffer(entropy); \
+}
LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
+dump_buffer(phuff_entropy_ptr entropy)
/* Empty the output buffer; we do not support suspension in this module. */
{
struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
- if (! (*dest->empty_output_buffer) (entropy->cinfo))
+ if (!(*dest->empty_output_buffer) (entropy->cinfo))
ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
/* After a successful buffer dump, must reset buffer pointers */
entropy->next_output_byte = dest->next_output_byte;
@@ -227,11 +325,11 @@ dump_buffer (phuff_entropy_ptr entropy)
*/
LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
+emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size)
/* Emit some bits, unless we are in gather mode */
{
/* This routine is heavily used, so it's worth coding tightly. */
- register size_t put_buffer = (size_t) code;
+ register size_t put_buffer = (size_t)code;
register int put_bits = entropy->put_bits;
/* if size is 0, caller used an invalid Huffman table entry */
@@ -241,7 +339,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
if (entropy->gather_statistics)
return; /* do nothing if we're only getting stats */
- put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
+ put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
put_bits += size; /* new number of bits in buffer */
@@ -250,7 +348,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
while (put_bits >= 8) {
- int c = (int) ((put_buffer >> 16) & 0xFF);
+ int c = (int)((put_buffer >> 16) & 0xFF);
emit_byte(entropy, c);
if (c == 0xFF) { /* need to stuff a zero byte? */
@@ -266,7 +364,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
+flush_bits(phuff_entropy_ptr entropy)
{
emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
entropy->put_buffer = 0; /* and reset bit-buffer to empty */
@@ -279,7 +377,7 @@ flush_bits (phuff_entropy_ptr entropy)
*/
LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
+emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol)
{
if (entropy->gather_statistics)
entropy->count_ptrs[tbl_no][symbol]++;
@@ -295,14 +393,14 @@ emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
*/
LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
- unsigned int nbits)
+emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart,
+ unsigned int nbits)
{
if (entropy->gather_statistics)
return; /* no real work */
while (nbits > 0) {
- emit_bits(entropy, (unsigned int) (*bufstart), 1);
+ emit_bits(entropy, (unsigned int)(*bufstart), 1);
bufstart++;
nbits--;
}
@@ -314,15 +412,13 @@ emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
*/
LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
+emit_eobrun(phuff_entropy_ptr entropy)
{
register int temp, nbits;
if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */
temp = entropy->EOBRUN;
- nbits = 0;
- while ((temp >>= 1))
- nbits++;
+ nbits = JPEG_NBITS_NONZERO(temp) - 1;
/* safety check: shouldn't happen given limited correction-bit buffer */
if (nbits > 14)
ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -345,13 +441,13 @@ emit_eobrun (phuff_entropy_ptr entropy)
*/
LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
+emit_restart(phuff_entropy_ptr entropy, int restart_num)
{
int ci;
emit_eobrun(entropy);
- if (! entropy->gather_statistics) {
+ if (!entropy->gather_statistics) {
flush_bits(entropy);
emit_byte(entropy, 0xFF);
emit_byte(entropy, JPEG_RST0 + restart_num);
@@ -375,10 +471,10 @@ emit_restart (phuff_entropy_ptr entropy, int restart_num)
*/
METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- register int temp, temp2;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, temp2, temp3;
register int nbits;
int blkn, ci;
int Al = cinfo->Al;
@@ -403,31 +499,31 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Compute the DC value after the required point transform by Al.
* This is simply an arithmetic right shift.
*/
- temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+ temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al);
/* DC differences are figured on the point-transformed values. */
temp = temp2 - entropy->last_dc_val[ci];
entropy->last_dc_val[ci] = temp2;
/* Encode the DC coefficient difference per section G.1.2.1 */
- temp2 = temp;
- if (temp < 0) {
- temp = -temp; /* temp is abs value of input */
- /* For a negative input, want temp2 = bitwise complement of abs(input) */
- /* This code assumes we are on a two's complement machine */
- temp2--;
- }
+
+ /* This is a well-known technique for obtaining the absolute value without
+ * a branch. It is derived from an assembly language technique presented
+ * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
+ * 1997 by Agner Fog.
+ */
+ temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+ temp ^= temp3;
+ temp -= temp3; /* temp is abs value of input */
+ /* For a negative input, want temp2 = bitwise complement of abs(input) */
+ temp2 = temp ^ temp3;
/* Find the number of bits needed for the magnitude of the coefficient */
- nbits = 0;
- while (temp) {
- nbits++;
- temp >>= 1;
- }
+ nbits = JPEG_NBITS(temp);
/* Check for out-of-range coefficient values.
* Since we're encoding a difference, the range limit is twice as much.
*/
- if (nbits > MAX_COEF_BITS+1)
+ if (nbits > MAX_COEF_BITS + 1)
ERREXIT(cinfo, JERR_BAD_DCT_COEF);
/* Count/emit the Huffman-coded symbol for the number of bits */
@@ -436,7 +532,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* Emit that number of bits of the value, if positive, */
/* or the complement of its magnitude, if negative. */
if (nbits) /* emit_bits rejects calls with size 0 */
- emit_bits(entropy, (unsigned int) temp2, nbits);
+ emit_bits(entropy, (unsigned int)temp2, nbits);
}
cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -457,20 +553,115 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/*
+ * Data preparation for encode_mcu_AC_first().
+ */
+
+#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ if (temp == 0) \
+ continue; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value; so the code is \
+ * interwoven with finding the abs value (temp) and output bits (temp2). \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ /* Watch out for case that nonzero coef is zero after point transform */ \
+ if (temp == 0) \
+ continue; \
+ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
+ temp2 ^= temp; \
+ values[k] = temp; \
+ values[k + DCTSIZE2] = temp2; \
+ zerobits |= ((size_t)1U) << k; \
+ } \
+}
+
+METHODDEF(void)
+encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *bits)
+{
+ register int k, temp, temp2;
+ size_t zerobits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 4
+ zerobits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ values += 32;
+
+ COMPUTE_ABSVALUES_AC_FIRST(Sl);
+ }
+ bits[1] = zerobits;
+#endif
+}
+
+/*
* MCU encoding for AC initial scan (either spectral selection,
* or first pass of successive approximation).
*/
+#define ENCODE_COEFS_AC_FIRST(label) { \
+ while (zerobits) { \
+ r = count_zeroes(&zerobits); \
+ cvalue += r; \
+label \
+ temp = cvalue[0]; \
+ temp2 = cvalue[DCTSIZE2]; \
+ \
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+ while (r > 15) { \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ } \
+ \
+ /* Find the number of bits needed for the magnitude of the coefficient */ \
+ nbits = JPEG_NBITS_NONZERO(temp); /* there must be at least one 1 bit */ \
+ /* Check for out-of-range coefficient values */ \
+ if (nbits > MAX_COEF_BITS) \
+ ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
+ \
+ /* Emit that number of bits of the value, if positive, */ \
+ /* or the complement of its magnitude, if negative. */ \
+ emit_bits(entropy, (unsigned int)temp2, nbits); \
+ \
+ cvalue++; \
+ zerobits >>= 1; \
+ } \
+}
+
METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp, temp2;
- register int nbits;
- register int r, k;
- int Se = cinfo->Se;
+ register int nbits, r;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
- JBLOCKROW block;
+ JCOEF values_unaligned[2 * DCTSIZE2 + 15];
+ JCOEF *values;
+ const JCOEF *cvalue;
+ size_t zerobits;
+ size_t bits[8 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -480,66 +671,48 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
- /* Encode the MCU data block */
- block = MCU_data[0];
+#ifdef WITH_SIMD
+ cvalue = values = (JCOEF *)PAD((size_t)values_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cvalue = values = values_unaligned;
+#endif
- /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+ /* Prepare data */
+ entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, values, bits);
- r = 0; /* r = run length of zeros */
-
- for (k = cinfo->Ss; k <= Se; k++) {
- if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
- r++;
- continue;
- }
- /* We must apply the point transform by Al. For AC coefficients this
- * is an integer division with rounding towards 0. To do this portably
- * in C, we shift after obtaining the absolute value; so the code is
- * interwoven with finding the abs value (temp) and output bits (temp2).
- */
- if (temp < 0) {
- temp = -temp; /* temp is abs value of input */
- temp >>= Al; /* apply the point transform */
- /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
- temp2 = ~temp;
- } else {
- temp >>= Al; /* apply the point transform */
- temp2 = temp;
- }
- /* Watch out for case that nonzero coef is zero after point transform */
- if (temp == 0) {
- r++;
- continue;
- }
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 4
+ zerobits |= bits[1];
+#endif
- /* Emit any pending EOBRUN */
- if (entropy->EOBRUN > 0)
- emit_eobrun(entropy);
- /* if run length > 15, must emit special run-length-16 codes (0xF0) */
- while (r > 15) {
- emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
- r -= 16;
- }
+ /* Emit any pending EOBRUN */
+ if (zerobits && (entropy->EOBRUN > 0))
+ emit_eobrun(entropy);
- /* Find the number of bits needed for the magnitude of the coefficient */
- nbits = 1; /* there must be at least one 1 bit */
- while ((temp >>= 1))
- nbits++;
- /* Check for out-of-range coefficient values */
- if (nbits > MAX_COEF_BITS)
- ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[0];
+#endif
- /* Count/emit Huffman symbol for run length / number of bits */
- emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+ /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
- /* Emit that number of bits of the value, if positive, */
- /* or the complement of its magnitude, if negative. */
- emit_bits(entropy, (unsigned int) temp2, nbits);
+ ENCODE_COEFS_AC_FIRST((void)0;);
- r = 0; /* reset zero run length */
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ if (zerobits) {
+ int diff = ((values + DCTSIZE2 / 2) - cvalue);
+ r = count_zeroes(&zerobits);
+ r += diff;
+ cvalue += r;
+ goto first_iter_ac_first;
}
- if (r > 0) { /* If there are trailing zeroes, */
+ ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
+#endif
+
+ if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
if (entropy->EOBRUN == 0x7FFF)
emit_eobrun(entropy); /* force it out to avoid overflow */
@@ -569,9 +742,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp;
int blkn;
int Al = cinfo->Al;
@@ -591,7 +764,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/* We simply emit the Al'th bit of the DC coefficient value. */
temp = (*block)[0];
- emit_bits(entropy, (unsigned int) (temp >> Al), 1);
+ emit_bits(entropy, (unsigned int)(temp >> Al), 1);
}
cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -612,22 +785,148 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
/*
+ * Data preparation for encode_mcu_AC_refine().
+ */
+
+#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
+ /* It is convenient to make a pre-pass to determine the transformed \
+ * coefficients' absolute values and the EOB position. \
+ */ \
+ for (k = 0; k < Sl; k++) { \
+ temp = block[jpeg_natural_order_start[k]]; \
+ /* We must apply the point transform by Al. For AC coefficients this \
+ * is an integer division with rounding towards 0. To do this portably \
+ * in C, we shift after obtaining the absolute value. \
+ */ \
+ temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+ temp ^= temp2; \
+ temp -= temp2; /* temp is abs value of input */ \
+ temp >>= Al; /* apply the point transform */ \
+ if (temp != 0) { \
+ zerobits |= ((size_t)1U) << k; \
+ signbits |= ((size_t)(temp2 + 1)) << k; \
+ } \
+ absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
+ if (temp == 1) \
+ EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \
+ } \
+}
+
+METHODDEF(int)
+encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ register int k, temp, temp2;
+ int EOB = 0;
+ size_t zerobits = 0U, signbits = 0U;
+ int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+ if (Sl0 > 32)
+ Sl0 = 32;
+#endif
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
+
+ bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 8
+ bits[1] = signbits;
+#else
+ bits[2] = signbits;
+
+ zerobits = 0U;
+ signbits = 0U;
+
+ if (Sl > 32) {
+ Sl -= 32;
+ jpeg_natural_order_start += 32;
+ absvalues += 32;
+
+ COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
+ }
+
+ bits[1] = zerobits;
+ bits[3] = signbits;
+#endif
+
+ return EOB;
+}
+
+
+/*
* MCU encoding for AC successive approximation refinement scan.
*/
+#define ENCODE_COEFS_AC_REFINE(label) { \
+ while (zerobits) { \
+ int idx = count_zeroes(&zerobits); \
+ r += idx; \
+ cabsvalue += idx; \
+ signbits >>= idx; \
+label \
+ /* Emit any required ZRLs, but not if they can be folded into EOB */ \
+ while (r > 15 && (cabsvalue <= EOBPTR)) { \
+ /* emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ /* Emit ZRL */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+ r -= 16; \
+ /* Emit buffered correction bits that must be associated with ZRL */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ } \
+ \
+ temp = *cabsvalue++; \
+ \
+ /* If the coef was previously nonzero, it only needs a correction bit. \
+ * NOTE: a straight translation of the spec's figure G.7 would suggest \
+ * that we also need to test r > 15. But if r > 15, we can only get here \
+ * if k > EOB, which implies that this coefficient is not 1. \
+ */ \
+ if (temp > 1) { \
+ /* The correction bit is the next bit of the absolute value. */ \
+ BR_buffer[BR++] = (char)(temp & 1); \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ continue; \
+ } \
+ \
+ /* Emit any pending EOBRUN and the BE correction bits */ \
+ emit_eobrun(entropy); \
+ \
+ /* Count/emit Huffman symbol for run length / number of bits */ \
+ emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
+ \
+ /* Emit output bit for newly-nonzero coef */ \
+ temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
+ emit_bits(entropy, (unsigned int)temp, 1); \
+ \
+ /* Emit buffered correction bits that must be associated with this code */ \
+ emit_buffered_bits(entropy, BR_buffer, BR); \
+ BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+ BR = 0; \
+ r = 0; /* reset zero run length */ \
+ signbits >>= 1; \
+ zerobits >>= 1; \
+ } \
+}
+
METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- register int temp;
- register int r, k;
- int EOB;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+ register int temp, r;
char *BR_buffer;
unsigned int BR;
- int Se = cinfo->Se;
+ int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
- JBLOCKROW block;
- int absvalues[DCTSIZE2];
+ JCOEF absvalues_unaligned[DCTSIZE2 + 15];
+ JCOEF *absvalues;
+ const JCOEF *cabsvalue, *EOBPTR;
+ size_t zerobits, signbits;
+ size_t bits[16 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -637,26 +936,17 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
- /* Encode the MCU data block */
- block = MCU_data[0];
+#ifdef WITH_SIMD
+ cabsvalue = absvalues = (JCOEF *)PAD((size_t)absvalues_unaligned, 16);
+#else
+ /* Not using SIMD, so alignment is not needed */
+ cabsvalue = absvalues = absvalues_unaligned;
+#endif
- /* It is convenient to make a pre-pass to determine the transformed
- * coefficients' absolute values and the EOB position.
- */
- EOB = 0;
- for (k = cinfo->Ss; k <= Se; k++) {
- temp = (*block)[jpeg_natural_order[k]];
- /* We must apply the point transform by Al. For AC coefficients this
- * is an integer division with rounding towards 0. To do this portably
- * in C, we shift after obtaining the absolute value.
- */
- if (temp < 0)
- temp = -temp; /* temp is abs value of input */
- temp >>= Al; /* apply the point transform */
- absvalues[k] = temp; /* save abs value for main pass */
- if (temp == 1)
- EOB = k; /* EOB = index of last newly-nonzero coef */
- }
+ /* Prepare data */
+ EOBPTR = absvalues +
+ entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+ Sl, Al, absvalues, bits);
/* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
@@ -664,52 +954,32 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
BR = 0; /* BR = count of buffered bits added now */
BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
- for (k = cinfo->Ss; k <= Se; k++) {
- if ((temp = absvalues[k]) == 0) {
- r++;
- continue;
- }
-
- /* Emit any required ZRLs, but not if they can be folded into EOB */
- while (r > 15 && k <= EOB) {
- /* emit any pending EOBRUN and the BE correction bits */
- emit_eobrun(entropy);
- /* Emit ZRL */
- emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
- r -= 16;
- /* Emit buffered correction bits that must be associated with ZRL */
- emit_buffered_bits(entropy, BR_buffer, BR);
- BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
- BR = 0;
- }
-
- /* If the coef was previously nonzero, it only needs a correction bit.
- * NOTE: a straight translation of the spec's figure G.7 would suggest
- * that we also need to test r > 15. But if r > 15, we can only get here
- * if k > EOB, which implies that this coefficient is not 1.
- */
- if (temp > 1) {
- /* The correction bit is the next bit of the absolute value. */
- BR_buffer[BR++] = (char) (temp & 1);
- continue;
- }
-
- /* Emit any pending EOBRUN and the BE correction bits */
- emit_eobrun(entropy);
-
- /* Count/emit Huffman symbol for run length / number of bits */
- emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+ zerobits = bits[0];
+#if SIZEOF_SIZE_T == 8
+ signbits = bits[1];
+#else
+ signbits = bits[2];
+#endif
+ ENCODE_COEFS_AC_REFINE((void)0;);
+
+#if SIZEOF_SIZE_T == 4
+ zerobits = bits[1];
+ signbits = bits[3];
+
+ if (zerobits) {
+ int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
+ int idx = count_zeroes(&zerobits);
+ signbits >>= idx;
+ idx += diff;
+ r += idx;
+ cabsvalue += idx;
+ goto first_iter_ac_refine;
+ }
- /* Emit output bit for newly-nonzero coef */
- temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
- emit_bits(entropy, (unsigned int) temp, 1);
+ ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
+#endif
- /* Emit buffered correction bits that must be associated with this code */
- emit_buffered_bits(entropy, BR_buffer, BR);
- BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
- BR = 0;
- r = 0; /* reset zero run length */
- }
+ r |= (int)((absvalues + Sl) - cabsvalue);
if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
@@ -718,7 +988,8 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
* 1. overflow of the EOB counter;
* 2. overflow of the correction bit buffer during the next MCU.
*/
- if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+ if (entropy->EOBRUN == 0x7FFF ||
+ entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1))
emit_eobrun(entropy);
}
@@ -744,9 +1015,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
*/
METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
+finish_pass_phuff(j_compress_ptr cinfo)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -765,9 +1036,9 @@ finish_pass_phuff (j_compress_ptr cinfo)
*/
METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
+finish_pass_gather_phuff(j_compress_ptr cinfo)
{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
boolean is_DC_band;
int ci, tbl;
jpeg_component_info *compptr;
@@ -793,13 +1064,13 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
} else {
tbl = compptr->ac_tbl_no;
}
- if (! did[tbl]) {
+ if (!did[tbl]) {
if (is_DC_band)
- htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+ htblptr = &cinfo->dc_huff_tbl_ptrs[tbl];
else
- htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+ htblptr = &cinfo->ac_huff_tbl_ptrs[tbl];
if (*htblptr == NULL)
- *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+ *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
did[tbl] = TRUE;
}
@@ -812,15 +1083,15 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
*/
GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
+jinit_phuff_encoder(j_compress_ptr cinfo)
{
phuff_entropy_ptr entropy;
int i;
entropy = (phuff_entropy_ptr)
- (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(phuff_entropy_encoder));
- cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+ cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
entropy->pub.start_pass = start_pass_phuff;
/* Mark tables unallocated */