15 files changed, 728 insertions, 482 deletions
diff --git a/src/3rdparty/libwebp/src/dec/buffer.c b/src/3rdparty/libwebp/src/dec/buffer.c
index 42feac7..2129312 100644
--- a/src/3rdparty/libwebp/src/dec/buffer.c
+++ b/src/3rdparty/libwebp/src/dec/buffer.c
@@ -33,6 +33,11 @@ static int IsValidColorspace(int webp_csp_mode) {
   return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }
 
+// strictly speaking, the very last (or first, if flipped) row
+// doesn't require padding.
+#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   int ok = 1;
   const WEBP_CSP_MODE mode = buffer->colorspace;
@@ -42,20 +47,22 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
     ok = 0;
   } else if (!WebPIsRGBMode(mode)) {   // YUV checks
     const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int uv_width  = (width  + 1) / 2;
+    const int uv_height = (height + 1) / 2;
     const int y_stride = abs(buf->y_stride);
     const int u_stride = abs(buf->u_stride);
     const int v_stride = abs(buf->v_stride);
     const int a_stride = abs(buf->a_stride);
-    const uint64_t y_size = (uint64_t)y_stride * height;
-    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)a_stride * height;
+    const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
+    const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
+    const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
+    const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
     ok &= (y_size <= buf->y_size);
     ok &= (u_size <= buf->u_size);
     ok &= (v_size <= buf->v_size);
     ok &= (y_stride >= width);
-    ok &= (u_stride >= (width + 1) / 2);
-    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (u_stride >= uv_width);
+    ok &= (v_stride >= uv_width);
     ok &= (buf->y != NULL);
     ok &= (buf->u != NULL);
     ok &= (buf->v != NULL);
@@ -67,13 +74,14 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   } else {    // RGB checks
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
     const int stride = abs(buf->stride);
-    const uint64_t size = (uint64_t)stride * height;
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
     ok &= (size <= buf->size);
     ok &= (stride >= width * kModeBpp[mode]);
     ok &= (buf->rgba != NULL);
   }
   return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
+#undef MIN_BUFFER_SIZE
 
 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
   const int w = buffer->width;
diff --git a/src/3rdparty/libwebp/src/dec/io.c b/src/3rdparty/libwebp/src/dec/io.c
index 8094e44..b2e72f0 100644
--- a/src/3rdparty/libwebp/src/dec/io.c
+++ b/src/3rdparty/libwebp/src/dec/io.c
@@ -322,37 +322,31 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
   size_t tmp_size;
-  int32_t* work;
+  rescaler_t* work;
 
   tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
     tmp_size += work_size * sizeof(*work);
   }
-  p->memory = WebPSafeCalloc(1ULL, tmp_size);
+  p->memory = WebPSafeMalloc(1ULL, tmp_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
   WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                    buf->y, out_width, out_height, buf->y_stride, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                    work);
   WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                    buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                    work + work_size);
   WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                    buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                    work + work_size + uv_work_size);
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
     WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                      buf->a, out_width, out_height, buf->a_stride, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                      work + work_size + 2 * uv_work_size);
     p->emit_alpha = EmitRescaledAlphaYUV;
     WebPInitAlphaProcessing();
@@ -375,9 +369,9 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
          WebPRescalerHasPendingOutput(&p->scaler_u)) {
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
     assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y, 0);
-    WebPRescalerExportRow(&p->scaler_u, 0);
-    WebPRescalerExportRow(&p->scaler_v, 0);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
     convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
             dst, p->scaler_y.dst_width);
     dst += buf->stride;
@@ -425,7 +419,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    WebPRescalerExportRow(&p->scaler_a);
     for (i = 0; i < width; ++i) {
       const uint32_t alpha_value = p->scaler_a.dst[i];
       dst[4 * i] = alpha_value;
@@ -458,7 +452,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    WebPRescalerExportRow(&p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
       const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@@ -495,7 +489,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
   const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
-  int32_t* work;  // rescalers work area
+  rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
   size_t tmp_size1, tmp_size2, total_size;
 
@@ -506,30 +500,26 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     tmp_size2 += out_width;
   }
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeCalloc(1ULL, total_size);
+  p->memory = WebPSafeMalloc(1ULL, total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
   WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                    tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                    work + 0 * work_size);
   WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                    tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                    work + 1 * work_size);
   WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                    tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                    work + 2 * work_size);
   p->emit = EmitRescaledRGB;
 
   if (has_alpha) {
     WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                      tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                      work + 3 * work_size);
     p->emit_alpha = EmitRescaledAlphaRGB;
     if (p->output->colorspace == MODE_RGBA_4444 ||
diff --git a/src/3rdparty/libwebp/src/dec/vp8i.h b/src/3rdparty/libwebp/src/dec/vp8i.h
index a02d9ff..0e6c8f5 100644
--- a/src/3rdparty/libwebp/src/dec/vp8i.h
+++ b/src/3rdparty/libwebp/src/dec/vp8i.h
@@ -31,7 +31,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 3
+#define DEC_REV_VERSION 4
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
diff --git a/src/3rdparty/libwebp/src/dec/vp8l.c b/src/3rdparty/libwebp/src/dec/vp8l.c
index e2780e5..2fa5f40 100644
--- a/src/3rdparty/libwebp/src/dec/vp8l.c
+++ b/src/3rdparty/libwebp/src/dec/vp8l.c
@@ -390,13 +390,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   const int in_height = io->mb_h;
   const int out_height = io->scaled_height;
   const uint64_t work_size = 2 * num_channels * (uint64_t)out_width;
-  int32_t* work;        // Rescaler work area.
-  const uint64_t scaled_data_size = num_channels * (uint64_t)out_width;
+  rescaler_t* work;        // Rescaler work area.
+  const uint64_t scaled_data_size = (uint64_t)out_width;
   uint32_t* scaled_data;  // Temporary storage for scaled BGRA data.
   const uint64_t memory_size = sizeof(*dec->rescaler) +
                                work_size * sizeof(*work) +
                                scaled_data_size * sizeof(*scaled_data);
-  uint8_t* memory = (uint8_t*)WebPSafeCalloc(memory_size, sizeof(*memory));
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(memory_size, sizeof(*memory));
   if (memory == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
@@ -406,13 +406,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 
   dec->rescaler = (WebPRescaler*)memory;
   memory += sizeof(*dec->rescaler);
-  work = (int32_t*)memory;
+  work = (rescaler_t*)memory;
   memory += work_size * sizeof(*work);
   scaled_data = (uint32_t*)memory;
 
   WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels,
-                   in_width, out_width, in_height, out_height, work);
+                   out_width, out_height, 0, num_channels, work);
   return 1;
 }
 
@@ -427,7 +426,7 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     uint8_t* const dst = rgba + num_lines_out * rgba_stride;
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
     WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
     ++num_lines_out;
@@ -545,7 +544,7 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
     WebPMultARGBRow(src, dst_width, 1);
     ConvertToYUVA(src, dst_width, y_pos, dec->output_);
     ++y_pos;
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index 9c5bc1c..4afae07 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -24,24 +24,24 @@
 
 // Load/Store vertical edge
 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
 
 #define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
 
 #if !defined(WORK_AROUND_GCC)
 
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 2409bae..a2c3951 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -36,14 +36,9 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif  // __clang__
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
 
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
@@ -73,7 +68,8 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif
 
-#if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
+#if defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
index def9a16..6cede18 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -34,26 +34,26 @@ static const int kC2 = 35468;
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
 #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
-  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
-  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
-  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
-  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
-  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
-  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
-  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
-  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
-  "sra     %[temp18],      %[temp18],      16              \n\n"            \
-  "sra     %[temp17],      %[temp17],      16              \n\n"            \
-  "sra     %[temp19],      %[temp19],      16              \n\n"            \
-  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
-  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
-  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"
 
 // macro for one horizontal pass in ITransformOne
 // MUL and STORE macros inlined
@@ -61,59 +61,59 @@ static const int kC2 = 35468;
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
-  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
-  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
-  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
-  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
-  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
-  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                      \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
-  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
-  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
-  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
-  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
-  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
-  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
-  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255             \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
-  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                      \n\t"            \
-  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)              \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4             \n\t"            \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]        \n\t"            \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]        \n\t"            \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]        \n\t"            \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]        \n\t"            \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\t"            \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16            \n\t"            \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16            \n\t"            \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16            \n\t"            \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]  \n\t"            \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]       \n\t"            \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]       \n\t"            \
+  "lw      %[temp20],      0(%[args])                        \n\t"            \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3             \n\t"            \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3             \n\t"            \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3             \n\t"            \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3             \n\t"            \
+  "lbu     %[temp16],      " #A "(%[temp20])                 \n\t"            \
+  "lbu     %[temp17],      " #B "(%[temp20])                 \n\t"            \
+  "lbu     %[temp18],      " #C "(%[temp20])                 \n\t"            \
+  "lbu     %[temp19],      " #D "(%[temp20])                 \n\t"            \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]   \n\t"            \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]   \n\t"            \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]   \n\t"            \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]  \n\t"            \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero           \n\t"            \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero           \n\t"            \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero           \n\t"            \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero           \n\t"            \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]       \n\t"            \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]       \n\t"            \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]       \n\t"            \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]       \n\t"            \
+  "addiu   %[temp20],      $zero,          255               \n\t"            \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]       \n\t"            \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]       \n\t"            \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]       \n\t"            \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]       \n\t"            \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]       \n\t"            \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]       \n\t"            \
+  "lw      %[temp16],      8(%[args])                        \n\t"            \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]       \n\t"            \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]       \n\t"            \
+  "sb      %[" #TEMP0 "],    " #A "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP4 "],    " #B "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP8 "],    " #C "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP12 "],   " #D "(%[temp16])               \n\t"
 
 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@@ -164,9 +164,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 #define QUANTIZE_ONE(J, K, N)                                               \
-  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
   "sra          %[sign],        %[temp0],           15              \n\t"   \
   "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
   "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
@@ -175,9 +175,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
   "addiu        %[temp5],       $zero,              0               \n\t"   \
   "addiu        %[level],       $zero,              0               \n\t"   \
   "beqz         %[temp4],       2f                                  \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
-  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
   "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
   "addu         %[level],       %[level],           %[temp2]        \n\t"   \
   "sra          %[level],       %[level],           17              \n\t"   \
@@ -187,8 +187,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
   "subu         %[level],       %[level],           %[sign]         \n\t"   \
   "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
 "2:                                                                 \n\t"   \
-  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
-  "sh           %[level],       "#N"(%[pout])                       \n\t"
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"
 
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                          const VP8Matrix* const mtx) {
@@ -249,14 +249,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
 #define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
-  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
-  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
-  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
-  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
-  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
-  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
-  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
+  "lbu    %[temp0],  " #A "(%[a])            \n\t"                \
+  "lbu    %[temp1],  " #B "(%[a])            \n\t"                \
+  "lbu    %[temp2],  " #C "(%[a])            \n\t"                \
+  "lbu    %[temp3],  " #D "(%[a])            \n\t"                \
+  "lbu    %[temp4],  " #A "(%[b])            \n\t"                \
+  "lbu    %[temp5],  " #B "(%[b])            \n\t"                \
+  "lbu    %[temp6],  " #C "(%[b])            \n\t"                \
+  "lbu    %[temp7],  " #D "(%[b])            \n\t"                \
   "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
   "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
   "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
@@ -273,14 +273,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
   "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
   "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
-  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
-  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
-  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
-  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
-  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
-  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
-  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+  "sw     %[temp7],  " #E "(%[tmp])          \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])          \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])          \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])          \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])         \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])         \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])         \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])         \n\t"
 
 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@@ -295,10 +295,10 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // A1..D1 - offsets in bytes to load second results from tmp buffer
 // E..H - offsets in bytes to load from w buffer
 #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
-  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
-  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
-  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
-  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
   "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
   "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
   "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@@ -319,18 +319,18 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
   "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
   "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
-  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
-  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
-  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
-  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
   "madd   %[temp4],  %[temp3]                \n\t"                \
   "madd   %[temp5],  %[temp1]                \n\t"                \
   "madd   %[temp6],  %[temp0]                \n\t"                \
   "madd   %[temp7],  %[temp8]                \n\t"                \
-  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
-  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
-  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
-  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
   "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
   "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
   "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@@ -407,71 +407,71 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
-  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
-  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
-  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
-  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
-  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
-  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
-  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
-  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
-  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
-  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3)   \
+  "lw     %[" #TEMP1 "],  0(%[args])                     \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                     \n\t"    \
+  "lbu    %[temp16],    " #A "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp17],    " #A "(%[" #TEMP2 "])            \n\t"    \
+  "lbu    %[temp18],    " #B "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp19],    " #B "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]          \n\t"    \
+  "lbu    %[temp16],    " #C "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp17],    " #C "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]        \n\t"    \
+  "lbu    %[temp18],    " #D "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp19],    " #D "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]        \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]        \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]    \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]  \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]         \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]         \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]         \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]         \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]      \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]        \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3              \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812               \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]          \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]          \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                \n\t"
 
 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to store to out buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
-  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
-  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
-  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
-  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
-  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
-  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
-  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
-  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
-  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
-  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
-  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
-  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
-  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
 
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -622,14 +622,14 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
 }
 
 #define GET_SSE_INNER(A, B, C, D)                               \
-  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
-  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
-  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
-  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
-  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
-  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
-  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
-  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
   "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
   "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
   "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_mips32.c
index 1308580..5562c41 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips32.c
@@ -285,28 +285,28 @@ static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
 // literal_ and successive histograms could be unaligned
 // so we must use ulw and usw
 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
-    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
-    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
-    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
-    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
-    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
-    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
-    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
-    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
+    "ulw    %[temp0], " #A "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp1], " #B "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp2], " #C "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp3], " #D "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp4], " #A "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp5], " #B "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp6], " #C "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp7], " #D "(%[" #P1 "])    \n\t"       \
     "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
     "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
     "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
     "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
-    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
-  ".if "#E" == 1                            \n\t"       \
-    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
+    "addiu  %[" #P0 "],  %[" #P0 "],  16    \n\t"       \
+  ".if " #E " == 1                          \n\t"       \
+    "addiu  %[" #P1 "],  %[" #P1 "],  16    \n\t"       \
   ".endif                                   \n\t"       \
-    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
-    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
-    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
-    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
-    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
-    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
+    "usw    %[temp4], " #A "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp5], " #B "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp6], " #C "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp7], " #D "(%[" #P2 "])    \n\t"       \
+    "addiu  %[" #P2 "], %[" #P2 "],   16    \n\t"       \
+    "bne    %[" #P0 "], %[LoopEnd], 1b      \n\t"       \
     ".set   pop                             \n\t"       \
 
 #define ASM_END_COMMON_0                                \
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram.c
index 7c6abb4..a2266b4 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram.c
@@ -20,6 +20,9 @@
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"
 
+#define ALIGN_CST 15
+#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
+
 #define MAX_COST 1.e38
 
 // Number of partitions for the three dominant (literal, red and blue) symbol
@@ -101,9 +104,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
-  const size_t total_size = sizeof(*set)
-                            + sizeof(*set->histograms) * size
-                            + (size_t)VP8LGetHistogramSize(cache_bits) * size;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  const size_t total_size =
+      sizeof(*set) + size * (sizeof(*set->histograms) + histo_size + ALIGN_CST);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -114,12 +117,12 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   set->max_size = size;
   set->size = size;
   for (i = 0; i < size; ++i) {
+    memory = (uint8_t*)DO_ALIGN(memory);
     set->histograms[i] = (VP8LHistogram*)memory;
     // literal_ won't necessary be aligned.
     set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
     VP8LHistogramInit(set->histograms[i], cache_bits);
-    // There's no padding/alignment between successive histograms.
-    memory += VP8LGetHistogramSize(cache_bits);
+    memory += histo_size;
   }
   return set;
 }
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale.c
index de52848..9e45551 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale.c
@@ -175,17 +175,13 @@ static void RescalePlane(const uint8_t* src,
                          int src_width, int src_height, int src_stride,
                          uint8_t* dst,
                          int dst_width, int dst_height, int dst_stride,
-                         int32_t* const work,
+                         rescaler_t* const work,
                          int num_channels) {
   WebPRescaler rescaler;
   int y = 0;
   WebPRescalerInit(&rescaler, src_width, src_height,
                    dst, dst_width, dst_height, dst_stride,
-                   num_channels,
-                   src_width, dst_width,
-                   src_height, dst_height,
-                   work);
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+                   num_channels, work);
   while (y < src_height) {
     y += WebPRescalerImport(&rescaler, src_height - y,
                             src + y * src_stride, src_stride);
@@ -209,7 +205,7 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
-  int32_t* work;
+  rescaler_t* work;
 
   if (pic == NULL) return 0;
   prev_width = pic->width;
@@ -231,7 +227,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   if (!WebPPictureAlloc(&tmp)) return 0;
 
   if (!pic->use_argb) {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
       return 0;
@@ -259,7 +255,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                  tmp.v,
                  HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
   } else {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
       return 0;
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8enci.h
index 74c8f70..20f58c6 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8enci.h
@@ -30,7 +30,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 3
+#define ENC_REV_VERSION 4
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
diff --git a/src/3rdparty/libwebp/src/utils/endian_inl.h b/src/3rdparty/libwebp/src/utils/endian_inl.h
index cd56c37..e11260f 100644
--- a/src/3rdparty/libwebp/src/utils/endian_inl.h
+++ b/src/3rdparty/libwebp/src/utils/endian_inl.h
@@ -35,14 +35,14 @@
 #endif
 
 #if !defined(HAVE_CONFIG_H)
-// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
-#if LOCAL_GCC_PREREQ(4,3) || LOCAL_CLANG_PREREQ(3,3)
+#if LOCAL_GCC_PREREQ(4,8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap32)
 #define HAVE_BUILTIN_BSWAP32
-#define HAVE_BUILTIN_BSWAP64
 #endif
-// clang-3.3 and gcc-4.8 have a builtin function for swap16
-#if LOCAL_GCC_PREREQ(4,8) || LOCAL_CLANG_PREREQ(3,3)
-#define HAVE_BUILTIN_BSWAP16
+#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
 #endif
 #endif  // !HAVE_CONFIG_H
 
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.c b/src/3rdparty/libwebp/src/utils/rescaler.c
index fad9c6b..3a43229 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.c
+++ b/src/3rdparty/libwebp/src/utils/rescaler.c
@@ -13,77 +13,192 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 #include "./rescaler.h"
 #include "../dsp/dsp.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
 
-void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
-                              const uint8_t* const src, int channel) = NULL;
-void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out) = NULL;
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(WebPRescaler* const wrk,
+                                          const uint8_t* src);
+static WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+static WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
 
-#define RFIX 30
-#define MULT_FIX(x, y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(WebPRescaler* const wrk);
+static WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+static WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 
-static void ImportRowC(WebPRescaler* const wrk,
-                       const uint8_t* const src, int channel) {
+#define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
+#define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
+#define WEBP_RESCALER_FRAC(x, y) \
+    ((uint32_t)(((uint64_t)(x) << WEBP_RESCALER_RFIX) / (y)))
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+static void ImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
-  int x_in = channel;
-  int x_out;
-  int accum = 0;
-  if (!wrk->x_expand) {
-    int sum = 0;
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    // simple bilinear interpolation
+    int accum = wrk->x_add;
+    int left = src[x_in];
+    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    x_in += x_stride;
+    while (1) {
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      x_out += x_stride;
+      if (x_out >= x_out_max) break;
+      accum -= wrk->x_sub;
+      if (accum < 0) {
+        left = right;
+        x_in += x_stride;
+        assert(x_in < wrk->src_width * x_stride);
+        right = src[x_in];
+        accum += wrk->x_add;
+      }
+    }
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+static void ImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(!wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    uint32_t sum = 0;
+    int accum = 0;
+    while (x_out < x_out_max) {
+      uint32_t base = 0;
       accum += wrk->x_add;
-      for (; accum > 0; accum -= wrk->x_sub) {
-        sum += src[x_in];
+      while (accum > 0) {
+        accum -= wrk->x_sub;
+        assert(x_in < wrk->src_width * x_stride);
+        base = src[x_in];
+        sum += base;
         x_in += x_stride;
       }
       {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in];
-        const int32_t frac = base * (-accum);
-        x_in += x_stride;
-        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
+        const rescaler_t frac = base * (-accum);
+        wrk->frow[x_out] = sum * wrk->x_sub - frac;
         // fresh fractional start for next pixel
         sum = (int)MULT_FIX(frac, wrk->fx_scale);
       }
+      x_out += x_stride;
     }
-  } else {        // simple bilinear interpolation
-    int left = src[channel], right = src[channel];
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-      if (accum < 0) {
-        left = right;
-        x_in += x_stride;
-        right = src[x_in];
-        accum += wrk->x_add;
-      }
-      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-      accum -= wrk->x_sub;
-    }
+    assert(accum == 0);
   }
-  // Accumulate the contribution of the new row.
-  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-    wrk->irow[x_out] += wrk->frow[x_out];
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpandC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
   }
 }
 
-static void ExportRowC(WebPRescaler* const wrk, int x_out) {
-  if (wrk->y_accum <= 0) {
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
-    for (; x_out < x_out_max; ++x_out) {
-      const int frac = (int)MULT_FIX(frow[x_out], yscale);
+static void ExportRowShrinkC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
       const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
-      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
       irow[x_out] = frac;   // new fractional start
     }
+  } else {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Main entry calls
+
+void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
+  assert(!WebPRescalerInputDone(wrk));
+  if (!wrk->x_expand) {
+    WebPRescalerImportRowShrink(wrk, src);
+  } else {
+    WebPRescalerImportRowExpand(wrk, src);
+  }
+}
+
+void WebPRescalerExportRow(WebPRescaler* const wrk) {
+  if (wrk->y_accum <= 0) {
+    assert(!WebPRescalerOutputDone(wrk));
+    if (wrk->y_expand) {
+      WebPRescalerExportRowExpand(wrk);
+    } else if (wrk->fxy_scale) {
+      WebPRescalerExportRowShrink(wrk);
+    } else {  // very special case for src = dst = 1x1
+      int i;
+      assert(wrk->src_width == 1 && wrk->dst_width <= 2);
+      assert(wrk->src_height == 1 && wrk->dst_height == 1);
+      for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
+        wrk->dst[i] = wrk->irow[i];
+        wrk->irow[i] = 0;
+      }
+    }
     wrk->y_accum += wrk->y_add;
     wrk->dst += wrk->dst_stride;
+    ++wrk->dst_y;
   }
 }
 
@@ -92,23 +207,25 @@ static void ExportRowC(WebPRescaler* const wrk, int x_out) {
 
 #if defined(WEBP_USE_MIPS32)
 
-static void ImportRowMIPS(WebPRescaler* const wrk,
-                          const uint8_t* const src, int channel) {
+static void ImportRowShrinkMIPS(WebPRescaler* const wrk, const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int fx_scale = wrk->fx_scale;
   const int x_add = wrk->x_add;
   const int x_sub = wrk->x_sub;
-  int* frow = wrk->frow + channel;
-  int* irow = wrk->irow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp1, temp2, temp3;
-  int base, frac, sum;
-  int accum, accum1;
   const int x_stride1 = x_stride << 2;
-  int loop_c = x_out_max - channel;
+  int channel;
+  assert(!wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max - channel;
 
-  if (!wrk->x_expand) {
     __asm__ volatile (
       "li     %[temp1],   0x8000                    \n\t"
       "li     %[temp2],   0x10000                   \n\t"
@@ -116,179 +233,295 @@ static void ImportRowMIPS(WebPRescaler* const wrk,
       "li     %[accum],   0                         \n\t"
     "1:                                             \n\t"
       "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
       "blez   %[accum],   3f                        \n\t"
     "2:                                             \n\t"
-      "lbu    %[temp3],   0(%[src1])                \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
       "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
       "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
-      "addu   %[sum],     %[sum],     %[temp3]      \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
       "bgtz   %[accum],   2b                        \n\t"
     "3:                                             \n\t"
-      "lbu    %[base],    0(%[src1])                \n\t"
-      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
       "negu   %[accum1],  %[accum]                  \n\t"
       "mul    %[frac],    %[base],    %[accum1]     \n\t"
-      "addu   %[temp3],   %[sum],     %[base]       \n\t"
-      "mul    %[temp3],   %[temp3],   %[x_sub]      \n\t"
-      "lw     %[base],    0(%[irow])                \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
       "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "sll    %[accum1],  %[frac],    2             \n\t"
       "mult   %[temp1],   %[temp2]                  \n\t"
-      "madd   %[accum1],  %[fx_scale]               \n\t"
+      "maddu  %[frac],    %[fx_scale]               \n\t"
       "mfhi   %[sum]                                \n\t"
       "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
       "sw     %[temp3],   0(%[frow])                \n\t"
-      "add    %[base],    %[base],    %[temp3]      \n\t"
-      "sw     %[base],    0(%[irow])                \n\t"
-      "addu   %[irow],    %[irow],    %[x_stride1]  \n\t"
       "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
       "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+r"(frow), [accum1]"=&r"(accum1),
+        [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+        [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+        [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+      : "memory", "hi", "lo"
+    );
+    assert(accum == 0);
+  }
+}
+
+static void ImportRowExpandMIPS(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int src_width = wrk->src_width;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
 
-      : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3),
-        [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac),
-        [frow] "+r" (frow), [irow] "+r" (irow), [accum1] "=&r" (accum1),
-        [temp2] "=&r" (temp2), [temp1] "=&r" (temp1)
-      : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale),
-        [x_sub] "r" (x_sub), [x_add] "r" (x_add),
-        [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1)
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+        [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+        [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+      : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+        [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+        [x_out_max]"r"(x_out_max)
+      : "memory", "hi", "lo"
+    );
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpandMIPS(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fy_scale;
+  const int temp6 = x_out_max << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
       : "memory", "hi", "lo"
     );
   } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
     __asm__ volatile (
-      "lbu    %[temp1],   0(%[src1])                \n\t"
-      "move   %[temp2],   %[temp1]                  \n\t"
-      "li     %[accum],   0                         \n\t"
-    "1:                                             \n\t"
-      "bgez   %[accum],   2f                        \n\t"
-      "move   %[temp2],   %[temp1]                  \n\t"
-      "addu   %[src1],    %[x_stride]               \n\t"
-      "lbu    %[temp1],   0(%[src1])                \n\t"
-      "addu   %[accum],   %[x_add]                  \n\t"
-    "2:                                             \n\t"
-      "subu   %[temp3],   %[temp2],   %[temp1]      \n\t"
-      "mul    %[temp3],   %[temp3],   %[accum]      \n\t"
-      "mul    %[base],    %[temp1],   %[x_add]      \n\t"
-      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
-      "lw     %[frac],    0(%[irow])                \n\t"
-      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "addu   %[temp3],   %[base],    %[temp3]      \n\t"
-      "sw     %[temp3],   0(%[frow])                \n\t"
-      "addu   %[frow],    %[x_stride1]              \n\t"
-      "addu   %[frac],    %[temp3]                  \n\t"
-      "sw     %[frac],    0(%[irow])                \n\t"
-      "addu   %[irow],    %[x_stride1]              \n\t"
-      "bgtz   %[loop_c],  1b                        \n\t"
-
-      : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
-        [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [base] "=&r" (base),
-        [frac] "=&r" (frac), [frow] "+r" (frow), [irow] "+r" (irow)
-      : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
-        [x_stride1] "r" (x_stride1), [loop_c] "r" (loop_c)
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "lw       %[temp1],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[A],        %[temp0]                   \n\t"
+      "maddu    %[B],        %[temp1]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp5],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
       : "memory", "hi", "lo"
     );
   }
 }
 
-static void ExportRowMIPS(WebPRescaler* const wrk, int x_out) {
-  if (wrk->y_accum <= 0) {
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
-    // if wrk->fxy_scale can fit into 32 bits use optimized code,
-    // otherwise use C code
-    if ((wrk->fxy_scale >> 32) == 0) {
-      int temp0, temp1, temp3, temp4, temp5, temp6, temp7, loop_end;
-      const int temp2 = (int)(wrk->fxy_scale);
-      const int temp8 = x_out_max << 2;
-      uint8_t* dst_t = (uint8_t*)dst;
-      int32_t* irow_t = (int32_t*)irow;
-      const int32_t* frow_t = (const int32_t*)frow;
-
-      __asm__ volatile(
-        "addiu    %[temp6],    $zero,       -256          \n\t"
-        "addiu    %[temp7],    $zero,       255           \n\t"
-        "li       %[temp3],    0x10000                    \n\t"
-        "li       %[temp4],    0x8000                     \n\t"
-        "addu     %[loop_end], %[frow_t],   %[temp8]      \n\t"
-      "1:                                                 \n\t"
-        "lw       %[temp0],    0(%[frow_t])               \n\t"
-        "mult     %[temp3],    %[temp4]                   \n\t"
-        "addiu    %[frow_t],   %[frow_t],   4             \n\t"
-        "sll      %[temp0],    %[temp0],    2             \n\t"
-        "madd     %[temp0],    %[yscale]                  \n\t"
-        "mfhi     %[temp1]                                \n\t"
-        "lw       %[temp0],    0(%[irow_t])               \n\t"
-        "addiu    %[dst_t],    %[dst_t],    1             \n\t"
-        "addiu    %[irow_t],   %[irow_t],   4             \n\t"
-        "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
-        "mult     %[temp3],    %[temp4]                   \n\t"
-        "sll      %[temp0],    %[temp0],    2             \n\t"
-        "madd     %[temp0],    %[temp2]                   \n\t"
-        "mfhi     %[temp5]                                \n\t"
-        "sw       %[temp1],    -4(%[irow_t])              \n\t"
-        "and      %[temp0],    %[temp5],    %[temp6]      \n\t"
-        "slti     %[temp1],    %[temp5],    0             \n\t"
-        "beqz     %[temp0],    2f                         \n\t"
-        "xor      %[temp5],    %[temp5],    %[temp5]      \n\t"
-        "movz     %[temp5],    %[temp7],    %[temp1]      \n\t"
-      "2:                                                 \n\t"
-        "sb       %[temp5],    -1(%[dst_t])               \n\t"
-        "bne      %[frow_t],   %[loop_end], 1b            \n\t"
-
-        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
-          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
-          [temp7]"=&r"(temp7), [frow_t]"+r"(frow_t), [irow_t]"+r"(irow_t),
-          [dst_t]"+r"(dst_t), [loop_end]"=&r"(loop_end)
-        : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp8]"r"(temp8)
-        : "memory", "hi", "lo"
-      );
-      wrk->y_accum += wrk->y_add;
-      wrk->dst += wrk->dst_stride;
-    } else {
-      ExportRowC(wrk, x_out);
-    }
+static void ExportRowShrinkMIPS(WebPRescaler* const wrk) {
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fxy_scale;
+  const int temp6 = x_out_max << 2;
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "maddu    %[temp0],    %[yscale]                  \n\t"
+      "mfhi     %[temp1]                                \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       %[temp1],    -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       $zero,       -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[irow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
   }
 }
+
 #endif   // WEBP_USE_MIPS32
 
 //------------------------------------------------------------------------------
 
 void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst, int dst_width, int dst_height,
-                      int dst_stride, int num_channels, int x_add, int x_sub,
-                      int y_add, int y_sub, int32_t* const work) {
+                      uint8_t* const dst,
+                      int dst_width, int dst_height, int dst_stride,
+                      int num_channels, rescaler_t* const work) {
+  const int x_add = src_width, x_sub = dst_width;
+  const int y_add = src_height, y_sub = dst_height;
   wrk->x_expand = (src_width < dst_width);
+  wrk->y_expand = (src_height < dst_height);
   wrk->src_width = src_width;
   wrk->src_height = src_height;
   wrk->dst_width = dst_width;
   wrk->dst_height = dst_height;
+  wrk->src_y = 0;
+  wrk->dst_y = 0;
   wrk->dst = dst;
   wrk->dst_stride = dst_stride;
   wrk->num_channels = num_channels;
+
   // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
   wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  if (!wrk->x_expand) {  // fx_scale is not used otherwise
+    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  }
+  // vertical scaling parameters
+  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
+  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
+  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
+  if (!wrk->y_expand) {
+    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    const uint64_t ratio =
+        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    if (ratio != (uint32_t)ratio) {
+      // We can't represent the ratio with the current fixed-point precision.
+      // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
+      wrk->fxy_scale = 0;
+    } else {
+      wrk->fxy_scale = (uint32_t)ratio;
+    }
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+  } else {
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
+    // wrk->fxy_scale is unused here.
+  }
   wrk->irow = work;
   wrk->frow = work + num_channels * dst_width;
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
 
-  if (WebPRescalerImportRow == NULL) {
-    WebPRescalerImportRow = ImportRowC;
-    WebPRescalerExportRow = ExportRowC;
+  if (WebPRescalerImportRowExpand == NULL) {
+    WebPRescalerImportRowExpand = ImportRowExpandC;
+    WebPRescalerImportRowShrink = ImportRowShrinkC;
+    WebPRescalerExportRowExpand = ExportRowExpandC;
+    WebPRescalerExportRowShrink = ExportRowShrinkC;
     if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_MIPS32)
       if (VP8GetCPUInfo(kMIPS32)) {
-        WebPRescalerImportRow = ImportRowMIPS;
-        WebPRescalerExportRow = ExportRowMIPS;
+        WebPRescalerImportRowExpand = ImportRowExpandMIPS;
+        WebPRescalerImportRowShrink = ImportRowShrinkMIPS;
+        WebPRescalerExportRowExpand = ExportRowExpandMIPS;
+        WebPRescalerExportRowShrink = ExportRowShrinkMIPS;
       }
 #endif
     }
@@ -296,7 +529,10 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
 }
 
 #undef MULT_FIX
-#undef RFIX
+#undef WEBP_RESCALER_RFIX
+#undef WEBP_RESCALER_ONE
+#undef WEBP_RESCALER_FRAC
+#undef ROUNDER
 
 //------------------------------------------------------------------------------
 // all-in-one calls
@@ -309,11 +545,20 @@ int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
-  while (total_imported < num_lines && wrk->y_accum > 0) {
-    int channel;
-    for (channel = 0; channel < wrk->num_channels; ++channel) {
-      WebPRescalerImportRow(wrk, src, channel);
+  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
+    if (wrk->y_expand) {
+      rescaler_t* const tmp = wrk->irow;
+      wrk->irow = wrk->frow;
+      wrk->frow = tmp;
+    }
+    WebPRescalerImportRow(wrk, src);
+    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+      int x;
+      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
+        wrk->irow[x] += wrk->frow[x];
+      }
     }
+    ++wrk->src_y;
     src += src_stride;
     ++total_imported;
     wrk->y_accum -= wrk->y_sub;
@@ -324,7 +569,7 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
 int WebPRescalerExport(WebPRescaler* const rescaler) {
   int total_exported = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
     ++total_exported;
   }
   return total_exported;
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.h b/src/3rdparty/libwebp/src/utils/rescaler.h
index a6f3787..8244cfe 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.h
+++ b/src/3rdparty/libwebp/src/utils/rescaler.h
@@ -21,20 +21,23 @@ extern "C" {
 #include "../webp/types.h"
 
 // Structure used for on-the-fly rescaling
+typedef uint32_t rescaler_t;   // type for side-buffer
 typedef struct {
   int x_expand;               // true if we're expanding in the x direction
+  int y_expand;               // true if we're expanding in the y direction
   int num_channels;           // bytes to jump between pixels
-  int fy_scale, fx_scale;     // fixed-point scaling factor
-  int64_t fxy_scale;          // ''
-  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  uint32_t fx_scale;          // fixed-point scaling factors
+  uint32_t fy_scale;          // ''
+  uint32_t fxy_scale;         // ''
   int y_accum;                // vertical accumulator
-  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
-  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int y_add, y_sub;           // vertical increments
+  int x_add, x_sub;           // horizontal increments
   int src_width, src_height;  // source dimensions
   int dst_width, dst_height;  // destination dimensions
+  int src_y, dst_y;           // row counters for input and output
   uint8_t* dst;
   int dst_stride;
-  int32_t* irow, *frow;       // work buffer
+  rescaler_t* irow, *frow;    // work buffer
 } WebPRescaler;
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
@@ -43,9 +46,7 @@ void WebPRescalerInit(WebPRescaler* const rescaler,
                       uint8_t* const dst,
                       int dst_width, int dst_height, int dst_stride,
                       int num_channels,
-                      int x_add, int x_sub,
-                      int y_add, int y_sub,
-                      int32_t* const work);
+                      rescaler_t* const work);
 
 // Returns the number of input lines needed next to produce one output line,
 // considering that the maximum available input lines are 'max_num_lines'.
@@ -57,21 +58,29 @@ int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
 int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
                        const uint8_t* src, int src_stride);
 
-// Import a row of data and save its contribution in the rescaler.
-// 'channel' denotes the channel number to be imported.
-extern void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
-                                     const uint8_t* const src, int channel);
+// Export as many rows as possible. Return the numbers of rows written.
+int WebPRescalerExport(WebPRescaler* const rescaler);
+void WebPRescalerImportRow(WebPRescaler* const wrk,
+                           const uint8_t* src);
 // Export one row (starting at x_out position) from rescaler.
-extern void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out);
+void WebPRescalerExportRow(WebPRescaler* const wrk);
 
-// Return true if there is pending output rows ready.
+// Return true if input is finished
 static WEBP_INLINE
-int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
-  return (rescaler->y_accum <= 0);
+int WebPRescalerInputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->src_y >= rescaler->src_height);
+}
+// Return true if output is finished
+static WEBP_INLINE
+int WebPRescalerOutputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->dst_y >= rescaler->dst_height);
 }
 
-// Export as many rows as possible. Return the numbers of rows written.
-int WebPRescalerExport(WebPRescaler* const rescaler);
+// Return true if there are pending output rows ready.
+static WEBP_INLINE
+int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
+  return !WebPRescalerOutputDone(rescaler) && (rescaler->y_accum <= 0);
+}
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h
index f2c498a..0bbbcab 100644
--- a/src/3rdparty/libwebp/src/utils/utils.h
+++ b/src/3rdparty/libwebp/src/utils/utils.h
@@ -90,7 +90,7 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
 #pragma intrinsic(_BitScanReverse)
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  uint32_t first_set_bit;
+  unsigned long first_set_bit;
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }