1 files changed, 547 insertions, 434 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index bab76d2..ee334bc 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -15,21 +15,16 @@
 
 #include "./dsp.h"
 
-#if defined(WEBP_USE_SSE2)
-#include <emmintrin.h>
-#endif
-
 #include <math.h>
 #include <stdlib.h>
-#include "./lossless.h"
 #include "../dec/vp8li.h"
+#include "../utils/endian_inl.h"
+#include "./lossless.h"
 #include "./yuv.h"
 
 #define MAX_DIFF_COST (1e30f)
 
 // lookup table for small values of log2(int)
-#define APPROX_LOG_MAX  4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
   0.0000000000000000f, 0.0000000000000000f,
   1.0000000000000000f, 1.5849625007211560f,
@@ -331,30 +326,59 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
 };
 
-float VP8LFastSLog2Slow(int v) {
+// The threshold till approximate version of log_2 can be used.
+// Practically, we can get rid of the call to log() as the two values match to
+// very high degree (the ratio of these two is 0.99999x).
+// Keeping a high threshold for now.
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+static float FastSLog2Slow(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_MAX) {
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
+    uint32_t y = 1;
+    int correction = 0;
     const float v_f = (float)v;
-    while (v >= LOG_LOOKUP_IDX_MAX) {
+    const uint32_t orig_v = v;
+    do {
       ++log_cnt;
       v = v >> 1;
-    }
-    return v_f * (kLog2Table[v] + log_cnt);
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+    correction = (23 * (orig_v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[v] + log_cnt) + correction;
   } else {
     return (float)(LOG_2_RECIPROCAL * v * log((double)v));
   }
 }
 
-float VP8LFastLog2Slow(int v) {
+static float FastLog2Slow(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_MAX) {
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
-    while (v >= LOG_LOOKUP_IDX_MAX) {
+    uint32_t y = 1;
+    const uint32_t orig_v = v;
+    double log_2;
+    do {
       ++log_cnt;
       v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+    log_2 = kLog2Table[v] + log_cnt;
+    if (orig_v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+      const int correction = (23 * (orig_v & (y - 1))) >> 4;
+      log_2 += (double)correction / orig_v;
     }
-    return kLog2Table[v] + log_cnt;
+    return (float)log_2;
   } else {
     return (float)(LOG_2_RECIPROCAL * log((double)v));
   }
@@ -363,6 +387,9 @@ float VP8LFastLog2Slow(int v) {
 //------------------------------------------------------------------------------
 // Image transforms.
 
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+
 // In-place sum of each component with mod 256.
 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
   const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
@@ -406,7 +433,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
                                          (c1 >> 8) & 0xff,
                                          (c2 >> 8) & 0xff);
   const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return (a << 24) | (r << 16) | (g << 8) | b;
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
@@ -420,15 +447,24 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
   const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
   const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
-  return (a << 24) | (r << 16) | (g << 8) | b;
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
-static WEBP_INLINE int Sub3(int a, int b, int c) {
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
   const int pb = b - c;
   const int pa = a - c;
   return abs(pb) - abs(pa);
 }
 
+#undef LOCAL_INLINE
+
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   const int pa_minus_pb =
       Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
@@ -489,21 +525,19 @@ static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
+  const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
-// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
-typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
-static const PredictorFunc kPredictors[16] = {
+static const VP8LPredictorFunc kPredictorsC[16] = {
   Predictor0, Predictor1, Predictor2, Predictor3,
   Predictor4, Predictor5, Predictor6, Predictor7,
   Predictor8, Predictor9, Predictor10, Predictor11,
@@ -511,10 +545,9 @@ static const PredictorFunc kPredictors[16] = {
   Predictor0, Predictor0    // <- padding security sentinels
 };
 
-// TODO(vikasa): Replace 256 etc with defines.
-static float PredictionCostSpatial(const int* counts,
-                                   int weight_0, double exp_val) {
-  const int significant_symbols = 16;
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
   const double exp_decay_factor = 0.6;
   double bits = weight_0 * counts[0];
   int i;
@@ -526,19 +559,19 @@ static float PredictionCostSpatial(const int* counts,
 }
 
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int* const X,
-                                    const int* const Y, int n) {
+static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
   int sumX = 0, sumXY = 0;
-  for (i = 0; i < n; ++i) {
+  for (i = 0; i < 256; ++i) {
     const int x = X[i];
-    const int xy = X[i] + Y[i];
+    const int xy = x + Y[i];
     if (x != 0) {
       sumX += x;
       retval -= VP8LFastSLog2(x);
-    }
-    if (xy != 0) {
+      sumXY += xy;
+      retval -= VP8LFastSLog2(xy);
+    } else if (xy != 0) {
       sumXY += xy;
       retval -= VP8LFastSLog2(xy);
     }
@@ -547,50 +580,53 @@ static float CombinedShannonEntropy(const int* const X,
   return (float)retval;
 }
 
-static float PredictionCostSpatialHistogram(int accumulated[4][256],
-                                            int tile[4][256]) {
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
   int i;
   double retval = 0;
   for (i = 0; i < 4; ++i) {
     const double kExpValue = 0.94;
     retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
+    retval += CombinedShannonEntropy(tile[i], accumulated[i]);
   }
   return (float)retval;
 }
 
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
 static int GetBestPredictorForTile(int width, int height,
                                    int tile_x, int tile_y, int bits,
-                                   int accumulated[4][256],
+                                   const int accumulated[4][256],
                                    const uint32_t* const argb_scratch) {
   const int kNumPredModes = 14;
   const int col_start = tile_x << bits;
   const int row_start = tile_y << bits;
   const int tile_size = 1 << bits;
-  const int ymax = (tile_size <= height - row_start) ?
-      tile_size : height - row_start;
-  const int xmax = (tile_size <= width - col_start) ?
-      tile_size : width - col_start;
-  int histo[4][256];
+  const int max_y = GetMin(tile_size, height - row_start);
+  const int max_x = GetMin(tile_size, width - col_start);
   float best_diff = MAX_DIFF_COST;
   int best_mode = 0;
-
   int mode;
   for (mode = 0; mode < kNumPredModes; ++mode) {
     const uint32_t* current_row = argb_scratch;
-    const PredictorFunc pred_func = kPredictors[mode];
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
     float cur_diff;
     int y;
-    memset(&histo[0][0], 0, sizeof(histo));
-    for (y = 0; y < ymax; ++y) {
+    int histo_argb[4][256];
+    memset(histo_argb, 0, sizeof(histo_argb));
+    for (y = 0; y < max_y; ++y) {
       int x;
       const int row = row_start + y;
       const uint32_t* const upper_row = current_row;
       current_row = upper_row + width;
-      for (x = 0; x < xmax; ++x) {
+      for (x = 0; x < max_x; ++x) {
         const int col = col_start + x;
         uint32_t predict;
-        uint32_t predict_diff;
         if (row == 0) {
           predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
         } else if (col == 0) {
@@ -598,14 +634,11 @@ static int GetBestPredictorForTile(int width, int height,
         } else {
           predict = pred_func(current_row[col - 1], upper_row + col);
         }
-        predict_diff = VP8LSubPixels(current_row[col], predict);
-        ++histo[0][predict_diff >> 24];
-        ++histo[1][((predict_diff >> 16) & 0xff)];
-        ++histo[2][((predict_diff >> 8) & 0xff)];
-        ++histo[3][(predict_diff & 0xff)];
+        UpdateHisto(histo_argb, VP8LSubPixels(current_row[col], predict));
       }
     }
-    cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
+    cur_diff = PredictionCostSpatialHistogram(
+        accumulated, (const int (*)[256])histo_argb);
     if (cur_diff < best_diff) {
       best_diff = cur_diff;
       best_mode = mode;
@@ -622,20 +655,18 @@ static void CopyTileWithPrediction(int width, int height,
   const int col_start = tile_x << bits;
   const int row_start = tile_y << bits;
   const int tile_size = 1 << bits;
-  const int ymax = (tile_size <= height - row_start) ?
-      tile_size : height - row_start;
-  const int xmax = (tile_size <= width - col_start) ?
-      tile_size : width - col_start;
-  const PredictorFunc pred_func = kPredictors[mode];
+  const int max_y = GetMin(tile_size, height - row_start);
+  const int max_x = GetMin(tile_size, width - col_start);
+  const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
   const uint32_t* current_row = argb_scratch;
 
   int y;
-  for (y = 0; y < ymax; ++y) {
+  for (y = 0; y < max_y; ++y) {
     int x;
     const int row = row_start + y;
     const uint32_t* const upper_row = current_row;
     current_row = upper_row + width;
-    for (x = 0; x < xmax; ++x) {
+    for (x = 0; x < max_x; ++x) {
       const int col = col_start + x;
       const int pix = row * width + col;
       uint32_t predict;
@@ -681,7 +712,8 @@ void VP8LResidualImage(int width, int height, int bits,
       if (all_x_max > width) {
         all_x_max = width;
       }
-      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
+      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits,
+                                     (const int (*)[256])histo,
                                      argb_scratch);
       image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
       CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
@@ -695,11 +727,7 @@ void VP8LResidualImage(int width, int height, int bits,
         }
         ix = all_y * width + tile_x_offset;
         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-          const uint32_t a = argb[ix];
-          ++histo[0][a >> 24];
-          ++histo[1][((a >> 16) & 0xff)];
-          ++histo[2][((a >> 8) & 0xff)];
-          ++histo[3][(a & 0xff)];
+          UpdateHisto(histo, argb[ix]);
         }
       }
     }
@@ -724,29 +752,36 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
 
   {
     int y = y_start;
-    const int mask = (1 << transform->bits_) - 1;
+    const int tile_width = 1 << transform->bits_;
+    const int mask = tile_width - 1;
+    const int safe_width = width & ~mask;
     const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
     const uint32_t* pred_mode_base =
         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 
     while (y < y_end) {
-      int x;
       const uint32_t pred2 = Predictor2(data[-1], data - width);
       const uint32_t* pred_mode_src = pred_mode_base;
-      PredictorFunc pred_func;
-
+      VP8LPredictorFunc pred_func;
+      int x = 1;
+      int t = 1;
       // First pixel follows the T (mode=2) mode.
       AddPixelsEq(data, pred2);
-
       // .. the rest:
-      pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
-      for (x = 1; x < width; ++x) {
-        uint32_t pred;
-        if ((x & mask) == 0) {    // start of tile. Read predictor function.
-          pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
+      while (x < safe_width) {
+        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        for (; t < tile_width; ++t, ++x) {
+          const uint32_t pred = pred_func(data[x - 1], data + x - width);
+          AddPixelsEq(data + x, pred);
+        }
+        t = 0;
+      }
+      if (x < width) {
+        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        for (; x < width; ++x) {
+          const uint32_t pred = pred_func(data[x - 1], data + x - width);
+          AddPixelsEq(data + x, pred);
         }
-        pred = pred_func(data[x - 1], data + x - width);
-        AddPixelsEq(data + x, pred);
       }
       data += width;
       ++y;
@@ -757,9 +792,9 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
   }
 }
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  for (; i < num_pixs; ++i) {
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
     const uint32_t argb = argb_data[i];
     const uint32_t green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
@@ -770,26 +805,19 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
-  while (data < data_end) {
-    const uint32_t argb = *data;
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
     const uint32_t green = ((argb >> 8) & 0xff);
     uint32_t red_blue = (argb & 0x00ff00ffu);
     red_blue += (green << 16) | green;
     red_blue &= 0x00ff00ffu;
-    *data++ = (argb & 0xff00ff00u) | red_blue;
+    data[i] = (argb & 0xff00ff00u) | red_blue;
   }
 }
 
-typedef struct {
-  // Note: the members are uint8_t, so that any negative values are
-  // automatically converted to "mod 256" values.
-  uint8_t green_to_red_;
-  uint8_t green_to_blue_;
-  uint8_t red_to_blue_;
-} Multipliers;
-
-static WEBP_INLINE void MultipliersClear(Multipliers* m) {
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
   m->green_to_red_ = 0;
   m->green_to_blue_ = 0;
   m->red_to_blue_ = 0;
@@ -801,40 +829,54 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
 }
 
 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
-                                               Multipliers* const m) {
+                                               VP8LMultipliers* const m) {
   m->green_to_red_  = (color_code >>  0) & 0xff;
   m->green_to_blue_ = (color_code >>  8) & 0xff;
   m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 
-static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
   return 0xff000000u |
          ((uint32_t)(m->red_to_blue_) << 16) |
          ((uint32_t)(m->green_to_blue_) << 8) |
          m->green_to_red_;
 }
 
-static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
-                                           uint32_t argb, int inverse) {
-  const uint32_t green = argb >> 8;
-  const uint32_t red = argb >> 16;
-  uint32_t new_red = red;
-  uint32_t new_blue = argb;
+void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+                          int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = argb >> 8;
+    const uint32_t red = argb >> 16;
+    uint32_t new_red = red;
+    uint32_t new_blue = argb;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
 
-  if (inverse) {
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
+                                 int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = argb >> 8;
+    const uint32_t red = argb >> 16;
+    uint32_t new_red = red;
+    uint32_t new_blue = argb;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
     new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
     new_blue &= 0xff;
-  } else {
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
-    new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
-    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
-  return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
 }
 
 static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
@@ -856,225 +898,251 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
 }
 
-static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
-                                          int ix, int xsize) {
-  const uint32_t v = argb[ix];
-  if (ix >= xsize + 3) {
-    if (v == argb[ix - xsize] &&
-        argb[ix - 1] == argb[ix - xsize - 1] &&
-        argb[ix - 2] == argb[ix - xsize - 2] &&
-        argb[ix - 3] == argb[ix - xsize - 3]) {
-      return 1;
-    }
-    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
-  } else if (ix >= 3) {
-    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
-  }
-  return 0;
-}
-
 static float PredictionCostCrossColor(const int accumulated[256],
                                       const int counts[256]) {
   // Favor low entropy, locally and globally.
   // Favor small absolute values for PredictionCostSpatial
   static const double kExpValue = 2.4;
-  return CombinedShannonEntropy(counts, accumulated, 256) +
+  return CombinedShannonEntropy(counts, accumulated) +
          PredictionCostSpatial(counts, 3, kExpValue);
 }
 
-static Multipliers GetBestColorTransformForTile(
-    int tile_x, int tile_y, int bits,
-    Multipliers prevX,
-    Multipliers prevY,
-    int step, int xsize, int ysize,
-    int* accumulated_red_histo,
-    int* accumulated_blue_histo,
-    const uint32_t* const argb) {
-  float best_diff = MAX_DIFF_COST;
+static float GetPredictionCostCrossColorRed(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256], const uint32_t* const argb) {
+  int all_y;
+  int histo[256] = { 0 };
   float cur_diff;
-  const int halfstep = step / 2;
-  const int max_tile_size = 1 << bits;
-  const int tile_y_offset = tile_y * max_tile_size;
-  const int tile_x_offset = tile_x * max_tile_size;
-  int green_to_red;
-  int green_to_blue;
-  int red_to_blue;
-  int all_x_max = tile_x_offset + max_tile_size;
-  int all_y_max = tile_y_offset + max_tile_size;
-  Multipliers best_tx;
-  MultipliersClear(&best_tx);
-  if (all_x_max > xsize) {
-    all_x_max = xsize;
+  for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+    int ix = all_y * xsize + tile_x_offset;
+    int all_x;
+    for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+      ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
+    }
   }
-  if (all_y_max > ysize) {
-    all_y_max = ysize;
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
   }
-
-  for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
-    int histo[256] = { 0 };
-    int all_y;
-
-    for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
-      int ix = all_y * xsize + tile_x_offset;
-      int all_x;
-      for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-        if (SkipRepeatedPixels(argb, ix, xsize)) {
-          continue;
-        }
-        ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
-      }
-    }
-    cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
-    if ((uint8_t)green_to_red == prevX.green_to_red_) {
-      cur_diff -= 3;  // favor keeping the areas locally similar
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    const int accumulated_red_histo[256], const uint32_t* const argb,
+    VP8LMultipliers* const best_tx) {
+  int min_green_to_red = -64;
+  int max_green_to_red = 64;
+  int green_to_red = 0;
+  int eval_min = 1;
+  int eval_max = 1;
+  float cur_diff_min = MAX_DIFF_COST;
+  float cur_diff_max = MAX_DIFF_COST;
+  // Do a binary search to find the optimal green_to_red color transform.
+  while (max_green_to_red - min_green_to_red > 2) {
+    if (eval_min) {
+      cur_diff_min = GetPredictionCostCrossColorRed(
+          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
+          prev_x, prev_y, min_green_to_red, accumulated_red_histo, argb);
+      eval_min = 0;
     }
-    if ((uint8_t)green_to_red == prevY.green_to_red_) {
-      cur_diff -= 3;  // favor keeping the areas locally similar
+    if (eval_max) {
+      cur_diff_max = GetPredictionCostCrossColorRed(
+          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
+          prev_x, prev_y, max_green_to_red, accumulated_red_histo, argb);
+      eval_max = 0;
     }
-    if (green_to_red == 0) {
-      cur_diff -= 3;
+    if (cur_diff_min < cur_diff_max) {
+      green_to_red = min_green_to_red;
+      max_green_to_red = (max_green_to_red + min_green_to_red) / 2;
+      eval_max = 1;
+    } else {
+      green_to_red = max_green_to_red;
+      min_green_to_red = (max_green_to_red + min_green_to_red) / 2;
+      eval_min = 1;
     }
-    if (cur_diff < best_diff) {
-      best_diff = cur_diff;
-      best_tx.green_to_red_ = green_to_red;
+  }
+  best_tx->green_to_red_ = green_to_red;
+}
+
+static float GetPredictionCostCrossColorBlue(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  int all_y;
+  int histo[256] = { 0 };
+  float cur_diff;
+  for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+    int all_x;
+    int ix = all_y * xsize + tile_x_offset;
+    for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
     }
   }
-  best_diff = MAX_DIFF_COST;
-  for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
-    for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
-      int all_y;
-      int histo[256] = { 0 };
-      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
-        int all_x;
-        int ix = all_y * xsize + tile_x_offset;
-        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-          if (SkipRepeatedPixels(argb, ix, xsize)) {
-            continue;
-          }
-          ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
-        }
-      }
-      cur_diff =
-          PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
-      if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if (green_to_blue == 0) {
-        cur_diff -= 3;
-      }
-      if (red_to_blue == 0) {
-        cur_diff -= 3;
-      }
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenRedToBlue(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256], const uint32_t* const argb,
+    VP8LMultipliers* const best_tx) {
+  float best_diff = MAX_DIFF_COST;
+  float cur_diff;
+  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
+  const int min_green_to_blue = -32;
+  const int max_green_to_blue = 32;
+  const int min_red_to_blue = -32;
+  const int max_red_to_blue = 32;
+  const int num_iters =
+      (1 + (max_green_to_blue - min_green_to_blue) / step) *
+      (1 + (max_red_to_blue - min_red_to_blue) / step);
+  // Number of tries to get optimal green_to_blue & red_to_blue color transforms
+  // after finding a local minima.
+  const int max_tries_after_min = 4 + (num_iters >> 2);
+  int num_tries_after_min = 0;
+  int green_to_blue;
+  for (green_to_blue = min_green_to_blue;
+       green_to_blue <= max_green_to_blue &&
+       num_tries_after_min < max_tries_after_min;
+       green_to_blue += step) {
+    int red_to_blue;
+    for (red_to_blue = min_red_to_blue;
+         red_to_blue <= max_red_to_blue &&
+         num_tries_after_min < max_tries_after_min;
+         red_to_blue += step) {
+      cur_diff = GetPredictionCostCrossColorBlue(
+          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, prev_x,
+          prev_y, green_to_blue, red_to_blue, accumulated_blue_histo, argb);
       if (cur_diff < best_diff) {
         best_diff = cur_diff;
-        best_tx.green_to_blue_ = green_to_blue;
-        best_tx.red_to_blue_ = red_to_blue;
+        best_tx->green_to_blue_ = green_to_blue;
+        best_tx->red_to_blue_ = red_to_blue;
+        num_tries_after_min = 0;
+      } else {
+        ++num_tries_after_min;
       }
     }
   }
+}
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
+                    prev_x, prev_y, accumulated_red_histo, argb, &best_tx);
+  GetBestGreenRedToBlue(tile_x_offset, tile_y_offset, all_x_max, all_y_max,
+                        xsize, prev_x, prev_y, quality, accumulated_blue_histo,
+                        argb, &best_tx);
   return best_tx;
 }
 
 static void CopyTileWithColorTransform(int xsize, int ysize,
-                                       int tile_x, int tile_y, int bits,
-                                       Multipliers color_transform,
-                                       uint32_t* const argb) {
-  int y;
-  int xscan = 1 << bits;
-  int yscan = 1 << bits;
-  tile_x <<= bits;
-  tile_y <<= bits;
-  if (xscan > xsize - tile_x) {
-    xscan = xsize - tile_x;
-  }
-  if (yscan > ysize - tile_y) {
-    yscan = ysize - tile_y;
-  }
-  yscan += tile_y;
-  for (y = tile_y; y < yscan; ++y) {
-    int ix = y * xsize + tile_x;
-    const int end_ix = ix + xscan;
-    for (; ix < end_ix; ++ix) {
-      argb[ix] = TransformColor(&color_transform, argb[ix], 0);
-    }
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
   }
 }
 
-void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                              uint32_t* const argb, uint32_t* image) {
   const int max_tile_size = 1 << bits;
-  int tile_xsize = VP8LSubSampleSize(width, bits);
-  int tile_ysize = VP8LSubSampleSize(height, bits);
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
   int accumulated_red_histo[256] = { 0 };
   int accumulated_blue_histo[256] = { 0 };
-  int tile_y;
-  int tile_x;
-  Multipliers prevX;
-  Multipliers prevY;
-  MultipliersClear(&prevY);
-  MultipliersClear(&prevX);
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
   for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
     for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
-      Multipliers color_transform;
-      int all_x_max;
       int y;
-      const int tile_y_offset = tile_y * max_tile_size;
       const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
       if (tile_y != 0) {
-        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
-        ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
-                               &prevY);
-      } else if (tile_x != 0) {
-        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
       }
-      color_transform =
-          GetBestColorTransformForTile(tile_x, tile_y, bits,
-                                       prevX, prevY,
-                                       step, width, height,
-                                       &accumulated_red_histo[0],
-                                       &accumulated_blue_histo[0],
-                                       argb);
-      image[tile_y * tile_xsize + tile_x] =
-          MultipliersToColorCode(&color_transform);
-      CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
-                                 color_transform, argb);
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
 
       // Gather accumulated histogram data.
-      all_x_max = tile_x_offset + max_tile_size;
-      if (all_x_max > width) {
-        all_x_max = width;
-      }
-      for (y = 0; y < max_tile_size; ++y) {
-        int ix;
-        int all_x;
-        int all_y = tile_y_offset + y;
-        if (all_y >= height) {
-          break;
-        }
-        ix = all_y * width + tile_x_offset;
-        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
           if (ix >= 2 &&
-              argb[ix] == argb[ix - 2] &&
-              argb[ix] == argb[ix - 1]) {
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
             continue;  // repeated pixels are handled by backward references
           }
           if (ix >= width + 2 &&
               argb[ix - 2] == argb[ix - width - 2] &&
               argb[ix - 1] == argb[ix - width - 1] &&
-              argb[ix] == argb[ix - width]) {
+              pix == argb[ix - width]) {
             continue;  // repeated pixels are handled by backward references
           }
-          ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
-          ++accumulated_blue_histo[argb[ix] & 0xff];
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
         }
       }
     }
@@ -1085,7 +1153,10 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int step,
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
                                        int y_start, int y_end, uint32_t* data) {
   const int width = transform->xsize_;
-  const int mask = (1 << transform->bits_) - 1;
+  const int tile_width = 1 << transform->bits_;
+  const int mask = tile_width - 1;
+  const int safe_width = width & ~mask;
+  const int remaining_width = width - safe_width;
   const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
   int y = y_start;
   const uint32_t* pred_row =
@@ -1093,16 +1164,21 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
 
   while (y < y_end) {
     const uint32_t* pred = pred_row;
-    Multipliers m = { 0, 0, 0 };
-    int x;
-
-    for (x = 0; x < width; ++x) {
-      if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
-      data[x] = TransformColor(&m, data[x], 1);
+    VP8LMultipliers m = { 0, 0, 0 };
+    const uint32_t* const data_safe_end = data + safe_width;
+    const uint32_t* const data_end = data + width;
+    while (data < data_safe_end) {
+      ColorCodeToMultipliers(*pred++, &m);
+      VP8LTransformColorInverse(&m, data, tile_width);
+      data += tile_width;
+    }
+    if (data < data_end) {  // Left-overs using C-version.
+      ColorCodeToMultipliers(*pred++, &m);
+      VP8LTransformColorInverse(&m, data, remaining_width);
+      data += remaining_width;
     }
-    data += width;
     ++y;
-    if ((y & mask) == 0) pred_row += tiles_per_row;;
+    if ((y & mask) == 0) pred_row += tiles_per_row;
   }
 }
 
@@ -1173,7 +1249,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
     case SUBTRACT_GREEN:
-      VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
+      VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
       break;
     case PREDICTOR_TRANSFORM:
       PredictorInverseTransform(transform, row_start, row_end, out);
@@ -1218,8 +1294,8 @@ static int is_big_endian(void) {
   return (tmp.b[0] != 1);
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB_C(const uint32_t* src,
+                            int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1229,8 +1305,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1241,8 +1317,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1258,8 +1334,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGB565(const uint32_t* src,
-                                int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1275,8 +1351,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToBGR_C(const uint32_t* src,
+                            int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1291,29 +1367,18 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
   if (is_big_endian() == swap_on_big_endian) {
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
-      uint32_t argb = *src++;
+      const uint32_t argb = *src++;
 
-#if !defined(__BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN)
 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
-#if defined(__i386__) || defined(__x86_64__)
-      __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
-      *(uint32_t*)dst = argb;
-#elif defined(_MSC_VER)
-      argb = _byteswap_ulong(argb);
-      *(uint32_t*)dst = argb;
-#else
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
+      *(uint32_t*)dst = BSwap32(argb);
 #else  // WEBP_REFERENCE_IMPLEMENTATION
       dst[0] = (argb >> 24) & 0xff;
       dst[1] = (argb >> 16) & 0xff;
       dst[2] = (argb >>  8) & 0xff;
       dst[3] = (argb >>  0) & 0xff;
 #endif
-#else  // __BIG_ENDIAN__
+#else  // WORDS_BIGENDIAN
       dst[0] = (argb >>  0) & 0xff;
       dst[1] = (argb >>  8) & 0xff;
       dst[2] = (argb >> 16) & 0xff;
@@ -1330,17 +1395,17 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
                          WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
   switch (out_colorspace) {
     case MODE_RGB:
-      ConvertBGRAToRGB(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
       break;
     case MODE_RGBA:
-      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
       break;
     case MODE_rgbA:
-      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
       WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
       break;
     case MODE_BGR:
-      ConvertBGRAToBGR(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
       break;
     case MODE_BGRA:
       CopyOrSwap(in_data, num_pixels, rgba, 1);
@@ -1357,20 +1422,21 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
       WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
       break;
     case MODE_RGBA_4444:
-      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
       break;
     case MODE_rgbA_4444:
-      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
       WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
       break;
     case MODE_RGB_565:
-      ConvertBGRAToRGB565(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
       break;
     default:
       assert(0);          // Code flow should not reach here.
   }
 }
 
+//------------------------------------------------------------------------------
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                         int xbits, uint32_t* const dst) {
@@ -1394,129 +1460,166 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,
 
 //------------------------------------------------------------------------------
 
-// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
-// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
-#if defined(WEBP_USE_SSE2)
-static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i V1 = _mm_add_epi16(C0, C1);
-  const __m128i V2 = _mm_sub_epi16(V1, C2);
-  const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i A1 = _mm_sub_epi16(A0, B0);
-  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
-  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
-  const __m128i A3 = _mm_srai_epi16(A2, 1);
-  const __m128i A4 = _mm_add_epi16(A0, A3);
-  const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
-}
-
-static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
-  int pa_minus_pb;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
-  const __m128i AC0 = _mm_subs_epu8(A0, C0);
-  const __m128i CA0 = _mm_subs_epu8(C0, A0);
-  const __m128i BC0 = _mm_subs_epu8(B0, C0);
-  const __m128i CB0 = _mm_subs_epu8(C0, B0);
-  const __m128i AC = _mm_or_si128(AC0, CA0);
-  const __m128i BC = _mm_or_si128(BC0, CB0);
-  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
-  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
-  const __m128i diff = _mm_sub_epi16(pb, pa);
-  {
-    int16_t out[8];
-    _mm_storeu_si128((__m128i*)out, diff);
-    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
-  }
-  return (pa_minus_pb <= 0) ? a : b;
+static double ExtraCost(const uint32_t* population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
 }
 
-static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; i + 4 < num_pixs; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
+                                int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) {
+    const int xy = X[i + 2] + Y[i + 2];
+    cost += (i >> 1) * xy;
   }
-  // fallthrough and finish off with plain-C
-  for (; i < num_pixs; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
-    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+  return cost;
+}
+
+// Returns the various RLE counts
+static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+    stats.counts[population[i] != 0] += (streak > 3);
+    stats.streaks[population[i] != 0][(streak > 3)] += streak;
+    streak = 0;
   }
+  ++streak;
+  stats.counts[population[i] != 0] += (streak > 3);
+  stats.streaks[population[i] != 0][(streak > 3)] += streak;
+  return stats;
 }
 
-static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; data + 4 < data_end; data += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)data);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_add_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)data, out);
+static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
+                                            const uint32_t* Y, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    const int xy = X[i] + Y[i];
+    const int xy_next = X[i + 1] + Y[i + 1];
+    ++streak;
+    if (xy == xy_next) {
+      continue;
+    }
+    stats.counts[xy != 0] += (streak > 3);
+    stats.streaks[xy != 0][(streak > 3)] += streak;
+    streak = 0;
   }
-  // fallthrough and finish off with plain-C
-  while (data < data_end) {
-    const uint32_t argb = *data;
-    const uint32_t green = ((argb >> 8) & 0xff);
-    uint32_t red_blue = (argb & 0x00ff00ffu);
-    red_blue += (green << 16) | green;
-    red_blue &= 0x00ff00ffu;
-    *data++ = (argb & 0xff00ff00u) | red_blue;
+  {
+    const int xy = X[i] + Y[i];
+    ++streak;
+    stats.counts[xy != 0] += (streak > 3);
+    stats.streaks[xy != 0][(streak > 3)] += streak;
   }
+  return stats;
 }
 
-extern void VP8LDspInitSSE2(void);
+//------------------------------------------------------------------------------
 
-void VP8LDspInitSSE2(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
-  VP8LSelect = SelectSSE2;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    for (i = 0; i < literal_size; ++i) {
+      out->literal_[i] = a->literal_[i] + b->literal_[i];
+    }
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      out->distance_[i] = a->distance_[i] + b->distance_[i];
+    }
+    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
+      out->red_[i] = a->red_[i] + b->red_[i];
+      out->blue_[i] = a->blue_[i] + b->blue_[i];
+      out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
+    }
+  } else {
+    for (i = 0; i < literal_size; ++i) {
+      out->literal_[i] += a->literal_[i];
+    }
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      out->distance_[i] += a->distance_[i];
+    }
+    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
+      out->red_[i] += a->red_[i];
+      out->blue_[i] += a->blue_[i];
+      out->alpha_[i] += a->alpha_[i];
+    }
+  }
 }
-#endif
+
 //------------------------------------------------------------------------------
 
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-VP8LPredSelectFunc VP8LSelect;
-VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LPredictorFunc VP8LPredictors[16];
+
+VP8LTransformColorFunc VP8LTransformColor;
+VP8LTransformColorFunc VP8LTransformColorInverse;
+
+VP8LConvertFunc VP8LConvertBGRAToRGB;
+VP8LConvertFunc VP8LConvertBGRAToRGBA;
+VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+VP8LConvertFunc VP8LConvertBGRAToRGB565;
+VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+VP8LCostFunc VP8LExtraCost;
+VP8LCostCombinedFunc VP8LExtraCostCombined;
+
+VP8LCostCountFunc VP8LHuffmanCostCount;
+VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
+
+VP8LHistogramAddFunc VP8LHistogramAdd;
+
+extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitNEON(void);
+extern void VP8LDspInitMIPS32(void);
+
+static volatile VP8CPUInfo lossless_last_cpuinfo_used =
+    (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
 void VP8LDspInit(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
-  VP8LSelect = Select;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
+
+  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
+  VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
+
+  VP8LTransformColor = VP8LTransformColor_C;
+  VP8LTransformColorInverse = VP8LTransformColorInverse_C;
+
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
+  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+
+  VP8LFastLog2Slow = FastLog2Slow;
+  VP8LFastSLog2Slow = FastSLog2Slow;
+
+  VP8LExtraCost = ExtraCost;
+  VP8LExtraCostCombined = ExtraCostCombined;
+
+  VP8LHuffmanCostCount = HuffmanCostCount;
+  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
+
+  VP8LHistogramAdd = HistogramAdd;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -1525,8 +1628,18 @@ void VP8LDspInit(void) {
       VP8LDspInitSSE2();
     }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8LDspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8LDspInitMIPS32();
+    }
+#endif
   }
+  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
 //------------------------------------------------------------------------------
-