Update Chromium to beta version 37.0.2062.68

Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca Reviewed-by: Andras Becsi <andras.becsi@digia.com>
author: Jocelyn Turcotte <jocelyn.turcotte@digia.com> 2014-08-08 14:30:41 +0200
committer: Jocelyn Turcotte <jocelyn.turcotte@digia.com> 2014-08-12 13:49:54 +0200
commit: ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree: 498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/skia/src/opts
parent: 4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)
44 files changed, 4870 insertions, 2796 deletions
diff --git a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp
index 259e2efc0ec..b0405669218 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -5,17 +5,15 @@
  * found in the LICENSE file.
  */
 
-#include "SkBitmapProcState.h"
+#include <emmintrin.h>
 #include "SkBitmap.h"
+#include "SkBitmapFilter_opts_SSE2.h"
+#include "SkBitmapProcState.h"
 #include "SkColor.h"
 #include "SkColorPriv.h"
-#include "SkUnPreMultiply.h"
-#include "SkShader.h"
 #include "SkConvolver.h"
-
-#include "SkBitmapFilter_opts_SSE2.h"
-
-#include <emmintrin.h>
+#include "SkShader.h"
+#include "SkUnPreMultiply.h"
 
 #if 0
 static inline void print128i(__m128i value) {
@@ -175,7 +173,6 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
 
         s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
                     SkIntToScalar(y), &srcPt);
-
     }
 }
 
@@ -185,126 +182,126 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
                                const SkConvolutionFilter1D& filter,
                                unsigned char* out_row,
                                bool /*has_alpha*/) {
-  int num_values = filter.numValues();
-
-  int filter_offset, filter_length;
-  __m128i zero = _mm_setzero_si128();
-  __m128i mask[4];
-  // |mask| will be used to decimate all extra filter coefficients that are
-  // loaded by SIMD when |filter_length| is not divisible by 4.
-  // mask[0] is not used in following algorithm.
-  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
-
-  // Output one pixel each iteration, calculating all channels (RGBA) together.
-  for (int out_x = 0; out_x < num_values; out_x++) {
-    const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
-        filter.FilterForValue(out_x, &filter_offset, &filter_length);
-
-    __m128i accum = _mm_setzero_si128();
-
-    // Compute the first pixel in this row that the filter affects. It will
-    // touch |filter_length| pixels (4 bytes each) after this.
-    const __m128i* row_to_filter =
-        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
-
-    // We will load and accumulate with four coefficients per iteration.
-    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
-
-      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
-      __m128i coeff, coeff16;
-      // [16] xx xx xx xx c3 c2 c1 c0
-      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
-      // [16] xx xx xx xx c1 c1 c0 c0
-      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
-      // [16] c1 c1 c1 c1 c0 c0 c0 c0
-      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-
-      // Load four pixels => unpack the first two pixels to 16 bits =>
-      // multiply with coefficients => accumulate the convolution result.
-      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
-      __m128i src8 = _mm_loadu_si128(row_to_filter);
-      // [16] a1 b1 g1 r1 a0 b0 g0 r0
-      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
-      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
-      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
-      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-
-      // Duplicate 3rd and 4th coefficients for all channels =>
-      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
-      // => accumulate the convolution results.
-      // [16] xx xx xx xx c3 c3 c2 c2
-      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
-      // [16] c3 c3 c3 c3 c2 c2 c2 c2
-      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-      // [16] a3 g3 b3 r3 a2 g2 b2 r2
-      src16 = _mm_unpackhi_epi8(src8, zero);
-      mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      mul_lo = _mm_mullo_epi16(src16, coeff16);
-      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
-      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-
-      // Advance the pixel and coefficients pointers.
-      row_to_filter += 1;
-      filter_values += 4;
-    }
+    int num_values = filter.numValues();
+
+    int filter_offset, filter_length;
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask[4];
+    // |mask| will be used to decimate all extra filter coefficients that are
+    // loaded by SIMD when |filter_length| is not divisible by 4.
+    // mask[0] is not used in following algorithm.
+    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+    // Output one pixel each iteration, calculating all channels (RGBA) together.
+    for (int out_x = 0; out_x < num_values; out_x++) {
+        const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+            filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+        __m128i accum = _mm_setzero_si128();
+
+        // Compute the first pixel in this row that the filter affects. It will
+        // touch |filter_length| pixels (4 bytes each) after this.
+        const __m128i* row_to_filter =
+            reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+        // We will load and accumulate with four coefficients per iteration.
+        for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+            // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+            __m128i coeff, coeff16;
+            // [16] xx xx xx xx c3 c2 c1 c0
+            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+            // [16] xx xx xx xx c1 c1 c0 c0
+            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+            // [16] c1 c1 c1 c1 c0 c0 c0 c0
+            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+            // Load four pixels => unpack the first two pixels to 16 bits =>
+            // multiply with coefficients => accumulate the convolution result.
+            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+            __m128i src8 = _mm_loadu_si128(row_to_filter);
+            // [16] a1 b1 g1 r1 a0 b0 g0 r0
+            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+            // [32]  a0*c0 b0*c0 g0*c0 r0*c0
+            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+            // [32]  a1*c1 b1*c1 g1*c1 r1*c1
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+
+            // Duplicate 3rd and 4th coefficients for all channels =>
+            // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+            // => accumulate the convolution results.
+            // [16] xx xx xx xx c3 c3 c2 c2
+            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+            // [16] c3 c3 c3 c3 c2 c2 c2 c2
+            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+            // [16] a3 g3 b3 r3 a2 g2 b2 r2
+            src16 = _mm_unpackhi_epi8(src8, zero);
+            mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            mul_lo = _mm_mullo_epi16(src16, coeff16);
+            // [32]  a2*c2 b2*c2 g2*c2 r2*c2
+            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+            // [32]  a3*c3 b3*c3 g3*c3 r3*c3
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+
+            // Advance the pixel and coefficients pointers.
+            row_to_filter += 1;
+            filter_values += 4;
+        }
 
-    // When |filter_length| is not divisible by 4, we need to decimate some of
-    // the filter coefficient that was loaded incorrectly to zero; Other than
-    // that the algorithm is same with above, exceot that the 4th pixel will be
-    // always absent.
-    int r = filter_length&3;
-    if (r) {
-      // Note: filter_values must be padded to align_up(filter_offset, 8).
-      __m128i coeff, coeff16;
-      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
-      // Mask out extra filter taps.
-      coeff = _mm_and_si128(coeff, mask[r]);
-      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
-      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-
-      // Note: line buffer must be padded to align_up(filter_offset, 16).
-      // We resolve this by use C-version for the last horizontal line.
-      __m128i src8 = _mm_loadu_si128(row_to_filter);
-      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
-      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
-      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-
-      src16 = _mm_unpackhi_epi8(src8, zero);
-      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
-      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-      mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      mul_lo = _mm_mullo_epi16(src16, coeff16);
-      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum = _mm_add_epi32(accum, t);
-    }
+        // When |filter_length| is not divisible by 4, we need to decimate some of
+        // the filter coefficient that was loaded incorrectly to zero; Other than
+        // that the algorithm is same with above, exceot that the 4th pixel will be
+        // always absent.
+        int r = filter_length&3;
+        if (r) {
+            // Note: filter_values must be padded to align_up(filter_offset, 8).
+            __m128i coeff, coeff16;
+            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+            // Mask out extra filter taps.
+            coeff = _mm_and_si128(coeff, mask[r]);
+            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+            // Note: line buffer must be padded to align_up(filter_offset, 16).
+            // We resolve this by use C-version for the last horizontal line.
+            __m128i src8 = _mm_loadu_si128(row_to_filter);
+            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+
+            src16 = _mm_unpackhi_epi8(src8, zero);
+            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+            mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            mul_lo = _mm_mullo_epi16(src16, coeff16);
+            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum = _mm_add_epi32(accum, t);
+        }
 
-    // Shift right for fixed point implementation.
-    accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
+        // Shift right for fixed point implementation.
+        accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
 
-    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
-    accum = _mm_packs_epi32(accum, zero);
-    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
-    accum = _mm_packus_epi16(accum, zero);
+        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+        accum = _mm_packs_epi32(accum, zero);
+        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+        accum = _mm_packus_epi16(accum, zero);
 
-    // Store the pixel value of 32 bits.
-    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
-    out_row += 4;
-  }
+        // Store the pixel value of 32 bits.
+        *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+        out_row += 4;
+    }
 }
 
 // Convolves horizontally along four rows. The row data is given in
@@ -314,116 +311,116 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
                                     const SkConvolutionFilter1D& filter,
                                     unsigned char* out_row[4]) {
-  int num_values = filter.numValues();
-
-  int filter_offset, filter_length;
-  __m128i zero = _mm_setzero_si128();
-  __m128i mask[4];
-  // |mask| will be used to decimate all extra filter coefficients that are
-  // loaded by SIMD when |filter_length| is not divisible by 4.
-  // mask[0] is not used in following algorithm.
-  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
-
-  // Output one pixel each iteration, calculating all channels (RGBA) together.
-  for (int out_x = 0; out_x < num_values; out_x++) {
-    const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
-        filter.FilterForValue(out_x, &filter_offset, &filter_length);
-
-    // four pixels in a column per iteration.
-    __m128i accum0 = _mm_setzero_si128();
-    __m128i accum1 = _mm_setzero_si128();
-    __m128i accum2 = _mm_setzero_si128();
-    __m128i accum3 = _mm_setzero_si128();
-    int start = (filter_offset<<2);
-    // We will load and accumulate with four coefficients per iteration.
-    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
-      __m128i coeff, coeff16lo, coeff16hi;
-      // [16] xx xx xx xx c3 c2 c1 c0
-      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
-      // [16] xx xx xx xx c1 c1 c0 c0
-      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
-      // [16] c1 c1 c1 c1 c0 c0 c0 c0
-      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
-      // [16] xx xx xx xx c3 c3 c2 c2
-      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
-      // [16] c3 c3 c3 c3 c2 c2 c2 c2
-      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
-
-      __m128i src8, src16, mul_hi, mul_lo, t;
-
-#define ITERATION(src, accum)                                          \
-      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
-      src16 = _mm_unpacklo_epi8(src8, zero);                           \
-      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
-      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
-      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
-      accum = _mm_add_epi32(accum, t);                                 \
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
-      accum = _mm_add_epi32(accum, t);                                 \
-      src16 = _mm_unpackhi_epi8(src8, zero);                           \
-      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
-      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
-      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
-      accum = _mm_add_epi32(accum, t);                                 \
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
-      accum = _mm_add_epi32(accum, t)
-
-      ITERATION(src_data[0] + start, accum0);
-      ITERATION(src_data[1] + start, accum1);
-      ITERATION(src_data[2] + start, accum2);
-      ITERATION(src_data[3] + start, accum3);
-
-      start += 16;
-      filter_values += 4;
-    }
+    int num_values = filter.numValues();
+
+    int filter_offset, filter_length;
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask[4];
+    // |mask| will be used to decimate all extra filter coefficients that are
+    // loaded by SIMD when |filter_length| is not divisible by 4.
+    // mask[0] is not used in following algorithm.
+    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+    // Output one pixel each iteration, calculating all channels (RGBA) together.
+    for (int out_x = 0; out_x < num_values; out_x++) {
+        const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+            filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+        // four pixels in a column per iteration.
+        __m128i accum0 = _mm_setzero_si128();
+        __m128i accum1 = _mm_setzero_si128();
+        __m128i accum2 = _mm_setzero_si128();
+        __m128i accum3 = _mm_setzero_si128();
+        int start = (filter_offset<<2);
+        // We will load and accumulate with four coefficients per iteration.
+        for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+            __m128i coeff, coeff16lo, coeff16hi;
+            // [16] xx xx xx xx c3 c2 c1 c0
+            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+            // [16] xx xx xx xx c1 c1 c0 c0
+            coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+            // [16] c1 c1 c1 c1 c0 c0 c0 c0
+            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+            // [16] xx xx xx xx c3 c3 c2 c2
+            coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+            // [16] c3 c3 c3 c3 c2 c2 c2 c2
+            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+            __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum)                                                \
+            src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
+            src16 = _mm_unpacklo_epi8(src8, zero);                           \
+            mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
+            mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
+            t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+            accum = _mm_add_epi32(accum, t);                                 \
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+            accum = _mm_add_epi32(accum, t);                                 \
+            src16 = _mm_unpackhi_epi8(src8, zero);                           \
+            mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
+            mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
+            t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+            accum = _mm_add_epi32(accum, t);                                 \
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+            accum = _mm_add_epi32(accum, t)
+
+            ITERATION(src_data[0] + start, accum0);
+            ITERATION(src_data[1] + start, accum1);
+            ITERATION(src_data[2] + start, accum2);
+            ITERATION(src_data[3] + start, accum3);
+
+            start += 16;
+            filter_values += 4;
+        }
 
-    int r = filter_length & 3;
-    if (r) {
-      // Note: filter_values must be padded to align_up(filter_offset, 8);
-      __m128i coeff;
-      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
-      // Mask out extra filter taps.
-      coeff = _mm_and_si128(coeff, mask[r]);
-
-      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
-      /* c1 c1 c1 c1 c0 c0 c0 c0 */
-      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
-      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
-      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
-
-      __m128i src8, src16, mul_hi, mul_lo, t;
-
-      ITERATION(src_data[0] + start, accum0);
-      ITERATION(src_data[1] + start, accum1);
-      ITERATION(src_data[2] + start, accum2);
-      ITERATION(src_data[3] + start, accum3);
-    }
+        int r = filter_length & 3;
+        if (r) {
+            // Note: filter_values must be padded to align_up(filter_offset, 8);
+            __m128i coeff;
+            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+            // Mask out extra filter taps.
+            coeff = _mm_and_si128(coeff, mask[r]);
+
+            __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+            /* c1 c1 c1 c1 c0 c0 c0 c0 */
+            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+            __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+            __m128i src8, src16, mul_hi, mul_lo, t;
+
+            ITERATION(src_data[0] + start, accum0);
+            ITERATION(src_data[1] + start, accum1);
+            ITERATION(src_data[2] + start, accum2);
+            ITERATION(src_data[3] + start, accum3);
+        }
 
-    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
-    accum0 = _mm_packs_epi32(accum0, zero);
-    accum0 = _mm_packus_epi16(accum0, zero);
-    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
-    accum1 = _mm_packs_epi32(accum1, zero);
-    accum1 = _mm_packus_epi16(accum1, zero);
-    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
-    accum2 = _mm_packs_epi32(accum2, zero);
-    accum2 = _mm_packus_epi16(accum2, zero);
-    accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
-    accum3 = _mm_packs_epi32(accum3, zero);
-    accum3 = _mm_packus_epi16(accum3, zero);
-
-    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
-    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
-    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
-    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
-
-    out_row[0] += 4;
-    out_row[1] += 4;
-    out_row[2] += 4;
-    out_row[3] += 4;
-  }
+        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+        accum0 = _mm_packs_epi32(accum0, zero);
+        accum0 = _mm_packus_epi16(accum0, zero);
+        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+        accum1 = _mm_packs_epi32(accum1, zero);
+        accum1 = _mm_packus_epi16(accum1, zero);
+        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+        accum2 = _mm_packs_epi32(accum2, zero);
+        accum2 = _mm_packus_epi16(accum2, zero);
+        accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+        accum3 = _mm_packs_epi32(accum3, zero);
+        accum3 = _mm_packus_epi16(accum3, zero);
+
+        *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+        *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+        *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+        *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+        out_row[0] += 4;
+        out_row[1] += 4;
+        out_row[2] += 4;
+        out_row[3] += 4;
+    }
 }
 
 // Does vertical convolution to produce one output row. The filter values and
@@ -438,166 +435,166 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
                              unsigned char* const* source_data_rows,
                              int pixel_width,
                              unsigned char* out_row) {
-  int width = pixel_width & ~3;
-
-  __m128i zero = _mm_setzero_si128();
-  __m128i accum0, accum1, accum2, accum3, coeff16;
-  const __m128i* src;
-  // Output four pixels per iteration (16 bytes).
-  for (int out_x = 0; out_x < width; out_x += 4) {
-
-    // Accumulated result for each pixel. 32 bits per RGBA channel.
-    accum0 = _mm_setzero_si128();
-    accum1 = _mm_setzero_si128();
-    accum2 = _mm_setzero_si128();
-    accum3 = _mm_setzero_si128();
-
-    // Convolve with one filter coefficient per iteration.
-    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
-
-      // Duplicate the filter coefficient 8 times.
-      // [16] cj cj cj cj cj cj cj cj
-      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
-
-      // Load four pixels (16 bytes) together.
-      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
-      src = reinterpret_cast<const __m128i*>(
-          &source_data_rows[filter_y][out_x << 2]);
-      __m128i src8 = _mm_loadu_si128(src);
-
-      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
-      // multiply with current coefficient => accumulate the result.
-      // [16] a1 b1 g1 r1 a0 b0 g0 r0
-      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
-      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
-      // [32] a0 b0 g0 r0
-      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum0 = _mm_add_epi32(accum0, t);
-      // [32] a1 b1 g1 r1
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-      accum1 = _mm_add_epi32(accum1, t);
-
-      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
-      // multiply with current coefficient => accumulate the result.
-      // [16] a3 b3 g3 r3 a2 b2 g2 r2
-      src16 = _mm_unpackhi_epi8(src8, zero);
-      mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      mul_lo = _mm_mullo_epi16(src16, coeff16);
-      // [32] a2 b2 g2 r2
-      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum2 = _mm_add_epi32(accum2, t);
-      // [32] a3 b3 g3 r3
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-      accum3 = _mm_add_epi32(accum3, t);
-    }
-
-    // Shift right for fixed point implementation.
-    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
-    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
-    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
-    accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
-
-    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
-    // [16] a1 b1 g1 r1 a0 b0 g0 r0
-    accum0 = _mm_packs_epi32(accum0, accum1);
-    // [16] a3 b3 g3 r3 a2 b2 g2 r2
-    accum2 = _mm_packs_epi32(accum2, accum3);
+    int width = pixel_width & ~3;
+
+    __m128i zero = _mm_setzero_si128();
+    __m128i accum0, accum1, accum2, accum3, coeff16;
+    const __m128i* src;
+    // Output four pixels per iteration (16 bytes).
+    for (int out_x = 0; out_x < width; out_x += 4) {
+
+        // Accumulated result for each pixel. 32 bits per RGBA channel.
+        accum0 = _mm_setzero_si128();
+        accum1 = _mm_setzero_si128();
+        accum2 = _mm_setzero_si128();
+        accum3 = _mm_setzero_si128();
+
+        // Convolve with one filter coefficient per iteration.
+        for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+            // Duplicate the filter coefficient 8 times.
+            // [16] cj cj cj cj cj cj cj cj
+            coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+            // Load four pixels (16 bytes) together.
+            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+            src = reinterpret_cast<const __m128i*>(
+                &source_data_rows[filter_y][out_x << 2]);
+            __m128i src8 = _mm_loadu_si128(src);
+
+            // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+            // multiply with current coefficient => accumulate the result.
+            // [16] a1 b1 g1 r1 a0 b0 g0 r0
+            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+            // [32] a0 b0 g0 r0
+            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum0 = _mm_add_epi32(accum0, t);
+            // [32] a1 b1 g1 r1
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+            accum1 = _mm_add_epi32(accum1, t);
+
+            // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+            // multiply with current coefficient => accumulate the result.
+            // [16] a3 b3 g3 r3 a2 b2 g2 r2
+            src16 = _mm_unpackhi_epi8(src8, zero);
+            mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            mul_lo = _mm_mullo_epi16(src16, coeff16);
+            // [32] a2 b2 g2 r2
+            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum2 = _mm_add_epi32(accum2, t);
+            // [32] a3 b3 g3 r3
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+            accum3 = _mm_add_epi32(accum3, t);
+        }
 
-    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
-    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
-    accum0 = _mm_packus_epi16(accum0, accum2);
+        // Shift right for fixed point implementation.
+        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+        accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+
+        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+        // [16] a1 b1 g1 r1 a0 b0 g0 r0
+        accum0 = _mm_packs_epi32(accum0, accum1);
+        // [16] a3 b3 g3 r3 a2 b2 g2 r2
+        accum2 = _mm_packs_epi32(accum2, accum3);
+
+        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+        accum0 = _mm_packus_epi16(accum0, accum2);
+
+        if (has_alpha) {
+            // Compute the max(ri, gi, bi) for each pixel.
+            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+            __m128i a = _mm_srli_epi32(accum0, 8);
+            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+            __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+            a = _mm_srli_epi32(accum0, 16);
+            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+            b = _mm_max_epu8(a, b);  // Max of r and g and b.
+            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+            b = _mm_slli_epi32(b, 24);
+
+            // Make sure the value of alpha channel is always larger than maximum
+            // value of color channels.
+            accum0 = _mm_max_epu8(b, accum0);
+        } else {
+            // Set value of alpha channels to 0xFF.
+            __m128i mask = _mm_set1_epi32(0xff000000);
+            accum0 = _mm_or_si128(accum0, mask);
+        }
 
-    if (has_alpha) {
-      // Compute the max(ri, gi, bi) for each pixel.
-      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
-      __m128i a = _mm_srli_epi32(accum0, 8);
-      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
-      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
-      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
-      a = _mm_srli_epi32(accum0, 16);
-      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
-      b = _mm_max_epu8(a, b);  // Max of r and g and b.
-      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
-      b = _mm_slli_epi32(b, 24);
-
-      // Make sure the value of alpha channel is always larger than maximum
-      // value of color channels.
-      accum0 = _mm_max_epu8(b, accum0);
-    } else {
-      // Set value of alpha channels to 0xFF.
-      __m128i mask = _mm_set1_epi32(0xff000000);
-      accum0 = _mm_or_si128(accum0, mask);
+        // Store the convolution result (16 bytes) and advance the pixel pointers.
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+        out_row += 16;
     }
 
-    // Store the convolution result (16 bytes) and advance the pixel pointers.
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
-    out_row += 16;
-  }
-
-  // When the width of the output is not divisible by 4, We need to save one
-  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
-  if (pixel_width & 3) {
-    accum0 = _mm_setzero_si128();
-    accum1 = _mm_setzero_si128();
-    accum2 = _mm_setzero_si128();
-    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
-      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
-      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
-      src = reinterpret_cast<const __m128i*>(
-          &source_data_rows[filter_y][width<<2]);
-      __m128i src8 = _mm_loadu_si128(src);
-      // [16] a1 b1 g1 r1 a0 b0 g0 r0
-      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
-      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
-      // [32] a0 b0 g0 r0
-      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum0 = _mm_add_epi32(accum0, t);
-      // [32] a1 b1 g1 r1
-      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-      accum1 = _mm_add_epi32(accum1, t);
-      // [16] a3 b3 g3 r3 a2 b2 g2 r2
-      src16 = _mm_unpackhi_epi8(src8, zero);
-      mul_hi = _mm_mulhi_epi16(src16, coeff16);
-      mul_lo = _mm_mullo_epi16(src16, coeff16);
-      // [32] a2 b2 g2 r2
-      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-      accum2 = _mm_add_epi32(accum2, t);
-    }
+    // When the width of the output is not divisible by 4, We need to save one
+    // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+    if (pixel_width & 3) {
+        accum0 = _mm_setzero_si128();
+        accum1 = _mm_setzero_si128();
+        accum2 = _mm_setzero_si128();
+        for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+            coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+            src = reinterpret_cast<const __m128i*>(
+                &source_data_rows[filter_y][width<<2]);
+            __m128i src8 = _mm_loadu_si128(src);
+            // [16] a1 b1 g1 r1 a0 b0 g0 r0
+            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+            // [32] a0 b0 g0 r0
+            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum0 = _mm_add_epi32(accum0, t);
+            // [32] a1 b1 g1 r1
+            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+            accum1 = _mm_add_epi32(accum1, t);
+            // [16] a3 b3 g3 r3 a2 b2 g2 r2
+            src16 = _mm_unpackhi_epi8(src8, zero);
+            mul_hi = _mm_mulhi_epi16(src16, coeff16);
+            mul_lo = _mm_mullo_epi16(src16, coeff16);
+            // [32] a2 b2 g2 r2
+            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+            accum2 = _mm_add_epi32(accum2, t);
+        }
 
-    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
-    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
-    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
-    // [16] a1 b1 g1 r1 a0 b0 g0 r0
-    accum0 = _mm_packs_epi32(accum0, accum1);
-    // [16] a3 b3 g3 r3 a2 b2 g2 r2
-    accum2 = _mm_packs_epi32(accum2, zero);
-    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
-    accum0 = _mm_packus_epi16(accum0, accum2);
-    if (has_alpha) {
-      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
-      __m128i a = _mm_srli_epi32(accum0, 8);
-      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
-      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
-      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
-      a = _mm_srli_epi32(accum0, 16);
-      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
-      b = _mm_max_epu8(a, b);  // Max of r and g and b.
-      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
-      b = _mm_slli_epi32(b, 24);
-      accum0 = _mm_max_epu8(b, accum0);
-    } else {
-      __m128i mask = _mm_set1_epi32(0xff000000);
-      accum0 = _mm_or_si128(accum0, mask);
-    }
+        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+        // [16] a1 b1 g1 r1 a0 b0 g0 r0
+        accum0 = _mm_packs_epi32(accum0, accum1);
+        // [16] a3 b3 g3 r3 a2 b2 g2 r2
+        accum2 = _mm_packs_epi32(accum2, zero);
+        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+        accum0 = _mm_packus_epi16(accum0, accum2);
+        if (has_alpha) {
+            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+            __m128i a = _mm_srli_epi32(accum0, 8);
+            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+            __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+            a = _mm_srli_epi32(accum0, 16);
+            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+            b = _mm_max_epu8(a, b);  // Max of r and g and b.
+            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+            b = _mm_slli_epi32(b, 24);
+            accum0 = _mm_max_epu8(b, accum0);
+        } else {
+            __m128i mask = _mm_set1_epi32(0xff000000);
+            accum0 = _mm_or_si128(accum0, mask);
+        }
 
-    for (int out_x = width; out_x < pixel_width; out_x++) {
-      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
-      accum0 = _mm_srli_si128(accum0, 4);
-      out_row += 4;
+        for (int out_x = width; out_x < pixel_width; out_x++) {
+            *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+            accum0 = _mm_srli_si128(accum0, 4);
+            out_row += 4;
+        }
     }
-  }
 }
 
 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
@@ -606,19 +603,19 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
                              int pixel_width,
                              unsigned char* out_row,
                              bool has_alpha) {
-  if (has_alpha) {
-    convolveVertically_SSE2<true>(filter_values,
-                                  filter_length,
-                                  source_data_rows,
-                                  pixel_width,
-                                  out_row);
-  } else {
-    convolveVertically_SSE2<false>(filter_values,
-                                   filter_length,
-                                   source_data_rows,
-                                   pixel_width,
-                                   out_row);
-  }
+    if (has_alpha) {
+        convolveVertically_SSE2<true>(filter_values,
+                                      filter_length,
+                                      source_data_rows,
+                                      pixel_width,
+                                      out_row);
+    } else {
+        convolveVertically_SSE2<false>(filter_values,
+                                       filter_length,
+                                       source_data_rows,
+                                       pixel_width,
+                                       out_row);
+    }
 }
 
 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
diff --git a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h
index 588f4ef18bb..661a824e227 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2013 Google Inc.
  *
@@ -6,7 +5,6 @@
  * found in the LICENSE file.
  */
 
-
 #ifndef SkBitmapFilter_opts_sse2_DEFINED
 #define SkBitmapFilter_opts_sse2_DEFINED
 
@@ -14,9 +12,9 @@
 #include "SkConvolver.h"
 
 void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
-                          SkPMColor *SK_RESTRICT colors, int count);
+                                      SkPMColor *SK_RESTRICT colors, int count);
 void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y,
-                SkPMColor *SK_RESTRICT colors, int count);
+                            SkPMColor *SK_RESTRICT colors, int count);
 
 
 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h
index e56b683b874..0887145c3d0 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h
@@ -17,12 +17,15 @@
  * exact results for the color components, but if the 4 incoming colors are
  * all opaque, then the output color must also be opaque. Subsequent parts of
  * the drawing pipeline may rely on this (e.g. which blitrow proc to use).
+ *
  */
-
-static inline void Filter_32_opaque_neon(unsigned x, unsigned y,
-                                         SkPMColor a00, SkPMColor a01,
-                                         SkPMColor a10, SkPMColor a11,
-                                         SkPMColor *dst) {
+// Chrome on Android uses -Os so we need to force these inline. Otherwise
+// calling the function in the inner loops will cause significant overhead on
+// some platforms.
+static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
+                                                   SkPMColor a00, SkPMColor a01,
+                                                   SkPMColor a10, SkPMColor a11,
+                                                   SkPMColor *dst) {
     uint8x8_t vy, vconst16_8, v16_y, vres;
     uint16x4_t vx, vconst16_16, v16_x, tmp;
     uint32x2_t va0, va1;
@@ -53,10 +56,11 @@ static inline void Filter_32_opaque_neon(unsigned x, unsigned y,
     vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
 }
 
-static inline void Filter_32_alpha_neon(unsigned x, unsigned y,
-                                        SkPMColor a00, SkPMColor a01,
-                                        SkPMColor a10, SkPMColor a11,
-                                        SkPMColor *dst, uint16_t scale) {
+static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
+                                                  SkPMColor a00, SkPMColor a01,
+                                                  SkPMColor a10, SkPMColor a11,
+                                                  SkPMColor *dst,
+                                                  uint16_t scale) {
     uint8x8_t vy, vconst16_8, v16_y, vres;
     uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
     uint32x2_t va0, va1;
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
index e81da670526..7789031c028 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
@@ -10,26 +10,140 @@
 #include "SkUtilsArm.h"
 #include "SkBitmapProcState_utils.h"
 
+#include <arm_neon.h>
+
 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
 
 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
 
-#define MAKENAME(suffix)        ClampX_ClampY ## suffix ## _neon
-#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
-#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
-#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
-#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+// TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
+static inline int16x8_t sbpsm_clamp_tile8(int32x4_t low, int32x4_t high, unsigned max) {
+    int16x8_t res;
+
+    // get the hi 16s of all those 32s
+    res = vuzpq_s16(vreinterpretq_s16_s32(low), vreinterpretq_s16_s32(high)).val[1];
+
+    // clamp
+    res = vmaxq_s16(res, vdupq_n_s16(0));
+    res = vminq_s16(res, vdupq_n_s16(max));
+
+    return res;
+}
+
+// TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
+static inline int32x4_t sbpsm_clamp_tile4(int32x4_t f, unsigned max) {
+    int32x4_t res;
+
+    // get the hi 16s of all those 32s
+    res = vshrq_n_s32(f, 16);
+
+    // clamp
+    res = vmaxq_s32(res, vdupq_n_s32(0));
+    res = vminq_s32(res, vdupq_n_s32(max));
+
+    return res;
+}
+
+// TILEY_LOW_BITS(fy, max)         (((fy) >> 12) & 0xF)
+static inline int32x4_t sbpsm_clamp_tile4_low_bits(int32x4_t fx) {
+    int32x4_t ret;
+
+    ret = vshrq_n_s32(fx, 12);
+
+    /* We don't need the mask below because the caller will
+     * overwrite the non-masked bits
+     */
+    //ret = vandq_s32(ret, vdupq_n_s32(0xF));
+
+    return ret;
+}
+
+// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16)
+static inline int16x8_t sbpsm_repeat_tile8(int32x4_t low, int32x4_t high, unsigned max) {
+    uint16x8_t res;
+    uint32x4_t tmpl, tmph;
+
+    // get the lower 16 bits
+    res = vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).val[0];
+
+    // bare multiplication, not SkFixedMul
+    tmpl = vmull_u16(vget_low_u16(res), vdup_n_u16(max+1));
+    tmph = vmull_u16(vget_high_u16(res), vdup_n_u16(max+1));
+
+    // extraction of the 16 upper bits
+    res = vuzpq_u16(vreinterpretq_u16_u32(tmpl), vreinterpretq_u16_u32(tmph)).val[1];
+
+    return vreinterpretq_s16_u16(res);
+}
+
+// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16)
+static inline int32x4_t sbpsm_repeat_tile4(int32x4_t f, unsigned max) {
+    uint16x4_t res;
+    uint32x4_t tmp;
+
+    // get the lower 16 bits
+    res = vmovn_u32(vreinterpretq_u32_s32(f));
+
+    // bare multiplication, not SkFixedMul
+    tmp = vmull_u16(res, vdup_n_u16(max+1));
+
+    // extraction of the 16 upper bits
+    tmp = vshrq_n_u32(tmp, 16);
+
+    return vreinterpretq_s32_u32(tmp);
+}
+
+// TILEX_LOW_BITS(fx, max)         ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
+static inline int32x4_t sbpsm_repeat_tile4_low_bits(int32x4_t fx, unsigned max) {
+    uint16x4_t res;
+    uint32x4_t tmp;
+    int32x4_t ret;
+
+    // get the lower 16 bits
+    res = vmovn_u32(vreinterpretq_u32_s32(fx));
+
+    // bare multiplication, not SkFixedMul
+    tmp = vmull_u16(res, vdup_n_u16(max + 1));
+
+    // shift and mask
+    ret = vshrq_n_s32(vreinterpretq_s32_u32(tmp), 12);
+
+    /* We don't need the mask below because the caller will
+     * overwrite the non-masked bits
+     */
+    //ret = vandq_s32(ret, vdupq_n_s32(0xF));
+
+    return ret;
+}
+
+#define MAKENAME(suffix)                ClampX_ClampY ## suffix ## _neon
+#define TILEX_PROCF(fx, max)            SkClampMax((fx) >> 16, max)
+#define TILEY_PROCF(fy, max)            SkClampMax((fy) >> 16, max)
+#define TILEX_PROCF_NEON8(l, h, max)    sbpsm_clamp_tile8(l, h, max)
+#define TILEY_PROCF_NEON8(l, h, max)    sbpsm_clamp_tile8(l, h, max)
+#define TILEX_PROCF_NEON4(fx, max)      sbpsm_clamp_tile4(fx, max)
+#define TILEY_PROCF_NEON4(fy, max)      sbpsm_clamp_tile4(fy, max)
+#define TILEX_LOW_BITS(fx, max)         (((fx) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max)         (((fy) >> 12) & 0xF)
+#define TILEX_LOW_BITS_NEON4(fx, max)   sbpsm_clamp_tile4_low_bits(fx)
+#define TILEY_LOW_BITS_NEON4(fy, max)   sbpsm_clamp_tile4_low_bits(fy)
 #define CHECK_FOR_DECAL
-#include "SkBitmapProcState_matrix_clamp_neon.h"
-
-#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix ## _neon
-#define TILEX_PROCF(fx, max)    SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
-#define TILEY_PROCF(fy, max)    SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
-#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
-#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
-#include "SkBitmapProcState_matrix_repeat_neon.h"
+#include "SkBitmapProcState_matrix_neon.h"
+
+#define MAKENAME(suffix)                RepeatX_RepeatY ## suffix ## _neon
+#define TILEX_PROCF(fx, max)            SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
+#define TILEY_PROCF(fy, max)            SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
+#define TILEX_PROCF_NEON8(l, h, max)    sbpsm_repeat_tile8(l, h, max)
+#define TILEY_PROCF_NEON8(l, h, max)    sbpsm_repeat_tile8(l, h, max)
+#define TILEX_PROCF_NEON4(fx, max)      sbpsm_repeat_tile4(fx, max)
+#define TILEY_PROCF_NEON4(fy, max)      sbpsm_repeat_tile4(fy, max)
+#define TILEX_LOW_BITS(fx, max)         ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max)         ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
+#define TILEX_LOW_BITS_NEON4(fx, max)   sbpsm_repeat_tile4_low_bits(fx, max)
+#define TILEY_LOW_BITS_NEON4(fy, max)   sbpsm_repeat_tile4_low_bits(fy, max)
+#include "SkBitmapProcState_matrix_neon.h"
 
 
 
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h
deleted file mode 100644
index a615e26b240..00000000000
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h
+++ /dev/null
@@ -1,911 +0,0 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-/*
- * Modifications done in-house at Motorola
- *
- * this is a clone of SkBitmapProcState_matrix.h
- * and has been tuned to work with the NEON unit.
- *
- * Still going back and forth between whether this approach
- * (clone the entire SkBitmapProcState_matrix.h file or
- * if I should put just the modified routines in here and
- * then use a construct like #define DONT_DO_THIS_FUNCTION or
- * something like that...
- *
- * This is for the ClampX_ClampY instance
- *
- */
-
-
-#include <arm_neon.h>
-
-/*
- * This has been modified on the knowledge that (at the time)
- * we had the following macro definitions in the parent file
- *
- * #define MAKENAME(suffix)        ClampX_ClampY ## suffix
- * #define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
- * #define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
- * #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
- * #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
- * #define CHECK_FOR_DECAL
- */
-
-/* SkClampMax(val,max) -- bound to 0..max */
-
-#define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
-#define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
-#define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
-#define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
-#define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
-#define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
-
-#define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
-#define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
-
-#ifndef PREAMBLE
-    #define PREAMBLE(state)
-    #define PREAMBLE_PARAM_X
-    #define PREAMBLE_PARAM_Y
-    #define PREAMBLE_ARG_X
-    #define PREAMBLE_ARG_Y
-#endif
-
-static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
-                                uint32_t xy[], int count, int x, int y) {
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask)) == 0);
-
-    PREAMBLE(s);
-    // we store y, x, x, x, x, x
-
-    const unsigned maxX = s.fBitmap->width() - 1;
-    SkFixed fx;
-    {
-        SkPoint pt;
-        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
-                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
-        fx = SkScalarToFixed(pt.fY);
-        const unsigned maxY = s.fBitmap->height() - 1;
-        *xy++ = TILEY_PROCF(fx, maxY);
-        fx = SkScalarToFixed(pt.fX);
-    }
-
-    if (0 == maxX) {
-        // all of the following X values must be 0
-        memset(xy, 0, count * sizeof(uint16_t));
-        return;
-    }
-
-    const SkFixed dx = s.fInvSx;
-
-#ifdef CHECK_FOR_DECAL
-    // test if we don't need to apply the tile proc
-    if ((unsigned)(fx >> 16) <= maxX &&
-        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
-        decal_nofilter_scale_neon(xy, fx, dx, count);
-        return;
-    }
-#endif
-
-    int i;
-
-    /* very much like done in decal_nofilter, but with
-     * an extra clamping function applied.
-     * TILEX_PROCF(fx,max) SkClampMax((fx)>>16, max)
-     */
-    if (count >= 8) {
-        /* SkFixed is 16.16 fixed point */
-        SkFixed dx2 = dx+dx;
-        SkFixed dx4 = dx2+dx2;
-        SkFixed dx8 = dx4+dx4;
-
-        /* now build fx/fx+dx/fx+2dx/fx+3dx */
-        SkFixed fx1, fx2, fx3;
-        int32x4_t lbase, hbase;
-        int16_t *dst16 = (int16_t *)xy;
-
-        fx1 = fx+dx;
-        fx2 = fx1+dx;
-        fx3 = fx2+dx;
-
-        /* build my template(s) */
-        /* avoid the 'lbase unitialized' warning */
-        lbase = vdupq_n_s32(fx);
-        lbase = vsetq_lane_s32(fx1, lbase, 1);
-        lbase = vsetq_lane_s32(fx2, lbase, 2);
-        lbase = vsetq_lane_s32(fx3, lbase, 3);
-
-        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
-
-        /* store & bump */
-        do {
-            int32x4_t lout;
-            int32x4_t hout;
-            int16x8_t hi16;
-
-            /* get the hi 16s of all those 32s */
-            lout = lbase;
-            hout = hbase;
-            /* this sets up all lout's then all hout's in hout */
-            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
-            hi16 = vreinterpretq_s16_s32(hout);
-
-            /* clamp & output */
-            hi16 = vmaxq_s16(hi16, vdupq_n_s16(0));
-            hi16 = vminq_s16(hi16, vdupq_n_s16(maxX));
-            vst1q_s16(dst16, hi16);
-
-            /* but preserving base & on to the next */
-            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
-            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
-            dst16 += 8;
-            count -= 8;
-            fx += dx8;
-        } while (count >= 8);
-        xy = (uint32_t *) dst16;
-    }
-
-    uint16_t* xx = (uint16_t*)xy;
-    for (i = count; i > 0; --i) {
-        *xx++ = TILEX_PROCF(fx, maxX); fx += dx;
-    }
-}
-
-// note: we could special-case on a matrix which is skewed in X but not Y.
-// this would require a more general setup thatn SCALE does, but could use
-// SCALE's inner loop that only looks at dx
-
-static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
-                                 uint32_t xy[], int count, int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask |
-                             SkMatrix::kAffine_Mask)) == 0);
-
-    PREAMBLE(s);
-    SkPoint srcPt;
-    s.fInvProc(s.fInvMatrix,
-               SkIntToScalar(x) + SK_ScalarHalf,
-               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
-    SkFixed fx = SkScalarToFixed(srcPt.fX);
-    SkFixed fy = SkScalarToFixed(srcPt.fY);
-    SkFixed dx = s.fInvSx;
-    SkFixed dy = s.fInvKy;
-    int maxX = s.fBitmap->width() - 1;
-    int maxY = s.fBitmap->height() - 1;
-
-    /* NEON lets us do an 8x unrolling */
-    if (count >= 8) {
-        /* SkFixed is 16.16 fixed point */
-        SkFixed dx4 = dx * 4;
-        SkFixed dy4 = dy * 4;
-        SkFixed dx8 = dx * 8;
-        SkFixed dy8 = dy * 8;
-
-        int32x4_t xbase, ybase;
-        int32x4_t x2base, y2base;
-        int16_t *dst16 = (int16_t *) xy;
-
-        /* my sets of maxx/maxy for clamping */
-        int32_t maxpair = (maxX&0xffff) | ((maxY&0xffff)<<16);
-        int16x8_t maxXY = vreinterpretq_s16_s32(vdupq_n_s32(maxpair));
-
-        /* now build fx/fx+dx/fx+2dx/fx+3dx */
-        /* avoid the 'xbase unitialized' warning...*/
-        xbase = vdupq_n_s32(fx);
-        xbase = vsetq_lane_s32(fx+dx, xbase, 1);
-        xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);
-        xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);
-
-        /* same for fy */
-        /* avoid the 'ybase unitialized' warning...*/
-        ybase = vdupq_n_s32(fy);
-        ybase = vsetq_lane_s32(fy+dy, ybase, 1);
-        ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);
-        ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
-
-        x2base = vaddq_s32(xbase, vdupq_n_s32(dx4));
-        y2base = vaddq_s32(ybase, vdupq_n_s32(dy4));
-
-        /* store & bump */
-        do {
-            int32x4_t xout, yout;
-            int32x4_t x2out, y2out;
-            int16x8_t hi16, hi16_2;
-
-            xout = xbase;
-            yout = ybase;
-
-            /* overlay y's low16 with hi16 from x */
-            /* so we properly shifted xyxyxyxy */
-            yout = vsriq_n_s32(yout, xout, 16);
-            hi16 = vreinterpretq_s16_s32 (yout);
-
-            /* do the clamping; both guys get 0's */
-            hi16 = vmaxq_s16 (hi16, vdupq_n_s16(0));
-            hi16 = vminq_s16 (hi16, maxXY);
-
-            vst1q_s16 (dst16, hi16);
-
-            /* and for the other 4 pieces of this iteration */
-            x2out = x2base;
-            y2out = y2base;
-
-            /* overlay y's low16 with hi16 from x */
-            /* so we properly shifted xyxyxyxy */
-            y2out = vsriq_n_s32(y2out, x2out, 16);
-            hi16_2 = vreinterpretq_s16_s32 (y2out);
-
-            /* do the clamping; both guys get 0's */
-            hi16_2 = vmaxq_s16 (hi16_2, vdupq_n_s16(0));
-            hi16_2 = vminq_s16 (hi16_2, maxXY);
-
-            /* RBE: gcc regenerates dst16+8 all the time instead
-             * of folding it into an addressing mode. *sigh* */
-            vst1q_s16 (dst16+8, hi16_2);
-
-            /* moving base and on to the next */
-            xbase = vaddq_s32 (xbase, vdupq_n_s32 (dx8));
-            ybase = vaddq_s32 (ybase, vdupq_n_s32 (dy8));
-            x2base = vaddq_s32 (x2base, vdupq_n_s32 (dx8));
-            y2base = vaddq_s32 (y2base, vdupq_n_s32 (dy8));
-
-            dst16 += 16;        /* 8x32 aka 16x16 */
-            count -= 8;
-            fx += dx8;
-            fy += dy8;
-        } while (count >= 8);
-        xy = (uint32_t *) dst16;
-    }
-
-    for (int i = count; i > 0; --i) {
-        *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX);
-        fx += dx; fy += dy;
-    }
-}
-
-#undef    DEBUG_PERSP_NOFILTER
-
-static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
-                                uint32_t* SK_RESTRICT xy,
-                                int count, int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
-    PREAMBLE(s);
-    /* max{X,Y} are int here, but later shown/assumed to fit in 16 bits */
-    int maxX = s.fBitmap->width() - 1;
-    int maxY = s.fBitmap->height() - 1;
-
-    SkPerspIter   iter(s.fInvMatrix,
-                       SkIntToScalar(x) + SK_ScalarHalf,
-                       SkIntToScalar(y) + SK_ScalarHalf, count);
-
-    while ((count = iter.next()) != 0) {
-        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-
-#if defined(DEBUG_PERSP_NOFILTER)
-    /* debugging stuff */
-    const SkFixed *end_srcXY = srcXY + (count*2);
-    uint32_t *end_xy = xy + (count);
-    const SkFixed *base_srcXY = srcXY;
-    uint32_t *base_xy = xy;
-    int base_count = count;
-#endif
-
-#if 1
-        // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition
-    // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn
-
-        /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
-         * but we immediately discard the low 16 bits...
-         * so what we're going to do is vld4, which will give us
-         * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo'
-         * parts....
-         */
-        if (count >= 8) {
-            int16_t *mysrc = (int16_t *) srcXY;
-            int16_t *mydst = (int16_t *) xy;
-            int16x4_t maxX4 = vdup_n_s16((int16_t)maxX);
-            int16x4_t maxY4 = vdup_n_s16((int16_t)maxY);
-            int16x4_t zero4 = vdup_n_s16(0);
-
-        /* The constructs with local blocks for register assignments
-         * and asm() instructions is to make keep any hard register
-         * assignments to as small a scope as possible. and to avoid
-         * burning call-preserved hard registers on the vld/vst
-         * instructions.
-         */
-
-            do {
-                int16x4_t xhi, yhi;
-                int16x4_t x2hi, y2hi;
-
-                /* vld4 does the de-interleaving for us */
-        {
-                    register int16x4_t t_xlo asm("d0");
-                    register int16x4_t t_xhi asm("d1");
-                    register int16x4_t t_ylo asm("d2");
-                    register int16x4_t t_yhi asm("d3");
-
-                    asm ("vld4.16    {d0-d3},[%4]  /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
-                        : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi)
-                        : "r" (mysrc)
-                    );
-            xhi = t_xhi;
-            yhi = t_yhi;
-        }
-
-                /* clamp X>>16 (aka xhi) to 0..maxX */
-                xhi = vmax_s16(xhi, zero4);    /* now 0.. */
-                xhi = vmin_s16(xhi, maxX4);    /* now 0..maxX */
-
-                /* clamp Y>>16 (aka yhi) to 0..maxY */
-                yhi = vmax_s16(yhi, zero4);    /* now 0.. */
-                yhi = vmin_s16(yhi, maxY4);    /* now 0..maxY */
-
-        /* deal with the second set of numbers */
-        {
-                    register int16x4_t t_xlo asm("d4");
-                    register int16x4_t t_xhi asm("d5");
-                    register int16x4_t t_ylo asm("d6");
-                    register int16x4_t t_yhi asm("d7");
-
-                    /* offset == 256 bits == 32 bytes == 8 longs == 16 shorts */
-                    asm ("vld4.16    {d4-d7},[%4]  /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
-                        : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi)
-                        : "r" (mysrc+16)
-                    );
-            x2hi = t_xhi;
-            y2hi = t_yhi;
-        }
-
-                /* clamp the second 4 here */
-
-        if (0) { extern void rbe(void); rbe(); }
-
-                /* clamp X>>16 (aka xhi) to 0..maxX */
-                x2hi = vmax_s16(x2hi, zero4);    /* now 0.. */
-                x2hi = vmin_s16(x2hi, maxX4);    /* now 0..maxX */
-
-                /* clamp Y>>16 (aka yhi) to 0..maxY */
-                y2hi = vmax_s16(y2hi, zero4);    /* now 0.. */
-                y2hi = vmin_s16(y2hi, maxY4);    /* now 0..maxY */
-
-                /* we're storing as {x,y}s: x is [0], y is [1] */
-                /* we'll use vst2 to make this happen */
-
-        {
-                    register int16x4_t out_x asm("d16") = xhi;
-                    register int16x4_t out_y asm("d17") = yhi;
-
-                    asm ("vst2.16    {d16-d17},[%2]  /* xlo=%P0 xhi=%P1 */"
-            :
-            : "w" (out_x), "w" (out_y), "r" (mydst)
-            );
-        }
-        {
-                    register int16x4_t out_x asm("d18") = x2hi;
-                    register int16x4_t out_y asm("d19") = y2hi;
-
-                    asm ("vst2.16    {d18-d19},[%2]  /* xlo=%P0 xhi=%P1 */"
-            :
-            : "w" (out_x), "w" (out_y), "r" (mydst+8)
-            );
-        }
-
-                /* XXX: gcc isn't interleaving these with the NEON ops
-                 * but i think that all the scoreboarding works out */
-                count -= 8;    /* 8 iterations */
-                mysrc += 32;    /* 16 longs, aka 32 shorts */
-                mydst += 16;    /* 16 shorts, aka 8 longs */
-            } while (count >= 8);
-            /* get xy and srcXY fixed up */
-            srcXY = (const SkFixed *) mysrc;
-            xy = (uint32_t *) mydst;
-        }
-#endif
-
-        while (--count >= 0) {
-            *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
-                     TILEX_PROCF(srcXY[0], maxX);
-            srcXY += 2;
-        }
-
-#if defined(DEBUG_PERSP_NOFILTER)
-    /* for checking our NEON-produced results against vanilla code */
-    {
-        int bad = (-1);
-        for (int i = 0; i < base_count; i++) {
-            uint32_t val;
-            val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
-                    TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
-
-            if (val != base_xy[i]) {
-                bad = i;
-                break;
-            }
-        }
-        if (bad >= 0) {
-            SkDebugf("clamp-nofilter-persp failed piece %d\n", bad);
-            SkDebugf("    maxX %08x maxY %08x\n", maxX, maxY);
-            bad -= (bad & 0x7);           /* align */
-            for (int i = bad; i < bad + 8; i++) {
-                uint32_t val;
-                val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
-                TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
-
-                SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
-                          i, base_xy[i], val, base_srcXY[i * 2 + 0],
-                 base_srcXY[i * 2 + 1]);
-            }
-            SkDebugf ("---\n");
-        }
-
-        if (end_xy != xy) {
-            SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy);
-        }
-        if (end_srcXY != srcXY) {
-            SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY,
-                      end_srcXY);
-        }
-    }
-#endif
-    }
-}
-
-#undef    DEBUG_PERSP_NOFILTER
-
-//////////////////////////////////////////////////////////////////////////////
-
-static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
-                                          SkFixed one PREAMBLE_PARAM_Y) {
-    unsigned i = TILEY_PROCF(f, max);
-    i = (i << 4) | TILEY_LOW_BITS(f, max);
-    return (i << 14) | (TILEY_PROCF((f + one), max));
-}
-
-static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
-                                          SkFixed one PREAMBLE_PARAM_X) {
-    unsigned i = TILEX_PROCF(f, max);
-    i = (i << 4) | TILEX_LOW_BITS(f, max);
-    return (i << 14) | (TILEX_PROCF((f + one), max));
-}
-
-static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
-                              uint32_t xy[], int count, int x, int y) {
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask)) == 0);
-    SkASSERT(s.fInvKy == 0);
-
-    PREAMBLE(s);
-
-    const unsigned maxX = s.fBitmap->width() - 1;
-    const SkFixed one = s.fFilterOneX;
-    const SkFixed dx = s.fInvSx;
-    SkFixed fx;
-
-    {
-        SkPoint pt;
-        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
-                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
-        const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
-        const unsigned maxY = s.fBitmap->height() - 1;
-        // compute our two Y values up front
-        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
-        // now initialize fx
-        fx = SkScalarToFixed(pt.fX) - (one >> 1);
-    }
-
-#ifdef CHECK_FOR_DECAL
-    // test if we don't need to apply the tile proc
-    if (dx > 0 &&
-            (unsigned)(fx >> 16) <= maxX &&
-            (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
-        decal_filter_scale_neon(xy, fx, dx, count);
-    } else
-#endif
-
-    if (count >= 4) {
-        int32x4_t wide_one, wide_fx, wide_fx1, wide_i, wide_lo;
-    #if 0
-        /* verification hooks -- see below */
-        SkFixed debug_fx = fx;
-        int count_done = 0;
-    #endif
-
-        wide_fx = vdupq_n_s32(fx);
-        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
-        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
-        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
-
-        wide_one = vdupq_n_s32(one);
-
-        while (count >= 4) {
-            /* original expands to:
-             * unsigned i = SkClampMax((f) >> 16, max);
-             * i = (i << 4) | (((f) >> 12) & 0xF);
-             * return (i << 14) | (SkClampMax(((f + one)) >> 16, max));
-             */
-
-            /* i = SkClampMax(f>>16, maxX) */
-            wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0));
-            wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX));
-
-            /* i<<4 | TILEX_LOW_BITS(fx) */
-            wide_lo = vshrq_n_s32(wide_fx, 12);
-            wide_i = vsliq_n_s32(wide_lo, wide_i, 4);
-
-            /* i<<14 */
-            wide_i = vshlq_n_s32(wide_i, 14);
-
-            /* SkClampMax(((f + one)) >> 16, max) */
-            wide_fx1 = vaddq_s32(wide_fx, wide_one);
-            wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0));
-            wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX));
-
-            /* final combination */
-            wide_i = vorrq_s32(wide_i, wide_fx1);
-
-            vst1q_u32(xy, vreinterpretq_u32_s32(wide_i));
-
-    #if 0
-            /* having a verification hook is a good idea */
-            /* use debug_fx, debug_fx+dx, etc. */
-
-            for (int i=0;i<4;i++) {
-            uint32_t want = PACK_FILTER_X_NAME(debug_fx, maxX, one PREAMBLE_ARG_X);
-                    if (xy[i] != want)
-                {
-                /* print a nastygram */
-                SkDebugf("clamp-filter-scale fails\n");
-                SkDebugf("got %08x want %08x\n", xy[i], want);
-                SkDebugf("fx %08x debug_fx %08x dx %08x done %d\n",
-                fx, debug_fx, dx, count_done);
-                SkDebugf(" maxX %08x one %08x\n", maxX, one);
-
-                }
-            debug_fx += dx;
-            count_done++;
-            }
-    #endif
-            wide_fx += vdupq_n_s32(dx+dx+dx+dx);
-            fx += dx+dx+dx+dx;
-            xy += 4;
-            count -= 4;
-        }
-    }
-
-    while (--count >= 0) {
-        *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X);
-        fx += dx;
-    }
-}
-
-static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
-                               uint32_t xy[], int count, int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask |
-                             SkMatrix::kAffine_Mask)) == 0);
-
-    PREAMBLE(s);
-    SkPoint srcPt;
-    s.fInvProc(s.fInvMatrix,
-               SkIntToScalar(x) + SK_ScalarHalf,
-               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
-    SkFixed oneX = s.fFilterOneX;
-    SkFixed oneY = s.fFilterOneY;
-    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
-    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
-    SkFixed dx = s.fInvSx;
-    SkFixed dy = s.fInvKy;
-    unsigned maxX = s.fBitmap->width() - 1;
-    unsigned maxY = s.fBitmap->height() - 1;
-
-    if (count >= 4) {
-        int32x4_t wide_i, wide_lo;
-        int32x4_t wide_fx, wide_onex, wide_fx1;
-        int32x4_t wide_fy, wide_oney, wide_fy1;
-
-    #undef    AFFINE_DEBUG
-    #if    defined(AFFINE_DEBUG)
-        SkFixed fyp = fy;
-        SkFixed fxp = fx;
-        uint32_t *xyp = xy;
-        int count_done = 0;
-    #endif
-
-        wide_fx = vdupq_n_s32(fx);
-        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
-        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
-        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
-
-        wide_fy = vdupq_n_s32(fy);
-        wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
-        wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
-        wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
-
-        wide_onex = vdupq_n_s32(oneX);
-        wide_oney = vdupq_n_s32(oneY);
-
-        while (count >= 4) {
-            int32x4_t wide_x;
-            int32x4_t wide_y;
-
-            /* do the X side, then the Y side, then interleave them */
-
-            /* original expands to:
-             * unsigned i = SkClampMax((f) >> 16, max);
-             * i = (i << 4) | (((f) >> 12) & 0xF);
-             * return (i << 14) | (SkClampMax(((f + one)) >> 16, max));
-             */
-
-            /* i = SkClampMax(f>>16, maxX) */
-            wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0));
-            wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX));
-
-            /* i<<4 | TILEX_LOW_BITS(fx) */
-            wide_lo = vshrq_n_s32(wide_fx, 12);
-            wide_i = vsliq_n_s32(wide_lo, wide_i, 4);
-
-            /* i<<14 */
-            wide_i = vshlq_n_s32(wide_i, 14);
-
-            /* SkClampMax(((f + one)) >> 16, max) */
-            wide_fx1 = vaddq_s32(wide_fx, wide_onex);
-            wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0));
-            wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX));
-
-            /* final combination */
-            wide_x = vorrq_s32(wide_i, wide_fx1);
-
-            /* And now the Y side */
-
-            /* i = SkClampMax(f>>16, maxX) */
-            wide_i = vmaxq_s32(vshrq_n_s32(wide_fy,16), vdupq_n_s32(0));
-            wide_i = vminq_s32(wide_i, vdupq_n_s32(maxY));
-
-            /* i<<4 | TILEX_LOW_BITS(fx) */
-            wide_lo = vshrq_n_s32(wide_fy, 12);
-            wide_i = vsliq_n_s32(wide_lo, wide_i, 4);
-
-            /* i<<14 */
-            wide_i = vshlq_n_s32(wide_i, 14);
-
-            /* SkClampMax(((f + one)) >> 16, max) */
-            wide_fy1 = vaddq_s32(wide_fy, wide_oney);
-            wide_fy1 = vmaxq_s32(vshrq_n_s32(wide_fy1,16), vdupq_n_s32(0));
-            wide_fy1 = vminq_s32(wide_fy1, vdupq_n_s32(maxY));
-
-            /* final combination */
-            wide_y = vorrq_s32(wide_i, wide_fy1);
-
-            /* interleave as YXYXYXYX as part of the storing */
-        {
-                /* vst2.32 needs side-by-side registers */
-                register int32x4_t t_x asm("q1");
-                register int32x4_t t_y asm("q0");
-
-        t_x = wide_x; t_y = wide_y;
-                asm ("vst2.32    {q0-q1},[%2]  /* y=%q0 x=%q1 */"
-                    :
-                    : "w" (t_y), "w" (t_x), "r" (xy)
-                    );
-        }
-
-    #if    defined(AFFINE_DEBUG)
-            /* make sure we're good here -- check the 4 we just output */
-            for (int i = 0; i<4;i++) {
-            uint32_t val;
-            val = PACK_FILTER_Y_NAME(fyp, maxY, oneY PREAMBLE_ARG_Y);
-            if (val != xy[i*2+0]) {
-                /* print a nastygram */
-                SkDebugf("clamp-filter-affine fails\n");
-                SkDebugf("[bad-y] got %08x want %08x\n", xy[i*2+0], val);
-                SkDebugf("fy %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n",
-                fy, fxp, fyp, dx, dy, count_done);
-                SkDebugf(" maxY %08x oneY %08x\n", maxY, oneY);
-                }
-            val = PACK_FILTER_X_NAME(fxp, maxX, oneX PREAMBLE_ARG_X);
-            if (val != xy[i*2+1]) {
-                /* print a nastygram */
-                SkDebugf("clamp-filter-affine fails\n");
-                SkDebugf("[bad-x] got %08x want %08x\n", xy[i*2+1], val);
-                SkDebugf("fx %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n",
-                fx, fxp, fyp, dx, dy, count_done);
-                SkDebugf(" maxX %08x one %08x\n", maxX, oneX);
-            }
-            fyp += dy;
-            fxp += dx;
-            count_done++;
-            }
-    #endif
-
-            wide_fx += vdupq_n_s32(dx+dx+dx+dx);
-            fx += dx+dx+dx+dx;
-            wide_fy += vdupq_n_s32(dy+dy+dy+dy);
-            fy += dy+dy+dy+dy;
-            xy += 8;        /* 4 x's, 4 y's */
-            count -= 4;
-        }
-    }
-
-    while (--count >= 0) {
-        /* NB: writing Y/X */
-        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
-        fy += dy;
-        *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
-        fx += dx;
-    }
-}
-
-static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
-                              uint32_t* SK_RESTRICT xy, int count,
-                              int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
-    PREAMBLE(s);
-    unsigned maxX = s.fBitmap->width() - 1;
-    unsigned maxY = s.fBitmap->height() - 1;
-    SkFixed oneX = s.fFilterOneX;
-    SkFixed oneY = s.fFilterOneY;
-
-    SkPerspIter   iter(s.fInvMatrix,
-                       SkIntToScalar(x) + SK_ScalarHalf,
-                       SkIntToScalar(y) + SK_ScalarHalf, count);
-
-    while ((count = iter.next()) != 0) {
-        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-
-        if (count >= 4) {
-            int32x4_t wide_i, wide_lo;
-            int32x4_t wide_fx1;
-            int32x4_t wide_fy1;
-            int32x4_t wide_x, wide_y;
-
-            while (count >= 4) {
-                /* RBE: it's good, but:
-                 * -- we spill a constant that could be easily regnerated
-                 *    [perhaps tweak gcc's NEON constant costs?]
-                 */
-
-                /* load src:  x-y-x-y-x-y-x-y */
-        {
-            register int32x4_t q0 asm ("q0");
-            register int32x4_t q1 asm ("q1");
-                    asm ("vld2.32    {q0-q1},[%2]  /* x=%q0 y=%q1 */"
-                         : "=w" (q0), "=w" (q1)
-                         : "r" (srcXY));
-            wide_x = q0; wide_y = q1;
-        }
-
-                /* do the X side, then the Y side, then interleave them */
-
-                wide_x = vsubq_s32(wide_x, vdupq_n_s32 (oneX>>1));
-
-                /* original expands to:
-                 * unsigned i = SkClampMax((f) >> 16, max);
-                 * i = (i << 4) | (((f) >> 12) & 0xF);
-                 * return (i << 14) | (SkClampMax(((f + one)) >> 16, max));
-                 */
-
-                /* i = SkClampMax(f>>16, maxX) */
-                wide_i = vmaxq_s32 (vshrq_n_s32 (wide_x, 16), vdupq_n_s32 (0));
-                wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxX));
-
-                /* i<<4 | TILEX_LOW_BITS(fx) */
-                wide_lo = vshrq_n_s32 (wide_x, 12);
-                wide_i = vsliq_n_s32 (wide_lo, wide_i, 4);
-
-                /* i<<14 */
-                wide_i = vshlq_n_s32 (wide_i, 14);
-
-                /* SkClampMax(((f + one)) >> 16, max) */
-                wide_fx1 = vaddq_s32 (wide_x, vdupq_n_s32(oneX));
-                wide_fx1 = vmaxq_s32 (vshrq_n_s32 (wide_fx1, 16), vdupq_n_s32 (0));
-                wide_fx1 = vminq_s32 (wide_fx1, vdupq_n_s32 (maxX));
-
-                /* final combination */
-                wide_x = vorrq_s32 (wide_i, wide_fx1);
-
-
-                /* And now the Y side */
-
-                wide_y = vsubq_s32(wide_y, vdupq_n_s32 (oneY>>1));
-
-                /* i = SkClampMax(f>>16, maxX) */
-                wide_i = vmaxq_s32 (vshrq_n_s32 (wide_y, 16), vdupq_n_s32 (0));
-                wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxY));
-
-                /* i<<4 | TILEX_LOW_BITS(fx) */
-                wide_lo = vshrq_n_s32 (wide_y, 12);
-                wide_i = vsliq_n_s32 (wide_lo, wide_i, 4);
-
-                /* i<<14 */
-                wide_i = vshlq_n_s32 (wide_i, 14);
-
-                /* SkClampMax(((f + one)) >> 16, max) */
-
-                /* wide_fy1_1 and wide_fy1_2 are just temporary variables to
-                 * work-around an ICE in debug */
-                int32x4_t wide_fy1_1 = vaddq_s32 (wide_y, vdupq_n_s32(oneY));
-                int32x4_t wide_fy1_2 = vmaxq_s32 (vshrq_n_s32 (wide_fy1_1, 16),
-                                                  vdupq_n_s32 (0));
-                wide_fy1 = vminq_s32 (wide_fy1_2, vdupq_n_s32 (maxY));
-
-                /* final combination */
-                wide_y = vorrq_s32 (wide_i, wide_fy1);
-
-                /* switch them around; have to do it this way to get them
-                 * in the proper registers to match our instruction */
-
-                /* iteration bookkeeping, ahead of the asm() for scheduling */
-                srcXY += 2*4;
-                count -= 4;
-
-                /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */
-        {
-            register int32x4_t q0 asm ("q0") = wide_y;
-            register int32x4_t q1 asm ("q1") = wide_x;
-
-                    asm ("vst2.32    {q0-q1},[%2]  /* y=%q0 x=%q1 */"
-                        :
-                        : "w" (q0), "w" (q1), "r" (xy));
-        }
-
-                /* on to the next iteration */
-                /* count, srcXY are handled above */
-                xy += 2*4;
-            }
-        }
-
-        /* was do-while; NEON code invalidates original count>0 assumption */
-        while (--count >= 0) {
-        /* NB: we read x/y, we write y/x */
-            *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
-                                       oneY PREAMBLE_ARG_Y);
-            *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
-                                       oneX PREAMBLE_ARG_X);
-            srcXY += 2;
-        }
-    }
-}
-
-const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
-    SCALE_NOFILTER_NAME,
-    SCALE_FILTER_NAME,
-    AFFINE_NOFILTER_NAME,
-    AFFINE_FILTER_NAME,
-    PERSP_NOFILTER_NAME,
-    PERSP_FILTER_NAME
-};
-
-#undef MAKENAME
-#undef TILEX_PROCF
-#undef TILEY_PROCF
-#ifdef CHECK_FOR_DECAL
-    #undef CHECK_FOR_DECAL
-#endif
-
-#undef SCALE_NOFILTER_NAME
-#undef SCALE_FILTER_NAME
-#undef AFFINE_NOFILTER_NAME
-#undef AFFINE_FILTER_NAME
-#undef PERSP_NOFILTER_NAME
-#undef PERSP_FILTER_NAME
-
-#undef PREAMBLE
-#undef PREAMBLE_PARAM_X
-#undef PREAMBLE_PARAM_Y
-#undef PREAMBLE_ARG_X
-#undef PREAMBLE_ARG_Y
-
-#undef TILEX_LOW_BITS
-#undef TILEY_LOW_BITS
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h
new file mode 100644
index 00000000000..72bf1bce336
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h
@@ -0,0 +1,506 @@
+
+#include <arm_neon.h>
+
+
+#define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
+#define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
+#define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
+#define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
+#define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
+#define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
+
+#define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
+#define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
+#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
+#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
+
+#ifndef PREAMBLE
+    #define PREAMBLE(state)
+    #define PREAMBLE_PARAM_X
+    #define PREAMBLE_PARAM_Y
+    #define PREAMBLE_ARG_X
+    #define PREAMBLE_ARG_Y
+#endif
+
+static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
+                                uint32_t xy[], int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+
+    PREAMBLE(s);
+
+    // we store y, x, x, x, x, x
+    const unsigned maxX = s.fBitmap->width() - 1;
+    SkFractionalInt fx;
+    {
+        SkPoint pt;
+        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
+        fx = SkScalarToFractionalInt(pt.fY);
+        const unsigned maxY = s.fBitmap->height() - 1;
+        *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
+        fx = SkScalarToFractionalInt(pt.fX);
+    }
+
+    if (0 == maxX) {
+        // all of the following X values must be 0
+        memset(xy, 0, count * sizeof(uint16_t));
+        return;
+    }
+
+    const SkFractionalInt dx = s.fInvSxFractionalInt;
+
+#ifdef CHECK_FOR_DECAL
+    // test if we don't need to apply the tile proc
+    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
+        decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
+                             SkFractionalIntToFixed(dx), count);
+        return;
+    }
+#endif
+
+    if (count >= 8) {
+        SkFractionalInt dx2 = dx+dx;
+        SkFractionalInt dx4 = dx2+dx2;
+        SkFractionalInt dx8 = dx4+dx4;
+
+        // now build fx/fx+dx/fx+2dx/fx+3dx
+        SkFractionalInt fx1, fx2, fx3;
+        int32x4_t lbase, hbase;
+        int16_t *dst16 = (int16_t *)xy;
+
+        fx1 = fx+dx;
+        fx2 = fx1+dx;
+        fx3 = fx2+dx;
+
+        lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
+        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
+        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
+        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
+        hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
+
+        // store & bump
+        while (count >= 8) {
+
+            int16x8_t fx8;
+
+            fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
+
+            vst1q_s16(dst16, fx8);
+
+            // but preserving base & on to the next
+            lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+            hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+            dst16 += 8;
+            count -= 8;
+            fx += dx8;
+        };
+        xy = (uint32_t *) dst16;
+    }
+
+    uint16_t* xx = (uint16_t*)xy;
+    for (int i = count; i > 0; --i) {
+        *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
+        fx += dx;
+    }
+}
+
+static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
+                                 uint32_t xy[], int count, int x, int y) {
+    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask |
+                             SkMatrix::kAffine_Mask)) == 0);
+
+    PREAMBLE(s);
+    SkPoint srcPt;
+    s.fInvProc(s.fInvMatrix,
+               SkIntToScalar(x) + SK_ScalarHalf,
+               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+
+    SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
+    SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
+    SkFractionalInt dx = s.fInvSxFractionalInt;
+    SkFractionalInt dy = s.fInvKyFractionalInt;
+    int maxX = s.fBitmap->width() - 1;
+    int maxY = s.fBitmap->height() - 1;
+
+    if (count >= 8) {
+        SkFractionalInt dx4 = dx * 4;
+        SkFractionalInt dy4 = dy * 4;
+        SkFractionalInt dx8 = dx * 8;
+        SkFractionalInt dy8 = dy * 8;
+
+        int32x4_t xbase, ybase;
+        int32x4_t x2base, y2base;
+        int16_t *dst16 = (int16_t *) xy;
+
+        // now build fx, fx+dx, fx+2dx, fx+3dx
+        xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
+        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
+        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
+        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
+
+        // same for fy
+        ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
+        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
+        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
+        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
+
+        x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
+        y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
+
+        // store & bump
+        do {
+            int16x8x2_t hi16;
+
+            hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
+            hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
+
+            vst2q_s16(dst16, hi16);
+
+            // moving base and on to the next
+            xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+            ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
+            x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+            y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
+
+            dst16 += 16; // 8x32 aka 16x16
+            count -= 8;
+            fx += dx8;
+            fy += dy8;
+        } while (count >= 8);
+        xy = (uint32_t *) dst16;
+    }
+
+    for (int i = count; i > 0; --i) {
+        *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
+                 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
+        fx += dx; fy += dy;
+    }
+}
+
+static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
+                                uint32_t* SK_RESTRICT xy,
+                                int count, int x, int y) {
+    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
+
+    PREAMBLE(s);
+    // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
+    int maxX = s.fBitmap->width() - 1;
+    int maxY = s.fBitmap->height() - 1;
+
+    SkPerspIter iter(s.fInvMatrix,
+                     SkIntToScalar(x) + SK_ScalarHalf,
+                     SkIntToScalar(y) + SK_ScalarHalf, count);
+
+    while ((count = iter.next()) != 0) {
+        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
+
+        if (count >= 8) {
+            int32_t *mysrc = (int32_t *) srcXY;
+            int16_t *mydst = (int16_t *) xy;
+            do {
+                int16x8x2_t hi16;
+                int32x4x2_t xy1, xy2;
+
+                xy1 = vld2q_s32(mysrc);
+                xy2 = vld2q_s32(mysrc+8);
+
+                hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
+                hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
+
+                vst2q_s16(mydst, hi16);
+
+                count -= 8;  // 8 iterations
+                mysrc += 16; // 16 longs
+                mydst += 16; // 16 shorts, aka 8 longs
+            } while (count >= 8);
+            // get xy and srcXY fixed up
+            srcXY = (const SkFixed *) mysrc;
+            xy = (uint32_t *) mydst;
+        }
+
+        while (--count >= 0) {
+            *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
+                     TILEX_PROCF(srcXY[0], maxX);
+            srcXY += 2;
+        }
+    }
+}
+
+static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
+                                          SkFixed one PREAMBLE_PARAM_Y) {
+    unsigned i = TILEY_PROCF(f, max);
+    i = (i << 4) | TILEY_LOW_BITS(f, max);
+    return (i << 14) | (TILEY_PROCF((f + one), max));
+}
+
+static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
+                                          SkFixed one PREAMBLE_PARAM_X) {
+    unsigned i = TILEX_PROCF(f, max);
+    i = (i << 4) | TILEX_LOW_BITS(f, max);
+    return (i << 14) | (TILEX_PROCF((f + one), max));
+}
+
+static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
+                                          SkFixed one PREAMBLE_PARAM_X) {
+    int32x4_t ret, res, wide_one;
+
+    // Prepare constants
+    wide_one = vdupq_n_s32(one);
+
+    // Step 1
+    res = TILEX_PROCF_NEON4(f, max);
+
+    // Step 2
+    ret = TILEX_LOW_BITS_NEON4(f, max);
+    ret = vsliq_n_s32(ret, res, 4);
+
+    // Step 3
+    res = TILEX_PROCF_NEON4(f + wide_one, max);
+    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
+
+    return ret;
+}
+
+static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
+                                          SkFixed one PREAMBLE_PARAM_X) {
+    int32x4_t ret, res, wide_one;
+
+    // Prepare constants
+    wide_one = vdupq_n_s32(one);
+
+    // Step 1
+    res = TILEY_PROCF_NEON4(f, max);
+
+    // Step 2
+    ret = TILEY_LOW_BITS_NEON4(f, max);
+    ret = vsliq_n_s32(ret, res, 4);
+
+    // Step 3
+    res = TILEY_PROCF_NEON4(f + wide_one, max);
+    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
+
+    return ret;
+}
+
+static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
+                              uint32_t xy[], int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+    SkASSERT(s.fInvKy == 0);
+
+    PREAMBLE(s);
+
+    const unsigned maxX = s.fBitmap->width() - 1;
+    const SkFixed one = s.fFilterOneX;
+    const SkFractionalInt dx = s.fInvSxFractionalInt;
+    SkFractionalInt fx;
+
+    {
+        SkPoint pt;
+        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
+        const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+        const unsigned maxY = s.fBitmap->height() - 1;
+        // compute our two Y values up front
+        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
+        // now initialize fx
+        fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
+    }
+
+#ifdef CHECK_FOR_DECAL
+    // test if we don't need to apply the tile proc
+    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
+        decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
+                             SkFractionalIntToFixed(dx), count);
+        return;
+    }
+#endif
+    {
+
+    if (count >= 4) {
+        int32x4_t wide_fx;
+
+        wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
+        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
+        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
+        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
+
+        while (count >= 4) {
+            int32x4_t res;
+
+            res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
+
+            vst1q_u32(xy, vreinterpretq_u32_s32(res));
+
+            wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
+            fx += dx+dx+dx+dx;
+            xy += 4;
+            count -= 4;
+        }
+    }
+
+    while (--count >= 0) {
+        *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
+        fx += dx;
+    }
+
+    }
+}
+
+static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
+                               uint32_t xy[], int count, int x, int y) {
+    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask |
+                             SkMatrix::kAffine_Mask)) == 0);
+
+    PREAMBLE(s);
+    SkPoint srcPt;
+    s.fInvProc(s.fInvMatrix,
+               SkIntToScalar(x) + SK_ScalarHalf,
+               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+
+    SkFixed oneX = s.fFilterOneX;
+    SkFixed oneY = s.fFilterOneY;
+    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
+    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
+    SkFixed dx = s.fInvSx;
+    SkFixed dy = s.fInvKy;
+    unsigned maxX = s.fBitmap->width() - 1;
+    unsigned maxY = s.fBitmap->height() - 1;
+
+    if (count >= 4) {
+        int32x4_t wide_fy, wide_fx;
+
+        wide_fx = vdupq_n_s32(fx);
+        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
+        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
+        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
+
+        wide_fy = vdupq_n_s32(fy);
+        wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
+        wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
+        wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
+
+        while (count >= 4) {
+            int32x4x2_t vxy;
+
+            // do the X side, then the Y side, then interleave them
+            vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
+            vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
+
+            // interleave as YXYXYXYX as part of the storing
+            vst2q_s32((int32_t*)xy, vxy);
+
+            // prepare next iteration
+            wide_fx += vdupq_n_s32(dx+dx+dx+dx);
+            fx += dx + dx + dx + dx;
+            wide_fy += vdupq_n_s32(dy+dy+dy+dy);
+            fy += dy+dy+dy+dy;
+            xy += 8; // 4 x's, 4 y's
+            count -= 4;
+        }
+    }
+
+    while (--count >= 0) {
+        // NB: writing Y/X
+        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
+        fy += dy;
+        *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
+        fx += dx;
+    }
+}
+
+static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
+                              uint32_t* SK_RESTRICT xy, int count,
+                              int x, int y) {
+    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
+
+    PREAMBLE(s);
+    unsigned maxX = s.fBitmap->width() - 1;
+    unsigned maxY = s.fBitmap->height() - 1;
+    SkFixed oneX = s.fFilterOneX;
+    SkFixed oneY = s.fFilterOneY;
+
+    SkPerspIter iter(s.fInvMatrix,
+                     SkIntToScalar(x) + SK_ScalarHalf,
+                     SkIntToScalar(y) + SK_ScalarHalf, count);
+
+    while ((count = iter.next()) != 0) {
+        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
+
+        while (count >= 4) {
+            int32x4_t wide_x, wide_y;
+            int32x4x2_t vxy, vresyx;
+
+            // load src:  x-y-x-y-x-y-x-y
+            vxy = vld2q_s32(srcXY);
+
+            // do the X side, then the Y side, then interleave them
+            wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
+            wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
+
+            vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
+            vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
+
+            // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
+            vst2q_s32((int32_t*)xy, vresyx);
+
+            // on to the next iteration
+            srcXY += 2*4;
+            count -= 4;
+            xy += 2*4;
+        }
+
+        while (--count >= 0) {
+            // NB: we read x/y, we write y/x
+            *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
+                                       oneY PREAMBLE_ARG_Y);
+            *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
+                                       oneX PREAMBLE_ARG_X);
+            srcXY += 2;
+        }
+    }
+}
+
+const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
+    SCALE_NOFILTER_NAME,
+    SCALE_FILTER_NAME,
+    AFFINE_NOFILTER_NAME,
+    AFFINE_FILTER_NAME,
+    PERSP_NOFILTER_NAME,
+    PERSP_FILTER_NAME
+};
+
+#undef TILEX_PROCF_NEON8
+#undef TILEY_PROCF_NEON8
+#undef TILEX_PROCF_NEON4
+#undef TILEY_PROCF_NEON4
+#undef TILEX_LOW_BITS_NEON4
+#undef TILEY_LOW_BITS_NEON4
+
+#undef MAKENAME
+#undef TILEX_PROCF
+#undef TILEY_PROCF
+#ifdef CHECK_FOR_DECAL
+    #undef CHECK_FOR_DECAL
+#endif
+
+#undef SCALE_NOFILTER_NAME
+#undef SCALE_FILTER_NAME
+#undef AFFINE_NOFILTER_NAME
+#undef AFFINE_FILTER_NAME
+#undef PERSP_NOFILTER_NAME
+#undef PERSP_FILTER_NAME
+
+#undef PREAMBLE
+#undef PREAMBLE_PARAM_X
+#undef PREAMBLE_PARAM_Y
+#undef PREAMBLE_ARG_X
+#undef PREAMBLE_ARG_Y
+
+#undef TILEX_LOW_BITS
+#undef TILEY_LOW_BITS
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h
deleted file mode 100644
index 55e2997a5ef..00000000000
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h
+++ /dev/null
@@ -1,542 +0,0 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-/*
- * Modifications done in-house at Motorola
- *
- * this is a clone of SkBitmapProcState_matrix.h
- * and has been tuned to work with the NEON unit.
- *
- * Still going back and forth between whether this approach
- * (clone the entire SkBitmapProcState_matrix.h file or
- * if I should put just the modified routines in here and
- * then use a construct like #define DONT_DO_THIS_FUNCTION or
- * something like that...
- *
- * This is for the RepeatX_RepeatY part of the world
- */
-
-
-#include <arm_neon.h>
-
-/*
- * This has been modified on the knowledge that (at the time)
- * we had the following macro definitions in the parent file
- *
- * #define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
- * #define TILEX_PROCF(fx, max)    (((fx) & 0xFFFF) * ((max) + 1) >> 16)
- * #define TILEY_PROCF(fy, max)    (((fy) & 0xFFFF) * ((max) + 1) >> 16)
- * #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
- * #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
- */
-
-/* SkClampMax(val,max) -- bound to 0..max */
-
-#define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
-#define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
-#define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
-#define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
-#define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
-#define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
-
-#define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
-#define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
-
-#ifndef PREAMBLE
-    #define PREAMBLE(state)
-    #define PREAMBLE_PARAM_X
-    #define PREAMBLE_PARAM_Y
-    #define PREAMBLE_ARG_X
-    #define PREAMBLE_ARG_Y
-#endif
-
-static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
-                                uint32_t xy[], int count, int x, int y) {
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask)) == 0);
-
-    PREAMBLE(s);
-    // we store y, x, x, x, x, x
-
-    const unsigned maxX = s.fBitmap->width() - 1;
-    SkFixed fx;
-    {
-        SkPoint pt;
-        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
-                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
-        fx = SkScalarToFixed(pt.fY);
-        const unsigned maxY = s.fBitmap->height() - 1;
-        *xy++ = TILEY_PROCF(fx, maxY);
-        fx = SkScalarToFixed(pt.fX);
-    }
-
-    if (0 == maxX) {
-        // all of the following X values must be 0
-        memset(xy, 0, count * sizeof(uint16_t));
-        return;
-    }
-
-    const SkFixed dx = s.fInvSx;
-
-#ifdef CHECK_FOR_DECAL
-    // test if we don't need to apply the tile proc
-    if ((unsigned)(fx >> 16) <= maxX &&
-        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
-        decal_nofilter_scale_neon(xy, fx, dx, count);
-    } else
-#endif
-    {
-        int i;
-
-    /* RBE: very much like done in decal_nofilter ,
-     * but some processing of the 'fx' information
-         * TILEX_PROCF(fx, max)    (((fx) & 0xFFFF) * ((max) + 1) >> 16)
-     */
-    if (count >= 8) {
-        /* SkFixed is 16.16 fixed point */
-        SkFixed dx2 = dx+dx;
-        SkFixed dx4 = dx2+dx2;
-        SkFixed dx8 = dx4+dx4;
-
-        /* now build fx/fx+dx/fx+2dx/fx+3dx */
-        SkFixed fx1, fx2, fx3;
-        int32x4_t lbase, hbase;
-        int16_t *dst16 = (int16_t *)xy;
-
-        fx1 = fx+dx;
-        fx2 = fx1+dx;
-        fx3 = fx2+dx;
-
-        lbase = vdupq_n_s32(fx);
-        lbase = vsetq_lane_s32(fx1, lbase, 1);
-        lbase = vsetq_lane_s32(fx2, lbase, 2);
-        lbase = vsetq_lane_s32(fx3, lbase, 3);
-        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
-
-        /* store & bump */
-        do
-        {
-            int32x4_t lout;
-        int32x4_t hout;
-        int16x8_t hi16;
-
-             /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
-        /* mask to low 16 [would like to use uzp tricks) */
-            lout = vandq_s32(lbase, vdupq_n_s32(0xffff));
-            hout = vandq_s32(hbase, vdupq_n_s32(0xffff));
-        /* bare multiplication, not SkFixedMul */
-        lout = vmulq_s32(lout, vdupq_n_s32(maxX+1));
-        hout = vmulq_s32(hout, vdupq_n_s32(maxX+1));
-
-        /* extraction, using uzp */
-        /* this is ok -- we want all hi(lout)s then all hi(hout)s */
-        asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
-        hi16 = vreinterpretq_s16_s32(hout);
-        vst1q_s16(dst16, hi16);
-
-        /* bump our base on to the next */
-        lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
-        hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
-        dst16 += 8;
-        count -= 8;
-        fx += dx8;
-        } while (count >= 8);
-        xy = (uint32_t *) dst16;
-    }
-        uint16_t* xx = (uint16_t*)xy;
-        for (i = count; i > 0; --i) {
-            *xx++ = TILEX_PROCF(fx, maxX); fx += dx;
-        }
-    }
-}
-
-// note: we could special-case on a matrix which is skewed in X but not Y.
-// this would require a more general setup thatn SCALE does, but could use
-// SCALE's inner loop that only looks at dx
-
-
-static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
-                                 uint32_t xy[], int count, int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask |
-                             SkMatrix::kAffine_Mask)) == 0);
-
-    PREAMBLE(s);
-    SkPoint srcPt;
-    s.fInvProc(s.fInvMatrix,
-               SkIntToScalar(x) + SK_ScalarHalf,
-               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
-    SkFixed fx = SkScalarToFixed(srcPt.fX);
-    SkFixed fy = SkScalarToFixed(srcPt.fY);
-    SkFixed dx = s.fInvSx;
-    SkFixed dy = s.fInvKy;
-    int maxX = s.fBitmap->width() - 1;
-    int maxY = s.fBitmap->height() - 1;
-
-#if 0
-    int ocount = count;
-    uint32_t *oxy = xy;
-    SkFixed bfx = fx, bfy=fy, bdx=dx, bdy=dy;
-#endif
-
-
-    if (0) { extern void rbe(void); rbe(); }
-
-    /* RBE: benchmarks show this eats up time; can we neonize it? */
-    /* RBE: very much like done in decal_nofilter ,
-     * but some processing of the 'fx' information
-         * TILEX_PROCF(fx, max)    (((fx) & 0xFFFF) * ((max) + 1) >> 16)
-     */
-    if (count >= 4) {
-        /* SkFixed is 16.16 fixed point */
-        SkFixed dx4 = dx*4;
-        SkFixed dy4 = dy*4;
-
-        /* now build fx/fx+dx/fx+2dx/fx+3dx */
-        int32x4_t xbase, ybase;
-        int16_t *dst16 = (int16_t *)xy;
-
-        /* synthesize 4x for both X and Y */
-        xbase = vdupq_n_s32(fx);
-        xbase = vsetq_lane_s32(fx+dx, xbase, 1);
-        xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);
-        xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);
-
-        ybase = vdupq_n_s32(fy);
-        ybase = vsetq_lane_s32(fy+dy, ybase, 1);
-        ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);
-        ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
-
-        /* store & bump */
-        do {
-            int32x4_t xout;
-            int32x4_t yout;
-            int16x8_t hi16;
-
-             /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
-        /* mask to low 16 [would like to use uzp tricks) */
-            xout = vandq_s32(xbase, vdupq_n_s32(0xffff));
-            yout = vandq_s32(ybase, vdupq_n_s32(0xffff));
-        /* bare multiplication, not SkFixedMul */
-        xout = vmulq_s32(xout, vdupq_n_s32(maxX+1));
-        yout = vmulq_s32(yout, vdupq_n_s32(maxY+1));
-
-        /* put hi16 from xout over low16 from yout */
-        yout = vsriq_n_s32(yout, xout, 16);
-
-        /* and then yout has the interleaved upper 16's */
-        hi16 = vreinterpretq_s16_s32(yout);
-        vst1q_s16(dst16, hi16);
-
-        /* bump preserved base & on to the next */
-        xbase = vaddq_s32 (xbase, vdupq_n_s32(dx4));
-        ybase = vaddq_s32 (ybase, vdupq_n_s32(dy4));
-        dst16 += 8;    /* 8 x16 aka 4x32 */
-        count -= 4;
-        fx += dx4;
-        fy += dy4;
-        } while (count >= 4);
-        xy = (uint32_t *) dst16;
-    }
-
-#if 0
-    /* diagnostics... see whether we agree with the NEON code */
-    int bad = 0;
-    uint32_t *myxy = oxy;
-    int myi = (-1);
-    SkFixed ofx = bfx, ofy= bfy, odx= bdx, ody= bdy;
-    for (myi = ocount; myi > 0; --myi) {
-    uint32_t val = (TILEY_PROCF(ofy, maxY) << 16) | TILEX_PROCF(ofx, maxX);
-    if (val != *myxy++) {
-        bad++;
-        break;
-    }
-        ofx += odx; ofy += ody;
-    }
-    if (bad) {
-        SkDebugf("repeat-nofilter-affine fails\n");
-        SkDebugf("count %d myi %d\n", ocount, myi);
-        SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
-                bfx, bdx, bfy, bdy);
-        SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
-    }
-#endif
-
-    for (int i = count; i > 0; --i) {
-    /* fx, fy, dx, dy are all 32 bit 16.16 fixed point */
-    /* (((fx) & 0xFFFF) * ((max) + 1) >> 16) */
-        *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX);
-        fx += dx; fy += dy;
-    }
-}
-
-static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
-                                uint32_t* SK_RESTRICT xy,
-                                int count, int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
-    PREAMBLE(s);
-    int maxX = s.fBitmap->width() - 1;
-    int maxY = s.fBitmap->height() - 1;
-
-    SkPerspIter   iter(s.fInvMatrix,
-                       SkIntToScalar(x) + SK_ScalarHalf,
-                       SkIntToScalar(y) + SK_ScalarHalf, count);
-
-    while ((count = iter.next()) != 0) {
-        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-
-    /* RBE: */
-    /* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */
-    /* it's a little more complicated than what I did for the
-     * clamp case -- where I could immediately snip to the top
-     * 16 bits and do my min/max games there.
-     * ... might only be able to get 4x unrolling here
-     */
-
-    /* vld2 to get a set of 32x4's ... */
-    /* do the tile[xy]_procf operations */
-    /* which includes doing vuzp to get hi16's */
-    /* store it */
-    /* -- inner loop (other than vld2) can be had from above */
-
-    /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
-     * but we immediately discard the low 16 bits...
-     * so what we're going to do is vld4, which will give us
-     * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo'
-     * parts....
-     */
-    if (0) { extern void rbe(void); rbe(); }
-    if (count >= 8) {
-        int32_t *mysrc = (int32_t *) srcXY;
-        int16_t *mydst = (int16_t *) xy;
-        do {
-        int32x4_t x, y, x2, y2;
-        int16x8_t hi, hi2;
-
-        /* read array of x,y,x,y,x,y */
-            /* vld2 does the de-interleaving for us */
-        /* isolate reg-bound scopes; gcc will minimize register
-         * motion if possible; this ensures that we don't lose
-         * a register across a debugging call because it happens
-         * to be bound into a call-clobbered register
-         */
-        {
-            register int32x4_t q0 asm("q0");
-            register int32x4_t q1 asm("q1");
-            asm ("vld2.32    {q0-q1},[%2]  /* x=%q0 y=%q1 */"
-                : "=w" (q0), "=w" (q1)
-                : "r" (mysrc)
-                );
-            x = q0; y = q1;
-        }
-
-        /* offset == 256 bits == 32 bytes == 8 longs */
-        {
-            register int32x4_t q2 asm("q2");
-            register int32x4_t q3 asm("q3");
-            asm ("vld2.32    {q2-q3},[%2]  /* x=%q0 y=%q1 */"
-                : "=w" (q2), "=w" (q3)
-                : "r" (mysrc+8)
-                );
-            x2 = q2; y2 = q3;
-        }
-
-             /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
-        /* mask to low 16 [would like to use uzp tricks) */
-        /* bare multiplication, not SkFixedMul */
-            x = vandq_s32(x, vdupq_n_s32(0xffff));
-        x = vmulq_s32(x, vdupq_n_s32(maxX+1));
-            y = vandq_s32(y, vdupq_n_s32(0xffff));
-        y = vmulq_s32(y, vdupq_n_s32(maxY+1));
-
-            x2 = vandq_s32(x2, vdupq_n_s32(0xffff));
-        x2 = vmulq_s32(x2, vdupq_n_s32(maxX+1));
-            y2 = vandq_s32(y2, vdupq_n_s32(0xffff));
-        y2 = vmulq_s32(y2, vdupq_n_s32(maxY+1));
-
-        /* now collect interleaved high 16's */
-        /* (hi-x, hi-y)4  (hi-x2; hi-y2)4 */
-
-        /* extraction, using uzp, leaves hi16's in y */
-        y = vsriq_n_s32(y, x, 16);
-        hi = vreinterpretq_s16_s32(y);
-        vst1q_s16(mydst, hi);
-
-        /* and likewise for the second 8 entries */
-        y2 = vsriq_n_s32(y2, x2, 16);
-        hi2 = vreinterpretq_s16_s32(y2);
-        vst1q_s16(mydst+8, hi2);
-
-        /* XXX: gcc isn't interleaving these with the NEON ops
-         * but i think that all the scoreboarding works out */
-        count -= 8;    /* 8 iterations */
-        mysrc += 16;    /* 16 longs */
-        mydst += 16;    /* 16 shorts, aka 8 longs */
-        } while (count >= 8);
-        /* get xy and srcXY fixed up */
-        srcXY = (const SkFixed *) mysrc;
-        xy = (uint32_t *) mydst;
-    }
-        while (--count >= 0) {
-            *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
-                     TILEX_PROCF(srcXY[0], maxX);
-            srcXY += 2;
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-
-static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
-                                          SkFixed one PREAMBLE_PARAM_Y) {
-    unsigned i = TILEY_PROCF(f, max);
-    i = (i << 4) | TILEY_LOW_BITS(f, max);
-    return (i << 14) | (TILEY_PROCF((f + one), max));
-}
-
-static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
-                                          SkFixed one PREAMBLE_PARAM_X) {
-    unsigned i = TILEX_PROCF(f, max);
-    i = (i << 4) | TILEX_LOW_BITS(f, max);
-    return (i << 14) | (TILEX_PROCF((f + one), max));
-}
-
-static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
-                              uint32_t xy[], int count, int x, int y) {
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask)) == 0);
-    SkASSERT(s.fInvKy == 0);
-
-    PREAMBLE(s);
-
-    const unsigned maxX = s.fBitmap->width() - 1;
-    const SkFixed one = s.fFilterOneX;
-    const SkFractionalInt dx = s.fInvSxFractionalInt;
-    SkFractionalInt fx;
-
-    {
-        SkPoint pt;
-        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
-                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
-        const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
-        const unsigned maxY = s.fBitmap->height() - 1;
-        // compute our two Y values up front
-        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
-        // now initialize fx
-        fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
-    }
-
-#ifdef CHECK_FOR_DECAL
-    // test if we don't need to apply the tile proc
-    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
-        decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
-                                SkFractionalIntToFixed(dx), count);
-    } else
-#endif
-    {
-        do {
-            SkFixed fixedFx = SkFractionalIntToFixed(fx);
-            *xy++ = PACK_FILTER_X_NAME(fixedFx, maxX, one PREAMBLE_ARG_X);
-            fx += dx;
-        } while (--count != 0);
-    }
-}
-
-static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
-                               uint32_t xy[], int count, int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask |
-                             SkMatrix::kAffine_Mask)) == 0);
-
-    PREAMBLE(s);
-    SkPoint srcPt;
-    s.fInvProc(s.fInvMatrix,
-               SkIntToScalar(x) + SK_ScalarHalf,
-               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
-    SkFixed oneX = s.fFilterOneX;
-    SkFixed oneY = s.fFilterOneY;
-    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
-    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
-    SkFixed dx = s.fInvSx;
-    SkFixed dy = s.fInvKy;
-    unsigned maxX = s.fBitmap->width() - 1;
-    unsigned maxY = s.fBitmap->height() - 1;
-
-    do {
-        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
-        fy += dy;
-        *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
-        fx += dx;
-    } while (--count != 0);
-}
-
-static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
-                              uint32_t* SK_RESTRICT xy, int count,
-                              int x, int y) {
-    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
-    extern void rbe(void);
-
-    PREAMBLE(s);
-    unsigned maxX = s.fBitmap->width() - 1;
-    unsigned maxY = s.fBitmap->height() - 1;
-    SkFixed oneX = s.fFilterOneX;
-    SkFixed oneY = s.fFilterOneY;
-
-
-
-    SkPerspIter   iter(s.fInvMatrix,
-                       SkIntToScalar(x) + SK_ScalarHalf,
-                       SkIntToScalar(y) + SK_ScalarHalf, count);
-
-    while ((count = iter.next()) != 0) {
-        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-        do {
-            *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
-                                       oneY PREAMBLE_ARG_Y);
-            *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
-                                       oneX PREAMBLE_ARG_X);
-            srcXY += 2;
-        } while (--count != 0);
-    }
-}
-
-const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
-    SCALE_NOFILTER_NAME,
-    SCALE_FILTER_NAME,
-    AFFINE_NOFILTER_NAME,
-    AFFINE_FILTER_NAME,
-    PERSP_NOFILTER_NAME,
-    PERSP_FILTER_NAME
-};
-
-#undef MAKENAME
-#undef TILEX_PROCF
-#undef TILEY_PROCF
-#ifdef CHECK_FOR_DECAL
-    #undef CHECK_FOR_DECAL
-#endif
-
-#undef SCALE_NOFILTER_NAME
-#undef SCALE_FILTER_NAME
-#undef AFFINE_NOFILTER_NAME
-#undef AFFINE_FILTER_NAME
-#undef PERSP_NOFILTER_NAME
-#undef PERSP_FILTER_NAME
-
-#undef PREAMBLE
-#undef PREAMBLE_PARAM_X
-#undef PREAMBLE_PARAM_Y
-#undef PREAMBLE_ARG_X
-#undef PREAMBLE_ARG_Y
-
-#undef TILEX_LOW_BITS
-#undef TILEY_LOW_BITS
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp
index 0b079977eb8..1f3bbc1f8f7 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2009 The Android Open Source Project
  *
@@ -6,9 +5,9 @@
  * found in the LICENSE file.
  */
 
-
 #include <emmintrin.h>
 #include "SkBitmapProcState_opts_SSE2.h"
+#include "SkColorPriv.h"
 #include "SkPaint.h"
 #include "SkUtils.h"
 
@@ -17,7 +16,7 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
                                    int count, uint32_t* colors) {
     SkASSERT(count > 0 && colors != NULL);
     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
-    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
     SkASSERT(s.fAlphaScale == 256);
 
     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
@@ -123,7 +122,7 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
                                   int count, uint32_t* colors) {
     SkASSERT(count > 0 && colors != NULL);
     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
-    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
     SkASSERT(s.fAlphaScale < 256);
 
     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
@@ -639,11 +638,11 @@ void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
  *  It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
  */
 void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint16_t* colors) {
+                            const uint32_t* xy,
+                            int count, uint16_t* colors) {
     SkASSERT(count > 0 && colors != NULL);
     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
-    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
     SkASSERT(s.fBitmap->isOpaque());
 
     SkPMColor dstColor;
@@ -744,23 +743,6 @@ void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
         // Extract low int and store.
         dstColor = _mm_cvtsi128_si32(sum);
 
-        //*colors++ = SkPixel32ToPixel16(dstColor);
-        // below is much faster than the above. It's tested for Android benchmark--Softweg
-        __m128i _m_temp1 = _mm_set1_epi32(dstColor);
-        __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
-
-        unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
-        unsigned r = (r32 & ((1<<5) -1)) << 11;
-
-        _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
-        unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
-        unsigned g = (g32 & ((1<<6) -1)) << 5;
-
-        _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
-        unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
-        unsigned b = (b32 & ((1<<5) -1));
-
-        *colors++ = r | g | b;
-
+        *colors++ = SkPixel32ToPixel16(dstColor);
     } while (--count > 0);
 }
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h
index 46e35a0f96f..82c5cc8d6e1 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2009 The Android Open Source Project
  *
@@ -6,6 +5,8 @@
  * found in the LICENSE file.
  */
 
+#ifndef SkBitmapProcState_opts_SSE2_DEFINED
+#define SkBitmapProcState_opts_SSE2_DEFINED
 
 #include "SkBitmapProcState.h"
 
@@ -24,7 +25,9 @@ void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
 void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
                                       uint32_t xy[], int count, int x, int y);
 void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
-                                       uint32_t xy[], int count, int x, int y);
+                                        uint32_t xy[], int count, int x, int y);
 void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
-                                  const uint32_t* xy,
-                                  int count, uint16_t* colors);
+                            const uint32_t* xy,
+                            int count, uint16_t* colors);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp
index f8342ecaad5..5b97215cc01 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp
@@ -5,11 +5,19 @@
  * found in the LICENSE file.
  */
 
-#include <tmmintrin.h>  // SSSE3
 #include "SkBitmapProcState_opts_SSSE3.h"
 #include "SkPaint.h"
 #include "SkUtils.h"
 
+/* With the exception of the Android framework we always build the SSSE3 functions
+ * and enable the caller to determine SSSE3 support.  However for the Android framework
+ * if the device does not support SSSE3 then the compiler will not supply the required
+ * -mssse3 option needed to build this file, so instead we provide a stub implementation.
+ */
+#if !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+
+#include <tmmintrin.h>  // SSSE3
+
 // adding anonymous namespace seemed to force gcc to inline directly the
 // instantiation, instead of creating the functions
 // S32_generic_D32_filter_DX_SSSE3<true> and
@@ -387,7 +395,7 @@ void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
                                      int count, uint32_t* colors) {
     SkASSERT(count > 0 && colors != NULL);
     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
-    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
     if (has_alpha) {
         SkASSERT(s.fAlphaScale < 256);
     } else {
@@ -417,9 +425,10 @@ void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
     const __m128i zero = _mm_setzero_si128();
 
     __m128i alpha = _mm_setzero_si128();
-    if (has_alpha)
+    if (has_alpha) {
         // 8x(alpha)
         alpha = _mm_set1_epi16(s.fAlphaScale);
+    }
 
     if (sub_y == 0) {
         // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
@@ -578,7 +587,7 @@ void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
                                        int count, uint32_t* colors) {
     SkASSERT(count > 0 && colors != NULL);
     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
-    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
     if (has_alpha) {
         SkASSERT(s.fAlphaScale < 256);
     } else {
@@ -697,7 +706,7 @@ void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
         *colors++ = _mm_cvtsi128_si32(sum0);
     }
 }
-}  // namepace
+}  // namespace
 
 void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
                                     const uint32_t* xy,
@@ -722,3 +731,31 @@ void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
                                    int count, uint32_t* colors) {
     S32_generic_D32_filter_DXDY_SSSE3<true>(s, xy, count, colors);
 }
+
+#else // !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+
+void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                    const uint32_t* xy,
+                                    int count, uint32_t* colors) {
+    sk_throw();
+}
+
+void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                   const uint32_t* xy,
+                                   int count, uint32_t* colors) {
+    sk_throw();
+}
+
+void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+                                    const uint32_t* xy,
+                                    int count, uint32_t* colors) {
+    sk_throw();
+}
+
+void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+                                   const uint32_t* xy,
+                                   int count, uint32_t* colors) {
+    sk_throw();
+}
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h
index 176f2bfbe74..9fd074aacf2 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h
@@ -5,6 +5,9 @@
  * found in the LICENSE file.
  */
 
+#ifndef SkBitmapProcState_opts_SSSE3_DEFINED
+#define SkBitmapProcState_opts_SSSE3_DEFINED
+
 #include "SkBitmapProcState.h"
 
 void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
@@ -19,3 +22,5 @@ void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
 void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
                                    const uint32_t* xy,
                                    int count, uint32_t* colors);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp
index 96fbebd4e19..ffa0ccfa8aa 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -15,7 +15,7 @@
 
 #include "SkConvolver.h"
 
-#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
+#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
 void SI8_D16_nofilter_DX_arm(
     const SkBitmapProcState& s,
     const uint32_t* SK_RESTRICT xy,
@@ -186,7 +186,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
 
     s.fBitmap->getColorTable()->unlockColors();
 }
-#endif // SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
+#endif // !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -194,6 +194,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
     otherwise the shader won't even look at the matrix/sampler
  */
 void SkBitmapProcState::platformProcs() {
+#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
     bool isOpaque = 256 == fAlphaScale;
     bool justDx = false;
 
@@ -201,9 +202,8 @@ void SkBitmapProcState::platformProcs() {
         justDx = true;
     }
 
-    switch (fBitmap->config()) {
-        case SkBitmap::kIndex8_Config:
-#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
+    switch (fBitmap->colorType()) {
+        case kIndex_8_SkColorType:
             if (justDx && SkPaint::kNone_FilterLevel == fFilterLevel) {
 #if 0   /* crashing on android device */
                 fSampleProc16 = SI8_D16_nofilter_DX_arm;
@@ -215,11 +215,11 @@ void SkBitmapProcState::platformProcs() {
                     fShaderProc32 = NULL;
                 }
             }
-#endif
             break;
         default:
             break;
     }
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp
index 2bf760313c1..11e172c0d1d 100644
--- a/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
 
 #include "SkColor.h"
 #include "SkColorPriv.h"
@@ -5,21 +11,24 @@
 #include "SkUtilsArm.h"
 #include "SkBlitMask_opts_arm_neon.h"
 
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
+SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
                                                      SkMask::Format maskFormat,
                                                      SkColor color) {
 #if SK_ARM_NEON_IS_NONE
     return NULL;
 #else
+/* ** This has been disabled until we can diagnose and fix the SIGILL generated
+   ** in the NEON code.  See http://skbug.com/2067 for details.
 #if SK_ARM_NEON_IS_DYNAMIC
     if (!sk_cpu_arm_has_neon()) {
         return NULL;
     }
 #endif
-    if ((SkBitmap::kARGB_8888_Config == dstConfig) &&
+    if ((kN32_SkColorType == dstCT) &&
         (SkMask::kA8_Format == maskFormat)) {
             return D32_A8_Factory_neon(color);
     }
+*/
 #endif
 
     // We don't need to handle the SkMask::kLCD16_Format case as the default
@@ -36,7 +45,7 @@ SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
     }
 }
 
-SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
+SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType dstCT,
                                                  SkMask::Format maskFormat,
                                                  RowFlags flags) {
     return NULL;
diff --git a/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp b/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp
index 0ad09193871..90f89a71292 100644
--- a/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp
@@ -1,7 +1,13 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
 
 #include "SkBlitMask.h"
 
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
+SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
                                                      SkMask::Format maskFormat,
                                                      SkColor color) {
     return NULL;
@@ -11,7 +17,7 @@ SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
     return NULL;
 }
 
-SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
+SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType dstCT,
                                                  SkMask::Format maskFormat,
                                                  RowFlags flags) {
     return NULL;
diff --git a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp
index 3cb2b9c6d09..d65a313dadf 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp
@@ -5,15 +5,14 @@
  * found in the LICENSE file.
  */
 
+#include <emmintrin.h>
 #include "SkBlitRect_opts_SSE2.h"
 #include "SkBlitRow.h"
 #include "SkColorPriv.h"
 
-#include <emmintrin.h>
-
-/** Simple blitting of opaque rectangles less than 31 pixels wide:
-    inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
-*/
+/* Simple blitting of opaque rectangles less than 31 pixels wide:
+ * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+ */
 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
                                   int width, int height,
                                   size_t rowBytes, uint32_t color) {
@@ -42,12 +41,12 @@ static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
     }
 }
 
-/**
-  Fast blitting of opaque rectangles at least 31 pixels wide:
-  inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
-  A 31 pixel rectangle is guaranteed to have at least one
-  16-pixel aligned span that can take advantage of mm_store.
-*/
+/*
+ * Fast blitting of opaque rectangles at least 31 pixels wide:
+ * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+ * A 31 pixel rectangle is guaranteed to have at least one
+ * 16-pixel aligned span that can take advantage of mm_store.
+ */
 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
                                 int width, int height,
                                 size_t rowBytes, uint32_t color) {
diff --git a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h
index 4d2f74a4b1b..3d09f5c3abc 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h
@@ -8,13 +8,11 @@
 #ifndef SkBlitRect_opts_SSE2_DEFINED
 #define SkBlitRect_opts_SSE2_DEFINED
 
-/*
-  These functions' implementations copy sections of both
-  SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
-*/
-
 #include "SkColor.h"
 
+/* These functions' implementations copy sections of both
+ * SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
+ */
 void ColorRect32_SSE2(SkPMColor* SK_RESTRICT dst,
                       int width, int height,
                       size_t rowBytes, uint32_t color);
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp
index f3d010e3bc4..391b24c8673 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -5,14 +5,14 @@
  * found in the LICENSE file.
  */
 
-
-#include "SkBlitRow_opts_SSE2.h"
+#include <emmintrin.h>
 #include "SkBitmapProcState_opts_SSE2.h"
+#include "SkBlitRow_opts_SSE2.h"
 #include "SkColorPriv.h"
+#include "SkColor_opts_SSE2.h"
+#include "SkDither.h"
 #include "SkUtils.h"
 
-#include <emmintrin.h>
-
 /* SSE2 version of S32_Blend_BlitRow32()
  * portable version is in core/SkBlitRow_D32.cpp
  */
@@ -177,7 +177,7 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
             d++;
             count -= 4;
         }
-    #else
+#else
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
         while (count >= 4) {
@@ -340,7 +340,6 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  */
 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
                   SkPMColor color) {
-
     if (count <= 0) {
         return;
     }
@@ -404,7 +403,7 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
             }
             src = reinterpret_cast<const SkPMColor*>(s);
             dst = reinterpret_cast<SkPMColor*>(d);
-         }
+        }
 
         while (count > 0) {
             *dst = color + SkAlphaMulQ(*src, scale);
@@ -502,7 +501,7 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
             }
             dst = reinterpret_cast<SkPMColor *>(d);
         }
-        while(count > 0) {
+        while (count > 0) {
             *dst= SkBlendARGB32(color, *dst, *mask);
             dst += 1;
             mask++;
@@ -851,3 +850,512 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
         width--;
     }
 }
+
+/* SSE2 version of S32_D565_Opaque()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+                          const SkPMColor* SK_RESTRICT src, int count,
+                          U8CPU alpha, int /*x*/, int /*y*/) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0) {
+        return;
+    }
+
+    if (count >= 8) {
+        while (((size_t)dst & 0x0F) != 0) {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+
+            *dst++ = SkPixel32ToPixel16_ToU16(c);
+            count--;
+        }
+
+        const __m128i* s = reinterpret_cast<const __m128i*>(src);
+        __m128i* d = reinterpret_cast<__m128i*>(dst);
+        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
+        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
+        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
+
+        while (count >= 8) {
+            // Load 8 pixels of src.
+            __m128i src_pixel1 = _mm_loadu_si128(s++);
+            __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+            // Calculate result r.
+            __m128i r1 = _mm_srli_epi32(src_pixel1,
+                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
+            r1 = _mm_and_si128(r1, r16_mask);
+            __m128i r2 = _mm_srli_epi32(src_pixel2,
+                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
+            r2 = _mm_and_si128(r2, r16_mask);
+            __m128i r = _mm_packs_epi32(r1, r2);
+
+            // Calculate result g.
+            __m128i g1 = _mm_srli_epi32(src_pixel1,
+                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
+            g1 = _mm_and_si128(g1, g16_mask);
+            __m128i g2 = _mm_srli_epi32(src_pixel2,
+                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
+            g2 = _mm_and_si128(g2, g16_mask);
+            __m128i g = _mm_packs_epi32(g1, g2);
+
+            // Calculate result b.
+            __m128i b1 = _mm_srli_epi32(src_pixel1,
+                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
+            b1 = _mm_and_si128(b1, b16_mask);
+            __m128i b2 = _mm_srli_epi32(src_pixel2,
+                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
+            b2 = _mm_and_si128(b2, b16_mask);
+            __m128i b = _mm_packs_epi32(b1, b2);
+
+            // Store 8 16-bit colors in dst.
+            __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
+            _mm_store_si128(d++, d_pixel);
+            count -= 8;
+        }
+        src = reinterpret_cast<const SkPMColor*>(s);
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+
+    if (count > 0) {
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            *dst++ = SkPixel32ToPixel16_ToU16(c);
+        } while (--count != 0);
+    }
+}
+
+/* SSE2 version of S32A_D565_Opaque()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+                           const SkPMColor* SK_RESTRICT src,
+                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0) {
+        return;
+    }
+
+    if (count >= 8) {
+        // Make dst 16 bytes alignment
+        while (((size_t)dst & 0x0F) != 0) {
+            SkPMColor c = *src++;
+            if (c) {
+              *dst = SkSrcOver32To16(c, *dst);
+            }
+            dst += 1;
+            count--;
+        }
+
+        const __m128i* s = reinterpret_cast<const __m128i*>(src);
+        __m128i* d = reinterpret_cast<__m128i*>(dst);
+        __m128i var255 = _mm_set1_epi16(255);
+        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
+        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
+        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
+
+        while (count >= 8) {
+            // Load 8 pixels of src.
+            __m128i src_pixel1 = _mm_loadu_si128(s++);
+            __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+            // Check whether src pixels are equal to 0 and get the highest bit
+            // of each byte of result, if src pixels are all zero, src_cmp1 and
+            // src_cmp2 will be 0xFFFF.
+            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
+                                             _mm_setzero_si128()));
+            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
+                                             _mm_setzero_si128()));
+            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
+                d++;
+                count -= 8;
+                continue;
+            }
+
+            // Load 8 pixels of dst.
+            __m128i dst_pixel = _mm_load_si128(d);
+
+            // Extract A from src.
+            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
+            sa1 = _mm_srli_epi32(sa1, 24);
+            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
+            sa2 = _mm_srli_epi32(sa2, 24);
+            __m128i sa = _mm_packs_epi32(sa1, sa2);
+
+            // Extract R from src.
+            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+            sr1 = _mm_srli_epi32(sr1, 24);
+            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+            sr2 = _mm_srli_epi32(sr2, 24);
+            __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+            // Extract G from src.
+            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+            sg1 = _mm_srli_epi32(sg1, 24);
+            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+            sg2 = _mm_srli_epi32(sg2, 24);
+            __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+            // Extract B from src.
+            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+            sb1 = _mm_srli_epi32(sb1, 24);
+            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+            sb2 = _mm_srli_epi32(sb2, 24);
+            __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+            // Extract R G B from dst.
+            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
+            dr = _mm_and_si128(dr, r16_mask);
+            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
+            dg = _mm_and_si128(dg, g16_mask);
+            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
+            db = _mm_and_si128(db, b16_mask);
+
+            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
+
+            // Calculate R G B of result.
+            // Original algorithm is in SkSrcOver32To16().
+            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
+            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
+            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
+            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
+            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
+            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
+
+            // Pack R G B into 16-bit color.
+            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
+
+            // Store 8 16-bit colors in dst.
+            _mm_store_si128(d++, d_pixel);
+            count -= 8;
+        }
+
+        src = reinterpret_cast<const SkPMColor*>(s);
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+
+    if (count > 0) {
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            if (c) {
+                *dst = SkSrcOver32To16(c, *dst);
+            }
+            dst += 1;
+        } while (--count != 0);
+    }
+}
+
+void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+                                 const SkPMColor* SK_RESTRICT src,
+                                 int count, U8CPU alpha, int x, int y) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0) {
+        return;
+    }
+
+    if (count >= 8) {
+        while (((size_t)dst & 0x0F) != 0) {
+            DITHER_565_SCAN(y);
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+
+            unsigned dither = DITHER_VALUE(x);
+            *dst++ = SkDitherRGB32To565(c, dither);
+            DITHER_INC_X(x);
+            count--;
+        }
+
+        unsigned short dither_value[8];
+        __m128i dither;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
+        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
+        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
+        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
+#else
+        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+        dither_value[0] = dither_value[4] = (dither_scan
+                                             >> (((x) & 3) << 2)) & 0xF;
+        dither_value[1] = dither_value[5] = (dither_scan
+                                             >> (((x + 1) & 3) << 2)) & 0xF;
+        dither_value[2] = dither_value[6] = (dither_scan
+                                             >> (((x + 2) & 3) << 2)) & 0xF;
+        dither_value[3] = dither_value[7] = (dither_scan
+                                             >> (((x + 3) & 3) << 2)) & 0xF;
+#endif
+        dither = _mm_loadu_si128((__m128i*) dither_value);
+
+        const __m128i* s = reinterpret_cast<const __m128i*>(src);
+        __m128i* d = reinterpret_cast<__m128i*>(dst);
+
+        while (count >= 8) {
+            // Load 8 pixels of src.
+            __m128i src_pixel1 = _mm_loadu_si128(s++);
+            __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+            // Extract R from src.
+            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+            sr1 = _mm_srli_epi32(sr1, 24);
+            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+            sr2 = _mm_srli_epi32(sr2, 24);
+            __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+            // SkDITHER_R32To565(sr, dither)
+            __m128i sr_offset = _mm_srli_epi16(sr, 5);
+            sr = _mm_add_epi16(sr, dither);
+            sr = _mm_sub_epi16(sr, sr_offset);
+            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
+
+            // Extract G from src.
+            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+            sg1 = _mm_srli_epi32(sg1, 24);
+            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+            sg2 = _mm_srli_epi32(sg2, 24);
+            __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+            // SkDITHER_R32To565(sg, dither)
+            __m128i sg_offset = _mm_srli_epi16(sg, 6);
+            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
+            sg = _mm_sub_epi16(sg, sg_offset);
+            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
+
+            // Extract B from src.
+            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+            sb1 = _mm_srli_epi32(sb1, 24);
+            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+            sb2 = _mm_srli_epi32(sb2, 24);
+            __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+            // SkDITHER_R32To565(sb, dither)
+            __m128i sb_offset = _mm_srli_epi16(sb, 5);
+            sb = _mm_add_epi16(sb, dither);
+            sb = _mm_sub_epi16(sb, sb_offset);
+            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
+
+            // Pack and store 16-bit dst pixel.
+            __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
+            _mm_store_si128(d++, d_pixel);
+
+            count -= 8;
+            x += 8;
+        }
+
+        src = reinterpret_cast<const SkPMColor*>(s);
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+
+    if (count > 0) {
+        DITHER_565_SCAN(y);
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+
+            unsigned dither = DITHER_VALUE(x);
+            *dst++ = SkDitherRGB32To565(c, dither);
+            DITHER_INC_X(x);
+        } while (--count != 0);
+    }
+}
+
+/* SSE2 version of S32A_D565_Opaque_Dither()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+                                  const SkPMColor* SK_RESTRICT src,
+                                  int count, U8CPU alpha, int x, int y) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0) {
+        return;
+    }
+
+    if (count >= 8) {
+        while (((size_t)dst & 0x0F) != 0) {
+            DITHER_565_SCAN(y);
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            if (c) {
+                unsigned a = SkGetPackedA32(c);
+
+                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+                unsigned sr = SkGetPackedR32(c);
+                unsigned sg = SkGetPackedG32(c);
+                unsigned sb = SkGetPackedB32(c);
+                sr = SkDITHER_R32_FOR_565(sr, d);
+                sg = SkDITHER_G32_FOR_565(sg, d);
+                sb = SkDITHER_B32_FOR_565(sb, d);
+
+                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+                // now src and dst expanded are in g:11 r:10 x:1 b:10
+                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+            }
+            dst += 1;
+            DITHER_INC_X(x);
+            count--;
+        }
+
+        unsigned short dither_value[8];
+        __m128i dither, dither_cur;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
+        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
+        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
+        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
+#else
+        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+        dither_value[0] = dither_value[4] = (dither_scan
+                                             >> (((x) & 3) << 2)) & 0xF;
+        dither_value[1] = dither_value[5] = (dither_scan
+                                             >> (((x + 1) & 3) << 2)) & 0xF;
+        dither_value[2] = dither_value[6] = (dither_scan
+                                             >> (((x + 2) & 3) << 2)) & 0xF;
+        dither_value[3] = dither_value[7] = (dither_scan
+                                             >> (((x + 3) & 3) << 2)) & 0xF;
+#endif
+        dither = _mm_loadu_si128((__m128i*) dither_value);
+
+        const __m128i* s = reinterpret_cast<const __m128i*>(src);
+        __m128i* d = reinterpret_cast<__m128i*>(dst);
+        __m128i var256 = _mm_set1_epi16(256);
+        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
+        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
+        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
+
+        while (count >= 8) {
+            // Load 8 pixels of src and dst.
+            __m128i src_pixel1 = _mm_loadu_si128(s++);
+            __m128i src_pixel2 = _mm_loadu_si128(s++);
+            __m128i dst_pixel = _mm_load_si128(d);
+
+            // Extract A from src.
+            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
+            sa1 = _mm_srli_epi32(sa1, 24);
+            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
+            sa2 = _mm_srli_epi32(sa2, 24);
+            __m128i sa = _mm_packs_epi32(sa1, sa2);
+
+            // Calculate current dither value.
+            dither_cur = _mm_mullo_epi16(dither,
+                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
+            dither_cur = _mm_srli_epi16(dither_cur, 8);
+
+            // Extract R from src.
+            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+            sr1 = _mm_srli_epi32(sr1, 24);
+            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+            sr2 = _mm_srli_epi32(sr2, 24);
+            __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+            // SkDITHER_R32_FOR_565(sr, d)
+            __m128i sr_offset = _mm_srli_epi16(sr, 5);
+            sr = _mm_add_epi16(sr, dither_cur);
+            sr = _mm_sub_epi16(sr, sr_offset);
+
+            // Expand sr.
+            sr = _mm_slli_epi16(sr, 2);
+
+            // Extract G from src.
+            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+            sg1 = _mm_srli_epi32(sg1, 24);
+            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+            sg2 = _mm_srli_epi32(sg2, 24);
+            __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+            // sg = SkDITHER_G32_FOR_565(sg, d).
+            __m128i sg_offset = _mm_srli_epi16(sg, 6);
+            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
+            sg = _mm_sub_epi16(sg, sg_offset);
+
+            // Expand sg.
+            sg = _mm_slli_epi16(sg, 3);
+
+            // Extract B from src.
+            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+            sb1 = _mm_srli_epi32(sb1, 24);
+            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+            sb2 = _mm_srli_epi32(sb2, 24);
+            __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+            // sb = SkDITHER_B32_FOR_565(sb, d).
+            __m128i sb_offset = _mm_srli_epi16(sb, 5);
+            sb = _mm_add_epi16(sb, dither_cur);
+            sb = _mm_sub_epi16(sb, sb_offset);
+
+            // Expand sb.
+            sb = _mm_slli_epi16(sb, 2);
+
+            // Extract R G B from dst.
+            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
+            dr = _mm_and_si128(dr, r16_mask);
+            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
+            dg = _mm_and_si128(dg, g16_mask);
+            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
+            db = _mm_and_si128(db, b16_mask);
+
+            // SkAlpha255To256(255 - a) >> 3
+            __m128i isa = _mm_sub_epi16(var256, sa);
+            isa = _mm_srli_epi16(isa, 3);
+
+            dr = _mm_mullo_epi16(dr, isa);
+            dr = _mm_add_epi16(dr, sr);
+            dr = _mm_srli_epi16(dr, 5);
+
+            dg = _mm_mullo_epi16(dg, isa);
+            dg = _mm_add_epi16(dg, sg);
+            dg = _mm_srli_epi16(dg, 5);
+
+            db = _mm_mullo_epi16(db, isa);
+            db = _mm_add_epi16(db, sb);
+            db = _mm_srli_epi16(db, 5);
+
+            // Package and store dst pixel.
+            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
+            _mm_store_si128(d++, d_pixel);
+
+            count -= 8;
+            x += 8;
+        }
+
+        src = reinterpret_cast<const SkPMColor*>(s);
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+
+    if (count > 0) {
+        DITHER_565_SCAN(y);
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            if (c) {
+                unsigned a = SkGetPackedA32(c);
+
+                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+                unsigned sr = SkGetPackedR32(c);
+                unsigned sg = SkGetPackedG32(c);
+                unsigned sb = SkGetPackedB32(c);
+                sr = SkDITHER_R32_FOR_565(sr, d);
+                sg = SkDITHER_G32_FOR_565(sg, d);
+                sb = SkDITHER_B32_FOR_565(sb, d);
+
+                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+                // now src and dst expanded are in g:11 r:10 x:1 b:10
+                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+            }
+            dst += 1;
+            DITHER_INC_X(x);
+        } while (--count != 0);
+    }
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h
index b443ec7f213..29fd96e5e91 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2009 The Android Open Source Project
  *
@@ -6,6 +5,8 @@
  * found in the LICENSE file.
  */
 
+#ifndef SkBlitRow_opts_SSE2_DEFINED
+#define SkBlitRow_opts_SSE2_DEFINED
 
 #include "SkBlitRow.h"
 
@@ -28,3 +29,18 @@ void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
                          SkColor color, int width, SkPMColor);
 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
                                SkColor color, int width, SkPMColor opaqueDst);
+
+void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+                          const SkPMColor* SK_RESTRICT src, int count,
+                          U8CPU alpha, int /*x*/, int /*y*/);
+void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+                           const SkPMColor* SK_RESTRICT src,
+                           int count, U8CPU alpha, int /*x*/, int /*y*/);
+void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+                                 const SkPMColor* SK_RESTRICT src,
+                                 int count, U8CPU alpha, int x, int y);
+void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+                                  const SkPMColor* SK_RESTRICT src,
+                                  int count, U8CPU alpha, int x, int y);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp
index e8e544e9dcb..34b8564723c 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp
@@ -12,8 +12,6 @@
 #include "SkUtils.h"
 #include "SkUtilsArm.h"
 
-#include "SkCachePreload_arm.h"
-
 // Define USE_NEON_CODE to indicate that we need to build NEON routines
 #define USE_NEON_CODE  (!SK_ARM_NEON_IS_NONE)
 
@@ -376,3 +374,7 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
     return SK_ARM_NEON_WRAP(Color32_arm);
 }
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+    return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
index 672980d0d26..01a6a2aa745 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -14,10 +14,56 @@
 #include "SkMathPriv.h"
 #include "SkUtils.h"
 
-#include "SkCachePreload_arm.h"
 #include "SkColor_opts_neon.h"
 #include <arm_neon.h>
 
+#ifdef SK_CPU_ARM64
+static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
+    uint8x8x4_t vsrc;
+    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
+
+    asm (
+        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
+        "mov    %[vsrc0].8b, v0.8b             \t\n"
+        "mov    %[vsrc1].8b, v1.8b             \t\n"
+        "mov    %[vsrc2].8b, v2.8b             \t\n"
+        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
+          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
+        : : "v0", "v1", "v2", "v3"
+    );
+
+    vsrc.val[0] = vsrc_0;
+    vsrc.val[1] = vsrc_1;
+    vsrc.val[2] = vsrc_2;
+
+    return vsrc;
+}
+
+static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
+    uint8x8x4_t vsrc;
+    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
+
+    asm (
+        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
+        "mov    %[vsrc0].8b, v0.8b             \t\n"
+        "mov    %[vsrc1].8b, v1.8b             \t\n"
+        "mov    %[vsrc2].8b, v2.8b             \t\n"
+        "mov    %[vsrc3].8b, v3.8b             \t\n"
+        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
+          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
+          [src] "+&r" (src)
+        : : "v0", "v1", "v2", "v3"
+    );
+
+    vsrc.val[0] = vsrc_0;
+    vsrc.val[1] = vsrc_1;
+    vsrc.val[2] = vsrc_2;
+    vsrc.val[3] = vsrc_3;
+
+    return vsrc;
+}
+#endif
+
 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                            const SkPMColor* SK_RESTRICT src, int count,
                            U8CPU alpha, int /*x*/, int /*y*/) {
@@ -28,7 +74,12 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
         uint16x8_t vdst;
 
         // Load
+#ifdef SK_CPU_ARM64
+        vsrc = sk_vld4_u8_arm64_3(src);
+#else
         vsrc = vld4_u8((uint8_t*)src);
+        src += 8;
+#endif
 
         // Convert src to 565
         vdst = SkPixel32ToPixel16_neon8(vsrc);
@@ -38,7 +89,6 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
 
         // Prepare next iteration
         dst += 8;
-        src += 8;
         count -= 8;
     };
 
@@ -52,6 +102,92 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     };
 }
 
+void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
+                          const SkPMColor* SK_RESTRICT src, int count,
+                          U8CPU alpha, int /*x*/, int /*y*/) {
+    SkASSERT(255 > alpha);
+
+    uint16x8_t vmask_blue, vscale;
+
+    // prepare constants
+    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
+    vmask_blue = vmovq_n_u16(0x1F);
+
+    while (count >= 8) {
+        uint8x8x4_t vsrc;
+        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
+        uint16x8_t vres_r, vres_g, vres_b;
+
+        // Load src
+#ifdef SK_CPU_ARM64
+        vsrc = sk_vld4_u8_arm64_3(src);
+#else
+        {
+        register uint8x8_t d0 asm("d0");
+        register uint8x8_t d1 asm("d1");
+        register uint8x8_t d2 asm("d2");
+        register uint8x8_t d3 asm("d3");
+
+        asm (
+            "vld4.8    {d0-d3},[%[src]]!"
+            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
+            :
+        );
+        vsrc.val[0] = d0;
+        vsrc.val[1] = d1;
+        vsrc.val[2] = d2;
+        }
+#endif
+
+        // Load and unpack dst
+        vdst = vld1q_u16(dst);
+        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
+        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
+        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
+        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
+
+        // Shift src to 565 range
+        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
+        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
+        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
+
+        // Scale src - dst
+        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
+        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
+        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
+
+        vres_r = vshrq_n_u16(vres_r * vscale, 8);
+        vres_g = vshrq_n_u16(vres_g * vscale, 8);
+        vres_b = vshrq_n_u16(vres_b * vscale, 8);
+
+        vres_r += vdst_r;
+        vres_g += vdst_g;
+        vres_b += vdst_b;
+
+        // Combine
+        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
+        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
+
+        // Store
+        vst1q_u16(dst, vres_b);
+        dst += 8;
+        count -= 8;
+    }
+    if (count > 0) {
+        int scale = SkAlpha255To256(alpha);
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            uint16_t d = *dst;
+            *dst++ = SkPackRGB16(
+                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
+                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
+                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
+        } while (--count != 0);
+    }
+}
+
+#ifdef SK_CPU_ARM32
 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                            const SkPMColor* SK_RESTRICT src, int count,
                            U8CPU alpha, int /*x*/, int /*y*/) {
@@ -229,114 +365,129 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                       );
     }
 }
+#endif
+
+static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
+    prod += vdupq_n_u16(128);
+    prod += vshrq_n_u16(prod, 8);
+    return vshrq_n_u16(prod, 8);
+}
 
 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
                           const SkPMColor* SK_RESTRICT src, int count,
                           U8CPU alpha, int /*x*/, int /*y*/) {
+   SkASSERT(255 > alpha);
 
-    U8CPU alpha_for_asm = alpha;
-
-    asm volatile (
-    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
-     * the original in two respects:
-     *  1. The results have a few mismatches compared to the original code. These mismatches
-     *     never exceed 1. It's possible to improve accuracy vs. a floating point
-     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
-     *     Rounding is not present in the code below, because although results would be closer
-     *     to a floating point implementation, the number of mismatches compared to the
-     *     original code would be far greater.
-     *  2. On certain inputs, the original code can overflow, causing colour channels to
-     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
-     *     to affect another.
+    /* This code implements a Neon version of S32A_D565_Blend. The results have
+     * a few mismatches compared to the original code. These mismatches never
+     * exceed 1.
      */
 
-#if 1
-        /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
-                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
+    if (count >= 8) {
+        uint16x8_t valpha_max, vmask_blue;
+        uint8x8_t valpha;
+
+        // prepare constants
+        valpha_max = vmovq_n_u16(255);
+        valpha = vdup_n_u8(alpha);
+        vmask_blue = vmovq_n_u16(SK_B16_MASK);
+
+        do {
+            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
+            uint16x8_t vres_a, vres_r, vres_g, vres_b;
+            uint8x8x4_t vsrc;
+
+            // load pixels
+            vdst = vld1q_u16(dst);
+#ifdef SK_CPU_ARM64
+            vsrc = sk_vld4_u8_arm64_4(src);
 #else
-                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
+#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
+            asm (
+                "vld4.u8 %h[vsrc], [%[src]]!"
+                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
+                : :
+            );
+#else
+            register uint8x8_t d0 asm("d0");
+            register uint8x8_t d1 asm("d1");
+            register uint8x8_t d2 asm("d2");
+            register uint8x8_t d3 asm("d3");
+
+            asm volatile (
+                "vld4.u8    {d0-d3},[%[src]]!;"
+                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
+                  [src] "+&r" (src)
+                : :
+            );
+            vsrc.val[0] = d0;
+            vsrc.val[1] = d1;
+            vsrc.val[2] = d2;
+            vsrc.val[3] = d3;
 #endif
-                  "vmov.u16   q3, #255                        \n\t"   // set up constant
-                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
-                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
-                  "beq        2f                              \n\t"   // if count8 == 0, exit
-                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
-
-                  "1:                                             \n\t"
-                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
-                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
-                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
-                  //  and deinterleave
-
-                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
-                  "vand       q10, q0, q15                    \n\t"   // extract blue
-                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
-                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
-                  // dstrgb = {q8, q9, q10}
-
-                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
-                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
-                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
-
-                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
-                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
-                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
-                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
-                  // srcrgba = {q11, q12, q13, q14}
-
-                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
-                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
-                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
-                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
-
-                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
-                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
-                  // dst_scale = q2
-
-                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
-                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
-                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
-
-#if 1
-    // trying for a better match with SkDiv255Round(a)
-    // C alg is:  a+=128; (a+a>>8)>>8
-    // we'll use just a rounding shift [q2 is available for scratch]
-                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
-                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
-                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
+#endif // #ifdef SK_CPU_ARM64
+
+
+            // deinterleave dst
+            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
+            vdst_b = vdst & vmask_blue;                     // extract blue
+            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
+            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
+
+            // shift src to 565
+            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
+            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
+            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
+
+            // calc src * src_scale
+            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
+            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
+            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
+            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
+
+            // prepare dst_scale
+            vres_a = SkDiv255Round_neon8(vres_a);
+            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
+
+            // add dst * dst_scale to previous result
+            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
+            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
+            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
+
+#ifdef S32A_D565_BLEND_EXACT
+            // It is possible to get exact results with this but it is slow,
+            // even slower than C code in some cases
+            vres_r = SkDiv255Round_neon8(vres_r);
+            vres_g = SkDiv255Round_neon8(vres_g);
+            vres_b = SkDiv255Round_neon8(vres_b);
 #else
-    // arm's original "truncating divide by 256"
-                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
-                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
-                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
+            vres_r = vrshrq_n_u16(vres_r, 8);
+            vres_g = vrshrq_n_u16(vres_g, 8);
+            vres_b = vrshrq_n_u16(vres_b, 8);
 #endif
+            // pack result
+            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
+            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
 
-                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
-                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
-                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
-
-                  "bne        1b                              \n\t"   // if counter != 0, loop
-                  "2:                                             \n\t"   // exit
-
-                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
-                  :
-                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
-                  );
+            // store
+            vst1q_u16(dst, vres_b);
+            dst += 8;
+            count -= 8;
+        } while (count >= 8);
+    }
 
-    count &= 7;
-    if (count > 0) {
-        do {
-            SkPMColor sc = *src++;
-            if (sc) {
-                uint16_t dc = *dst;
-                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
-                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
-                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
-                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
-                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
-            }
-            dst += 1;
-        } while (--count != 0);
+    // leftovers
+    while (count-- > 0) {
+        SkPMColor sc = *src++;
+        if (sc) {
+            uint16_t dc = *dst;
+            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
+            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
+            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
+            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
+            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
+        }
+        dst += 1;
     }
 }
 
@@ -374,6 +525,7 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
 
         do {
 
+            uint8x8x4_t vsrc;
             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
@@ -384,6 +536,9 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
             int8x8_t vres8_r, vres8_g, vres8_b;
 
             // Load source and add dither
+#ifdef SK_CPU_ARM64
+            vsrc = sk_vld4_u8_arm64_3(src);
+#else
             {
             register uint8x8_t d0 asm("d0");
             register uint8x8_t d1 asm("d1");
@@ -391,17 +546,18 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
             register uint8x8_t d3 asm("d3");
 
             asm (
-                "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+                "vld4.8    {d0-d3},[%[src]]! "
                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
                 :
             );
-            vsrc_g = d1;
-#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
-            vsrc_r = d2; vsrc_b = d0;
-#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
-            vsrc_r = d0; vsrc_b = d2;
-#endif
+            vsrc.val[0] = d0;
+            vsrc.val[1] = d1;
+            vsrc.val[2] = d2;
             }
+#endif
+            vsrc_r = vsrc.val[NEON_R];
+            vsrc_g = vsrc.val[NEON_G];
+            vsrc_b = vsrc.val[NEON_B];
 
             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
@@ -766,76 +922,67 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
                               const SkPMColor* SK_RESTRICT src,
                               int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
-    if (count > 0) {
-        uint16_t src_scale = SkAlpha255To256(alpha);
-        uint16_t dst_scale = 256 - src_scale;
-
-    /* run them N at a time through the NEON unit */
-    /* note that each 1 is 4 bytes, each treated exactly the same,
-     * so we can work under that guise. We *do* know that the src&dst
-     * will be 32-bit aligned quantities, so we can specify that on
-     * the load/store ops and do a neon 'reinterpret' to get us to
-     * byte-sized (pun intended) pieces that we widen/multiply/shift
-     * we're limited at 128 bits in the wide ops, which is 8x16bits
-     * or a pair of 32 bit src/dsts.
-     */
-    /* we *could* manually unroll this loop so that we load 128 bits
-     * (as a pair of 64s) from each of src and dst, processing them
-     * in pieces. This might give us a little better management of
-     * the memory latency, but my initial attempts here did not
-     * produce an instruction stream that looked all that nice.
-     */
-#define    UNROLL    2
-    while (count >= UNROLL) {
-        uint8x8_t  src_raw, dst_raw, dst_final;
-        uint16x8_t  src_wide, dst_wide;
 
-        /* get 64 bits of src, widen it, multiply by src_scale */
-        src_raw = vreinterpret_u8_u32(vld1_u32(src));
-        src_wide = vmovl_u8(src_raw);
-        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
-        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
+    if (count <= 0) {
+        return;
+    }
 
-        /* ditto with dst */
-        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
-        dst_wide = vmovl_u8(dst_raw);
+    uint16_t src_scale = SkAlpha255To256(alpha);
+    uint16_t dst_scale = 256 - src_scale;
 
-        /* combine add with dst multiply into mul-accumulate */
-        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
+    while (count >= 2) {
+        uint8x8_t vsrc, vdst, vres;
+        uint16x8_t vsrc_wide, vdst_wide;
 
-        dst_final = vshrn_n_u16(dst_wide, 8);
-        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
+        /* These commented prefetches are a big win for count
+         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
+         * They also hurt a little (<5%) on an A15
+         */
+        //__builtin_prefetch(src+32);
+        //__builtin_prefetch(dst+32);
 
-        src += UNROLL;
-        dst += UNROLL;
-        count -= UNROLL;
+        // Load
+        vsrc = vreinterpret_u8_u32(vld1_u32(src));
+        vdst = vreinterpret_u8_u32(vld1_u32(dst));
+
+        // Process src
+        vsrc_wide = vmovl_u8(vsrc);
+        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
+
+        // Process dst
+        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+
+        // Combine
+        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+
+        // Store
+        vst1_u32(dst, vreinterpret_u32_u8(vres));
+
+        src += 2;
+        dst += 2;
+        count -= 2;
     }
-    /* RBE: well, i don't like how gcc manages src/dst across the above
-     * loop it's constantly calculating src+bias, dst+bias and it only
-     * adjusts the real ones when we leave the loop. Not sure why
-     * it's "hoisting down" (hoisting implies above in my lexicon ;))
-     * the adjustments to src/dst/count, but it does...
-     * (might be SSA-style internal logic...
-     */
 
-#if    UNROLL == 2
     if (count == 1) {
-            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
-    }
-#else
-    if (count > 0) {
-            do {
-                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
-                src += 1;
-                dst += 1;
-            } while (--count > 0);
-    }
-#endif
+        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
+        uint16x8_t vsrc_wide, vdst_wide;
 
-#undef    UNROLL
+        // Load
+        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
+        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
+
+        // Process
+        vsrc_wide = vmovl_u8(vsrc);
+        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
+        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+
+        // Store
+        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
     }
 }
 
+#ifdef SK_CPU_ARM32
 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
                          const SkPMColor* SK_RESTRICT src,
                          int count, U8CPU alpha) {
@@ -961,6 +1108,7 @@ static void showme16(char *str, void *p, int len)
     SkDebugf("%s\n", buf);
 }
 #endif
+#endif // #ifdef SK_CPU_ARM32
 
 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
                                    const SkPMColor* SK_RESTRICT src,
@@ -970,9 +1118,8 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
 #define    UNROLL    8
 
     if (count >= UNROLL) {
-    uint8x8_t dbase;
 
-#if    defined(DEBUG_OPAQUE_DITHER)
+#if defined(DEBUG_OPAQUE_DITHER)
     uint16_t tmpbuf[UNROLL];
     int td[UNROLL];
     int tdv[UNROLL];
@@ -983,35 +1130,37 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
     int noisy = 0;
 #endif
 
+    uint8x8_t dbase;
     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
     dbase = vld1_u8(dstart);
 
         do {
+        uint8x8x4_t vsrc;
         uint8x8_t sr, sg, sb, sa, d;
         uint16x8_t dst8, scale8, alpha8;
         uint16x8_t dst_r, dst_g, dst_b;
 
-#if    defined(DEBUG_OPAQUE_DITHER)
-    /* calculate 8 elements worth into a temp buffer */
-    {
-      int my_y = y;
-      int my_x = x;
-      SkPMColor* my_src = (SkPMColor*)src;
-      uint16_t* my_dst = dst;
-      int i;
-
-          DITHER_565_SCAN(my_y);
-          for(i=0;i<UNROLL;i++) {
+#if defined(DEBUG_OPAQUE_DITHER)
+        // calculate 8 elements worth into a temp buffer
+        {
+        int my_y = y;
+        int my_x = x;
+        SkPMColor* my_src = (SkPMColor*)src;
+        uint16_t* my_dst = dst;
+        int i;
+
+        DITHER_565_SCAN(my_y);
+        for(i = 0; i < UNROLL; i++) {
             SkPMColor c = *my_src++;
             SkPMColorAssert(c);
             if (c) {
                 unsigned a = SkGetPackedA32(c);
 
                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
-        tdv[i] = DITHER_VALUE(my_x);
-        ta[i] = a;
-        tap[i] = SkAlpha255To256(a);
-        td[i] = d;
+                tdv[i] = DITHER_VALUE(my_x);
+                ta[i] = a;
+                tap[i] = SkAlpha255To256(a);
+                td[i] = d;
 
                 unsigned sr = SkGetPackedR32(c);
                 unsigned sg = SkGetPackedG32(c);
@@ -1025,147 +1174,132 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
                 // now src and dst expanded are in g:11 r:10 x:1 b:10
                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
-        td[i] = d;
-
+                td[i] = d;
             } else {
-        tmpbuf[i] = *my_dst;
-        ta[i] = tdv[i] = td[i] = 0xbeef;
-        }
-        in_dst[i] = *my_dst;
+                tmpbuf[i] = *my_dst;
+                ta[i] = tdv[i] = td[i] = 0xbeef;
+            }
+            in_dst[i] = *my_dst;
             my_dst += 1;
             DITHER_INC_X(my_x);
-          }
-    }
+        }
+        }
 #endif
 
-        /* source is in ABGR */
+#ifdef SK_CPU_ARM64
+        vsrc = sk_vld4_u8_arm64_4(src);
+#else
         {
         register uint8x8_t d0 asm("d0");
         register uint8x8_t d1 asm("d1");
         register uint8x8_t d2 asm("d2");
         register uint8x8_t d3 asm("d3");
 
-        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
-            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
-            : "r" (src)
-                    );
-            sr = d0; sg = d1; sb = d2; sa = d3;
+        asm ("vld4.8    {d0-d3},[%[src]]! "
+            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
+            :
+        );
+        vsrc.val[0] = d0;
+        vsrc.val[1] = d1;
+        vsrc.val[2] = d2;
+        vsrc.val[3] = d3;
         }
-
-        /* calculate 'd', which will be 0..7 */
-        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
-#if defined(SK_BUILD_FOR_ANDROID)
-        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
-        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
-#else
-        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
 #endif
-        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
-        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
+        sa = vsrc.val[NEON_A];
+        sr = vsrc.val[NEON_R];
+        sg = vsrc.val[NEON_G];
+        sb = vsrc.val[NEON_B];
 
-        /* sr = sr - (sr>>5) + d */
+        /* calculate 'd', which will be 0..7
+         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
+         */
+        alpha8 = vmovl_u8(dbase);
+        alpha8 = vmlal_u8(alpha8, sa, dbase);
+        d = vshrn_n_u16(alpha8, 8);    // narrowing too
+
+        // sr = sr - (sr>>5) + d
         /* watching for 8-bit overflow.  d is 0..7; risky range of
          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
-         * safe  as long as we do ((sr-sr>>5) + d) */
+         * safe  as long as we do ((sr-sr>>5) + d)
+         */
         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
         sr = vadd_u8(sr, d);
 
-        /* sb = sb - (sb>>5) + d */
+        // sb = sb - (sb>>5) + d
         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
         sb = vadd_u8(sb, d);
 
-        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
         sg = vadd_u8(sg, vshr_n_u8(d,1));
 
-        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
+        // need to pick up 8 dst's -- at 16 bits each, 128 bits
         dst8 = vld1q_u16(dst);
-        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
-        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
-        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
-
-        /* blend */
-#if 1
-        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
-        /* originally 255-sa + 1 */
+        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
+        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
+        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
+
+        // blend
         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
-#else
-        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
-        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
-#endif
 
-#if 1
-        /* combine the addq and mul, save 3 insns */
+        // combine the addq and mul, save 3 insns
         scale8 = vshrq_n_u16(scale8, 3);
         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
-#else
-        /* known correct, but +3 insns over above */
-        scale8 = vshrq_n_u16(scale8, 3);
-        dst_b = vmulq_u16(dst_b, scale8);
-        dst_g = vmulq_u16(dst_g, scale8);
-        dst_r = vmulq_u16(dst_r, scale8);
-
-        /* combine */
-        /* NB: vshll widens, need to preserve those bits */
-        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
-        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
-        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
-#endif
 
-        /* repack to store */
-        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
+        // repack to store
+        dst8 = vshrq_n_u16(dst_b, 5);
         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
 
         vst1q_u16(dst, dst8);
 
-#if    defined(DEBUG_OPAQUE_DITHER)
-        /* verify my 8 elements match the temp buffer */
-    {
-       int i, bad=0;
-       static int invocation;
-
-       for (i=0;i<UNROLL;i++)
-        if (tmpbuf[i] != dst[i]) bad=1;
-       if (bad) {
-        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
-            invocation, offset);
-        SkDebugf("  alpha 0x%x\n", alpha);
-        for (i=0;i<UNROLL;i++)
-            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
-            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
-            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
-
-        showme16("alpha8", &alpha8, sizeof(alpha8));
-        showme16("scale8", &scale8, sizeof(scale8));
-        showme8("d", &d, sizeof(d));
-        showme16("dst8", &dst8, sizeof(dst8));
-        showme16("dst_b", &dst_b, sizeof(dst_b));
-        showme16("dst_g", &dst_g, sizeof(dst_g));
-        showme16("dst_r", &dst_r, sizeof(dst_r));
-        showme8("sb", &sb, sizeof(sb));
-        showme8("sg", &sg, sizeof(sg));
-        showme8("sr", &sr, sizeof(sr));
-
-        /* cop out */
-        return;
-       }
-       offset += UNROLL;
-       invocation++;
-    }
-#endif
+#if defined(DEBUG_OPAQUE_DITHER)
+        // verify my 8 elements match the temp buffer
+        {
+        int i, bad=0;
+        static int invocation;
 
-            dst += UNROLL;
-        src += UNROLL;
+        for (i = 0; i < UNROLL; i++) {
+            if (tmpbuf[i] != dst[i]) {
+                bad=1;
+            }
+        }
+        if (bad) {
+            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
+                     invocation, offset);
+            SkDebugf("  alpha 0x%x\n", alpha);
+            for (i = 0; i < UNROLL; i++)
+                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
+                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
+                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
+
+            showme16("alpha8", &alpha8, sizeof(alpha8));
+            showme16("scale8", &scale8, sizeof(scale8));
+            showme8("d", &d, sizeof(d));
+            showme16("dst8", &dst8, sizeof(dst8));
+            showme16("dst_b", &dst_b, sizeof(dst_b));
+            showme16("dst_g", &dst_g, sizeof(dst_g));
+            showme16("dst_r", &dst_r, sizeof(dst_r));
+            showme8("sb", &sb, sizeof(sb));
+            showme8("sg", &sg, sizeof(sg));
+            showme8("sr", &sr, sizeof(sr));
+
+            return;
+        }
+        offset += UNROLL;
+        invocation++;
+        }
+#endif
+        dst += UNROLL;
         count -= UNROLL;
-        /* skip x += UNROLL, since it's unchanged mod-4 */
+        // skip x += UNROLL, since it's unchanged mod-4
         } while (count >= UNROLL);
     }
 #undef    UNROLL
 
-    /* residuals */
+    // residuals
     if (count > 0) {
         DITHER_565_SCAN(y);
         do {
@@ -1218,7 +1352,11 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
         uint8x8_t sr, sg, sb;
         uint16x8_t dr, dg, db;
         uint16x8_t dst8;
+        uint8x8x4_t vsrc;
 
+#ifdef SK_CPU_ARM64
+        vsrc = sk_vld4_u8_arm64_3(src);
+#else
         {
         register uint8x8_t d0 asm("d0");
         register uint8x8_t d1 asm("d1");
@@ -1226,17 +1364,19 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
         register uint8x8_t d3 asm("d3");
 
         asm (
-            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+            "vld4.8    {d0-d3},[%[src]]! "
             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
             :
         );
-        sg = d1;
-#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
-        sr = d2; sb = d0;
-#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
-        sr = d0; sb = d2;
-#endif
+        vsrc.val[0] = d0;
+        vsrc.val[1] = d1;
+        vsrc.val[2] = d2;
         }
+#endif
+        sr = vsrc.val[NEON_R];
+        sg = vsrc.val[NEON_G];
+        sb = vsrc.val[NEON_B];
+
         /* XXX: if we want to prefetch, hide it in the above asm()
          * using the gcc __builtin_prefetch(), the prefetch will
          * fall to the bottom of the loop -- it won't stick up
@@ -1321,84 +1461,88 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
     unsigned colorA = SkGetPackedA32(color);
     if (255 == colorA) {
         sk_memset32(dst, color, count);
-    } else {
-        unsigned scale = 256 - SkAlpha255To256(colorA);
+        return;
+    }
 
-        if (count >= 8) {
-            // at the end of this assembly, count will have been decremented
-            // to a negative value. That is, if count mod 8 = x, it will be
-            // -8 +x coming out.
-            asm volatile (
-                PLD128(src, 0)
-
-                "vdup.32    q0, %[color]                \n\t"
-
-                PLD128(src, 128)
-
-                // scale numerical interval [0-255], so load as 8 bits
-                "vdup.8     d2, %[scale]                \n\t"
-
-                PLD128(src, 256)
-
-                "subs       %[count], %[count], #8      \n\t"
-
-                PLD128(src, 384)
-
-                "Loop_Color32:                          \n\t"
-
-                // load src color, 8 pixels, 4 64 bit registers
-                // (and increment src).
-                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
-
-                PLD128(src, 384)
-
-                // multiply long by scale, 64 bits at a time,
-                // destination into a 128 bit register.
-                "vmull.u8   q4, d4, d2                  \n\t"
-                "vmull.u8   q5, d5, d2                  \n\t"
-                "vmull.u8   q6, d6, d2                  \n\t"
-                "vmull.u8   q7, d7, d2                  \n\t"
-
-                // shift the 128 bit registers, containing the 16
-                // bit scaled values back to 8 bits, narrowing the
-                // results to 64 bit registers.
-                "vshrn.i16  d8, q4, #8                  \n\t"
-                "vshrn.i16  d9, q5, #8                  \n\t"
-                "vshrn.i16  d10, q6, #8                 \n\t"
-                "vshrn.i16  d11, q7, #8                 \n\t"
-
-                // adding back the color, using 128 bit registers.
-                "vadd.i8    q6, q4, q0                  \n\t"
-                "vadd.i8    q7, q5, q0                  \n\t"
-
-                // store back the 8 calculated pixels (2 128 bit
-                // registers), and increment dst.
-                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
-
-                "subs       %[count], %[count], #8      \n\t"
-                "bge        Loop_Color32                \n\t"
-                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
-                : [color] "r" (color), [scale] "r" (scale)
-                : "cc", "memory",
-                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
-                          );
-            // At this point, if we went through the inline assembly, count is
-            // a negative value:
-            // if the value is -8, there is no pixel left to process.
-            // if the value is -7, there is one pixel left to process
-            // ...
-            // And'ing it with 7 will give us the number of pixels
-            // left to process.
-            count = count & 0x7;
-        }
+    unsigned scale = 256 - SkAlpha255To256(colorA);
 
-        while (count > 0) {
-            *dst = color + SkAlphaMulQ(*src, scale);
-            src += 1;
-            dst += 1;
-            count--;
-        }
+    if (count >= 8) {
+        uint32x4_t vcolor;
+        uint8x8_t vscale;
+
+        vcolor = vdupq_n_u32(color);
+
+        // scale numerical interval [0-255], so load as 8 bits
+        vscale = vdup_n_u8(scale);
+
+        do {
+            // load src color, 8 pixels, 4 64 bit registers
+            // (and increment src).
+            uint32x2x4_t vsrc;
+#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
+            asm (
+                "vld1.32    %h[vsrc], [%[src]]!"
+                : [vsrc] "=w" (vsrc), [src] "+r" (src)
+                : :
+            );
+#else // 64bit targets and Clang
+            vsrc.val[0] = vld1_u32(src);
+            vsrc.val[1] = vld1_u32(src+2);
+            vsrc.val[2] = vld1_u32(src+4);
+            vsrc.val[3] = vld1_u32(src+6);
+            src += 8;
+#endif
+
+            // multiply long by scale, 64 bits at a time,
+            // destination into a 128 bit register.
+            uint16x8x4_t vtmp;
+            vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
+            vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
+            vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
+            vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
+
+            // shift the 128 bit registers, containing the 16
+            // bit scaled values back to 8 bits, narrowing the
+            // results to 64 bit registers.
+            uint8x16x2_t vres;
+            vres.val[0] = vcombine_u8(
+                            vshrn_n_u16(vtmp.val[0], 8),
+                            vshrn_n_u16(vtmp.val[1], 8));
+            vres.val[1] = vcombine_u8(
+                            vshrn_n_u16(vtmp.val[2], 8),
+                            vshrn_n_u16(vtmp.val[3], 8));
+
+            // adding back the color, using 128 bit registers.
+            uint32x4x2_t vdst;
+            vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
+                                               vreinterpretq_u8_u32(vcolor));
+            vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
+                                               vreinterpretq_u8_u32(vcolor));
+
+            // store back the 8 calculated pixels (2 128 bit
+            // registers), and increment dst.
+#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
+            asm (
+                "vst1.32    %h[vdst], [%[dst]]!"
+                : [dst] "+r" (dst)
+                : [vdst] "w" (vdst)
+                : "memory"
+            );
+#else // 64bit targets and Clang
+            vst1q_u32(dst, vdst.val[0]);
+            vst1q_u32(dst+4, vdst.val[1]);
+            dst += 8;
+#endif
+            count -= 8;
+
+        } while (count >= 8);
+    }
+
+    while (count > 0) {
+        *dst = color + SkAlphaMulQ(*src, scale);
+        src += 1;
+        dst += 1;
+        count--;
     }
 }
 
@@ -1406,12 +1550,13 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
 
 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
     // no dither
-    // NOTE: For the S32_D565_Blend function below, we don't have a special
-    //       version that assumes that each source pixel is opaque. But our
-    //       S32A is still faster than the default, so use it.
     S32_D565_Opaque_neon,
-    S32A_D565_Blend_neon,   // really S32_D565_Blend
+    S32_D565_Blend_neon,
+#ifdef SK_CPU_ARM32
     S32A_D565_Opaque_neon,
+#else
+    NULL,
+#endif
     S32A_D565_Blend_neon,
 
     // dither
@@ -1439,5 +1584,9 @@ const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
 #else
     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
 #endif
+#ifdef SK_CPU_ARM32
     S32A_Blend_BlitRow32_neon        // S32A_Blend
+#else
+    NULL
+#endif
 };
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
new file mode 100644
index 00000000000..30bb4c2701a
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
@@ -0,0 +1,848 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBlitRow.h"
+#include "SkBlitMask.h"
+#include "SkColorPriv.h"
+#include "SkDither.h"
+#include "SkMathPriv.h"
+
+static void S32_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst,
+                                    const SkPMColor* SK_RESTRICT src, int count,
+                                    U8CPU alpha, int /*x*/, int /*y*/) {
+    register uint32_t t0, t1, t2, t3, t4, t5, t6;
+    register uint32_t s0, s1, s2, s4, s5, s6;
+
+    alpha += 1;
+    if (count >= 2) {
+        __asm__ volatile (
+           ".set             push                          \n\t"
+           ".set             noreorder                     \n\t"
+            "sll             %[s4],    %[alpha], 8         \n\t"
+            "or              %[s4],    %[s4],    %[alpha]  \n\t"
+            "repl.ph         %[s5],    0x1f                \n\t"
+            "repl.ph         %[s6],    0x3f                \n\t"
+        "1:                                                \n\t"
+            "lw              %[s2],    0(%[src])           \n\t"
+            "lw              %[s1],    4(%[src])           \n\t"
+            "lwr             %[s0],    0(%[dst])           \n\t"
+            "lwl             %[s0],    3(%[dst])           \n\t"
+            "and             %[t1],    %[s0],    %[s5]     \n\t"
+            "shra.ph         %[t0],    %[s0],    5         \n\t"
+            "and             %[t2],    %[t0],    %[s6]     \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+            "shrl.ph         %[t3],    %[s0],    11        \n\t"
+#else
+            "shra.ph         %[t0],    %[s0],    11        \n\t"
+            "and             %[t3],    %[t0],    %[s5]     \n\t"
+#endif
+            "precrq.ph.w     %[t0],    %[s1],    %[s2]     \n\t"
+            "shrl.qb         %[t5],    %[t0],    3         \n\t"
+            "and             %[t4],    %[t5],    %[s5]     \n\t"
+            "ins             %[s2],    %[s1],    16, 16    \n\t"
+            "preceu.ph.qbra  %[t0],    %[s2]               \n\t"
+            "shrl.qb         %[t6],    %[t0],    3         \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+            "shrl.ph         %[t5],    %[s2],    10        \n\t"
+#else
+            "shra.ph         %[t0],    %[s2],    10        \n\t"
+            "and             %[t5],    %[t0],    %[s6]     \n\t"
+#endif
+            "subu.qb         %[t4],    %[t4],    %[t1]     \n\t"
+            "subu.qb         %[t5],    %[t5],    %[t2]     \n\t"
+            "subu.qb         %[t6],    %[t6],    %[t3]     \n\t"
+            "muleu_s.ph.qbr  %[t4],    %[s4],    %[t4]     \n\t"
+            "muleu_s.ph.qbr  %[t5],    %[s4],    %[t5]     \n\t"
+            "muleu_s.ph.qbr  %[t6],    %[s4],    %[t6]     \n\t"
+            "addiu           %[count], %[count], -2        \n\t"
+            "addiu           %[src],   %[src],   8         \n\t"
+            "shra.ph         %[t4],    %[t4],    8         \n\t"
+            "shra.ph         %[t5],    %[t5],    8         \n\t"
+            "shra.ph         %[t6],    %[t6],    8         \n\t"
+            "addu.qb         %[t4],    %[t4],    %[t1]     \n\t"
+            "addu.qb         %[t5],    %[t5],    %[t2]     \n\t"
+            "addu.qb         %[t6],    %[t6],    %[t3]     \n\t"
+            "andi            %[s0],    %[t4],    0xffff    \n\t"
+            "andi            %[t0],    %[t5],    0xffff    \n\t"
+            "sll             %[t0],    %[t0],    0x5       \n\t"
+            "or              %[s0],    %[s0],    %[t0]     \n\t"
+            "sll             %[t0],    %[t6],    0xb       \n\t"
+            "or              %[t0],    %[t0],    %[s0]     \n\t"
+            "sh              %[t0],    0(%[dst])           \n\t"
+            "srl             %[s1],    %[t4],    16        \n\t"
+            "srl             %[t0],    %[t5],    16        \n\t"
+            "sll             %[t5],    %[t0],    5         \n\t"
+            "or              %[t0],    %[t5],    %[s1]     \n\t"
+            "srl             %[s0],    %[t6],    16        \n\t"
+            "sll             %[s2],    %[s0],    0xb       \n\t"
+            "or              %[s1],    %[s2],    %[t0]     \n\t"
+            "sh              %[s1],    2(%[dst])           \n\t"
+            "bge             %[count], 2,        1b        \n\t"
+            " addiu          %[dst],   %[dst],   4         \n\t"
+            ".set            pop                           \n\t"
+            : [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [s0]"=&r"(s0),
+              [s1]"=&r"(s1), [s2]"=&r"(s2), [s4]"=&r"(s4), [s5]"=&r"(s5),
+              [s6]"=&r"(s6), [count]"+r"(count), [dst]"+r"(dst),
+              [src]"+r"(src)
+            : [alpha]"r"(alpha)
+            : "memory", "hi", "lo"
+        );
+    }
+
+    if (count == 1) {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c);
+        SkASSERT(SkGetPackedA32(c) == 255);
+        uint16_t d = *dst;
+        *dst++ = SkPackRGB16(SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), alpha),
+                             SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), alpha),
+                             SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), alpha));
+    }
+}
+
+static void S32A_D565_Opaque_Dither_mips_dsp(uint16_t* __restrict__ dst,
+                                             const SkPMColor* __restrict__ src,
+                                             int count, U8CPU alpha, int x, int y) {
+    __asm__ volatile (
+        "pref  0,   0(%[src])     \n\t"
+        "pref  1,   0(%[dst])     \n\t"
+        "pref  0,   32(%[src])    \n\t"
+        "pref  1,   32(%[dst])    \n\t"
+        :
+        : [src]"r"(src), [dst]"r"(dst)
+        : "memory"
+    );
+
+    register int32_t t0, t1, t2, t3, t4, t5, t6;
+    register int32_t t7, t8, t9, s0, s1, s2, s3;
+    const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+
+    if (count >= 2) {
+        __asm__ volatile (
+            ".set            push                                \n\t"
+            ".set            noreorder                           \n\t"
+            "li              %[s1],    0x01010101                \n\t"
+            "li              %[s2],    -2017                     \n\t"
+        "1:                                                      \n\t"
+            "bnez            %[s3],    4f                        \n\t"
+            " li             %[s3],    2                         \n\t"
+            "pref            0,        64(%[src])                \n\t"
+            "pref            1,        64(%[dst])                \n\t"
+        "4:                                                      \n\t"
+            "addiu           %[s3],    %[s3],    -1              \n\t"
+            "lw              %[t1],    0(%[src])                 \n\t"
+            "andi            %[t3],    %[x],     0x3             \n\t"
+            "addiu           %[x],     %[x],     1               \n\t"
+            "sll             %[t4],    %[t3],    2               \n\t"
+            "srav            %[t5],    %[dither_scan], %[t4]     \n\t"
+            "andi            %[t3],    %[t5],    0xf             \n\t"
+            "lw              %[t2],    4(%[src])                 \n\t"
+            "andi            %[t4],    %[x],     0x3             \n\t"
+            "sll             %[t5],    %[t4],    2               \n\t"
+            "srav            %[t6],    %[dither_scan], %[t5]     \n\t"
+            "addiu           %[x],     %[x],     1               \n\t"
+            "ins             %[t3],    %[t6],    8,    4         \n\t"
+            "srl             %[t4],    %[t1],    24              \n\t"
+            "addiu           %[t0],    %[t4],    1               \n\t"
+            "srl             %[t4],    %[t2],    24              \n\t"
+            "addiu           %[t5],    %[t4],    1               \n\t"
+            "ins             %[t0],    %[t5],    16,   16        \n\t"
+            "muleu_s.ph.qbr  %[t4],    %[t3],    %[t0]           \n\t"
+            "preceu.ph.qbla  %[t3],    %[t4]                     \n\t"
+            "andi            %[t4],    %[t1],    0xff            \n\t"
+            "ins             %[t4],    %[t2],    16,   8         \n\t"
+            "shrl.qb         %[t5],    %[t4],    5               \n\t"
+            "subu.qb         %[t6],    %[t3],    %[t5]           \n\t"
+            "addq.ph         %[t5],    %[t6],    %[t4]           \n\t"
+            "ext             %[t4],    %[t1],    8,    8         \n\t"
+            "srl             %[t6],    %[t2],    8               \n\t"
+            "ins             %[t4],    %[t6],    16,   8         \n\t"
+            "shrl.qb         %[t6],    %[t4],    6               \n\t"
+            "shrl.qb         %[t7],    %[t3],    1               \n\t"
+            "subu.qb         %[t8],    %[t7],    %[t6]           \n\t"
+            "addq.ph         %[t6],    %[t8],    %[t4]           \n\t"
+            "ext             %[t4],    %[t1],    16,   8         \n\t"
+            "srl             %[t7],    %[t2],    16              \n\t"
+            "ins             %[t4],    %[t7],    16,   8         \n\t"
+            "shrl.qb         %[t7],    %[t4],    5               \n\t"
+            "subu.qb         %[t8],    %[t3],    %[t7]           \n\t"
+            "addq.ph         %[t7],    %[t8],    %[t4]           \n\t"
+            "shll.ph         %[t4],    %[t7],    2               \n\t"
+            "andi            %[t9],    %[t4],    0xffff          \n\t"
+            "srl             %[s0],    %[t4],    16              \n\t"
+            "andi            %[t3],    %[t6],    0xffff          \n\t"
+            "srl             %[t4],    %[t6],    16              \n\t"
+            "andi            %[t6],    %[t5],    0xffff          \n\t"
+            "srl             %[t7],    %[t5],    16              \n\t"
+            "subq.ph         %[t5],    %[s1],    %[t0]           \n\t"
+            "srl             %[t0],    %[t5],    3               \n\t"
+            "beqz            %[t1],    3f                        \n\t"
+            " lhu            %[t5],    0(%[dst])                 \n\t"
+            "sll             %[t1],    %[t6],    13              \n\t"
+            "or              %[t8],    %[t9],    %[t1]           \n\t"
+            "sll             %[t1],    %[t3],    24              \n\t"
+            "or              %[t9],    %[t1],    %[t8]           \n\t"
+            "andi            %[t3],    %[t5],    0x7e0           \n\t"
+            "sll             %[t6],    %[t3],    0x10            \n\t"
+            "and             %[t8],    %[s2],    %[t5]           \n\t"
+            "or              %[t5],    %[t6],    %[t8]           \n\t"
+            "andi            %[t6],    %[t0],    0xff            \n\t"
+            "mul             %[t1],    %[t6],    %[t5]           \n\t"
+            "addu            %[t5],    %[t1],    %[t9]           \n\t"
+            "srl             %[t6],    %[t5],    5               \n\t"
+            "and             %[t5],    %[s2],    %[t6]           \n\t"
+            "srl             %[t8],    %[t6],    16              \n\t"
+            "andi            %[t6],    %[t8],    0x7e0           \n\t"
+            "or              %[t1],    %[t5],    %[t6]           \n\t"
+            "sh              %[t1],    0(%[dst])                 \n\t"
+        "3:                                                      \n\t"
+            "beqz            %[t2],    2f                        \n\t"
+            " lhu            %[t5],    2(%[dst])                 \n\t"
+            "sll             %[t1],    %[t7],    13              \n\t"
+            "or              %[t8],    %[s0],    %[t1]           \n\t"
+            "sll             %[t1],    %[t4],    24              \n\t"
+            "or              %[t9],    %[t1],    %[t8]           \n\t"
+            "andi            %[t3],    %[t5],    0x7e0           \n\t"
+            "sll             %[t6],    %[t3],    0x10            \n\t"
+            "and             %[t8],    %[s2],    %[t5]           \n\t"
+            "or              %[t5],    %[t6],    %[t8]           \n\t"
+            "srl             %[t6],    %[t0],    16              \n\t"
+            "mul             %[t1],    %[t6],    %[t5]           \n\t"
+            "addu            %[t5],    %[t1],    %[t9]           \n\t"
+            "srl             %[t6],    %[t5],    5               \n\t"
+            "and             %[t5],    %[s2],    %[t6]           \n\t"
+            "srl             %[t8],    %[t6],    16              \n\t"
+            "andi            %[t6],    %[t8],    0x7e0           \n\t"
+            "or              %[t1],    %[t5],    %[t6]           \n\t"
+            "sh              %[t1],    2(%[dst])                 \n\t"
+        "2:                                                      \n\t"
+            "addiu           %[count], %[count], -2              \n\t"
+            "addiu           %[src],   %[src],   8               \n\t"
+            "addiu           %[t1],    %[count], -1              \n\t"
+            "bgtz            %[t1],    1b                        \n\t"
+            " addiu          %[dst],  %[dst],    4               \n\t"
+            ".set            pop                                 \n\t"
+            : [src]"+r"(src), [count]"+r"(count), [dst]"+r"(dst), [x]"+r"(x),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7),
+              [t8]"=&r"(t8),  [t9]"=&r"(t9), [s0]"=&r"(s0), [s1]"=&r"(s1),
+              [s2]"=&r"(s2), [s3]"=&r"(s3)
+            : [dither_scan]"r"(dither_scan)
+            : "memory", "hi", "lo"
+        );
+    }
+
+    if (count == 1) {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c);
+        if (c) {
+            unsigned a = SkGetPackedA32(c);
+            int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+            unsigned sr = SkGetPackedR32(c);
+            unsigned sg = SkGetPackedG32(c);
+            unsigned sb = SkGetPackedB32(c);
+            sr = SkDITHER_R32_FOR_565(sr, d);
+            sg = SkDITHER_G32_FOR_565(sg, d);
+            sb = SkDITHER_B32_FOR_565(sb, d);
+
+            uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+            uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+            dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+            // now src and dst expanded are in g:11 r:10 x:1 b:10
+            *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+        }
+        dst += 1;
+        DITHER_INC_X(x);
+    }
+}
+
+static void S32_D565_Opaque_Dither_mips_dsp(uint16_t* __restrict__ dst,
+                                            const SkPMColor* __restrict__ src,
+                                            int count, U8CPU alpha, int x, int y) {
+    uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+    register uint32_t t0, t1, t2, t3, t4, t5;
+    register uint32_t t6, t7, t8, t9, s0;
+    int dither[4];
+    int i;
+
+    for (i = 0; i < 4; i++, x++) {
+        dither[i] = (dither_scan >> ((x & 3) << 2)) & 0xF;
+    }
+
+    __asm__ volatile (
+        ".set            push                          \n\t"
+        ".set            noreorder                     \n\t"
+        "li              %[s0],    1                   \n\t"
+    "2:                                                \n\t"
+        "beqz            %[count], 1f                  \n\t"
+        " nop                                          \n\t"
+        "addiu           %[t0],    %[count], -1        \n\t"
+        "beqz            %[t0],    1f                  \n\t"
+        " nop                                          \n\t"
+        "beqz            %[s0],    3f                  \n\t"
+        " nop                                          \n\t"
+        "lw              %[t0],    0(%[dither])        \n\t"
+        "lw              %[t1],    4(%[dither])        \n\t"
+        "li              %[s0],    0                   \n\t"
+        "b               4f                            \n\t"
+        " nop                                          \n\t"
+    "3:                                                \n\t"
+        "lw              %[t0],    8(%[dither])        \n\t"
+        "lw              %[t1],    12(%[dither])       \n\t"
+        "li              %[s0],    1                   \n\t"
+    "4:                                                \n\t"
+        "sll             %[t2],    %[t0],    16        \n\t"
+        "or              %[t1],    %[t2],    %[t1]     \n\t"
+        "lw              %[t0],    0(%[src])           \n\t"
+        "lw              %[t2],    4(%[src])           \n\t"
+        "precrq.ph.w     %[t3],    %[t0],    %[t2]     \n\t"
+        "preceu.ph.qbra  %[t9],    %[t3]               \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+        "append          %[t0],    %[t2],    16        \n\t"
+        "preceu.ph.qbra  %[t4],    %[t0]               \n\t"
+        "preceu.ph.qbla  %[t5],    %[t0]               \n\t"
+#else
+        "sll             %[t6],    %[t0],    16        \n\t"
+        "sll             %[t7],    %[t2],    16        \n\t"
+        "precrq.ph.w     %[t8],    %[t6],    %[t7]     \n\t"
+        "preceu.ph.qbra  %[t4],    %[t8]               \n\t"
+        "preceu.ph.qbla  %[t5],    %[t8]               \n\t"
+#endif
+        "addu.qb         %[t0],    %[t4],    %[t1]     \n\t"
+        "shra.ph         %[t2],    %[t4],    5         \n\t"
+        "subu.qb         %[t3],    %[t0],    %[t2]     \n\t"
+        "shra.ph         %[t6],    %[t3],    3         \n\t"
+        "addu.qb         %[t0],    %[t9],    %[t1]     \n\t"
+        "shra.ph         %[t2],    %[t9],    5         \n\t"
+        "subu.qb         %[t3],    %[t0],    %[t2]     \n\t"
+        "shra.ph         %[t7],    %[t3],    3         \n\t"
+        "shra.ph         %[t0],    %[t1],    1         \n\t"
+        "shra.ph         %[t2],    %[t5],    6         \n\t"
+        "addu.qb         %[t3],    %[t5],    %[t0]     \n\t"
+        "subu.qb         %[t4],    %[t3],    %[t2]     \n\t"
+        "shra.ph         %[t8],    %[t4],    2         \n\t"
+        "precrq.ph.w     %[t0],    %[t6],    %[t7]     \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+        "append          %[t6],    %[t7],    16        \n\t"
+#else
+        "sll             %[t6],    %[t6],    16        \n\t"
+        "sll             %[t2],    %[t7],    16        \n\t"
+        "precrq.ph.w     %[t6],    %[t6],    %[t2]     \n\t"
+#endif
+        "sra             %[t4],    %[t8],    16        \n\t"
+        "andi            %[t5],    %[t8],    0xFF      \n\t"
+        "sll             %[t7],    %[t4],    5         \n\t"
+        "sra             %[t8],    %[t0],    5         \n\t"
+        "or              %[t9],    %[t7],    %[t8]     \n\t"
+        "or              %[t3],    %[t9],    %[t0]     \n\t"
+        "andi            %[t4],    %[t3],    0xFFFF    \n\t"
+        "sll             %[t7],    %[t5],    5         \n\t"
+        "sra             %[t8],    %[t6],    5         \n\t"
+        "or              %[t9],    %[t7],    %[t8]     \n\t"
+        "or              %[t3],    %[t9],    %[t6]     \n\t"
+        "and             %[t7],    %[t3],    0xFFFF    \n\t"
+        "sh              %[t4],    0(%[dst])           \n\t"
+        "sh              %[t7],    2(%[dst])           \n\t"
+        "addiu           %[count], %[count], -2        \n\t"
+        "addiu           %[src],   %[src],   8         \n\t"
+        "b               2b                            \n\t"
+        " addiu          %[dst],   %[dst],   4         \n\t"
+    "1:                                                \n\t"
+        ".set            pop                           \n\t"
+        : [dst]"+r"(dst), [src]"+r"(src), [count]"+r"(count),
+          [x]"+r"(x), [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2),
+          [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6),
+          [t7]"=&r"(t7), [t8]"=&r"(t8), [t9]"=&r"(t9), [s0]"=&r"(s0)
+        : [dither] "r" (dither)
+        : "memory"
+    );
+
+    if (count == 1) {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c); // only if DEBUG is turned on
+        SkASSERT(SkGetPackedA32(c) == 255);
+        unsigned dither = DITHER_VALUE(x);
+        *dst++ = SkDitherRGB32To565(c, dither);
+    }
+}
+
+static void S32_D565_Blend_Dither_mips_dsp(uint16_t* dst,
+                                           const SkPMColor* src,
+                                           int count, U8CPU alpha, int x, int y) {
+    register int32_t t0, t1, t2, t3, t4, t5, t6;
+    register int32_t s0, s1, s2, s3;
+    register int x1 = 0;
+    register uint32_t sc_mul;
+    register uint32_t sc_add;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+    const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+#else // ENABLE_DITHER_MATRIX_4X4
+    const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+#endif // ENABLE_DITHER_MATRIX_4X4
+    int dither[4];
+
+    for (int i = 0; i < 4; i++) {
+        dither[i] = (dither_scan >> ((x & 3) << 2)) & 0xF;
+        x += 1;
+    }
+    alpha += 1;
+    __asm__ volatile (
+        ".set            push                              \n\t"
+        ".set            noreorder                         \n\t"
+        "li              %[t0],     0x100                  \n\t"
+        "subu            %[t0],     %[t0],     %[alpha]    \n\t"
+        "replv.ph        %[sc_mul], %[alpha]               \n\t"
+        "beqz            %[alpha],  1f                     \n\t"
+        " nop                                              \n\t"
+        "replv.qb        %[sc_add], %[t0]                  \n\t"
+        "b               2f                                \n\t"
+        " nop                                              \n\t"
+    "1:                                                    \n\t"
+        "replv.qb        %[sc_add], %[alpha]               \n\t"
+    "2:                                                    \n\t"
+        "addiu           %[t2],     %[count],  -1          \n\t"
+        "blez            %[t2],     3f                     \n\t"
+        " nop                                              \n\t"
+        "lw              %[s0],     0(%[src])              \n\t"
+        "lw              %[s1],     4(%[src])              \n\t"
+        "bnez            %[x1],     4f                     \n\t"
+        " nop                                              \n\t"
+        "lw              %[t0],     0(%[dither])           \n\t"
+        "lw              %[t1],     4(%[dither])           \n\t"
+        "li              %[x1],     1                      \n\t"
+        "b               5f                                \n\t"
+        " nop                                              \n\t"
+    "4:                                                    \n\t"
+        "lw              %[t0],     8(%[dither])           \n\t"
+        "lw              %[t1],     12(%[dither])          \n\t"
+        "li              %[x1],     0                      \n\t"
+    "5:                                                    \n\t"
+        "sll             %[t3],     %[t0],     7           \n\t"
+        "sll             %[t4],     %[t1],     7           \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+        "append          %[t0],     %[t1],     16          \n\t"
+#else
+        "sll             %[t0],     %[t0],     8           \n\t"
+        "sll             %[t2],     %[t1],     8           \n\t"
+        "precrq.qb.ph    %[t0],     %[t0],     %[t2]       \n\t"
+#endif
+        "precrq.qb.ph    %[t1],     %[t3],     %[t4]       \n\t"
+        "sll             %[t5],     %[s0],     8           \n\t"
+        "sll             %[t6],     %[s1],     8           \n\t"
+        "precrq.qb.ph    %[t4],     %[t5],     %[t6]       \n\t"
+        "precrq.qb.ph    %[t6],     %[s0],     %[s1]       \n\t"
+        "preceu.ph.qbla  %[t5],     %[t4]                  \n\t"
+        "preceu.ph.qbra  %[t4],     %[t4]                  \n\t"
+        "preceu.ph.qbra  %[t6],     %[t6]                  \n\t"
+        "lh              %[t2],     0(%[dst])              \n\t"
+        "lh              %[s1],     2(%[dst])              \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+        "append          %[t2],     %[s1],     16          \n\t"
+#else
+        "sll             %[s1],     %[s1],     16          \n\t"
+        "packrl.ph       %[t2],     %[t2],     %[s1]       \n\t"
+#endif
+        "shra.ph         %[s1],     %[t2],     11          \n\t"
+        "and             %[s1],     %[s1],     0x1F001F    \n\t"
+        "shra.ph         %[s2],     %[t2],     5           \n\t"
+        "and             %[s2],     %[s2],     0x3F003F    \n\t"
+        "and             %[s3],     %[t2],     0x1F001F    \n\t"
+        "shrl.qb         %[t3],     %[t4],     5           \n\t"
+        "addu.qb         %[t4],     %[t4],     %[t0]       \n\t"
+        "subu.qb         %[t4],     %[t4],     %[t3]       \n\t"
+        "shrl.qb         %[t4],     %[t4],     3           \n\t"
+        "shrl.qb         %[t3],     %[t5],     5           \n\t"
+        "addu.qb         %[t5],     %[t5],     %[t0]       \n\t"
+        "subu.qb         %[t5],     %[t5],     %[t3]       \n\t"
+        "shrl.qb         %[t5],     %[t5],     3           \n\t"
+        "shrl.qb         %[t3],     %[t6],     6           \n\t"
+        "addu.qb         %[t6],     %[t6],     %[t1]       \n\t"
+        "subu.qb         %[t6],     %[t6],     %[t3]       \n\t"
+        "shrl.qb         %[t6],     %[t6],     2           \n\t"
+        "cmpu.lt.qb      %[t4],     %[s1]                  \n\t"
+        "pick.qb         %[s0],     %[sc_add], $0          \n\t"
+        "addu.qb         %[s0],     %[s0],     %[s1]       \n\t"
+        "subu.qb         %[t4],     %[t4],     %[s1]       \n\t"
+        "muleu_s.ph.qbl  %[t0],     %[t4],     %[sc_mul]   \n\t"
+        "muleu_s.ph.qbr  %[t1],     %[t4],     %[sc_mul]   \n\t"
+        "precrq.qb.ph    %[t4],     %[t0],     %[t1]       \n\t"
+        "addu.qb         %[t4],     %[t4],     %[s0]       \n\t"
+        "cmpu.lt.qb      %[t5],     %[s3]                  \n\t"
+        "pick.qb         %[s0],     %[sc_add], $0          \n\t"
+        "addu.qb         %[s0],     %[s0],     %[s3]       \n\t"
+        "subu.qb         %[t5],     %[t5],     %[s3]       \n\t"
+        "muleu_s.ph.qbl  %[t0],     %[t5],     %[sc_mul]   \n\t"
+        "muleu_s.ph.qbr  %[t1],     %[t5],     %[sc_mul]   \n\t"
+        "precrq.qb.ph    %[t5],     %[t0],     %[t1]       \n\t"
+        "addu.qb         %[t5],     %[t5],     %[s0]       \n\t"
+        "cmpu.lt.qb      %[t6],     %[s2]                  \n\t"
+        "pick.qb         %[s0],     %[sc_add], $0          \n\t"
+        "addu.qb         %[s0],     %[s0],     %[s2]       \n\t"
+        "subu.qb         %[t6],     %[t6],     %[s2]       \n\t"
+        "muleu_s.ph.qbl  %[t0],     %[t6],     %[sc_mul]   \n\t"
+        "muleu_s.ph.qbr  %[t1],     %[t6],     %[sc_mul]   \n\t"
+        "precrq.qb.ph    %[t6],     %[t0],     %[t1]       \n\t"
+        "addu.qb         %[t6],     %[t6],     %[s0]       \n\t"
+        "shll.ph         %[s1],     %[t4],     11          \n\t"
+        "shll.ph         %[t0],     %[t6],     5           \n\t"
+        "or              %[s0],     %[s1],     %[t0]       \n\t"
+        "or              %[s1],     %[s0],     %[t5]       \n\t"
+        "srl             %[t2],     %[s1],     16          \n\t"
+        "and             %[t3],     %[s1],     0xFFFF      \n\t"
+        "sh              %[t2],     0(%[dst])              \n\t"
+        "sh              %[t3],     2(%[dst])              \n\t"
+        "addiu           %[src],    %[src],    8           \n\t"
+        "addi            %[count],  %[count],  -2          \n\t"
+        "b               2b                                \n\t"
+        " addu           %[dst],    %[dst],    4           \n\t"
+    "3:                                                    \n\t"
+        ".set            pop                               \n\t"
+        : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
+          [x1]"+r"(x1), [sc_mul]"=&r"(sc_mul), [sc_add]"=&r"(sc_add),
+          [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+          [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [s0]"=&r"(s0),
+          [s1]"=&r"(s1), [s2]"=&r"(s2), [s3]"=&r"(s3)
+        : [dither]"r"(dither), [alpha]"r"(alpha)
+        : "memory", "hi", "lo"
+    );
+
+    if(count == 1) {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c);
+        SkASSERT(SkGetPackedA32(c) == 255);
+        DITHER_565_SCAN(y);
+        int dither = DITHER_VALUE(x);
+        int sr = SkGetPackedR32(c);
+        int sg = SkGetPackedG32(c);
+        int sb = SkGetPackedB32(c);
+        sr = SkDITHER_R32To565(sr, dither);
+        sg = SkDITHER_G32To565(sg, dither);
+        sb = SkDITHER_B32To565(sb, dither);
+
+        uint16_t d = *dst;
+        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), alpha),
+                             SkAlphaBlend(sg, SkGetPackedG16(d), alpha),
+                             SkAlphaBlend(sb, SkGetPackedB16(d), alpha));
+        DITHER_INC_X(x);
+    }
+}
+
+static void S32A_D565_Opaque_mips_dsp(uint16_t* __restrict__ dst,
+                                      const SkPMColor* __restrict__ src,
+                                      int count, U8CPU alpha, int x, int y) {
+
+    __asm__ volatile (
+        "pref  0,  0(%[src])     \n\t"
+        "pref  1,  0(%[dst])     \n\t"
+        "pref  0,  32(%[src])    \n\t"
+        "pref  1,  32(%[dst])    \n\t"
+        :
+        : [src]"r"(src), [dst]"r"(dst)
+        : "memory"
+    );
+
+    register uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8;
+    register uint32_t t16;
+    register uint32_t add_x10 = 0x100010;
+    register uint32_t add_x20 = 0x200020;
+    register uint32_t sa = 0xff00ff;
+
+    __asm__ volatile (
+        ".set           push                            \n\t"
+        ".set           noreorder                       \n\t"
+        "blez           %[count], 1f                    \n\t"
+        " nop                                           \n\t"
+    "2:                                                 \n\t"
+        "beqz           %[count], 1f                    \n\t"
+        " nop                                           \n\t"
+        "addiu          %[t0],    %[count], -1          \n\t"
+        "beqz           %[t0],    1f                    \n\t"
+        " nop                                           \n\t"
+        "bnez           %[t16],   3f                    \n\t"
+        " nop                                           \n\t"
+        "li             %[t16],   2                     \n\t"
+        "pref           0,        64(%[src])            \n\t"
+        "pref           1,        64(%[dst])            \n\t"
+    "3:                                                 \n\t"
+        "addiu          %[t16],   %[t16],   -1          \n\t"
+        "lw             %[t0],    0(%[src])             \n\t"
+        "lw             %[t1],    4(%[src])             \n\t"
+        "precrq.ph.w    %[t2],    %[t0],    %[t1]       \n\t"
+        "preceu.ph.qbra %[t8],    %[t2]                 \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+        "append         %[t0],    %[t1],    16          \n\t"
+#else
+        "sll            %[t0],    %[t0],    16          \n\t"
+        "sll            %[t6],    %[t1],    16          \n\t"
+        "precrq.ph.w    %[t0],    %[t0],    %[t6]       \n\t"
+#endif
+        "preceu.ph.qbra %[t3],    %[t0]                 \n\t"
+        "preceu.ph.qbla %[t4],    %[t0]                 \n\t"
+        "preceu.ph.qbla %[t0],    %[t2]                 \n\t"
+        "subq.ph        %[t1],    %[sa],    %[t0]       \n\t"
+        "sra            %[t2],    %[t1],    8           \n\t"
+        "or             %[t5],    %[t2],    %[t1]       \n\t"
+        "replv.ph       %[t2],    %[t5]                 \n\t"
+        "lh             %[t0],    0(%[dst])             \n\t"
+        "lh             %[t1],    2(%[dst])             \n\t"
+        "and            %[t1],    %[t1],    0xffff      \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+        "append         %[t0],    %[t1],    16          \n\t"
+#else
+        "sll            %[t5],    %[t0],    16          \n\t"
+        "or             %[t0],    %[t5],    %[t1]       \n\t"
+#endif
+        "and            %[t1],    %[t0],    0x1f001f    \n\t"
+        "shra.ph        %[t6],    %[t0],    11          \n\t"
+        "and            %[t6],    %[t6],    0x1f001f    \n\t"
+        "and            %[t7],    %[t0],    0x7e007e0   \n\t"
+        "shra.ph        %[t5],    %[t7],    5           \n\t"
+        "muleu_s.ph.qbl %[t0],    %[t2],    %[t6]       \n\t"
+        "addq.ph        %[t7],    %[t0],    %[add_x10]  \n\t"
+        "shra.ph        %[t6],    %[t7],    5           \n\t"
+        "addq.ph        %[t6],    %[t7],    %[t6]       \n\t"
+        "shra.ph        %[t0],    %[t6],    5           \n\t"
+        "addq.ph        %[t7],    %[t0],    %[t3]       \n\t"
+        "shra.ph        %[t6],    %[t7],    3           \n\t"
+        "muleu_s.ph.qbl %[t0],    %[t2],    %[t1]       \n\t"
+        "addq.ph        %[t7],    %[t0],    %[add_x10]  \n\t"
+        "shra.ph        %[t0],    %[t7],    5           \n\t"
+        "addq.ph        %[t7],    %[t7],    %[t0]       \n\t"
+        "shra.ph        %[t0],    %[t7],    5           \n\t"
+        "addq.ph        %[t7],    %[t0],    %[t8]       \n\t"
+        "shra.ph        %[t3],    %[t7],    3           \n\t"
+        "muleu_s.ph.qbl %[t0],    %[t2],    %[t5]       \n\t"
+        "addq.ph        %[t7],    %[t0],    %[add_x20]  \n\t"
+        "shra.ph        %[t0],    %[t7],    6           \n\t"
+        "addq.ph        %[t8],    %[t7],    %[t0]       \n\t"
+        "shra.ph        %[t0],    %[t8],    6           \n\t"
+        "addq.ph        %[t7],    %[t0],    %[t4]       \n\t"
+        "shra.ph        %[t8],    %[t7],    2           \n\t"
+        "shll.ph        %[t0],    %[t8],    5           \n\t"
+        "shll.ph        %[t1],    %[t6],    11          \n\t"
+        "or             %[t2],    %[t0],    %[t1]       \n\t"
+        "or             %[t3],    %[t2],    %[t3]       \n\t"
+        "sra            %[t4],    %[t3],    16          \n\t"
+        "sh             %[t4],    0(%[dst])             \n\t"
+        "sh             %[t3],    2(%[dst])             \n\t"
+        "addiu          %[count], %[count], -2          \n\t"
+        "addiu          %[src],   %[src],   8           \n\t"
+        "b              2b                              \n\t"
+        " addiu         %[dst],   %[dst],   4           \n\t"
+    "1:                                                 \n\t"
+        ".set           pop                             \n\t"
+        : [dst]"+r"(dst), [src]"+r"(src), [count]"+r"(count),
+          [t16]"=&r"(t16), [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2),
+          [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6),
+          [t7]"=&r"(t7), [t8]"=&r"(t8)
+        : [add_x10]"r"(add_x10), [add_x20]"r"(add_x20), [sa]"r"(sa)
+        : "memory", "hi", "lo"
+    );
+
+    if (count == 1) {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c);
+        if (c) {
+            *dst = SkSrcOver32To16(c, *dst);
+        }
+        dst += 1;
+    }
+}
+
+static void S32A_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst,
+                                     const SkPMColor* SK_RESTRICT src, int count,
+                                     U8CPU alpha, int /*x*/, int /*y*/) {
+    register uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+    register uint32_t  s0, s1, s2, s3;
+    register unsigned dst_scale = 0;
+
+    __asm__ volatile (
+        ".set            push                                       \n\t"
+        ".set            noreorder                                  \n\t"
+        "replv.qb        %[t0],        %[alpha]                     \n\t"
+        "repl.ph         %[t6],        0x80                         \n\t"
+        "repl.ph         %[t7],        0xFF                         \n\t"
+    "1:                                                             \n\t"
+        "addiu           %[t8],        %[count],     -1             \n\t"
+        "blez            %[t8],        2f                           \n\t"
+        " nop                                                       \n\t"
+        "lw              %[t8],        0(%[src])                    \n\t"
+        "lw              %[t9],        4(%[src])                    \n\t"
+        "lh              %[t4],        0(%[dst])                    \n\t"
+        "lh              %[t5],        2(%[dst])                    \n\t"
+        "sll             %[t5],        %[t5],        16             \n\t"
+        "sll             %[t2],        %[t8],        8              \n\t"
+        "sll             %[t3],        %[t9],        8              \n\t"
+        "precrq.qb.ph    %[t1],        %[t2],        %[t3]          \n\t"
+        "precrq.qb.ph    %[t3],        %[t8],        %[t9]          \n\t"
+        "preceu.ph.qbla  %[t8],        %[t3]                        \n\t"
+        "muleu_s.ph.qbr  %[s3],        %[t0],        %[t8]          \n\t"
+        "preceu.ph.qbla  %[t2],        %[t1]                        \n\t"
+        "preceu.ph.qbra  %[t1],        %[t1]                        \n\t"
+        "preceu.ph.qbra  %[t3],        %[t3]                        \n\t"
+        "packrl.ph       %[t9],        %[t4],        %[t5]          \n\t"
+        "shra.ph         %[s0],        %[t9],        11             \n\t"
+        "and             %[s0],        %[s0],        0x1F001F       \n\t"
+        "shra.ph         %[s1],        %[t9],        5              \n\t"
+        "and             %[s1],        %[s1],        0x3F003F       \n\t"
+        "and             %[s2],        %[t9],        0x1F001F       \n\t"
+        "addq.ph         %[s3],        %[s3],        %[t6]          \n\t"
+        "shra.ph         %[t5],        %[s3],        8              \n\t"
+        "and             %[t5],        %[t5],        0xFF00FF       \n\t"
+        "addq.ph         %[dst_scale], %[s3],        %[t5]          \n\t"
+        "shra.ph         %[dst_scale], %[dst_scale], 8              \n\t"
+        "subq_s.ph       %[dst_scale], %[t7],        %[dst_scale]   \n\t"
+        "sll             %[dst_scale], %[dst_scale], 8              \n\t"
+        "precrq.qb.ph    %[dst_scale], %[dst_scale], %[dst_scale]   \n\t"
+        "shrl.qb         %[t1],        %[t1],        3              \n\t"
+        "shrl.qb         %[t2],        %[t2],        3              \n\t"
+        "shrl.qb         %[t3],        %[t3],        2              \n\t"
+        "muleu_s.ph.qbl  %[t1],        %[t0],        %[t1]          \n\t"
+        "muleu_s.ph.qbl  %[t2],        %[t0],        %[t2]          \n\t"
+        "muleu_s.ph.qbl  %[t3],        %[t0],        %[t3]          \n\t"
+        "muleu_s.ph.qbl  %[t8],        %[dst_scale], %[s0]          \n\t"
+        "muleu_s.ph.qbl  %[t9],        %[dst_scale], %[s2]          \n\t"
+        "muleu_s.ph.qbl  %[t4],        %[dst_scale], %[s1]          \n\t"
+        "addq.ph         %[t1],        %[t1],        %[t8]          \n\t"
+        "addq.ph         %[t2],        %[t2],        %[t9]          \n\t"
+        "addq.ph         %[t3],        %[t3],        %[t4]          \n\t"
+        "addq.ph         %[t8],        %[t1],        %[t6]          \n\t"
+        "addq.ph         %[t9],        %[t2],        %[t6]          \n\t"
+        "addq.ph         %[t4],        %[t3],        %[t6]          \n\t"
+        "shra.ph         %[t1],        %[t8],        8              \n\t"
+        "addq.ph         %[t1],        %[t1],        %[t8]          \n\t"
+        "preceu.ph.qbla  %[t1],        %[t1]                        \n\t"
+        "shra.ph         %[t2],        %[t9],        8              \n\t"
+        "addq.ph         %[t2],        %[t2],        %[t9]          \n\t"
+        "preceu.ph.qbla  %[t2],        %[t2]                        \n\t"
+        "shra.ph         %[t3],        %[t4],        8              \n\t"
+        "addq.ph         %[t3],        %[t3],        %[t4]          \n\t"
+        "preceu.ph.qbla  %[t3],        %[t3]                        \n\t"
+        "shll.ph         %[t8],        %[t1],        11             \n\t"
+        "shll.ph         %[t9],        %[t3],        5              \n\t"
+        "or              %[t8],        %[t8],        %[t9]          \n\t"
+        "or              %[s0],        %[t8],        %[t2]          \n\t"
+        "srl             %[t8],        %[s0],        16             \n\t"
+        "and             %[t9],        %[s0],        0xFFFF         \n\t"
+        "sh              %[t8],        0(%[dst])                    \n\t"
+        "sh              %[t9],        2(%[dst])                    \n\t"
+        "addiu           %[src],       %[src],       8              \n\t"
+        "addiu           %[count],     %[count],     -2             \n\t"
+        "b               1b                                         \n\t"
+        " addiu          %[dst],       %[dst],       4              \n\t"
+    "2:                                                             \n\t"
+        ".set            pop                                        \n\t"
+        : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
+          [dst_scale]"+r"(dst_scale), [s0]"=&r"(s0), [s1]"=&r"(s1),
+          [s2]"=&r"(s2), [s3]"=&r"(s3), [t0]"=&r"(t0), [t1]"=&r"(t1),
+          [t2]"=&r"(t2), [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5),
+          [t6]"=&r"(t6), [t7]"=&r"(t7), [t8]"=&r"(t8), [t9]"=&r"(t9)
+        : [alpha]"r"(alpha)
+        : "memory", "hi", "lo"
+    );
+
+    if (count == 1) {
+        SkPMColor sc = *src++;
+        SkPMColorAssert(sc);
+        if (sc) {
+            uint16_t dc = *dst;
+            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
+            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) +
+                          SkMulS16(SkGetPackedR16(dc), dst_scale);
+            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) +
+                          SkMulS16(SkGetPackedG16(dc), dst_scale);
+            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) +
+                          SkMulS16(SkGetPackedB16(dc), dst_scale);
+            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
+        }
+        dst += 1;
+    }
+}
+
+static void S32_Blend_BlitRow32_mips_dsp(SkPMColor* SK_RESTRICT dst,
+                                         const SkPMColor* SK_RESTRICT src,
+                                         int count, U8CPU alpha) {
+    register int32_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __asm__ volatile (
+        ".set            push                         \n\t"
+        ".set            noreorder                    \n\t"
+        "li              %[t2],    0x100              \n\t"
+        "addiu           %[t0],    %[alpha], 1        \n\t"
+        "subu            %[t1],    %[t2],    %[t0]    \n\t"
+        "replv.qb        %[t7],    %[t0]              \n\t"
+        "replv.qb        %[t6],    %[t1]              \n\t"
+    "1:                                               \n\t"
+        "blez            %[count], 2f                 \n\t"
+        "lw              %[t0],    0(%[src])          \n\t"
+        "lw              %[t1],    0(%[dst])          \n\t"
+        "preceu.ph.qbr   %[t2],    %[t0]              \n\t"
+        "preceu.ph.qbl   %[t3],    %[t0]              \n\t"
+        "preceu.ph.qbr   %[t4],    %[t1]              \n\t"
+        "preceu.ph.qbl   %[t5],    %[t1]              \n\t"
+        "muleu_s.ph.qbr  %[t2],    %[t7],    %[t2]    \n\t"
+        "muleu_s.ph.qbr  %[t3],    %[t7],    %[t3]    \n\t"
+        "muleu_s.ph.qbr  %[t4],    %[t6],    %[t4]    \n\t"
+        "muleu_s.ph.qbr  %[t5],    %[t6],    %[t5]    \n\t"
+        "addiu           %[src],   %[src],   4        \n\t"
+        "addiu           %[count], %[count], -1       \n\t"
+        "precrq.qb.ph    %[t0],    %[t3],    %[t2]    \n\t"
+        "precrq.qb.ph    %[t2],    %[t5],    %[t4]    \n\t"
+        "addu            %[t1],    %[t0],    %[t2]    \n\t"
+        "sw              %[t1],    0(%[dst])          \n\t"
+        "b               1b                           \n\t"
+        " addi           %[dst],   %[dst],   4        \n\t"
+    "2:                                               \n\t"
+        ".set            pop                          \n\t"
+        : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
+          [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+          [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+        : [alpha]"r"(alpha)
+        : "memory", "hi", "lo"
+    );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+const SkBlitRow::Proc platform_565_procs_mips_dsp[] = {
+    // no dither
+    NULL,
+    S32_D565_Blend_mips_dsp,
+    S32A_D565_Opaque_mips_dsp,
+    S32A_D565_Blend_mips_dsp,
+
+    // dither
+    S32_D565_Opaque_Dither_mips_dsp,
+    S32_D565_Blend_Dither_mips_dsp,
+    S32A_D565_Opaque_Dither_mips_dsp,
+    NULL,
+};
+
+static const SkBlitRow::Proc32 platform_32_procs_mips_dsp[] = {
+    NULL,   // S32_Opaque,
+    S32_Blend_BlitRow32_mips_dsp,   // S32_Blend,
+    NULL,   // S32A_Opaque,
+    NULL,   // S32A_Blend,
+};
+
+SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
+    return platform_565_procs_mips_dsp[flags];
+}
+
+SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+    return platform_32_procs_mips_dsp[flags];
+}
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+    return NULL;
+}
+
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+    return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp
index 93830d78b46..bbc6a66462e 100644
--- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp
@@ -5,36 +5,31 @@
  * found in the LICENSE file.
  */
 
-
+#include <emmintrin.h>
 #include "SkBitmap.h"
-#include "SkColorPriv.h"
 #include "SkBlurImage_opts_SSE2.h"
+#include "SkColorPriv.h"
 #include "SkRect.h"
 
-#include <emmintrin.h>
-
 namespace {
-
 enum BlurDirection {
     kX, kY
 };
 
-/**
- * Helper function to spread the components of a 32-bit integer into the
+/* Helper function to spread the components of a 32-bit integer into the
  * lower 8 bits of each 32-bit element of an SSE register.
  */
-
 inline __m128i expand(int a) {
-      const __m128i zero = _mm_setzero_si128();
+    const __m128i zero = _mm_setzero_si128();
 
-      // 0 0 0 0   0 0 0 0   0 0 0 0   A R G B
-      __m128i result = _mm_cvtsi32_si128(a);
+    // 0 0 0 0   0 0 0 0   0 0 0 0   A R G B
+    __m128i result = _mm_cvtsi32_si128(a);
 
-      // 0 0 0 0   0 0 0 0   0 A 0 R   0 G 0 B
-      result = _mm_unpacklo_epi8(result, zero);
+    // 0 0 0 0   0 0 0 0   0 A 0 R   0 G 0 B
+    result = _mm_unpacklo_epi8(result, zero);
 
-      // 0 0 0 A   0 0 0 R   0 0 0 G   0 0 0 B
-      return _mm_unpacklo_epi16(result, zero);
+    // 0 0 0 A   0 0 0 R   0 0 0 G   0 0 0 B
+    return _mm_unpacklo_epi16(result, zero);
 }
 
 template<BlurDirection srcDirection, BlurDirection dstDirection>
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h
index c8deea4bb9c..db104bacf4f 100644
--- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h
@@ -5,9 +5,14 @@
  * found in the LICENSE file.
  */
 
+#ifndef SkBlurImage_opts_SSE2_DEFINED
+#define SkBlurImage_opts_SSE2_DEFINED
+
 #include "SkBlurImage_opts.h"
 
 bool SkBoxBlurGetPlatformProcs_SSE2(SkBoxBlurProc* boxBlurX,
                                     SkBoxBlurProc* boxBlurY,
                                     SkBoxBlurProc* boxBlurXY,
                                     SkBoxBlurProc* boxBlurYX);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp
new file mode 100644
index 00000000000..10d595afa59
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2014 ARM Ltd.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBlurImage_opts_neon.h"
+#include "SkUtilsArm.h"
+
+bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
+                               SkBoxBlurProc* boxBlurY,
+                               SkBoxBlurProc* boxBlurXY,
+                               SkBoxBlurProc* boxBlurYX) {
+#if SK_ARM_NEON_IS_NONE
+    return false;
+#else
+#if SK_ARM_NEON_IS_DYNAMIC
+    if (!sk_cpu_arm_has_neon()) {
+        return false;
+    }
+#endif
+    return SkBoxBlurGetPlatformProcs_NEON(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
+#endif
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp
index 4e33d72d462..08187f3e55e 100644
--- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp
@@ -20,6 +20,86 @@ enum BlurDirection {
 };
 
 /**
+ * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register
+ * and also pre-load pixels for future read
+ */
+template<BlurDirection srcDirection>
+inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) {
+    if (srcDirection == kX) {
+        uint32x2_t temp = vdup_n_u32(0);
+        // 10% faster by adding these 2 prefetches
+        SK_PREFETCH(src + 16);
+        SK_PREFETCH(src + srcStride + 16);
+        return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(src, temp, 0), 1));
+     } else {
+         return vld1_u8((uint8_t*)src);
+     }
+}
+
+/**
+ * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows
+ */
+template<BlurDirection dstDirection>
+inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) {
+    if (dstDirection == kX) {
+        uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8));
+        vst1_lane_u32(dst, temp, 0);
+        vst1_lane_u32(dst + dstStride, temp, 1);
+    } else {
+        uint8x8_t temp = vmovn_u16(result16x8);
+        vst1_u8((uint8_t*)dst, temp);
+    }
+}
+
+/**
+ * fast path for kernel size less than 128
+ */
+template<BlurDirection srcDirection, BlurDirection dstDirection>
+void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize,
+                        int leftOffset, int rightOffset, int width, int* height)
+{
+    const int rightBorder = SkMin32(rightOffset + 1, width);
+    const int srcStrideX = srcDirection == kX ? 1 : srcStride;
+    const int dstStrideX = dstDirection == kX ? 1 : *height;
+    const int srcStrideY = srcDirection == kX ? srcStride : 1;
+    const int dstStrideY = dstDirection == kX ? width : 1;
+    const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);
+
+    for (; *height >= 2; *height -= 2) {
+        uint16x8_t sum = vdupq_n_u16(0);
+        const SkPMColor* p = *src;
+        for (int i = 0; i < rightBorder; i++) {
+            sum = vaddw_u8(sum,
+                load_2_pixels<srcDirection>(p, srcStride));
+            p += srcStrideX;
+        }
+
+        const SkPMColor* sptr = *src;
+        SkPMColor* dptr = *dst;
+        for (int x = 0; x < width; x++) {
+            // val = (sum * scale * 2 + 0x8000) >> 16
+            uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16(
+                vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale)));
+            store_2_pixels<dstDirection>(resultPixels, dptr, width);
+
+            if (x >= leftOffset) {
+                sum = vsubw_u8(sum,
+                    load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX, srcStride));
+            }
+            if (x + rightOffset + 1 < width) {
+                sum = vaddw_u8(sum,
+                    load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcStrideX, srcStride));
+            }
+            sptr += srcStrideX;
+            dptr += dstStrideX;
+        }
+        *src += srcStrideY * 2;
+        *dst += dstStrideY * 2;
+    }
+}
+
+
+/**
  * Helper function to spread the components of a 32-bit integer into the
  * lower 8 bits of each 16-bit element of a NEON register.
  */
@@ -42,7 +122,14 @@ void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker
     const int dstStrideY = dstDirection == kX ? width : 1;
     const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
     const uint32x4_t half = vdupq_n_u32(1 << 23);
-    for (int y = 0; y < height; ++y) {
+
+    if (kernelSize < 128)
+    {
+        SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &dst, kernelSize,
+            leftOffset, rightOffset, width, &height);
+    }
+
+    for (; height > 0; height--) {
         uint32x4_t sum = vdupq_n_u32(0);
         const SkPMColor* p = src;
         for (int i = 0; i < rightBorder; ++i) {
@@ -77,8 +164,8 @@ void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker
                 sum = vaddw_u16(sum, expand(*r));
             }
             sptr += srcStrideX;
-            if (srcDirection == kY) {
-                SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX);
+            if (srcDirection == kX) {
+                SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX);
             }
             dptr += dstStrideX;
         }
diff --git a/chromium/third_party/skia/src/opts/SkCachePreload_arm.h b/chromium/third_party/skia/src/opts/SkCachePreload_arm.h
deleted file mode 100644
index cff8c2a9b79..00000000000
--- a/chromium/third_party/skia/src/opts/SkCachePreload_arm.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-#ifndef SkCachePreload_arm_DEFINED
-#define SkCachePreload_arm_DEFINED
-
-// This file defines macros for preload instructions for ARM. These macros
-// are designed to be embedded inside GNU inline assembly.
-// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache
-// line size also needs to be known (and needs to be contained inside
-// __ARM_CACHE_LINE_SIZE).
-#if defined(__ARM_USE_PLD)
-
-#define PLD(x, n)           "pld        [%["#x"], #("#n")]\n\t"
-
-#if __ARM_CACHE_LINE_SIZE == 32
-    #define PLD64(x, n)      PLD(x, n) PLD(x, (n) + 32)
-#elif __ARM_CACHE_LINE_SIZE == 64
-    #define PLD64(x, n)      PLD(x, n)
-#else
-    #error "unknown __ARM_CACHE_LINE_SIZE."
-#endif
-#else
-    // PLD is disabled, all macros become empty.
-    #define PLD(x, n)
-    #define PLD64(x, n)
-#endif
-
-#define PLD128(x, n)         PLD64(x, n) PLD64(x, (n) + 64)
-
-#endif  // SkCachePreload_arm_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h
new file mode 100644
index 00000000000..7e61d526b3b
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColor_opts_SSE2_DEFINED
+#define SkColor_opts_SSE2_DEFINED
+
+#include <emmintrin.h>
+
+// Because no _mm_mul_epi32() in SSE2, we emulate it here.
+// Multiplies 4 32-bit integers from a by 4 32-bit intergers from b.
+// The 4 multiplication results should be represented within 32-bit
+// integers, otherwise they would be overflow.
+static inline  __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) {
+    // Calculate results of a0 * b0 and a2 * b2.
+    __m128i r1 = _mm_mul_epu32(a, b);
+    // Calculate results of a1 * b1 and a3 * b3.
+    __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+    // Shuffle results to [63..0] and interleave the results.
+    __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)),
+                                   _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0)));
+    return r;
+}
+
+static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) {
+    return _mm_add_epi32(alpha, _mm_set1_epi32(1));
+}
+
+// See #define SkAlphaMulAlpha(a, b)  SkMulDiv255Round(a, b) in SkXfermode.cpp.
+static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a,
+                                           const __m128i& b) {
+    __m128i prod = _mm_mullo_epi16(a, b);
+    prod = _mm_add_epi32(prod, _mm_set1_epi32(128));
+    prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8));
+    prod = _mm_srli_epi32(prod, 8);
+
+    return prod;
+}
+
+// Portable version SkAlphaMulQ is in SkColorPriv.h.
+static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) {
+    __m128i mask = _mm_set1_epi32(0xFF00FF);
+    __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
+
+    // uint32_t rb = ((c & mask) * scale) >> 8
+    __m128i rb = _mm_and_si128(mask, c);
+    rb = _mm_mullo_epi16(rb, s);
+    rb = _mm_srli_epi16(rb, 8);
+
+    // uint32_t ag = ((c >> 8) & mask) * scale
+    __m128i ag = _mm_srli_epi16(c, 8);
+    ag = _mm_and_si128(ag, mask);
+    ag = _mm_mullo_epi16(ag, s);
+
+    // (rb & mask) | (ag & ~mask)
+    rb = _mm_and_si128(mask, rb);
+    ag = _mm_andnot_si128(mask, ag);
+    return _mm_or_si128(rb, ag);
+}
+
+static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
+    __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
+    return _mm_srli_epi32(a, 24);
+}
+
+static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {
+    __m128i r = _mm_slli_epi32(src, (24 - SK_R32_SHIFT));
+    return _mm_srli_epi32(r, 24);
+}
+
+static inline __m128i SkGetPackedG32_SSE2(const __m128i& src) {
+    __m128i g = _mm_slli_epi32(src, (24 - SK_G32_SHIFT));
+    return _mm_srli_epi32(g, 24);
+}
+
+static inline __m128i SkGetPackedB32_SSE2(const __m128i& src) {
+    __m128i b = _mm_slli_epi32(src, (24 - SK_B32_SHIFT));
+    return _mm_srli_epi32(b, 24);
+}
+
+static inline __m128i SkMul16ShiftRound_SSE2(const __m128i& a,
+                                             const __m128i& b, int shift) {
+    __m128i prod = _mm_mullo_epi16(a, b);
+    prod = _mm_add_epi16(prod, _mm_set1_epi16(1 << (shift - 1)));
+    prod = _mm_add_epi16(prod, _mm_srli_epi16(prod, shift));
+    prod = _mm_srli_epi16(prod, shift);
+
+    return prod;
+}
+
+static inline __m128i SkPackRGB16_SSE2(const __m128i& r,
+                                       const __m128i& g, const __m128i& b) {
+    __m128i dr = _mm_slli_epi16(r, SK_R16_SHIFT);
+    __m128i dg = _mm_slli_epi16(g, SK_G16_SHIFT);
+    __m128i db = _mm_slli_epi16(b, SK_B16_SHIFT);
+
+    __m128i c = _mm_or_si128(dr, dg);
+    return _mm_or_si128(c, db);
+}
+
+static inline __m128i SkPackARGB32_SSE2(const __m128i& a, const __m128i& r,
+                                        const __m128i& g, const __m128i& b) {
+    __m128i da = _mm_slli_epi32(a, SK_A32_SHIFT);
+    __m128i dr = _mm_slli_epi32(r, SK_R32_SHIFT);
+    __m128i dg = _mm_slli_epi32(g, SK_G32_SHIFT);
+    __m128i db = _mm_slli_epi32(b, SK_B32_SHIFT);
+
+    __m128i c = _mm_or_si128(da, dr);
+    c = _mm_or_si128(c, dg);
+    return _mm_or_si128(c, db);
+}
+
+static inline __m128i SkPacked16ToR32_SSE2(const __m128i& src) {
+    __m128i r = _mm_srli_epi32(src, SK_R16_SHIFT);
+    r = _mm_and_si128(r, _mm_set1_epi32(SK_R16_MASK));
+    r = _mm_or_si128(_mm_slli_epi32(r, (8 - SK_R16_BITS)),
+                     _mm_srli_epi32(r, (2 * SK_R16_BITS - 8)));
+
+    return r;
+}
+
+static inline __m128i SkPacked16ToG32_SSE2(const __m128i& src) {
+    __m128i g = _mm_srli_epi32(src, SK_G16_SHIFT);
+    g = _mm_and_si128(g, _mm_set1_epi32(SK_G16_MASK));
+    g = _mm_or_si128(_mm_slli_epi32(g, (8 - SK_G16_BITS)),
+                     _mm_srli_epi32(g, (2 * SK_G16_BITS - 8)));
+
+    return g;
+}
+
+static inline __m128i SkPacked16ToB32_SSE2(const __m128i& src) {
+    __m128i b = _mm_srli_epi32(src, SK_B16_SHIFT);
+    b = _mm_and_si128(b, _mm_set1_epi32(SK_B16_MASK));
+    b = _mm_or_si128(_mm_slli_epi32(b, (8 - SK_B16_BITS)),
+                     _mm_srli_epi32(b, (2 * SK_B16_BITS - 8)));
+
+    return b;
+}
+
+static inline __m128i SkPixel16ToPixel32_SSE2(const __m128i& src) {
+    __m128i r = SkPacked16ToR32_SSE2(src);
+    __m128i g = SkPacked16ToG32_SSE2(src);
+    __m128i b = SkPacked16ToB32_SSE2(src);
+
+    return SkPackARGB32_SSE2(_mm_set1_epi32(0xFF), r, g, b);
+}
+
+static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
+                                                    const __m128i& src_pixel2) {
+    // Calculate result r.
+    __m128i r1 = _mm_srli_epi32(src_pixel1,
+                                SK_R32_SHIFT + (8 - SK_R16_BITS));
+    r1 = _mm_and_si128(r1, _mm_set1_epi32(SK_R16_MASK));
+    __m128i r2 = _mm_srli_epi32(src_pixel2,
+                                SK_R32_SHIFT + (8 - SK_R16_BITS));
+    r2 = _mm_and_si128(r2, _mm_set1_epi32(SK_R16_MASK));
+    __m128i r = _mm_packs_epi32(r1, r2);
+
+    // Calculate result g.
+    __m128i g1 = _mm_srli_epi32(src_pixel1,
+                                SK_G32_SHIFT + (8 - SK_G16_BITS));
+    g1 = _mm_and_si128(g1, _mm_set1_epi32(SK_G16_MASK));
+    __m128i g2 = _mm_srli_epi32(src_pixel2,
+                                SK_G32_SHIFT + (8 - SK_G16_BITS));
+    g2 = _mm_and_si128(g2, _mm_set1_epi32(SK_G16_MASK));
+    __m128i g = _mm_packs_epi32(g1, g2);
+
+    // Calculate result b.
+    __m128i b1 = _mm_srli_epi32(src_pixel1,
+                                SK_B32_SHIFT + (8 - SK_B16_BITS));
+    b1 = _mm_and_si128(b1, _mm_set1_epi32(SK_B16_MASK));
+    __m128i b2 = _mm_srli_epi32(src_pixel2,
+                                SK_B32_SHIFT + (8 - SK_B16_BITS));
+    b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK));
+    __m128i b = _mm_packs_epi32(b1, b2);
+
+    // Store 8 16-bit colors in dst.
+    __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
+
+    return d_pixel;
+}
+
+#endif // SkColor_opts_SSE2_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h
new file mode 100644
index 00000000000..2cc21afa0df
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkMath_opts_SSE2_DEFINED
+#define SkMath_opts_SSE2_DEFINED
+
+#include <emmintrin.h>
+
+// Because no _mm_div_epi32() in SSE2, we use float division to emulate.
+// When using this function, make sure a and b don't exceed float's precision.
+static inline __m128i shim_mm_div_epi32(const __m128i& a, const __m128i& b) {
+    __m128 x = _mm_cvtepi32_ps(a);
+    __m128 y = _mm_cvtepi32_ps(b);
+    return _mm_cvttps_epi32(_mm_div_ps(x, y));
+}
+
+// Portable version of SkSqrtBits is in SkMath.cpp.
+static inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) {
+    __m128i root =  _mm_setzero_si128();
+    __m128i remHi = _mm_setzero_si128();
+    __m128i remLo = x;
+    __m128i one128 = _mm_set1_epi32(1);
+
+    do {
+        root = _mm_slli_epi32(root, 1);
+
+        remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2),
+                             _mm_srli_epi32(remLo, 30));
+        remLo = _mm_slli_epi32(remLo, 2);
+
+        __m128i testDiv = _mm_slli_epi32(root, 1);
+        testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1));
+
+        __m128i cmp = _mm_cmplt_epi32(remHi, testDiv);
+        __m128i remHi1 = _mm_and_si128(cmp, remHi);
+        __m128i root1 = _mm_and_si128(cmp, root);
+        __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv));
+        __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128));
+
+        remHi = _mm_or_si128(remHi1, remHi2);
+        root = _mm_or_si128(root1, root2);
+    } while (--count >= 0);
+
+    return root;
+}
+
+#endif // SkMath_opts_SSE2_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts.h b/chromium/third_party/skia/src/opts/SkMorphology_opts.h
index e3ad853cf64..7ea7c546231 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts.h
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts.h
@@ -5,17 +5,10 @@
  * found in the LICENSE file.
  */
 
-#include <SkColor.h>
+#ifndef SkMorphology_opts_DEFINED
+#define SkMorphology_opts_DEFINED
 
-/**
- * All morphology procs have the same signature: src is the source buffer, dst the
- * destination buffer, radius is the morphology radius, width and height are the bounds
- * of the destination buffer (in pixels), and srcStride and dstStride are the
- * number of pixels per row in each buffer. All buffers are 8888.
- */
-
-typedef void (*SkMorphologyProc)(const SkPMColor* src, SkPMColor* dst, int radius,
-                                 int width, int height, int srcStride, int dstStride);
+#include <SkMorphologyImageFilter.h>
 
 enum SkMorphologyProcType {
     kDilateX_SkMorphologyProcType,
@@ -24,4 +17,6 @@ enum SkMorphologyProcType {
     kErodeY_SkMorphologyProcType
 };
 
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type);
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp
index b58fced2c12..e782950956a 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp
@@ -5,12 +5,10 @@
  * found in the LICENSE file.
  */
 
-
+#include <emmintrin.h>
 #include "SkColorPriv.h"
 #include "SkMorphology_opts_SSE2.h"
 
-#include <emmintrin.h>
-
 /* SSE2 version of dilateX, dilateY, erodeX, erodeY.
  * portable versions are in src/effects/SkMorphologyImageFilter.cpp.
  */
@@ -48,8 +46,12 @@ static void SkMorph_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
             lp += srcStrideY;
             up += srcStrideY;
         }
-        if (x >= radius) src += srcStrideX;
-        if (x + radius < width - 1) upperSrc += srcStrideX;
+        if (x >= radius) {
+            src += srcStrideX;
+        }
+        if (x + radius < width - 1) {
+            upperSrc += srcStrideX;
+        }
         dst += dstStrideX;
     }
 }
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h
index bd103e6eba9..bf5aa03b092 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h
@@ -5,6 +5,11 @@
  * found in the LICENSE file.
  */
 
+#ifndef SkMorphology_opts_SSE2_DEFINED
+#define SkMorphology_opts_SSE2_DEFINED
+
+#include "SkColor.h"
+
 void SkDilateX_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
                     int width, int height, int srcStride, int dstStride);
 void SkDilateY_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
@@ -13,3 +18,5 @@ void SkErodeX_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
                    int width, int height, int srcStride, int dstStride);
 void SkErodeY_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
                    int width, int height, int srcStride, int dstStride);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp
new file mode 100644
index 00000000000..2bba4929c22
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2014 ARM Ltd.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkMorphology_opts.h"
+#include "SkMorphology_opts_neon.h"
+#include "SkUtilsArm.h"
+
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
+#if SK_ARM_NEON_IS_NONE
+    return NULL;
+#else
+#if SK_ARM_NEON_IS_DYNAMIC
+    if (!sk_cpu_arm_has_neon()) {
+        return NULL;
+    }
+#endif
+    switch (type) {
+        case kDilateX_SkMorphologyProcType:
+            return SkDilateX_neon;
+        case kDilateY_SkMorphologyProcType:
+            return SkDilateY_neon;
+        case kErodeX_SkMorphologyProcType:
+            return SkErodeX_neon;
+        case kErodeY_SkMorphologyProcType:
+            return SkErodeY_neon;
+        default:
+            return NULL;
+    }
+#endif
+}
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp
index 66d58ba571f..ade261fc7d2 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp
@@ -7,6 +7,6 @@
 
 #include "SkMorphology_opts.h"
 
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType) {
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType) {
     return NULL;
 }
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp
index e22044d39d3..bd2f9b29a44 100644
--- a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2009 The Android Open Source Project
  *
@@ -6,7 +5,6 @@
  * found in the LICENSE file.
  */
 
-
 #include <emmintrin.h>
 #include "SkUtils_opts_SSE2.h"
 
@@ -69,3 +67,33 @@ void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count)
         --count;
     }
 }
+
+void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count)
+{
+    if (count >= 16) {
+        while (((size_t)dst) & 0x0F) {
+            *dst++ = *src++;
+            --count;
+        }
+        __m128i *dst128 = reinterpret_cast<__m128i*>(dst);
+        const __m128i *src128 = reinterpret_cast<const __m128i*>(src);
+        while (count >= 16) {
+            __m128i a =  _mm_loadu_si128(src128++);
+            __m128i b =  _mm_loadu_si128(src128++);
+            __m128i c =  _mm_loadu_si128(src128++);
+            __m128i d =  _mm_loadu_si128(src128++);
+
+            _mm_store_si128(dst128++, a);
+            _mm_store_si128(dst128++, b);
+            _mm_store_si128(dst128++, c);
+            _mm_store_si128(dst128++, d);
+            count -= 16;
+        }
+        dst = reinterpret_cast<uint32_t*>(dst128);
+        src = reinterpret_cast<const uint32_t*>(src128);
+    }
+    while (count > 0) {
+        *dst++ = *src++;
+        --count;
+    }
+}
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h
index ed24c1ffa40..009f01894b4 100644
--- a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2009 The Android Open Source Project
  *
@@ -6,8 +5,13 @@
  * found in the LICENSE file.
  */
 
+#ifndef SkUtils_opts_SSE2_DEFINED
+#define SkUtils_opts_SSE2_DEFINED
 
 #include "SkTypes.h"
 
 void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count);
 void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count);
+void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp
new file mode 100644
index 00000000000..b1c9d0aa93e
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2014 ARM Ltd.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkUtils.h"
+#include "SkUtilsArm.h"
+
+#if defined(SK_CPU_LENDIAN) && !SK_ARM_NEON_IS_NONE
+extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
+extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
+#endif
+
+#if defined(SK_CPU_LENDIAN)
+extern "C" void arm_memset16(uint16_t* dst, uint16_t value, int count);
+extern "C" void arm_memset32(uint32_t* dst, uint32_t value, int count);
+#endif
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+    // FIXME: memset.arm.S is using syntax incompatible with XCode
+#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
+    return NULL;
+#elif SK_ARM_NEON_IS_DYNAMIC
+    if (sk_cpu_arm_has_neon()) {
+        return memset16_neon;
+    } else {
+        return arm_memset16;
+    }
+#elif SK_ARM_NEON_IS_ALWAYS
+    return memset16_neon;
+#else
+    return arm_memset16;
+#endif
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+    // FIXME: memset.arm.S is using syntax incompatible with XCode
+#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
+    return NULL;
+#elif SK_ARM_NEON_IS_DYNAMIC
+    if (sk_cpu_arm_has_neon()) {
+        return memset32_neon;
+    } else {
+        return arm_memset32;
+    }
+#elif SK_ARM_NEON_IS_ALWAYS
+    return memset32_neon;
+#else
+    return arm_memset32;
+#endif
+}
+
+SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
+    return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp
index 286f10d7e53..18f52496db4 100644
--- a/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp
@@ -16,3 +16,7 @@ SkMemset16Proc SkMemset16GetPlatformProc() {
 SkMemset32Proc SkMemset32GetPlatformProc() {
     return NULL;
 }
+
+SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
+    return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp
new file mode 100644
index 00000000000..94f9a4aea3b
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp
@@ -0,0 +1,819 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkColorPriv.h"
+#include "SkColor_opts_SSE2.h"
+#include "SkMathPriv.h"
+#include "SkMath_opts_SSE2.h"
+#include "SkXfermode.h"
+#include "SkXfermode_opts_SSE2.h"
+#include "SkXfermode_proccoeff.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 4 pixels SSE2 version functions
+////////////////////////////////////////////////////////////////////////////////
+
+static inline __m128i SkDiv255Round_SSE2(const __m128i& a) {
+    __m128i prod = _mm_add_epi32(a, _mm_set1_epi32(128)); // prod += 128;
+    prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8));  // prod + (prod >> 8)
+    prod = _mm_srli_epi32(prod, 8);                       // >> 8
+
+    return prod;
+}
+
+static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) {
+    __m128i sum = _mm_add_epi32(a, b);
+    __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255));
+
+    sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)),
+                       _mm_andnot_si128(cmp, sum));
+    return sum;
+}
+
+static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) {
+    __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128());
+    __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255));
+    __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255));
+
+    __m128i cmp = _mm_or_si128(cmp1, cmp2);
+    ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n));
+
+    return ret;
+}
+
+static inline __m128i clamp_div255round_SSE2(const __m128i& prod) {
+    // test if > 0
+    __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128());
+    // test if < 255*255
+    __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255));
+
+    __m128i ret = _mm_setzero_si128();
+
+    // if value >= 255*255, value = 255
+    ret = _mm_andnot_si128(cmp2,  _mm_set1_epi32(255));
+
+    __m128i div = SkDiv255Round_SSE2(prod);
+
+    // test if > 0 && < 255*255
+    __m128i cmp = _mm_and_si128(cmp1, cmp2);
+
+    ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret));
+
+    return ret;
+}
+
+static __m128i srcover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
+    return _mm_add_epi32(src, SkAlphaMulQ_SSE2(dst, isa));
+}
+
+static __m128i dstover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
+    return _mm_add_epi32(dst, SkAlphaMulQ_SSE2(src, ida));
+}
+
+static __m128i srcin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i da = SkGetPackedA32_SSE2(dst);
+    return SkAlphaMulQ_SSE2(src, SkAlpha255To256_SSE2(da));
+}
+
+static __m128i dstin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    return SkAlphaMulQ_SSE2(dst, SkAlpha255To256_SSE2(sa));
+}
+
+static __m128i srcout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
+    return SkAlphaMulQ_SSE2(src, ida);
+}
+
+static __m128i dstout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
+    return SkAlphaMulQ_SSE2(dst, isa);
+}
+
+static __m128i srcatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+
+    __m128i a = da;
+
+    __m128i r1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedR32_SSE2(src));
+    __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
+    __m128i r = _mm_add_epi32(r1, r2);
+
+    __m128i g1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedG32_SSE2(src));
+    __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
+    __m128i g = _mm_add_epi32(g1, g2);
+
+    __m128i b1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedB32_SSE2(src));
+    __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
+    __m128i b = _mm_add_epi32(b1, b2);
+
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i dstatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+
+    __m128i a = sa;
+
+    __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
+    __m128i r2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedR32_SSE2(dst));
+    __m128i r = _mm_add_epi32(r1, r2);
+
+    __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
+    __m128i g2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedG32_SSE2(dst));
+    __m128i g = _mm_add_epi32(g1, g2);
+
+    __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
+    __m128i b2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedB32_SSE2(dst));
+    __m128i b = _mm_add_epi32(b1, b2);
+
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+
+    __m128i a1 = _mm_add_epi32(sa, da);
+    __m128i a2 = SkAlphaMulAlpha_SSE2(sa, da);
+    a2 = _mm_slli_epi32(a2, 1);
+    __m128i a = _mm_sub_epi32(a1, a2);
+
+    __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
+    __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
+    __m128i r = _mm_add_epi32(r1, r2);
+
+    __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
+    __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
+    __m128i g = _mm_add_epi32(g1, g2);
+
+    __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
+    __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
+    __m128i b = _mm_add_epi32(b1, b2);
+
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i plus_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i b = saturated_add_SSE2(SkGetPackedB32_SSE2(src),
+                                   SkGetPackedB32_SSE2(dst));
+    __m128i g = saturated_add_SSE2(SkGetPackedG32_SSE2(src),
+                                   SkGetPackedG32_SSE2(dst));
+    __m128i r = saturated_add_SSE2(SkGetPackedR32_SSE2(src),
+                                   SkGetPackedR32_SSE2(dst));
+    __m128i a = saturated_add_SSE2(SkGetPackedA32_SSE2(src),
+                                   SkGetPackedA32_SSE2(dst));
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i modulate_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i a = SkAlphaMulAlpha_SSE2(SkGetPackedA32_SSE2(src),
+                                     SkGetPackedA32_SSE2(dst));
+    __m128i r = SkAlphaMulAlpha_SSE2(SkGetPackedR32_SSE2(src),
+                                     SkGetPackedR32_SSE2(dst));
+    __m128i g = SkAlphaMulAlpha_SSE2(SkGetPackedG32_SSE2(src),
+                                     SkGetPackedG32_SSE2(dst));
+    __m128i b = SkAlphaMulAlpha_SSE2(SkGetPackedB32_SSE2(src),
+                                     SkGetPackedB32_SSE2(dst));
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) {
+    __m128i cmp = _mm_cmplt_epi32(a, b);
+    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b));
+}
+
+static inline __m128i srcover_byte_SSE2(const __m128i& a, const __m128i& b) {
+    // a + b - SkAlphaMulAlpha(a, b);
+    return _mm_sub_epi32(_mm_add_epi32(a, b), SkAlphaMulAlpha_SSE2(a, b));
+
+}
+
+static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                                   const __m128i& sa, const __m128i& da) {
+    // sc * (255 - da)
+    __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da);
+    ret1 = _mm_mullo_epi16(sc, ret1);
+
+    // dc * (255 - sa)
+    __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+    ret2 = _mm_mullo_epi16(dc, ret2);
+
+    // sc * dc
+    __m128i ret3 = _mm_mullo_epi16(sc, dc);
+
+    __m128i ret = _mm_add_epi32(ret1, ret2);
+    ret = _mm_add_epi32(ret, ret3);
+
+    return clamp_div255round_SSE2(ret);
+}
+
+static __m128i multiply_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+    __m128i a = srcover_byte_SSE2(sa, da);
+
+    __m128i sr = SkGetPackedR32_SSE2(src);
+    __m128i dr = SkGetPackedR32_SSE2(dst);
+    __m128i r = blendfunc_multiply_byte_SSE2(sr, dr, sa, da);
+
+    __m128i sg = SkGetPackedG32_SSE2(src);
+    __m128i dg = SkGetPackedG32_SSE2(dst);
+    __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da);
+
+
+    __m128i sb = SkGetPackedB32_SSE2(src);
+    __m128i db = SkGetPackedB32_SSE2(dst);
+    __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da);
+
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i screen_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i a = srcover_byte_SSE2(SkGetPackedA32_SSE2(src),
+                                  SkGetPackedA32_SSE2(dst));
+    __m128i r = srcover_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                  SkGetPackedR32_SSE2(dst));
+    __m128i g = srcover_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                  SkGetPackedG32_SSE2(dst));
+    __m128i b = srcover_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                  SkGetPackedB32_SSE2(dst));
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+// Portable version overlay_byte() is in SkXfermode.cpp.
+static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                        const __m128i& sa, const __m128i& da) {
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+    __m128i tmp1 = _mm_mullo_epi16(sc, ida);
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+    __m128i tmp2 = _mm_mullo_epi16(dc, isa);
+    __m128i tmp = _mm_add_epi32(tmp1, tmp2);
+
+    __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da);
+    __m128i rc1 = _mm_slli_epi32(sc, 1);                        // 2 * sc
+    rc1 = Multiply32_SSE2(rc1, dc);                             // *dc
+
+    __m128i rc2 = _mm_mullo_epi16(sa, da);                      // sa * da
+    __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1);    // 2 * (da - dc)
+    tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc));        // * (sa - sc)
+    rc2 = _mm_sub_epi32(rc2, tmp3);
+
+    __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1),
+                              _mm_and_si128(cmp, rc2));
+    return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp));
+}
+
+static __m128i overlay_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = overlay_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                  SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = overlay_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                  SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = overlay_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                  SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                       const __m128i& sa, const __m128i& da) {
+    __m128i sd = _mm_mullo_epi16(sc, da);
+    __m128i ds = _mm_mullo_epi16(dc, sa);
+
+    __m128i cmp = _mm_cmplt_epi32(sd, ds);
+
+    __m128i tmp = _mm_add_epi32(sc, dc);
+    __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
+    __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
+    __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
+                               _mm_andnot_si128(cmp, ret2));
+    return ret;
+}
+
+static __m128i darken_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = darken_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                 SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = darken_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                 SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = darken_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                 SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i lighten_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                        const __m128i& sa, const __m128i& da) {
+    __m128i sd = _mm_mullo_epi16(sc, da);
+    __m128i ds = _mm_mullo_epi16(dc, sa);
+
+    __m128i cmp = _mm_cmpgt_epi32(sd, ds);
+
+    __m128i tmp = _mm_add_epi32(sc, dc);
+    __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
+    __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
+    __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
+                               _mm_andnot_si128(cmp, ret2));
+    return ret;
+}
+
+static __m128i lighten_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = lighten_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                  SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = lighten_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                  SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = lighten_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                  SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                           const __m128i& sa, const __m128i& da) {
+    __m128i diff = _mm_sub_epi32(sa, sc);
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+
+    // if (0 == dc)
+    __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128());
+    __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida));
+
+    // else if (0 == diff)
+    __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128());
+    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
+    __m128i tmp1 = _mm_mullo_epi16(sa, da);
+    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
+    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
+    __m128i rc2 = _mm_add_epi32(tmp1, tmp2);
+    rc2 = _mm_add_epi32(rc2, tmp3);
+    rc2 = clamp_div255round_SSE2(rc2);
+    rc2 = _mm_and_si128(cmp, rc2);
+
+    // else
+    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
+    __m128i value = _mm_mullo_epi16(dc, sa);
+    diff = shim_mm_div_epi32(value, diff);
+
+    __m128i tmp4 = SkMin32_SSE2(da, diff);
+    tmp4 = Multiply32_SSE2(sa, tmp4);
+    __m128i rc3 = _mm_add_epi32(tmp4, tmp2);
+    rc3 = _mm_add_epi32(rc3, tmp3);
+    rc3 = clamp_div255round_SSE2(rc3);
+    rc3 = _mm_andnot_si128(cmp3, rc3);
+
+    __m128i rc = _mm_or_si128(rc1, rc2);
+    rc = _mm_or_si128(rc, rc3);
+
+    return rc;
+}
+
+static __m128i colordodge_modeproc_SSE2(const __m128i& src,
+                                        const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = colordodge_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                     SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = colordodge_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                     SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = colordodge_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                     SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                          const __m128i& sa, const __m128i& da) {
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+
+    // if (dc == da)
+    __m128i cmp1 = _mm_cmpeq_epi32(dc, da);
+    __m128i tmp1 = _mm_mullo_epi16(sa, da);
+    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
+    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
+    __m128i rc1 = _mm_add_epi32(tmp1, tmp2);
+    rc1 = _mm_add_epi32(rc1, tmp3);
+    rc1 = clamp_div255round_SSE2(rc1);
+    rc1 = _mm_and_si128(cmp1, rc1);
+
+    // else if (0 == sc)
+    __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128());
+    __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa);
+    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
+    rc2 = _mm_and_si128(cmp, rc2);
+
+    // else
+    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
+    __m128i tmp4 = _mm_sub_epi32(da, dc);
+    tmp4 = Multiply32_SSE2(tmp4, sa);
+    tmp4 = shim_mm_div_epi32(tmp4, sc);
+
+    __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4));
+    tmp5 = Multiply32_SSE2(sa, tmp5);
+    __m128i rc3 = _mm_add_epi32(tmp5, tmp2);
+    rc3 = _mm_add_epi32(rc3, tmp3);
+    rc3 = clamp_div255round_SSE2(rc3);
+    rc3 = _mm_andnot_si128(cmp3, rc3);
+
+    __m128i rc = _mm_or_si128(rc1, rc2);
+    rc = _mm_or_si128(rc, rc3);
+
+    return rc;
+}
+
+static __m128i colorburn_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = colorburn_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                    SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = colorburn_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                    SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = colorburn_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                    SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                          const __m128i& sa, const __m128i& da) {
+    // if (2 * sc <= sa)
+    __m128i tmp1 = _mm_slli_epi32(sc, 1);
+    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
+    __m128i rc1 = _mm_mullo_epi16(sc, dc);                // sc * dc;
+    rc1 = _mm_slli_epi32(rc1, 1);                         // 2 * sc * dc
+    rc1 = _mm_andnot_si128(cmp1, rc1);
+
+    // else
+    tmp1 = _mm_mullo_epi16(sa, da);
+    __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc),
+                                   _mm_sub_epi32(sa, sc));
+    tmp2 = _mm_slli_epi32(tmp2, 1);
+    __m128i rc2 = _mm_sub_epi32(tmp1, tmp2);
+    rc2 = _mm_and_si128(cmp1, rc2);
+
+    __m128i rc = _mm_or_si128(rc1, rc2);
+
+    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+    tmp1 = _mm_mullo_epi16(sc, ida);
+    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+    tmp2 = _mm_mullo_epi16(dc, isa);
+    rc = _mm_add_epi32(rc, tmp1);
+    rc = _mm_add_epi32(rc, tmp2);
+    return clamp_div255round_SSE2(rc);
+}
+
+static __m128i hardlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = hardlight_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                    SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = hardlight_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                    SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = hardlight_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                    SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i sqrt_unit_byte_SSE2(const __m128i& n) {
+    return SkSqrtBits_SSE2(n, 15+4);
+}
+
+static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                          const __m128i& sa, const __m128i& da) {
+    __m128i tmp1, tmp2, tmp3;
+
+    // int m = da ? dc * 256 / da : 0;
+    __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
+    __m128i m = _mm_slli_epi32(dc, 8);
+    __m128 x = _mm_cvtepi32_ps(m);
+    __m128 y = _mm_cvtepi32_ps(da);
+    m = _mm_cvttps_epi32(_mm_div_ps(x, y));
+    m = _mm_andnot_si128(cmp, m);
+
+    // if (2 * sc <= sa)
+    tmp1 = _mm_slli_epi32(sc, 1);                      // 2 * sc
+    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
+    tmp1 = _mm_sub_epi32(tmp1, sa);                    // 2 * sc - sa
+    tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m);      // 256 - m
+    tmp1 = Multiply32_SSE2(tmp1, tmp2);
+    tmp1 = _mm_srai_epi32(tmp1, 8);
+    tmp1 = _mm_add_epi32(sa, tmp1);
+    tmp1 = Multiply32_SSE2(dc, tmp1);
+    __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);
+
+    // else if (4 * dc <= da)
+    tmp2 = _mm_slli_epi32(dc, 2);                      // dc * 4
+    __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
+    __m128i i = _mm_slli_epi32(m, 2);                  // 4 * m
+    __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
+    __m128i k = Multiply32_SSE2(i, j);                 // 4 * m * (4 * m + 256)
+    __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
+    i = Multiply32_SSE2(k, t);                         // 4 * m * (4 * m + 256) * (m - 256)
+    i = _mm_srai_epi32(i, 16);                         // >> 16
+    j = Multiply32_SSE2(_mm_set1_epi32(7), m);         // 7 * m
+    tmp2 = _mm_add_epi32(i, j);
+    i = Multiply32_SSE2(dc, sa);                       // dc * sa
+    j = _mm_slli_epi32(sc, 1);                         // 2 * sc
+    j = _mm_sub_epi32(j, sa);                          // 2 * sc - sa
+    j = Multiply32_SSE2(da, j);                        // da * (2 * sc - sa)
+    tmp2 = Multiply32_SSE2(j, tmp2);                   // * tmp
+    tmp2 = _mm_srai_epi32(tmp2, 8);                    // >> 8
+    tmp2 = _mm_add_epi32(i, tmp2);
+    cmp = _mm_andnot_si128(cmp2, cmp1);
+    __m128i rc2 = _mm_and_si128(cmp, tmp2);
+    __m128i rc = _mm_or_si128(rc1, rc2);
+
+    // else
+    tmp3 = sqrt_unit_byte_SSE2(m);
+    tmp3 = _mm_sub_epi32(tmp3, m);
+    tmp3 = Multiply32_SSE2(j, tmp3);                   // j = da * (2 * sc - sa)
+    tmp3 = _mm_srai_epi32(tmp3, 8);
+    tmp3 = _mm_add_epi32(i, tmp3);                     // i = dc * sa
+    cmp = _mm_and_si128(cmp1, cmp2);
+    __m128i rc3 = _mm_and_si128(cmp, tmp3);
+    rc = _mm_or_si128(rc, rc3);
+
+    tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da);     // 255 - da
+    tmp1 = _mm_mullo_epi16(sc, tmp1);
+    tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);     // 255 - sa
+    tmp2 = _mm_mullo_epi16(dc, tmp2);
+    rc = _mm_add_epi32(rc, tmp1);
+    rc = _mm_add_epi32(rc, tmp2);
+    return clamp_div255round_SSE2(rc);
+}
+
+static __m128i softlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = softlight_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                    SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = softlight_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                    SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = softlight_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                    SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i difference_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                           const __m128i& sa, const __m128i& da) {
+    __m128i tmp1 = _mm_mullo_epi16(sc, da);
+    __m128i tmp2 = _mm_mullo_epi16(dc, sa);
+    __m128i tmp = SkMin32_SSE2(tmp1, tmp2);
+
+    __m128i ret1 = _mm_add_epi32(sc, dc);
+    __m128i ret2 = _mm_slli_epi32(SkDiv255Round_SSE2(tmp), 1);
+    __m128i ret = _mm_sub_epi32(ret1, ret2);
+
+    ret = clamp_signed_byte_SSE2(ret);
+    return ret;
+}
+
+static __m128i difference_modeproc_SSE2(const __m128i& src,
+                                        const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = difference_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                     SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = difference_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                     SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = difference_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                     SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc,
+                                          const __m128i&, __m128i&) {
+    __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc
+    __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc
+    tmp1 = _mm_add_epi32(tmp1, tmp2);
+    tmp2 = _mm_mullo_epi16(sc, dc);                          // sc * dc
+    tmp2 = _mm_slli_epi32(tmp2, 1);                          // 2 * sc * dc
+
+    __m128i r = _mm_sub_epi32(tmp1, tmp2);
+    return clamp_div255round_SSE2(r);
+}
+
+static __m128i exclusion_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+    __m128i sa = SkGetPackedA32_SSE2(src);
+    __m128i da = SkGetPackedA32_SSE2(dst);
+
+    __m128i a = srcover_byte_SSE2(sa, da);
+    __m128i r = exclusion_byte_SSE2(SkGetPackedR32_SSE2(src),
+                                    SkGetPackedR32_SSE2(dst), sa, da);
+    __m128i g = exclusion_byte_SSE2(SkGetPackedG32_SSE2(src),
+                                    SkGetPackedG32_SSE2(dst), sa, da);
+    __m128i b = exclusion_byte_SSE2(SkGetPackedB32_SSE2(src),
+                                    SkGetPackedB32_SSE2(dst), sa, da);
+    return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst);
+
+extern SkXfermodeProcSIMD gSSE2XfermodeProcs[];
+
+SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer)
+    : INHERITED(buffer) {
+    fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]);
+    buffer.validate(fProcSIMD != NULL);
+}
+
+void SkSSE2ProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
+                                     int count, const SkAlpha aa[]) const {
+    SkASSERT(dst && src && count >= 0);
+
+    SkXfermodeProc proc = this->getProc();
+    SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
+    SkASSERT(procSIMD != NULL);
+
+    if (NULL == aa) {
+        if (count >= 4) {
+            while (((size_t)dst & 0x0F) != 0) {
+                *dst = proc(*src, *dst);
+                dst++;
+                src++;
+                count--;
+            }
+
+            const __m128i* s = reinterpret_cast<const __m128i*>(src);
+            __m128i* d = reinterpret_cast<__m128i*>(dst);
+
+            while (count >= 4) {
+                __m128i src_pixel = _mm_loadu_si128(s++);
+                __m128i dst_pixel = _mm_load_si128(d);
+
+                dst_pixel = procSIMD(src_pixel, dst_pixel);
+                _mm_store_si128(d++, dst_pixel);
+                count -= 4;
+            }
+
+            src = reinterpret_cast<const SkPMColor*>(s);
+            dst = reinterpret_cast<SkPMColor*>(d);
+        }
+
+        for (int i = count - 1; i >= 0; --i) {
+            *dst = proc(*src, *dst);
+            dst++;
+            src++;
+        }
+    } else {
+        for (int i = count - 1; i >= 0; --i) {
+            unsigned a = aa[i];
+            if (0 != a) {
+                SkPMColor dstC = dst[i];
+                SkPMColor C = proc(src[i], dstC);
+                if (a != 0xFF) {
+                    C = SkFourByteInterp(C, dstC, a);
+                }
+                dst[i] = C;
+            }
+        }
+    }
+}
+
+void SkSSE2ProcCoeffXfermode::xfer16(uint16_t dst[], const SkPMColor src[],
+                                     int count, const SkAlpha aa[]) const {
+    SkASSERT(dst && src && count >= 0);
+
+    SkXfermodeProc proc = this->getProc();
+    SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
+    SkASSERT(procSIMD != NULL);
+
+    if (NULL == aa) {
+        if (count >= 8) {
+            while (((size_t)dst & 0x0F) != 0) {
+                SkPMColor dstC = SkPixel16ToPixel32(*dst);
+                *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC));
+                dst++;
+                src++;
+                count--;
+            }
+
+            const __m128i* s = reinterpret_cast<const __m128i*>(src);
+            __m128i* d = reinterpret_cast<__m128i*>(dst);
+
+            while (count >= 8) {
+                __m128i src_pixel1 = _mm_loadu_si128(s++);
+                __m128i src_pixel2 = _mm_loadu_si128(s++);
+                __m128i dst_pixel = _mm_load_si128(d);
+
+                __m128i dst_pixel1 = _mm_unpacklo_epi16(dst_pixel, _mm_setzero_si128());
+                __m128i dst_pixel2 = _mm_unpackhi_epi16(dst_pixel, _mm_setzero_si128());
+
+                __m128i dstC1 = SkPixel16ToPixel32_SSE2(dst_pixel1);
+                __m128i dstC2 = SkPixel16ToPixel32_SSE2(dst_pixel2);
+
+                dst_pixel1 = procSIMD(src_pixel1, dstC1);
+                dst_pixel2 = procSIMD(src_pixel2, dstC2);
+                dst_pixel = SkPixel32ToPixel16_ToU16_SSE2(dst_pixel1, dst_pixel2);
+
+                _mm_store_si128(d++, dst_pixel);
+                count -= 8;
+            }
+
+            src = reinterpret_cast<const SkPMColor*>(s);
+            dst = reinterpret_cast<uint16_t*>(d);
+        }
+
+        for (int i = count - 1; i >= 0; --i) {
+            SkPMColor dstC = SkPixel16ToPixel32(*dst);
+            *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC));
+            dst++;
+            src++;
+        }
+    } else {
+        for (int i = count - 1; i >= 0; --i) {
+            unsigned a = aa[i];
+            if (0 != a) {
+                SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
+                SkPMColor C = proc(src[i], dstC);
+                if (0xFF != a) {
+                    C = SkFourByteInterp(C, dstC, a);
+                }
+                dst[i] = SkPixel32ToPixel16_ToU16(C);
+            }
+        }
+    }
+}
+
+#ifndef SK_IGNORE_TO_STRING
+void SkSSE2ProcCoeffXfermode::toString(SkString* str) const {
+    this->INHERITED::toString(str);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// 4 pixels modeprocs with SSE2
+SkXfermodeProcSIMD gSSE2XfermodeProcs[] = {
+    NULL, // kClear_Mode
+    NULL, // kSrc_Mode
+    NULL, // kDst_Mode
+    srcover_modeproc_SSE2,
+    dstover_modeproc_SSE2,
+    srcin_modeproc_SSE2,
+    dstin_modeproc_SSE2,
+    srcout_modeproc_SSE2,
+    dstout_modeproc_SSE2,
+    srcatop_modeproc_SSE2,
+    dstatop_modeproc_SSE2,
+    xor_modeproc_SSE2,
+    plus_modeproc_SSE2,
+    modulate_modeproc_SSE2,
+    screen_modeproc_SSE2,
+
+    overlay_modeproc_SSE2,
+    darken_modeproc_SSE2,
+    lighten_modeproc_SSE2,
+    colordodge_modeproc_SSE2,
+    colorburn_modeproc_SSE2,
+    hardlight_modeproc_SSE2,
+    softlight_modeproc_SSE2,
+    difference_modeproc_SSE2,
+    exclusion_modeproc_SSE2,
+    multiply_modeproc_SSE2,
+
+    NULL, // kHue_Mode
+    NULL, // kSaturation_Mode
+    NULL, // kColor_Mode
+    NULL, // kLuminosity_Mode
+};
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
+                                                         SkXfermode::Mode mode) {
+    void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]);
+
+    if (procSIMD != NULL) {
+        return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD));
+    }
+    return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h
new file mode 100644
index 00000000000..bfc143937a8
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkXfermode_opts_SSE2_DEFINED
+#define SkXfermode_opts_SSE2_DEFINED
+
+#include "SkTypes.h"
+#include "SkXfermode_proccoeff.h"
+
+class SK_API SkSSE2ProcCoeffXfermode : public SkProcCoeffXfermode {
+public:
+    SkSSE2ProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode,
+                            void* procSIMD)
+        : INHERITED(rec, mode), fProcSIMD(procSIMD) {}
+
+    virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count,
+                        const SkAlpha aa[]) const SK_OVERRIDE;
+    virtual void xfer16(uint16_t dst[], const SkPMColor src[],
+                        int count, const SkAlpha aa[]) const SK_OVERRIDE;
+
+    SK_TO_STRING_OVERRIDE()
+    SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkSSE2ProcCoeffXfermode)
+
+private:
+    SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer);
+
+    void* fProcSIMD;
+    typedef SkProcCoeffXfermode INHERITED;
+};
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
+                                                         SkXfermode::Mode mode);
+
+#endif // SkXfermode_opts_SSE2_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp
index 6a79b737263..70e92af66bc 100644
--- a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp
@@ -41,8 +41,13 @@ static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp
 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
     uint16x8_t tmp;
 
+#ifdef SK_CPU_ARM64
+    tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),
+                         vreinterpretq_u32_s32(p2));
+#else
     tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
                        vmovn_u32(vreinterpretq_u32_s32(p2)));
+#endif
 
     tmp += vdupq_n_u16(128);
     tmp += vshrq_n_u16(tmp, 8);
@@ -66,7 +71,11 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
     // Test if <= 0
     cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
     cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
+#ifdef SK_CPU_ARM64
+    cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
+#else
     cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
+#endif
     cmp8_1 = vmovn_u16(cmp16);
 
     // Init to zero
@@ -75,7 +84,11 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
     // Test if >= 255*255
     cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
     cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
+#ifdef SK_CPU_ARM64
+    cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
+#else
     cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
+#endif
     cmp8 = vmovn_u16(cmp16);
 
     // Insert 255 where true
@@ -409,11 +422,19 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
     if (overlay) {
         dc2 = vshll_n_u8(dc, 1);
         scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
+#ifdef SK_CPU_ARM64
+        scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));
+#else
         scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
+#endif
     } else {
         sc2 = vshll_n_u8(sc, 1);
         scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
+#ifdef SK_CPU_ARM64
+        scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));
+#else
         scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
+#endif
     }
 
     // Calc COM
@@ -421,12 +442,20 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
     com1 = vreinterpretq_s32_u32(
                 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
     com2 = vreinterpretq_s32_u32(
+#ifdef SK_CPU_ARM64
+                vmull_high_u16(const255, sc_plus_dc));
+#else
                 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
+#endif
 
     // Calc SUB
     int32x4_t sub1, sub2;
     sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
+#ifdef SK_CPU_ARM64
+    sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));
+#else
     sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
+#endif
     sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
     sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
 
@@ -444,10 +473,14 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
     int32x4_t val2_1, val2_2;
     uint32x4_t cmp1, cmp2;
 
-    cmp1 = vmovl_u16(vget_low_u16(cmp));
-    cmp1 |= vshlq_n_u32(cmp1, 16);
-    cmp2 = vmovl_u16(vget_high_u16(cmp));
-    cmp2 |= vshlq_n_u32(cmp2, 16);
+    // Doing a signed lengthening allows to save a few instructions
+    // thanks to sign extension.
+    cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));
+#ifdef SK_CPU_ARM64
+    cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));
+#else
+    cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));
+#endif
 
     // Calc COM - SUB
     val1_1 = com1 - sub1;
@@ -458,7 +491,11 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
     val2_2 = com2 + sub2;
 
     val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
+#ifdef SK_CPU_ARM64
+    val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));
+#else
     val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
+#endif
 
     // Insert where needed
     val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
@@ -628,11 +665,19 @@ static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
     term1_1 = vreinterpretq_s32_u32(
                 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
     term1_2 = vreinterpretq_s32_u32(
+#ifdef SK_CPU_ARM64
+                vmull_high_u16(const255, sc_plus_dc));
+#else
                 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
+#endif
 
     /* Calc the second term */
     term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
+#ifdef SK_CPU_ARM64
+    term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));
+#else
     term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
+#endif
 
     return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
 }
@@ -661,10 +706,18 @@ static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
     scdc = vmull_u8(sc, dc);
 
     val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
+#ifdef SK_CPU_ARM64
+    val2 = vaddl_high_u16(t1, t2);
+#else
     val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
+#endif
 
     val1 = vaddw_u16(val1, vget_low_u16(scdc));
+#ifdef SK_CPU_ARM64
+    val2 = vaddw_high_u16(val2, scdc);
+#else
     val2 = vaddw_u16(val2, vget_high_u16(scdc));
+#endif
 
     return clamp_div255round_simd8_32(
                 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
@@ -690,7 +743,7 @@ typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
 
 extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
 
-SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer)
+SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
         : INHERITED(buffer) {
     fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
 }
@@ -708,6 +761,10 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
         while (count >= 8) {
             uint8x8x4_t vsrc, vdst, vres;
 
+#ifdef SK_CPU_ARM64
+            vsrc = vld4_u8((uint8_t*)src);
+            vdst = vld4_u8((uint8_t*)dst);
+#else
 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
             asm volatile (
                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
@@ -740,6 +797,7 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
             vsrc.val[2] = d2; vdst.val[2] = d6;
             vsrc.val[3] = d3; vdst.val[3] = d7;
 #endif
+#endif // #ifdef SK_CPU_ARM64
 
             vres = procSIMD(vsrc, vdst);
 
@@ -747,6 +805,9 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
 
             count -= 8;
             dst += 8;
+#ifdef SK_CPU_ARM64
+            src += 8;
+#endif
         }
         // Leftovers
         for (int i = 0; i < count; i++) {
@@ -783,6 +844,9 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
 
             vdst = vld1q_u16(dst);
 
+#ifdef SK_CPU_ARM64
+            vsrc = vld4_u8((uint8_t*)src);
+#else
 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
             asm volatile (
                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
@@ -806,6 +870,7 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
             vsrc.val[2] = d2;
             vsrc.val[3] = d3;
 #endif
+#endif // #ifdef SK_CPU_ARM64
 
             vdst32 = SkPixel16ToPixel32_neon8(vdst);
             vres = procSIMD(vsrc, vdst32);
@@ -815,6 +880,9 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
 
             count -= 8;
             dst += 8;
+#ifdef SK_CPU_ARM64
+            src += 8;
+#endif
         }
         for (int i = 0; i < count; i++) {
             SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
@@ -835,7 +903,7 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
     }
 }
 
-#ifdef SK_DEVELOPER
+#ifndef SK_IGNORE_TO_STRING
 void SkNEONProcCoeffXfermode::toString(SkString* str) const {
     this->INHERITED::toString(str);
 }
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h
index a8d438195eb..8f3aaaea9d9 100644
--- a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h
@@ -14,11 +14,11 @@ public:
     virtual void xfer16(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,
                         int count, const SkAlpha* SK_RESTRICT aa) const SK_OVERRIDE;
 
-    SK_DEVELOPER_TO_STRING()
+    SK_TO_STRING_OVERRIDE()
     SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode)
 
 private:
-    SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer);
+    SkNEONProcCoeffXfermode(SkReadBuffer& buffer);
 
     // void* is used to avoid pulling arm_neon.h in the core and having to build
     // it with -mfpu=neon.
diff --git a/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp b/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp
deleted file mode 100644
index aaf6b2ef824..00000000000
--- a/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright 2009 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBitmapProcState_opts_SSE2.h"
-#include "SkBitmapProcState_opts_SSSE3.h"
-#include "SkBitmapFilter_opts_SSE2.h"
-#include "SkBlitMask.h"
-#include "SkBlitRow.h"
-#include "SkBlitRect_opts_SSE2.h"
-#include "SkBlitRow_opts_SSE2.h"
-#include "SkBlurImage_opts_SSE2.h"
-#include "SkUtils_opts_SSE2.h"
-#include "SkUtils.h"
-#include "SkMorphology_opts.h"
-#include "SkMorphology_opts_SSE2.h"
-
-#include "SkRTConf.h"
-
-#if defined(_MSC_VER) && defined(_WIN64)
-#include <intrin.h>
-#endif
-
-/* This file must *not* be compiled with -msse or -msse2, otherwise
-   gcc may generate sse2 even for scalar ops (and thus give an invalid
-   instruction on Pentium3 on the code below).  Only files named *_SSE2.cpp
-   in this directory should be compiled with -msse2. */
-
-
-#ifdef _MSC_VER
-static inline void getcpuid(int info_type, int info[4]) {
-#if defined(_WIN64)
-    __cpuid(info, info_type);
-#else
-    __asm {
-        mov    eax, [info_type]
-        cpuid
-        mov    edi, [info]
-        mov    [edi], eax
-        mov    [edi+4], ebx
-        mov    [edi+8], ecx
-        mov    [edi+12], edx
-    }
-#endif
-}
-#else
-#if defined(__x86_64__)
-static inline void getcpuid(int info_type, int info[4]) {
-    asm volatile (
-        "cpuid \n\t"
-        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
-        : "a"(info_type)
-    );
-}
-#else
-static inline void getcpuid(int info_type, int info[4]) {
-    // We save and restore ebx, so this code can be compatible with -fPIC
-    asm volatile (
-        "pushl %%ebx      \n\t"
-        "cpuid            \n\t"
-        "movl %%ebx, %1   \n\t"
-        "popl %%ebx       \n\t"
-        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
-        : "a"(info_type)
-    );
-}
-#endif
-#endif
-
-#if defined(__x86_64__) || defined(_WIN64) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-/* All x86_64 machines have SSE2, or we know it's supported at compile time,  so don't even bother checking. */
-static inline bool hasSSE2() {
-    return true;
-}
-#else
-
-static inline bool hasSSE2() {
-    int cpu_info[4] = { 0 };
-    getcpuid(1, cpu_info);
-    return (cpu_info[3] & (1<<26)) != 0;
-}
-#endif
-
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-/* If we know SSSE3 is supported at compile time, don't even bother checking. */
-static inline bool hasSSSE3() {
-    return true;
-}
-#else
-
-static inline bool hasSSSE3() {
-    int cpu_info[4] = { 0 };
-    getcpuid(1, cpu_info);
-    return (cpu_info[2] & 0x200) != 0;
-}
-#endif
-
-static bool cachedHasSSE2() {
-    static bool gHasSSE2 = hasSSE2();
-    return gHasSSE2;
-}
-
-static bool cachedHasSSSE3() {
-    static bool gHasSSSE3 = hasSSSE3();
-    return gHasSSSE3;
-}
-
-SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
-
-void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) {
-    if (cachedHasSSE2()) {
-        procs->fExtraHorizontalReads = 3;
-        procs->fConvolveVertically = &convolveVertically_SSE2;
-        procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
-        procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
-        procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
-    }
-}
-
-void SkBitmapProcState::platformProcs() {
-    if (cachedHasSSSE3()) {
-        if (fSampleProc32 == S32_opaque_D32_filter_DX) {
-            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
-        } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
-        }
-
-        if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
-            fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
-        } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
-            fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
-        }
-    } else if (cachedHasSSE2()) {
-        if (fSampleProc32 == S32_opaque_D32_filter_DX) {
-            fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
-        } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-            fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
-        }
-
-        if (fSampleProc16 == S32_D16_filter_DX) {
-            fSampleProc16 = S32_D16_filter_DX_SSE2;
-        }
-    }
-
-    if (cachedHasSSSE3() || cachedHasSSE2()) {
-        if (fMatrixProc == ClampX_ClampY_filter_scale) {
-            fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
-        } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
-            fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
-        }
-
-        if (fMatrixProc == ClampX_ClampY_filter_affine) {
-            fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
-        } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
-            fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
-        }
-        if (c_hqfilter_sse) {
-            if (fShaderProc32 == highQualityFilter32) {
-                fShaderProc32 = highQualityFilter_SSE2;
-            }
-        }
-    }
-}
-
-static SkBlitRow::Proc32 platform_32_procs[] = {
-    NULL,                               // S32_Opaque,
-    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
-    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
-    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
-};
-
-SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
-    return NULL;
-}
-
-SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
-    if (cachedHasSSE2()) {
-        return Color32_SSE2;
-    } else {
-        return NULL;
-    }
-}
-
-SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-    if (cachedHasSSE2()) {
-        return platform_32_procs[flags];
-    } else {
-        return NULL;
-    }
-}
-
-
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
-                                                     SkMask::Format maskFormat,
-                                                     SkColor color) {
-    if (SkMask::kA8_Format != maskFormat) {
-        return NULL;
-    }
-
-    ColorProc proc = NULL;
-    if (cachedHasSSE2()) {
-        switch (dstConfig) {
-            case SkBitmap::kARGB_8888_Config:
-                // The SSE2 version is not (yet) faster for black, so we check
-                // for that.
-                if (SK_ColorBLACK != color) {
-                    proc = SkARGB32_A8_BlitMask_SSE2;
-                }
-                break;
-            default:
-                break;
-        }
-    }
-    return proc;
-}
-
-SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
-    if (cachedHasSSE2()) {
-        if (isOpaque) {
-            return SkBlitLCD16OpaqueRow_SSE2;
-        } else {
-            return SkBlitLCD16Row_SSE2;
-        }
-    } else {
-        return NULL;
-    }
-
-}
-SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
-                                                 SkMask::Format maskFormat,
-                                                 RowFlags flags) {
-    return NULL;
-}
-
-SkMemset16Proc SkMemset16GetPlatformProc() {
-    if (cachedHasSSE2()) {
-        return sk_memset16_SSE2;
-    } else {
-        return NULL;
-    }
-}
-
-SkMemset32Proc SkMemset32GetPlatformProc() {
-    if (cachedHasSSE2()) {
-        return sk_memset32_SSE2;
-    } else {
-        return NULL;
-    }
-}
-
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
-    if (!cachedHasSSE2()) {
-        return NULL;
-    }
-    switch (type) {
-        case kDilateX_SkMorphologyProcType:
-            return SkDilateX_SSE2;
-        case kDilateY_SkMorphologyProcType:
-            return SkDilateY_SSE2;
-        case kErodeX_SkMorphologyProcType:
-            return SkErodeX_SSE2;
-        case kErodeY_SkMorphologyProcType:
-            return SkErodeY_SSE2;
-        default:
-            return NULL;
-    }
-}
-
-bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
-                               SkBoxBlurProc* boxBlurY,
-                               SkBoxBlurProc* boxBlurXY,
-                               SkBoxBlurProc* boxBlurYX) {
-#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
-    return false;
-#else
-    if (!cachedHasSSE2()) {
-        return false;
-    }
-    return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
-#endif
-}
-
-SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
-
-SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
-    if (cachedHasSSE2()) {
-        return ColorRect32_SSE2;
-    } else {
-        return NULL;
-    }
-}
diff --git a/chromium/third_party/skia/src/opts/opts_check_arm.cpp b/chromium/third_party/skia/src/opts/opts_check_arm.cpp
deleted file mode 100644
index 3a322aa0e9b..00000000000
--- a/chromium/third_party/skia/src/opts/opts_check_arm.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/***************************************************************************
- * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
- * Copyright 2006-2010, The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- ***************************************************************************/
-
-/* Changes:
- * 2011-04-01 ARM
- *    Merged the functions from src/opts/opts_check_arm_neon.cpp
- *    Modified to return ARM version of memset16 and memset32 if no neon
- *    available in the core
- */
-
-#include "SkBlitRow.h"
-#include "SkUtils.h"
-
-#include "SkUtilsArm.h"
-#include "SkMorphology_opts.h"
-#include "SkMorphology_opts_neon.h"
-#include "SkBlurImage_opts_neon.h"
-
-#if defined(SK_CPU_LENDIAN) && !SK_ARM_NEON_IS_NONE
-extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
-extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
-#endif
-
-#if defined(SK_CPU_LENDIAN)
-extern "C" void arm_memset16(uint16_t* dst, uint16_t value, int count);
-extern "C" void arm_memset32(uint32_t* dst, uint32_t value, int count);
-#endif
-
-SkMemset16Proc SkMemset16GetPlatformProc() {
-    // FIXME: memset.arm.S is using syntax incompatible with XCode
-#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
-    return NULL;
-#elif SK_ARM_NEON_IS_DYNAMIC
-    if (sk_cpu_arm_has_neon()) {
-        return memset16_neon;
-    } else {
-        return arm_memset16;
-    }
-#elif SK_ARM_NEON_IS_ALWAYS
-    return memset16_neon;
-#else
-    return arm_memset16;
-#endif
-}
-
-SkMemset32Proc SkMemset32GetPlatformProc() {
-    // FIXME: memset.arm.S is using syntax incompatible with XCode
-#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
-    return NULL;
-#elif SK_ARM_NEON_IS_DYNAMIC
-    if (sk_cpu_arm_has_neon()) {
-        return memset32_neon;
-    } else {
-        return arm_memset32;
-    }
-#elif SK_ARM_NEON_IS_ALWAYS
-    return memset32_neon;
-#else
-    return arm_memset32;
-#endif
-}
-
-SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
-    return NULL;
-}
-
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
-#if SK_ARM_NEON_IS_NONE
-    return NULL;
-#else
-#if SK_ARM_NEON_IS_DYNAMIC
-    if (!sk_cpu_arm_has_neon()) {
-        return NULL;
-    }
-#endif
-    switch (type) {
-        case kDilateX_SkMorphologyProcType:
-            return SkDilateX_neon;
-        case kDilateY_SkMorphologyProcType:
-            return SkDilateY_neon;
-        case kErodeX_SkMorphologyProcType:
-            return SkErodeX_neon;
-        case kErodeY_SkMorphologyProcType:
-            return SkErodeY_neon;
-        default:
-            return NULL;
-    }
-#endif
-}
-
-bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
-                               SkBoxBlurProc* boxBlurY,
-                               SkBoxBlurProc* boxBlurXY,
-                               SkBoxBlurProc* boxBlurYX) {
-#if SK_ARM_NEON_IS_NONE
-    return false;
-#else
-#if SK_ARM_NEON_IS_DYNAMIC
-    if (!sk_cpu_arm_has_neon()) {
-        return false;
-    }
-#endif
-    return SkBoxBlurGetPlatformProcs_NEON(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
-#endif
-}
diff --git a/chromium/third_party/skia/src/opts/opts_check_x86.cpp b/chromium/third_party/skia/src/opts/opts_check_x86.cpp
new file mode 100644
index 00000000000..6af47729cd0
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/opts_check_x86.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2009 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBitmapFilter_opts_SSE2.h"
+#include "SkBitmapProcState_opts_SSE2.h"
+#include "SkBitmapProcState_opts_SSSE3.h"
+#include "SkBlitMask.h"
+#include "SkBlitRect_opts_SSE2.h"
+#include "SkBlitRow.h"
+#include "SkBlitRow_opts_SSE2.h"
+#include "SkBlurImage_opts_SSE2.h"
+#include "SkMorphology_opts.h"
+#include "SkMorphology_opts_SSE2.h"
+#include "SkRTConf.h"
+#include "SkUtils.h"
+#include "SkUtils_opts_SSE2.h"
+#include "SkXfermode.h"
+#include "SkXfermode_proccoeff.h"
+
+#if defined(_MSC_VER) && defined(_WIN64)
+#include <intrin.h>
+#endif
+
+/* This file must *not* be compiled with -msse or any other optional SIMD
+   extension, otherwise gcc may generate SIMD instructions even for scalar ops
+   (and thus give an invalid instruction on Pentium3 on the code below).
+   For example, only files named *_SSE2.cpp in this directory should be
+   compiled with -msse2 or higher. */
+
+
+/* Function to get the CPU SSE-level in runtime, for different compilers. */
+#ifdef _MSC_VER
+static inline void getcpuid(int info_type, int info[4]) {
+#if defined(_WIN64)
+    __cpuid(info, info_type);
+#else
+    __asm {
+        mov    eax, [info_type]
+        cpuid
+        mov    edi, [info]
+        mov    [edi], eax
+        mov    [edi+4], ebx
+        mov    [edi+8], ecx
+        mov    [edi+12], edx
+    }
+#endif
+}
+#elif defined(__x86_64__)
+static inline void getcpuid(int info_type, int info[4]) {
+    asm volatile (
+        "cpuid \n\t"
+        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+        : "a"(info_type)
+    );
+}
+#else
+static inline void getcpuid(int info_type, int info[4]) {
+    // We save and restore ebx, so this code can be compatible with -fPIC
+    asm volatile (
+        "pushl %%ebx      \n\t"
+        "cpuid            \n\t"
+        "movl %%ebx, %1   \n\t"
+        "popl %%ebx       \n\t"
+        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
+        : "a"(info_type)
+    );
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+/* Fetch the SIMD level directly from the CPU, at run-time.
+ * Only checks the levels needed by the optimizations in this file.
+ */
+static int get_SIMD_level() {
+    int cpu_info[4] = { 0 };
+
+    getcpuid(1, cpu_info);
+    if ((cpu_info[2] & (1<<20)) != 0) {
+        return SK_CPU_SSE_LEVEL_SSE42;
+    } else if ((cpu_info[2] & (1<<9)) != 0) {
+        return SK_CPU_SSE_LEVEL_SSSE3;
+    } else if ((cpu_info[3] & (1<<26)) != 0) {
+        return SK_CPU_SSE_LEVEL_SSE2;
+    } else {
+        return 0;
+    }
+}
+
+/* Verify that the requested SIMD level is supported in the build.
+ * If not, check if the platform supports it.
+ */
+static inline bool supports_simd(int minLevel) {
+#if defined(SK_CPU_SSE_LEVEL)
+    if (minLevel <= SK_CPU_SSE_LEVEL) {
+        return true;
+    } else
+#endif
+    {
+#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
+        /* For the Android framework we should always know at compile time if the device
+         * we are building for supports SSSE3.  The one exception to this rule is on the
+         * emulator where we are compiled without the -mssse3 option (so we have no
+         * SSSE3 procs) but can be run on a host machine that supports SSSE3
+         * instructions. So for that particular case we disable our SSSE3 options.
+         */
+        return false;
+#else
+        static int gSIMDLevel = get_SIMD_level();
+        return (minLevel <= gSIMDLevel);
+#endif
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
+
+void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        procs->fExtraHorizontalReads = 3;
+        procs->fConvolveVertically = &convolveVertically_SSE2;
+        procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
+        procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
+        procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void SkBitmapProcState::platformProcs() {
+    /* Every optimization in the function requires at least SSE2 */
+    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return;
+    }
+
+    /* Check fSampleProc32 */
+    if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
+        } else {
+            fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
+        }
+    } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
+        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+            fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
+        }
+    } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
+        } else {
+            fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
+        }
+    } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
+        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+            fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
+        }
+    }
+
+    /* Check fSampleProc16 */
+    if (fSampleProc16 == S32_D16_filter_DX) {
+        fSampleProc16 = S32_D16_filter_DX_SSE2;
+    }
+
+    /* Check fMatrixProc */
+    if (fMatrixProc == ClampX_ClampY_filter_scale) {
+        fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
+    } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+        fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+    } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
+        fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
+    } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
+        fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
+    }
+
+    /* Check fShaderProc32 */
+    if (c_hqfilter_sse) {
+        if (fShaderProc32 == highQualityFilter32) {
+            fShaderProc32 = highQualityFilter_SSE2;
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static SkBlitRow::Proc platform_16_procs[] = {
+    S32_D565_Opaque_SSE2,               // S32_D565_Opaque
+    NULL,                               // S32_D565_Blend
+    S32A_D565_Opaque_SSE2,              // S32A_D565_Opaque
+    NULL,                               // S32A_D565_Blend
+    S32_D565_Opaque_Dither_SSE2,        // S32_D565_Opaque_Dither
+    NULL,                               // S32_D565_Blend_Dither
+    S32A_D565_Opaque_Dither_SSE2,       // S32A_D565_Opaque_Dither
+    NULL,                               // S32A_D565_Blend_Dither
+};
+
+SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return platform_16_procs[flags];
+    } else {
+        return NULL;
+    }
+}
+
+static SkBlitRow::Proc32 platform_32_procs[] = {
+    NULL,                               // S32_Opaque,
+    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
+    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
+    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
+};
+
+SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return platform_32_procs[flags];
+    } else {
+        return NULL;
+    }
+}
+
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return Color32_SSE2;
+    } else {
+        return NULL;
+    }
+}
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+/* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled.
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return ColorRect32_SSE2;
+    } else {
+        return NULL;
+    }
+*/
+    return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
+                                                     SkMask::Format maskFormat,
+                                                     SkColor color) {
+    if (SkMask::kA8_Format != maskFormat) {
+        return NULL;
+    }
+
+    ColorProc proc = NULL;
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        switch (dstCT) {
+            case kN32_SkColorType:
+                // The SSE2 version is not (yet) faster for black, so we check
+                // for that.
+                if (SK_ColorBLACK != color) {
+                    proc = SkARGB32_A8_BlitMask_SSE2;
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    return proc;
+}
+
+SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        if (isOpaque) {
+            return SkBlitLCD16OpaqueRow_SSE2;
+        } else {
+            return SkBlitLCD16Row_SSE2;
+        }
+    } else {
+        return NULL;
+    }
+
+}
+
+SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
+    return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return sk_memset16_SSE2;
+    } else {
+        return NULL;
+    }
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return sk_memset32_SSE2;
+    } else {
+        return NULL;
+    }
+}
+
+SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return sk_memcpy32_SSE2;
+    } else {
+        return NULL;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
+    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return NULL;
+    }
+    switch (type) {
+        case kDilateX_SkMorphologyProcType:
+            return SkDilateX_SSE2;
+        case kDilateY_SkMorphologyProcType:
+            return SkDilateY_SSE2;
+        case kErodeX_SkMorphologyProcType:
+            return SkErodeX_SSE2;
+        case kErodeY_SkMorphologyProcType:
+            return SkErodeY_SSE2;
+        default:
+            return NULL;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
+                               SkBoxBlurProc* boxBlurY,
+                               SkBoxBlurProc* boxBlurXY,
+                               SkBoxBlurProc* boxBlurYX) {
+#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
+    return false;
+#else
+    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return false;
+    }
+    return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
+                                                                SkXfermode::Mode mode);
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
+                                                    SkXfermode::Mode mode);
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
+                                                    SkXfermode::Mode mode) {
+    return NULL;
+}
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
+                                               SkXfermode::Mode mode);
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
+                                               SkXfermode::Mode mode) {
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return SkPlatformXfermodeFactory_impl_SSE2(rec, mode);
+    } else {
+        return SkPlatformXfermodeFactory_impl(rec, mode);
+    }
+}
+
+SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode);
+
+SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) {
+    return NULL;
+}
author	Jocelyn Turcotte <jocelyn.turcotte@digia.com>	2014-08-08 14:30:41 +0200
committer	Jocelyn Turcotte <jocelyn.turcotte@digia.com>	2014-08-12 13:49:54 +0200
commit	ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree	498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/skia/src/opts
parent	4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)