summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/skia/src/opts
diff options
context:
space:
mode:
authorJocelyn Turcotte <jocelyn.turcotte@digia.com>2014-08-08 14:30:41 +0200
committerJocelyn Turcotte <jocelyn.turcotte@digia.com>2014-08-12 13:49:54 +0200
commitab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/skia/src/opts
parent4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)
Update Chromium to beta version 37.0.2062.68
Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca Reviewed-by: Andras Becsi <andras.becsi@digia.com>
Diffstat (limited to 'chromium/third_party/skia/src/opts')
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp793
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h6
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h22
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp140
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h911
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h506
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h542
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp32
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h11
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp47
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h5
-rw-r--r--chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp12
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp15
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp10
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp21
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h8
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp524
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h18
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp6
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp875
-rw-r--r--chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp848
-rw-r--r--chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp25
-rw-r--r--chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h5
-rw-r--r--chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp25
-rw-r--r--chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp93
-rw-r--r--chromium/third_party/skia/src/opts/SkCachePreload_arm.h34
-rw-r--r--chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h186
-rw-r--r--chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h51
-rw-r--r--chromium/third_party/skia/src/opts/SkMorphology_opts.h17
-rw-r--r--chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp12
-rw-r--r--chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h7
-rw-r--r--chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp34
-rw-r--r--chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp2
-rw-r--r--chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp32
-rw-r--r--chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h6
-rw-r--r--chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp57
-rw-r--r--chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp4
-rw-r--r--chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp819
-rw-r--r--chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h38
-rw-r--r--chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp80
-rw-r--r--chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h4
-rw-r--r--chromium/third_party/skia/src/opts/opts_check_SSE2.cpp294
-rw-r--r--chromium/third_party/skia/src/opts/opts_check_arm.cpp110
-rw-r--r--chromium/third_party/skia/src/opts/opts_check_x86.cpp379
44 files changed, 4870 insertions, 2796 deletions
diff --git a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp
index 259e2efc0ec..b0405669218 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -5,17 +5,15 @@
* found in the LICENSE file.
*/
-#include "SkBitmapProcState.h"
+#include <emmintrin.h>
#include "SkBitmap.h"
+#include "SkBitmapFilter_opts_SSE2.h"
+#include "SkBitmapProcState.h"
#include "SkColor.h"
#include "SkColorPriv.h"
-#include "SkUnPreMultiply.h"
-#include "SkShader.h"
#include "SkConvolver.h"
-
-#include "SkBitmapFilter_opts_SSE2.h"
-
-#include <emmintrin.h>
+#include "SkShader.h"
+#include "SkUnPreMultiply.h"
#if 0
static inline void print128i(__m128i value) {
@@ -175,7 +173,6 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
SkIntToScalar(y), &srcPt);
-
}
}
@@ -185,126 +182,126 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
const SkConvolutionFilter1D& filter,
unsigned char* out_row,
bool /*has_alpha*/) {
- int num_values = filter.numValues();
-
- int filter_offset, filter_length;
- __m128i zero = _mm_setzero_si128();
- __m128i mask[4];
- // |mask| will be used to decimate all extra filter coefficients that are
- // loaded by SIMD when |filter_length| is not divisible by 4.
- // mask[0] is not used in following algorithm.
- mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
- mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
- mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
-
- // Output one pixel each iteration, calculating all channels (RGBA) together.
- for (int out_x = 0; out_x < num_values; out_x++) {
- const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
- filter.FilterForValue(out_x, &filter_offset, &filter_length);
-
- __m128i accum = _mm_setzero_si128();
-
- // Compute the first pixel in this row that the filter affects. It will
- // touch |filter_length| pixels (4 bytes each) after this.
- const __m128i* row_to_filter =
- reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
-
- // We will load and accumulate with four coefficients per iteration.
- for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
-
- // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
- __m128i coeff, coeff16;
- // [16] xx xx xx xx c3 c2 c1 c0
- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
- // [16] xx xx xx xx c1 c1 c0 c0
- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
- // [16] c1 c1 c1 c1 c0 c0 c0 c0
- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-
- // Load four pixels => unpack the first two pixels to 16 bits =>
- // multiply with coefficients => accumulate the convolution result.
- // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
- __m128i src8 = _mm_loadu_si128(row_to_filter);
- // [16] a1 b1 g1 r1 a0 b0 g0 r0
- __m128i src16 = _mm_unpacklo_epi8(src8, zero);
- __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
- __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
- // [32] a0*c0 b0*c0 g0*c0 r0*c0
- __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
- // [32] a1*c1 b1*c1 g1*c1 r1*c1
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
-
- // Duplicate 3rd and 4th coefficients for all channels =>
- // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
- // => accumulate the convolution results.
- // [16] xx xx xx xx c3 c3 c2 c2
- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
- // [16] c3 c3 c3 c3 c2 c2 c2 c2
- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
- // [16] a3 g3 b3 r3 a2 g2 b2 r2
- src16 = _mm_unpackhi_epi8(src8, zero);
- mul_hi = _mm_mulhi_epi16(src16, coeff16);
- mul_lo = _mm_mullo_epi16(src16, coeff16);
- // [32] a2*c2 b2*c2 g2*c2 r2*c2
- t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
- // [32] a3*c3 b3*c3 g3*c3 r3*c3
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
-
- // Advance the pixel and coefficients pointers.
- row_to_filter += 1;
- filter_values += 4;
- }
+ int num_values = filter.numValues();
+
+ int filter_offset, filter_length;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask[4];
+ // |mask| will be used to decimate all extra filter coefficients that are
+ // loaded by SIMD when |filter_length| is not divisible by 4.
+ // mask[0] is not used in following algorithm.
+ mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+ mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+ mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+ // Output one pixel each iteration, calculating all channels (RGBA) together.
+ for (int out_x = 0; out_x < num_values; out_x++) {
+ const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+ filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+ __m128i accum = _mm_setzero_si128();
+
+ // Compute the first pixel in this row that the filter affects. It will
+ // touch |filter_length| pixels (4 bytes each) after this.
+ const __m128i* row_to_filter =
+ reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+ // We will load and accumulate with four coefficients per iteration.
+ for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+ // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+ __m128i coeff, coeff16;
+ // [16] xx xx xx xx c3 c2 c1 c0
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // [16] xx xx xx xx c1 c1 c0 c0
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ // [16] c1 c1 c1 c1 c0 c0 c0 c0
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+ // Load four pixels => unpack the first two pixels to 16 bits =>
+ // multiply with coefficients => accumulate the convolution result.
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src8 = _mm_loadu_si128(row_to_filter);
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a0*c0 b0*c0 g0*c0 r0*c0
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ // [32] a1*c1 b1*c1 g1*c1 r1*c1
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+
+ // Duplicate 3rd and 4th coefficients for all channels =>
+ // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+ // => accumulate the convolution results.
+ // [16] xx xx xx xx c3 c3 c2 c2
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ // [16] c3 c3 c3 c3 c2 c2 c2 c2
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+ // [16] a3 g3 b3 r3 a2 g2 b2 r2
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a2*c2 b2*c2 g2*c2 r2*c2
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ // [32] a3*c3 b3*c3 g3*c3 r3*c3
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+
+ // Advance the pixel and coefficients pointers.
+ row_to_filter += 1;
+ filter_values += 4;
+ }
- // When |filter_length| is not divisible by 4, we need to decimate some of
- // the filter coefficient that was loaded incorrectly to zero; Other than
- // that the algorithm is same with above, exceot that the 4th pixel will be
- // always absent.
- int r = filter_length&3;
- if (r) {
- // Note: filter_values must be padded to align_up(filter_offset, 8).
- __m128i coeff, coeff16;
- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
- // Mask out extra filter taps.
- coeff = _mm_and_si128(coeff, mask[r]);
- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-
- // Note: line buffer must be padded to align_up(filter_offset, 16).
- // We resolve this by use C-version for the last horizontal line.
- __m128i src8 = _mm_loadu_si128(row_to_filter);
- __m128i src16 = _mm_unpacklo_epi8(src8, zero);
- __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
- __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
- __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
-
- src16 = _mm_unpackhi_epi8(src8, zero);
- coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
- coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
- mul_hi = _mm_mulhi_epi16(src16, coeff16);
- mul_lo = _mm_mullo_epi16(src16, coeff16);
- t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum = _mm_add_epi32(accum, t);
- }
+ // When |filter_length| is not divisible by 4, we need to decimate some of
+ // the filter coefficient that was loaded incorrectly to zero; Other than
+ // that the algorithm is same with above, exceot that the 4th pixel will be
+ // always absent.
+ int r = filter_length&3;
+ if (r) {
+ // Note: filter_values must be padded to align_up(filter_offset, 8).
+ __m128i coeff, coeff16;
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // Mask out extra filter taps.
+ coeff = _mm_and_si128(coeff, mask[r]);
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+ // Note: line buffer must be padded to align_up(filter_offset, 16).
+ // We resolve this by use C-version for the last horizontal line.
+ __m128i src8 = _mm_loadu_si128(row_to_filter);
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ }
- // Shift right for fixed point implementation.
- accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
+ // Shift right for fixed point implementation.
+ accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
- // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
- accum = _mm_packs_epi32(accum, zero);
- // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
- accum = _mm_packus_epi16(accum, zero);
+ // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+ accum = _mm_packs_epi32(accum, zero);
+ // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+ accum = _mm_packus_epi16(accum, zero);
- // Store the pixel value of 32 bits.
- *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
- out_row += 4;
- }
+ // Store the pixel value of 32 bits.
+ *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+ out_row += 4;
+ }
}
// Convolves horizontally along four rows. The row data is given in
@@ -314,116 +311,116 @@ void convolveHorizontally_SSE2(const unsigned char* src_data,
void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
const SkConvolutionFilter1D& filter,
unsigned char* out_row[4]) {
- int num_values = filter.numValues();
-
- int filter_offset, filter_length;
- __m128i zero = _mm_setzero_si128();
- __m128i mask[4];
- // |mask| will be used to decimate all extra filter coefficients that are
- // loaded by SIMD when |filter_length| is not divisible by 4.
- // mask[0] is not used in following algorithm.
- mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
- mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
- mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
-
- // Output one pixel each iteration, calculating all channels (RGBA) together.
- for (int out_x = 0; out_x < num_values; out_x++) {
- const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
- filter.FilterForValue(out_x, &filter_offset, &filter_length);
-
- // four pixels in a column per iteration.
- __m128i accum0 = _mm_setzero_si128();
- __m128i accum1 = _mm_setzero_si128();
- __m128i accum2 = _mm_setzero_si128();
- __m128i accum3 = _mm_setzero_si128();
- int start = (filter_offset<<2);
- // We will load and accumulate with four coefficients per iteration.
- for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
- __m128i coeff, coeff16lo, coeff16hi;
- // [16] xx xx xx xx c3 c2 c1 c0
- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
- // [16] xx xx xx xx c1 c1 c0 c0
- coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
- // [16] c1 c1 c1 c1 c0 c0 c0 c0
- coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
- // [16] xx xx xx xx c3 c3 c2 c2
- coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
- // [16] c3 c3 c3 c3 c2 c2 c2 c2
- coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
-
- __m128i src8, src16, mul_hi, mul_lo, t;
-
-#define ITERATION(src, accum) \
- src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
- src16 = _mm_unpacklo_epi8(src8, zero); \
- mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
- mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
- t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
- accum = _mm_add_epi32(accum, t); \
- t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
- accum = _mm_add_epi32(accum, t); \
- src16 = _mm_unpackhi_epi8(src8, zero); \
- mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
- mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
- t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
- accum = _mm_add_epi32(accum, t); \
- t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
- accum = _mm_add_epi32(accum, t)
-
- ITERATION(src_data[0] + start, accum0);
- ITERATION(src_data[1] + start, accum1);
- ITERATION(src_data[2] + start, accum2);
- ITERATION(src_data[3] + start, accum3);
-
- start += 16;
- filter_values += 4;
- }
+ int num_values = filter.numValues();
+
+ int filter_offset, filter_length;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask[4];
+ // |mask| will be used to decimate all extra filter coefficients that are
+ // loaded by SIMD when |filter_length| is not divisible by 4.
+ // mask[0] is not used in following algorithm.
+ mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+ mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+ mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+ // Output one pixel each iteration, calculating all channels (RGBA) together.
+ for (int out_x = 0; out_x < num_values; out_x++) {
+ const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+ filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+ // four pixels in a column per iteration.
+ __m128i accum0 = _mm_setzero_si128();
+ __m128i accum1 = _mm_setzero_si128();
+ __m128i accum2 = _mm_setzero_si128();
+ __m128i accum3 = _mm_setzero_si128();
+ int start = (filter_offset<<2);
+ // We will load and accumulate with four coefficients per iteration.
+ for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+ __m128i coeff, coeff16lo, coeff16hi;
+ // [16] xx xx xx xx c3 c2 c1 c0
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // [16] xx xx xx xx c1 c1 c0 c0
+ coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ // [16] c1 c1 c1 c1 c0 c0 c0 c0
+ coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+ // [16] xx xx xx xx c3 c3 c2 c2
+ coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ // [16] c3 c3 c3 c3 c2 c2 c2 c2
+ coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+ __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum) \
+ src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
+ src16 = _mm_unpacklo_epi8(src8, zero); \
+ mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
+ mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t); \
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t); \
+ src16 = _mm_unpackhi_epi8(src8, zero); \
+ mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
+ mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t); \
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t)
+
+ ITERATION(src_data[0] + start, accum0);
+ ITERATION(src_data[1] + start, accum1);
+ ITERATION(src_data[2] + start, accum2);
+ ITERATION(src_data[3] + start, accum3);
+
+ start += 16;
+ filter_values += 4;
+ }
- int r = filter_length & 3;
- if (r) {
- // Note: filter_values must be padded to align_up(filter_offset, 8);
- __m128i coeff;
- coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
- // Mask out extra filter taps.
- coeff = _mm_and_si128(coeff, mask[r]);
-
- __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
- /* c1 c1 c1 c1 c0 c0 c0 c0 */
- coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
- __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
- coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
-
- __m128i src8, src16, mul_hi, mul_lo, t;
-
- ITERATION(src_data[0] + start, accum0);
- ITERATION(src_data[1] + start, accum1);
- ITERATION(src_data[2] + start, accum2);
- ITERATION(src_data[3] + start, accum3);
- }
+ int r = filter_length & 3;
+ if (r) {
+ // Note: filter_values must be padded to align_up(filter_offset, 8);
+ __m128i coeff;
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // Mask out extra filter taps.
+ coeff = _mm_and_si128(coeff, mask[r]);
+
+ __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ /* c1 c1 c1 c1 c0 c0 c0 c0 */
+ coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+ __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+ __m128i src8, src16, mul_hi, mul_lo, t;
+
+ ITERATION(src_data[0] + start, accum0);
+ ITERATION(src_data[1] + start, accum1);
+ ITERATION(src_data[2] + start, accum2);
+ ITERATION(src_data[3] + start, accum3);
+ }
- accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
- accum0 = _mm_packs_epi32(accum0, zero);
- accum0 = _mm_packus_epi16(accum0, zero);
- accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
- accum1 = _mm_packs_epi32(accum1, zero);
- accum1 = _mm_packus_epi16(accum1, zero);
- accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
- accum2 = _mm_packs_epi32(accum2, zero);
- accum2 = _mm_packus_epi16(accum2, zero);
- accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
- accum3 = _mm_packs_epi32(accum3, zero);
- accum3 = _mm_packus_epi16(accum3, zero);
-
- *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
- *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
- *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
- *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
-
- out_row[0] += 4;
- out_row[1] += 4;
- out_row[2] += 4;
- out_row[3] += 4;
- }
+ accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+ accum0 = _mm_packs_epi32(accum0, zero);
+ accum0 = _mm_packus_epi16(accum0, zero);
+ accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+ accum1 = _mm_packs_epi32(accum1, zero);
+ accum1 = _mm_packus_epi16(accum1, zero);
+ accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+ accum2 = _mm_packs_epi32(accum2, zero);
+ accum2 = _mm_packus_epi16(accum2, zero);
+ accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+ accum3 = _mm_packs_epi32(accum3, zero);
+ accum3 = _mm_packus_epi16(accum3, zero);
+
+ *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+ *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+ *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+ *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+ out_row[0] += 4;
+ out_row[1] += 4;
+ out_row[2] += 4;
+ out_row[3] += 4;
+ }
}
// Does vertical convolution to produce one output row. The filter values and
@@ -438,166 +435,166 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
unsigned char* const* source_data_rows,
int pixel_width,
unsigned char* out_row) {
- int width = pixel_width & ~3;
-
- __m128i zero = _mm_setzero_si128();
- __m128i accum0, accum1, accum2, accum3, coeff16;
- const __m128i* src;
- // Output four pixels per iteration (16 bytes).
- for (int out_x = 0; out_x < width; out_x += 4) {
-
- // Accumulated result for each pixel. 32 bits per RGBA channel.
- accum0 = _mm_setzero_si128();
- accum1 = _mm_setzero_si128();
- accum2 = _mm_setzero_si128();
- accum3 = _mm_setzero_si128();
-
- // Convolve with one filter coefficient per iteration.
- for (int filter_y = 0; filter_y < filter_length; filter_y++) {
-
- // Duplicate the filter coefficient 8 times.
- // [16] cj cj cj cj cj cj cj cj
- coeff16 = _mm_set1_epi16(filter_values[filter_y]);
-
- // Load four pixels (16 bytes) together.
- // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
- src = reinterpret_cast<const __m128i*>(
- &source_data_rows[filter_y][out_x << 2]);
- __m128i src8 = _mm_loadu_si128(src);
-
- // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
- // multiply with current coefficient => accumulate the result.
- // [16] a1 b1 g1 r1 a0 b0 g0 r0
- __m128i src16 = _mm_unpacklo_epi8(src8, zero);
- __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
- __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
- // [32] a0 b0 g0 r0
- __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum0 = _mm_add_epi32(accum0, t);
- // [32] a1 b1 g1 r1
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum1 = _mm_add_epi32(accum1, t);
-
- // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
- // multiply with current coefficient => accumulate the result.
- // [16] a3 b3 g3 r3 a2 b2 g2 r2
- src16 = _mm_unpackhi_epi8(src8, zero);
- mul_hi = _mm_mulhi_epi16(src16, coeff16);
- mul_lo = _mm_mullo_epi16(src16, coeff16);
- // [32] a2 b2 g2 r2
- t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum2 = _mm_add_epi32(accum2, t);
- // [32] a3 b3 g3 r3
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum3 = _mm_add_epi32(accum3, t);
- }
-
- // Shift right for fixed point implementation.
- accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
- accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
- accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
- accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
-
- // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
- // [16] a1 b1 g1 r1 a0 b0 g0 r0
- accum0 = _mm_packs_epi32(accum0, accum1);
- // [16] a3 b3 g3 r3 a2 b2 g2 r2
- accum2 = _mm_packs_epi32(accum2, accum3);
+ int width = pixel_width & ~3;
+
+ __m128i zero = _mm_setzero_si128();
+ __m128i accum0, accum1, accum2, accum3, coeff16;
+ const __m128i* src;
+ // Output four pixels per iteration (16 bytes).
+ for (int out_x = 0; out_x < width; out_x += 4) {
+
+ // Accumulated result for each pixel. 32 bits per RGBA channel.
+ accum0 = _mm_setzero_si128();
+ accum1 = _mm_setzero_si128();
+ accum2 = _mm_setzero_si128();
+ accum3 = _mm_setzero_si128();
+
+ // Convolve with one filter coefficient per iteration.
+ for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+ // Duplicate the filter coefficient 8 times.
+ // [16] cj cj cj cj cj cj cj cj
+ coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+ // Load four pixels (16 bytes) together.
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ src = reinterpret_cast<const __m128i*>(
+ &source_data_rows[filter_y][out_x << 2]);
+ __m128i src8 = _mm_loadu_si128(src);
+
+ // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+ // multiply with current coefficient => accumulate the result.
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a0 b0 g0 r0
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum0 = _mm_add_epi32(accum0, t);
+ // [32] a1 b1 g1 r1
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum1 = _mm_add_epi32(accum1, t);
+
+ // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+ // multiply with current coefficient => accumulate the result.
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a2 b2 g2 r2
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum2 = _mm_add_epi32(accum2, t);
+ // [32] a3 b3 g3 r3
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum3 = _mm_add_epi32(accum3, t);
+ }
- // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
- // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
- accum0 = _mm_packus_epi16(accum0, accum2);
+ // Shift right for fixed point implementation.
+ accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+ accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+ accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+ accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+
+ // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packs_epi32(accum0, accum1);
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ accum2 = _mm_packs_epi32(accum2, accum3);
+
+ // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packus_epi16(accum0, accum2);
+
+ if (has_alpha) {
+ // Compute the max(ri, gi, bi) for each pixel.
+ // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+ __m128i a = _mm_srli_epi32(accum0, 8);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
+ // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+ a = _mm_srli_epi32(accum0, 16);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ b = _mm_max_epu8(a, b); // Max of r and g and b.
+ // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+ b = _mm_slli_epi32(b, 24);
+
+ // Make sure the value of alpha channel is always larger than maximum
+ // value of color channels.
+ accum0 = _mm_max_epu8(b, accum0);
+ } else {
+ // Set value of alpha channels to 0xFF.
+ __m128i mask = _mm_set1_epi32(0xff000000);
+ accum0 = _mm_or_si128(accum0, mask);
+ }
- if (has_alpha) {
- // Compute the max(ri, gi, bi) for each pixel.
- // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
- __m128i a = _mm_srli_epi32(accum0, 8);
- // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
- __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
- // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
- a = _mm_srli_epi32(accum0, 16);
- // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
- b = _mm_max_epu8(a, b); // Max of r and g and b.
- // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
- b = _mm_slli_epi32(b, 24);
-
- // Make sure the value of alpha channel is always larger than maximum
- // value of color channels.
- accum0 = _mm_max_epu8(b, accum0);
- } else {
- // Set value of alpha channels to 0xFF.
- __m128i mask = _mm_set1_epi32(0xff000000);
- accum0 = _mm_or_si128(accum0, mask);
+ // Store the convolution result (16 bytes) and advance the pixel pointers.
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+ out_row += 16;
}
- // Store the convolution result (16 bytes) and advance the pixel pointers.
- _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
- out_row += 16;
- }
-
- // When the width of the output is not divisible by 4, We need to save one
- // pixel (4 bytes) each time. And also the fourth pixel is always absent.
- if (pixel_width & 3) {
- accum0 = _mm_setzero_si128();
- accum1 = _mm_setzero_si128();
- accum2 = _mm_setzero_si128();
- for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
- coeff16 = _mm_set1_epi16(filter_values[filter_y]);
- // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
- src = reinterpret_cast<const __m128i*>(
- &source_data_rows[filter_y][width<<2]);
- __m128i src8 = _mm_loadu_si128(src);
- // [16] a1 b1 g1 r1 a0 b0 g0 r0
- __m128i src16 = _mm_unpacklo_epi8(src8, zero);
- __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
- __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
- // [32] a0 b0 g0 r0
- __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum0 = _mm_add_epi32(accum0, t);
- // [32] a1 b1 g1 r1
- t = _mm_unpackhi_epi16(mul_lo, mul_hi);
- accum1 = _mm_add_epi32(accum1, t);
- // [16] a3 b3 g3 r3 a2 b2 g2 r2
- src16 = _mm_unpackhi_epi8(src8, zero);
- mul_hi = _mm_mulhi_epi16(src16, coeff16);
- mul_lo = _mm_mullo_epi16(src16, coeff16);
- // [32] a2 b2 g2 r2
- t = _mm_unpacklo_epi16(mul_lo, mul_hi);
- accum2 = _mm_add_epi32(accum2, t);
- }
+ // When the width of the output is not divisible by 4, We need to save one
+ // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+ if (pixel_width & 3) {
+ accum0 = _mm_setzero_si128();
+ accum1 = _mm_setzero_si128();
+ accum2 = _mm_setzero_si128();
+ for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+ coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ src = reinterpret_cast<const __m128i*>(
+ &source_data_rows[filter_y][width<<2]);
+ __m128i src8 = _mm_loadu_si128(src);
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a0 b0 g0 r0
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum0 = _mm_add_epi32(accum0, t);
+ // [32] a1 b1 g1 r1
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum1 = _mm_add_epi32(accum1, t);
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a2 b2 g2 r2
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum2 = _mm_add_epi32(accum2, t);
+ }
- accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
- accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
- accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
- // [16] a1 b1 g1 r1 a0 b0 g0 r0
- accum0 = _mm_packs_epi32(accum0, accum1);
- // [16] a3 b3 g3 r3 a2 b2 g2 r2
- accum2 = _mm_packs_epi32(accum2, zero);
- // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
- accum0 = _mm_packus_epi16(accum0, accum2);
- if (has_alpha) {
- // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
- __m128i a = _mm_srli_epi32(accum0, 8);
- // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
- __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
- // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
- a = _mm_srli_epi32(accum0, 16);
- // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
- b = _mm_max_epu8(a, b); // Max of r and g and b.
- // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
- b = _mm_slli_epi32(b, 24);
- accum0 = _mm_max_epu8(b, accum0);
- } else {
- __m128i mask = _mm_set1_epi32(0xff000000);
- accum0 = _mm_or_si128(accum0, mask);
- }
+ accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+ accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+ accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packs_epi32(accum0, accum1);
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ accum2 = _mm_packs_epi32(accum2, zero);
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packus_epi16(accum0, accum2);
+ if (has_alpha) {
+ // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+ __m128i a = _mm_srli_epi32(accum0, 8);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
+ // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+ a = _mm_srli_epi32(accum0, 16);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ b = _mm_max_epu8(a, b); // Max of r and g and b.
+ // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+ b = _mm_slli_epi32(b, 24);
+ accum0 = _mm_max_epu8(b, accum0);
+ } else {
+ __m128i mask = _mm_set1_epi32(0xff000000);
+ accum0 = _mm_or_si128(accum0, mask);
+ }
- for (int out_x = width; out_x < pixel_width; out_x++) {
- *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
- accum0 = _mm_srli_si128(accum0, 4);
- out_row += 4;
+ for (int out_x = width; out_x < pixel_width; out_x++) {
+ *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+ accum0 = _mm_srli_si128(accum0, 4);
+ out_row += 4;
+ }
}
- }
}
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
@@ -606,19 +603,19 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
int pixel_width,
unsigned char* out_row,
bool has_alpha) {
- if (has_alpha) {
- convolveVertically_SSE2<true>(filter_values,
- filter_length,
- source_data_rows,
- pixel_width,
- out_row);
- } else {
- convolveVertically_SSE2<false>(filter_values,
- filter_length,
- source_data_rows,
- pixel_width,
- out_row);
- }
+ if (has_alpha) {
+ convolveVertically_SSE2<true>(filter_values,
+ filter_length,
+ source_data_rows,
+ pixel_width,
+ out_row);
+ } else {
+ convolveVertically_SSE2<false>(filter_values,
+ filter_length,
+ source_data_rows,
+ pixel_width,
+ out_row);
+ }
}
void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
diff --git a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h
index 588f4ef18bb..661a824e227 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h
@@ -1,4 +1,3 @@
-
/*
* Copyright 2013 Google Inc.
*
@@ -6,7 +5,6 @@
* found in the LICENSE file.
*/
-
#ifndef SkBitmapFilter_opts_sse2_DEFINED
#define SkBitmapFilter_opts_sse2_DEFINED
@@ -14,9 +12,9 @@
#include "SkConvolver.h"
void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
- SkPMColor *SK_RESTRICT colors, int count);
+ SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y,
- SkPMColor *SK_RESTRICT colors, int count);
+ SkPMColor *SK_RESTRICT colors, int count);
void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h
index e56b683b874..0887145c3d0 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h
@@ -17,12 +17,15 @@
* exact results for the color components, but if the 4 incoming colors are
* all opaque, then the output color must also be opaque. Subsequent parts of
* the drawing pipeline may rely on this (e.g. which blitrow proc to use).
+ *
*/
-
-static inline void Filter_32_opaque_neon(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11,
- SkPMColor *dst) {
+// Chrome on Android uses -Os so we need to force these inline. Otherwise
+// calling the function in the inner loops will cause significant overhead on
+// some platforms.
+static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor *dst) {
uint8x8_t vy, vconst16_8, v16_y, vres;
uint16x4_t vx, vconst16_16, v16_x, tmp;
uint32x2_t va0, va1;
@@ -53,10 +56,11 @@ static inline void Filter_32_opaque_neon(unsigned x, unsigned y,
vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
}
-static inline void Filter_32_alpha_neon(unsigned x, unsigned y,
- SkPMColor a00, SkPMColor a01,
- SkPMColor a10, SkPMColor a11,
- SkPMColor *dst, uint16_t scale) {
+static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
+ SkPMColor a00, SkPMColor a01,
+ SkPMColor a10, SkPMColor a11,
+ SkPMColor *dst,
+ uint16_t scale) {
uint8x8_t vy, vconst16_8, v16_y, vres;
uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
uint32x2_t va0, va1;
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
index e81da670526..7789031c028 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
@@ -10,26 +10,140 @@
#include "SkUtilsArm.h"
#include "SkBitmapProcState_utils.h"
+#include <arm_neon.h>
+
extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
-#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
-#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
-#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
-#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
-#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
+static inline int16x8_t sbpsm_clamp_tile8(int32x4_t low, int32x4_t high, unsigned max) {
+ int16x8_t res;
+
+ // get the hi 16s of all those 32s
+ res = vuzpq_s16(vreinterpretq_s16_s32(low), vreinterpretq_s16_s32(high)).val[1];
+
+ // clamp
+ res = vmaxq_s16(res, vdupq_n_s16(0));
+ res = vminq_s16(res, vdupq_n_s16(max));
+
+ return res;
+}
+
+// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
+static inline int32x4_t sbpsm_clamp_tile4(int32x4_t f, unsigned max) {
+ int32x4_t res;
+
+ // get the hi 16s of all those 32s
+ res = vshrq_n_s32(f, 16);
+
+ // clamp
+ res = vmaxq_s32(res, vdupq_n_s32(0));
+ res = vminq_s32(res, vdupq_n_s32(max));
+
+ return res;
+}
+
+// TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+static inline int32x4_t sbpsm_clamp_tile4_low_bits(int32x4_t fx) {
+ int32x4_t ret;
+
+ ret = vshrq_n_s32(fx, 12);
+
+ /* We don't need the mask below because the caller will
+ * overwrite the non-masked bits
+ */
+ //ret = vandq_s32(ret, vdupq_n_s32(0xF));
+
+ return ret;
+}
+
+// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16)
+static inline int16x8_t sbpsm_repeat_tile8(int32x4_t low, int32x4_t high, unsigned max) {
+ uint16x8_t res;
+ uint32x4_t tmpl, tmph;
+
+ // get the lower 16 bits
+ res = vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).val[0];
+
+ // bare multiplication, not SkFixedMul
+ tmpl = vmull_u16(vget_low_u16(res), vdup_n_u16(max+1));
+ tmph = vmull_u16(vget_high_u16(res), vdup_n_u16(max+1));
+
+ // extraction of the 16 upper bits
+ res = vuzpq_u16(vreinterpretq_u16_u32(tmpl), vreinterpretq_u16_u32(tmph)).val[1];
+
+ return vreinterpretq_s16_u16(res);
+}
+
+// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16)
+static inline int32x4_t sbpsm_repeat_tile4(int32x4_t f, unsigned max) {
+ uint16x4_t res;
+ uint32x4_t tmp;
+
+ // get the lower 16 bits
+ res = vmovn_u32(vreinterpretq_u32_s32(f));
+
+ // bare multiplication, not SkFixedMul
+ tmp = vmull_u16(res, vdup_n_u16(max+1));
+
+ // extraction of the 16 upper bits
+ tmp = vshrq_n_u32(tmp, 16);
+
+ return vreinterpretq_s32_u32(tmp);
+}
+
+// TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
+static inline int32x4_t sbpsm_repeat_tile4_low_bits(int32x4_t fx, unsigned max) {
+ uint16x4_t res;
+ uint32x4_t tmp;
+ int32x4_t ret;
+
+ // get the lower 16 bits
+ res = vmovn_u32(vreinterpretq_u32_s32(fx));
+
+ // bare multiplication, not SkFixedMul
+ tmp = vmull_u16(res, vdup_n_u16(max + 1));
+
+ // shift and mask
+ ret = vshrq_n_s32(vreinterpretq_s32_u32(tmp), 12);
+
+ /* We don't need the mask below because the caller will
+ * overwrite the non-masked bits
+ */
+ //ret = vandq_s32(ret, vdupq_n_s32(0xF));
+
+ return ret;
+}
+
+#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
+#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
+#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
+#define TILEX_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max)
+#define TILEY_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max)
+#define TILEX_PROCF_NEON4(fx, max) sbpsm_clamp_tile4(fx, max)
+#define TILEY_PROCF_NEON4(fy, max) sbpsm_clamp_tile4(fy, max)
+#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_clamp_tile4_low_bits(fx)
+#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_clamp_tile4_low_bits(fy)
#define CHECK_FOR_DECAL
-#include "SkBitmapProcState_matrix_clamp_neon.h"
-
-#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon
-#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
-#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
-#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
-#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
-#include "SkBitmapProcState_matrix_repeat_neon.h"
+#include "SkBitmapProcState_matrix_neon.h"
+
+#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon
+#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
+#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
+#define TILEX_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max)
+#define TILEY_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max)
+#define TILEX_PROCF_NEON4(fx, max) sbpsm_repeat_tile4(fx, max)
+#define TILEY_PROCF_NEON4(fy, max) sbpsm_repeat_tile4(fy, max)
+#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
+#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_repeat_tile4_low_bits(fx, max)
+#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_repeat_tile4_low_bits(fy, max)
+#include "SkBitmapProcState_matrix_neon.h"
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h
deleted file mode 100644
index a615e26b240..00000000000
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h
+++ /dev/null
@@ -1,911 +0,0 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-/*
- * Modifications done in-house at Motorola
- *
- * this is a clone of SkBitmapProcState_matrix.h
- * and has been tuned to work with the NEON unit.
- *
- * Still going back and forth between whether this approach
- * (clone the entire SkBitmapProcState_matrix.h file or
- * if I should put just the modified routines in here and
- * then use a construct like #define DONT_DO_THIS_FUNCTION or
- * something like that...
- *
- * This is for the ClampX_ClampY instance
- *
- */
-
-
-#include <arm_neon.h>
-
-/*
- * This has been modified on the knowledge that (at the time)
- * we had the following macro definitions in the parent file
- *
- * #define MAKENAME(suffix) ClampX_ClampY ## suffix
- * #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
- * #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
- * #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
- * #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
- * #define CHECK_FOR_DECAL
- */
-
-/* SkClampMax(val,max) -- bound to 0..max */
-
-#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
-#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
-#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
-#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
-#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
-#define PERSP_FILTER_NAME MAKENAME(_filter_persp)
-
-#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
-#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
-
-#ifndef PREAMBLE
- #define PREAMBLE(state)
- #define PREAMBLE_PARAM_X
- #define PREAMBLE_PARAM_Y
- #define PREAMBLE_ARG_X
- #define PREAMBLE_ARG_Y
-#endif
-
-static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask)) == 0);
-
- PREAMBLE(s);
- // we store y, x, x, x, x, x
-
- const unsigned maxX = s.fBitmap->width() - 1;
- SkFixed fx;
- {
- SkPoint pt;
- s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &pt);
- fx = SkScalarToFixed(pt.fY);
- const unsigned maxY = s.fBitmap->height() - 1;
- *xy++ = TILEY_PROCF(fx, maxY);
- fx = SkScalarToFixed(pt.fX);
- }
-
- if (0 == maxX) {
- // all of the following X values must be 0
- memset(xy, 0, count * sizeof(uint16_t));
- return;
- }
-
- const SkFixed dx = s.fInvSx;
-
-#ifdef CHECK_FOR_DECAL
- // test if we don't need to apply the tile proc
- if ((unsigned)(fx >> 16) <= maxX &&
- (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
- decal_nofilter_scale_neon(xy, fx, dx, count);
- return;
- }
-#endif
-
- int i;
-
- /* very much like done in decal_nofilter, but with
- * an extra clamping function applied.
- * TILEX_PROCF(fx,max) SkClampMax((fx)>>16, max)
- */
- if (count >= 8) {
- /* SkFixed is 16.16 fixed point */
- SkFixed dx2 = dx+dx;
- SkFixed dx4 = dx2+dx2;
- SkFixed dx8 = dx4+dx4;
-
- /* now build fx/fx+dx/fx+2dx/fx+3dx */
- SkFixed fx1, fx2, fx3;
- int32x4_t lbase, hbase;
- int16_t *dst16 = (int16_t *)xy;
-
- fx1 = fx+dx;
- fx2 = fx1+dx;
- fx3 = fx2+dx;
-
- /* build my template(s) */
- /* avoid the 'lbase unitialized' warning */
- lbase = vdupq_n_s32(fx);
- lbase = vsetq_lane_s32(fx1, lbase, 1);
- lbase = vsetq_lane_s32(fx2, lbase, 2);
- lbase = vsetq_lane_s32(fx3, lbase, 3);
-
- hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
-
- /* store & bump */
- do {
- int32x4_t lout;
- int32x4_t hout;
- int16x8_t hi16;
-
- /* get the hi 16s of all those 32s */
- lout = lbase;
- hout = hbase;
- /* this sets up all lout's then all hout's in hout */
- asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
- hi16 = vreinterpretq_s16_s32(hout);
-
- /* clamp & output */
- hi16 = vmaxq_s16(hi16, vdupq_n_s16(0));
- hi16 = vminq_s16(hi16, vdupq_n_s16(maxX));
- vst1q_s16(dst16, hi16);
-
- /* but preserving base & on to the next */
- lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
- hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
- dst16 += 8;
- count -= 8;
- fx += dx8;
- } while (count >= 8);
- xy = (uint32_t *) dst16;
- }
-
- uint16_t* xx = (uint16_t*)xy;
- for (i = count; i > 0; --i) {
- *xx++ = TILEX_PROCF(fx, maxX); fx += dx;
- }
-}
-
-// note: we could special-case on a matrix which is skewed in X but not Y.
-// this would require a more general setup thatn SCALE does, but could use
-// SCALE's inner loop that only looks at dx
-
-static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask |
- SkMatrix::kAffine_Mask)) == 0);
-
- PREAMBLE(s);
- SkPoint srcPt;
- s.fInvProc(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
- SkFixed fx = SkScalarToFixed(srcPt.fX);
- SkFixed fy = SkScalarToFixed(srcPt.fY);
- SkFixed dx = s.fInvSx;
- SkFixed dy = s.fInvKy;
- int maxX = s.fBitmap->width() - 1;
- int maxY = s.fBitmap->height() - 1;
-
- /* NEON lets us do an 8x unrolling */
- if (count >= 8) {
- /* SkFixed is 16.16 fixed point */
- SkFixed dx4 = dx * 4;
- SkFixed dy4 = dy * 4;
- SkFixed dx8 = dx * 8;
- SkFixed dy8 = dy * 8;
-
- int32x4_t xbase, ybase;
- int32x4_t x2base, y2base;
- int16_t *dst16 = (int16_t *) xy;
-
- /* my sets of maxx/maxy for clamping */
- int32_t maxpair = (maxX&0xffff) | ((maxY&0xffff)<<16);
- int16x8_t maxXY = vreinterpretq_s16_s32(vdupq_n_s32(maxpair));
-
- /* now build fx/fx+dx/fx+2dx/fx+3dx */
- /* avoid the 'xbase unitialized' warning...*/
- xbase = vdupq_n_s32(fx);
- xbase = vsetq_lane_s32(fx+dx, xbase, 1);
- xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);
- xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);
-
- /* same for fy */
- /* avoid the 'ybase unitialized' warning...*/
- ybase = vdupq_n_s32(fy);
- ybase = vsetq_lane_s32(fy+dy, ybase, 1);
- ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);
- ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
-
- x2base = vaddq_s32(xbase, vdupq_n_s32(dx4));
- y2base = vaddq_s32(ybase, vdupq_n_s32(dy4));
-
- /* store & bump */
- do {
- int32x4_t xout, yout;
- int32x4_t x2out, y2out;
- int16x8_t hi16, hi16_2;
-
- xout = xbase;
- yout = ybase;
-
- /* overlay y's low16 with hi16 from x */
- /* so we properly shifted xyxyxyxy */
- yout = vsriq_n_s32(yout, xout, 16);
- hi16 = vreinterpretq_s16_s32 (yout);
-
- /* do the clamping; both guys get 0's */
- hi16 = vmaxq_s16 (hi16, vdupq_n_s16(0));
- hi16 = vminq_s16 (hi16, maxXY);
-
- vst1q_s16 (dst16, hi16);
-
- /* and for the other 4 pieces of this iteration */
- x2out = x2base;
- y2out = y2base;
-
- /* overlay y's low16 with hi16 from x */
- /* so we properly shifted xyxyxyxy */
- y2out = vsriq_n_s32(y2out, x2out, 16);
- hi16_2 = vreinterpretq_s16_s32 (y2out);
-
- /* do the clamping; both guys get 0's */
- hi16_2 = vmaxq_s16 (hi16_2, vdupq_n_s16(0));
- hi16_2 = vminq_s16 (hi16_2, maxXY);
-
- /* RBE: gcc regenerates dst16+8 all the time instead
- * of folding it into an addressing mode. *sigh* */
- vst1q_s16 (dst16+8, hi16_2);
-
- /* moving base and on to the next */
- xbase = vaddq_s32 (xbase, vdupq_n_s32 (dx8));
- ybase = vaddq_s32 (ybase, vdupq_n_s32 (dy8));
- x2base = vaddq_s32 (x2base, vdupq_n_s32 (dx8));
- y2base = vaddq_s32 (y2base, vdupq_n_s32 (dy8));
-
- dst16 += 16; /* 8x32 aka 16x16 */
- count -= 8;
- fx += dx8;
- fy += dy8;
- } while (count >= 8);
- xy = (uint32_t *) dst16;
- }
-
- for (int i = count; i > 0; --i) {
- *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX);
- fx += dx; fy += dy;
- }
-}
-
-#undef DEBUG_PERSP_NOFILTER
-
-static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
- uint32_t* SK_RESTRICT xy,
- int count, int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
- PREAMBLE(s);
- /* max{X,Y} are int here, but later shown/assumed to fit in 16 bits */
- int maxX = s.fBitmap->width() - 1;
- int maxY = s.fBitmap->height() - 1;
-
- SkPerspIter iter(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, count);
-
- while ((count = iter.next()) != 0) {
- const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-
-#if defined(DEBUG_PERSP_NOFILTER)
- /* debugging stuff */
- const SkFixed *end_srcXY = srcXY + (count*2);
- uint32_t *end_xy = xy + (count);
- const SkFixed *base_srcXY = srcXY;
- uint32_t *base_xy = xy;
- int base_count = count;
-#endif
-
-#if 1
- // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition
- // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn
-
- /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
- * but we immediately discard the low 16 bits...
- * so what we're going to do is vld4, which will give us
- * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo'
- * parts....
- */
- if (count >= 8) {
- int16_t *mysrc = (int16_t *) srcXY;
- int16_t *mydst = (int16_t *) xy;
- int16x4_t maxX4 = vdup_n_s16((int16_t)maxX);
- int16x4_t maxY4 = vdup_n_s16((int16_t)maxY);
- int16x4_t zero4 = vdup_n_s16(0);
-
- /* The constructs with local blocks for register assignments
- * and asm() instructions is to make keep any hard register
- * assignments to as small a scope as possible. and to avoid
- * burning call-preserved hard registers on the vld/vst
- * instructions.
- */
-
- do {
- int16x4_t xhi, yhi;
- int16x4_t x2hi, y2hi;
-
- /* vld4 does the de-interleaving for us */
- {
- register int16x4_t t_xlo asm("d0");
- register int16x4_t t_xhi asm("d1");
- register int16x4_t t_ylo asm("d2");
- register int16x4_t t_yhi asm("d3");
-
- asm ("vld4.16 {d0-d3},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
- : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi)
- : "r" (mysrc)
- );
- xhi = t_xhi;
- yhi = t_yhi;
- }
-
- /* clamp X>>16 (aka xhi) to 0..maxX */
- xhi = vmax_s16(xhi, zero4); /* now 0.. */
- xhi = vmin_s16(xhi, maxX4); /* now 0..maxX */
-
- /* clamp Y>>16 (aka yhi) to 0..maxY */
- yhi = vmax_s16(yhi, zero4); /* now 0.. */
- yhi = vmin_s16(yhi, maxY4); /* now 0..maxY */
-
- /* deal with the second set of numbers */
- {
- register int16x4_t t_xlo asm("d4");
- register int16x4_t t_xhi asm("d5");
- register int16x4_t t_ylo asm("d6");
- register int16x4_t t_yhi asm("d7");
-
- /* offset == 256 bits == 32 bytes == 8 longs == 16 shorts */
- asm ("vld4.16 {d4-d7},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
- : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi)
- : "r" (mysrc+16)
- );
- x2hi = t_xhi;
- y2hi = t_yhi;
- }
-
- /* clamp the second 4 here */
-
- if (0) { extern void rbe(void); rbe(); }
-
- /* clamp X>>16 (aka xhi) to 0..maxX */
- x2hi = vmax_s16(x2hi, zero4); /* now 0.. */
- x2hi = vmin_s16(x2hi, maxX4); /* now 0..maxX */
-
- /* clamp Y>>16 (aka yhi) to 0..maxY */
- y2hi = vmax_s16(y2hi, zero4); /* now 0.. */
- y2hi = vmin_s16(y2hi, maxY4); /* now 0..maxY */
-
- /* we're storing as {x,y}s: x is [0], y is [1] */
- /* we'll use vst2 to make this happen */
-
- {
- register int16x4_t out_x asm("d16") = xhi;
- register int16x4_t out_y asm("d17") = yhi;
-
- asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x), "w" (out_y), "r" (mydst)
- );
- }
- {
- register int16x4_t out_x asm("d18") = x2hi;
- register int16x4_t out_y asm("d19") = y2hi;
-
- asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x), "w" (out_y), "r" (mydst+8)
- );
- }
-
- /* XXX: gcc isn't interleaving these with the NEON ops
- * but i think that all the scoreboarding works out */
- count -= 8; /* 8 iterations */
- mysrc += 32; /* 16 longs, aka 32 shorts */
- mydst += 16; /* 16 shorts, aka 8 longs */
- } while (count >= 8);
- /* get xy and srcXY fixed up */
- srcXY = (const SkFixed *) mysrc;
- xy = (uint32_t *) mydst;
- }
-#endif
-
- while (--count >= 0) {
- *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
- TILEX_PROCF(srcXY[0], maxX);
- srcXY += 2;
- }
-
-#if defined(DEBUG_PERSP_NOFILTER)
- /* for checking our NEON-produced results against vanilla code */
- {
- int bad = (-1);
- for (int i = 0; i < base_count; i++) {
- uint32_t val;
- val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
- TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
-
- if (val != base_xy[i]) {
- bad = i;
- break;
- }
- }
- if (bad >= 0) {
- SkDebugf("clamp-nofilter-persp failed piece %d\n", bad);
- SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY);
- bad -= (bad & 0x7); /* align */
- for (int i = bad; i < bad + 8; i++) {
- uint32_t val;
- val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
- TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
-
- SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
- i, base_xy[i], val, base_srcXY[i * 2 + 0],
- base_srcXY[i * 2 + 1]);
- }
- SkDebugf ("---\n");
- }
-
- if (end_xy != xy) {
- SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy);
- }
- if (end_srcXY != srcXY) {
- SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY,
- end_srcXY);
- }
- }
-#endif
- }
-}
-
-#undef DEBUG_PERSP_NOFILTER
-
-//////////////////////////////////////////////////////////////////////////////
-
-static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
- SkFixed one PREAMBLE_PARAM_Y) {
- unsigned i = TILEY_PROCF(f, max);
- i = (i << 4) | TILEY_LOW_BITS(f, max);
- return (i << 14) | (TILEY_PROCF((f + one), max));
-}
-
-static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
- SkFixed one PREAMBLE_PARAM_X) {
- unsigned i = TILEX_PROCF(f, max);
- i = (i << 4) | TILEX_LOW_BITS(f, max);
- return (i << 14) | (TILEX_PROCF((f + one), max));
-}
-
-static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask)) == 0);
- SkASSERT(s.fInvKy == 0);
-
- PREAMBLE(s);
-
- const unsigned maxX = s.fBitmap->width() - 1;
- const SkFixed one = s.fFilterOneX;
- const SkFixed dx = s.fInvSx;
- SkFixed fx;
-
- {
- SkPoint pt;
- s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &pt);
- const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
- const unsigned maxY = s.fBitmap->height() - 1;
- // compute our two Y values up front
- *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
- // now initialize fx
- fx = SkScalarToFixed(pt.fX) - (one >> 1);
- }
-
-#ifdef CHECK_FOR_DECAL
- // test if we don't need to apply the tile proc
- if (dx > 0 &&
- (unsigned)(fx >> 16) <= maxX &&
- (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
- decal_filter_scale_neon(xy, fx, dx, count);
- } else
-#endif
-
- if (count >= 4) {
- int32x4_t wide_one, wide_fx, wide_fx1, wide_i, wide_lo;
- #if 0
- /* verification hooks -- see below */
- SkFixed debug_fx = fx;
- int count_done = 0;
- #endif
-
- wide_fx = vdupq_n_s32(fx);
- wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
- wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
- wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
-
- wide_one = vdupq_n_s32(one);
-
- while (count >= 4) {
- /* original expands to:
- * unsigned i = SkClampMax((f) >> 16, max);
- * i = (i << 4) | (((f) >> 12) & 0xF);
- * return (i << 14) | (SkClampMax(((f + one)) >> 16, max));
- */
-
- /* i = SkClampMax(f>>16, maxX) */
- wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0));
- wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX));
-
- /* i<<4 | TILEX_LOW_BITS(fx) */
- wide_lo = vshrq_n_s32(wide_fx, 12);
- wide_i = vsliq_n_s32(wide_lo, wide_i, 4);
-
- /* i<<14 */
- wide_i = vshlq_n_s32(wide_i, 14);
-
- /* SkClampMax(((f + one)) >> 16, max) */
- wide_fx1 = vaddq_s32(wide_fx, wide_one);
- wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0));
- wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX));
-
- /* final combination */
- wide_i = vorrq_s32(wide_i, wide_fx1);
-
- vst1q_u32(xy, vreinterpretq_u32_s32(wide_i));
-
- #if 0
- /* having a verification hook is a good idea */
- /* use debug_fx, debug_fx+dx, etc. */
-
- for (int i=0;i<4;i++) {
- uint32_t want = PACK_FILTER_X_NAME(debug_fx, maxX, one PREAMBLE_ARG_X);
- if (xy[i] != want)
- {
- /* print a nastygram */
- SkDebugf("clamp-filter-scale fails\n");
- SkDebugf("got %08x want %08x\n", xy[i], want);
- SkDebugf("fx %08x debug_fx %08x dx %08x done %d\n",
- fx, debug_fx, dx, count_done);
- SkDebugf(" maxX %08x one %08x\n", maxX, one);
-
- }
- debug_fx += dx;
- count_done++;
- }
- #endif
- wide_fx += vdupq_n_s32(dx+dx+dx+dx);
- fx += dx+dx+dx+dx;
- xy += 4;
- count -= 4;
- }
- }
-
- while (--count >= 0) {
- *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X);
- fx += dx;
- }
-}
-
-static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask |
- SkMatrix::kAffine_Mask)) == 0);
-
- PREAMBLE(s);
- SkPoint srcPt;
- s.fInvProc(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
- SkFixed oneX = s.fFilterOneX;
- SkFixed oneY = s.fFilterOneY;
- SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
- SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
- SkFixed dx = s.fInvSx;
- SkFixed dy = s.fInvKy;
- unsigned maxX = s.fBitmap->width() - 1;
- unsigned maxY = s.fBitmap->height() - 1;
-
- if (count >= 4) {
- int32x4_t wide_i, wide_lo;
- int32x4_t wide_fx, wide_onex, wide_fx1;
- int32x4_t wide_fy, wide_oney, wide_fy1;
-
- #undef AFFINE_DEBUG
- #if defined(AFFINE_DEBUG)
- SkFixed fyp = fy;
- SkFixed fxp = fx;
- uint32_t *xyp = xy;
- int count_done = 0;
- #endif
-
- wide_fx = vdupq_n_s32(fx);
- wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
- wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
- wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
-
- wide_fy = vdupq_n_s32(fy);
- wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
- wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
- wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
-
- wide_onex = vdupq_n_s32(oneX);
- wide_oney = vdupq_n_s32(oneY);
-
- while (count >= 4) {
- int32x4_t wide_x;
- int32x4_t wide_y;
-
- /* do the X side, then the Y side, then interleave them */
-
- /* original expands to:
- * unsigned i = SkClampMax((f) >> 16, max);
- * i = (i << 4) | (((f) >> 12) & 0xF);
- * return (i << 14) | (SkClampMax(((f + one)) >> 16, max));
- */
-
- /* i = SkClampMax(f>>16, maxX) */
- wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0));
- wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX));
-
- /* i<<4 | TILEX_LOW_BITS(fx) */
- wide_lo = vshrq_n_s32(wide_fx, 12);
- wide_i = vsliq_n_s32(wide_lo, wide_i, 4);
-
- /* i<<14 */
- wide_i = vshlq_n_s32(wide_i, 14);
-
- /* SkClampMax(((f + one)) >> 16, max) */
- wide_fx1 = vaddq_s32(wide_fx, wide_onex);
- wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0));
- wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX));
-
- /* final combination */
- wide_x = vorrq_s32(wide_i, wide_fx1);
-
- /* And now the Y side */
-
- /* i = SkClampMax(f>>16, maxX) */
- wide_i = vmaxq_s32(vshrq_n_s32(wide_fy,16), vdupq_n_s32(0));
- wide_i = vminq_s32(wide_i, vdupq_n_s32(maxY));
-
- /* i<<4 | TILEX_LOW_BITS(fx) */
- wide_lo = vshrq_n_s32(wide_fy, 12);
- wide_i = vsliq_n_s32(wide_lo, wide_i, 4);
-
- /* i<<14 */
- wide_i = vshlq_n_s32(wide_i, 14);
-
- /* SkClampMax(((f + one)) >> 16, max) */
- wide_fy1 = vaddq_s32(wide_fy, wide_oney);
- wide_fy1 = vmaxq_s32(vshrq_n_s32(wide_fy1,16), vdupq_n_s32(0));
- wide_fy1 = vminq_s32(wide_fy1, vdupq_n_s32(maxY));
-
- /* final combination */
- wide_y = vorrq_s32(wide_i, wide_fy1);
-
- /* interleave as YXYXYXYX as part of the storing */
- {
- /* vst2.32 needs side-by-side registers */
- register int32x4_t t_x asm("q1");
- register int32x4_t t_y asm("q0");
-
- t_x = wide_x; t_y = wide_y;
- asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */"
- :
- : "w" (t_y), "w" (t_x), "r" (xy)
- );
- }
-
- #if defined(AFFINE_DEBUG)
- /* make sure we're good here -- check the 4 we just output */
- for (int i = 0; i<4;i++) {
- uint32_t val;
- val = PACK_FILTER_Y_NAME(fyp, maxY, oneY PREAMBLE_ARG_Y);
- if (val != xy[i*2+0]) {
- /* print a nastygram */
- SkDebugf("clamp-filter-affine fails\n");
- SkDebugf("[bad-y] got %08x want %08x\n", xy[i*2+0], val);
- SkDebugf("fy %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n",
- fy, fxp, fyp, dx, dy, count_done);
- SkDebugf(" maxY %08x oneY %08x\n", maxY, oneY);
- }
- val = PACK_FILTER_X_NAME(fxp, maxX, oneX PREAMBLE_ARG_X);
- if (val != xy[i*2+1]) {
- /* print a nastygram */
- SkDebugf("clamp-filter-affine fails\n");
- SkDebugf("[bad-x] got %08x want %08x\n", xy[i*2+1], val);
- SkDebugf("fx %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n",
- fx, fxp, fyp, dx, dy, count_done);
- SkDebugf(" maxX %08x one %08x\n", maxX, oneX);
- }
- fyp += dy;
- fxp += dx;
- count_done++;
- }
- #endif
-
- wide_fx += vdupq_n_s32(dx+dx+dx+dx);
- fx += dx+dx+dx+dx;
- wide_fy += vdupq_n_s32(dy+dy+dy+dy);
- fy += dy+dy+dy+dy;
- xy += 8; /* 4 x's, 4 y's */
- count -= 4;
- }
- }
-
- while (--count >= 0) {
- /* NB: writing Y/X */
- *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
- fy += dy;
- *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
- fx += dx;
- }
-}
-
-static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
- uint32_t* SK_RESTRICT xy, int count,
- int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
- PREAMBLE(s);
- unsigned maxX = s.fBitmap->width() - 1;
- unsigned maxY = s.fBitmap->height() - 1;
- SkFixed oneX = s.fFilterOneX;
- SkFixed oneY = s.fFilterOneY;
-
- SkPerspIter iter(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, count);
-
- while ((count = iter.next()) != 0) {
- const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-
- if (count >= 4) {
- int32x4_t wide_i, wide_lo;
- int32x4_t wide_fx1;
- int32x4_t wide_fy1;
- int32x4_t wide_x, wide_y;
-
- while (count >= 4) {
- /* RBE: it's good, but:
- * -- we spill a constant that could be easily regnerated
- * [perhaps tweak gcc's NEON constant costs?]
- */
-
- /* load src: x-y-x-y-x-y-x-y */
- {
- register int32x4_t q0 asm ("q0");
- register int32x4_t q1 asm ("q1");
- asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
- : "=w" (q0), "=w" (q1)
- : "r" (srcXY));
- wide_x = q0; wide_y = q1;
- }
-
- /* do the X side, then the Y side, then interleave them */
-
- wide_x = vsubq_s32(wide_x, vdupq_n_s32 (oneX>>1));
-
- /* original expands to:
- * unsigned i = SkClampMax((f) >> 16, max);
- * i = (i << 4) | (((f) >> 12) & 0xF);
- * return (i << 14) | (SkClampMax(((f + one)) >> 16, max));
- */
-
- /* i = SkClampMax(f>>16, maxX) */
- wide_i = vmaxq_s32 (vshrq_n_s32 (wide_x, 16), vdupq_n_s32 (0));
- wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxX));
-
- /* i<<4 | TILEX_LOW_BITS(fx) */
- wide_lo = vshrq_n_s32 (wide_x, 12);
- wide_i = vsliq_n_s32 (wide_lo, wide_i, 4);
-
- /* i<<14 */
- wide_i = vshlq_n_s32 (wide_i, 14);
-
- /* SkClampMax(((f + one)) >> 16, max) */
- wide_fx1 = vaddq_s32 (wide_x, vdupq_n_s32(oneX));
- wide_fx1 = vmaxq_s32 (vshrq_n_s32 (wide_fx1, 16), vdupq_n_s32 (0));
- wide_fx1 = vminq_s32 (wide_fx1, vdupq_n_s32 (maxX));
-
- /* final combination */
- wide_x = vorrq_s32 (wide_i, wide_fx1);
-
-
- /* And now the Y side */
-
- wide_y = vsubq_s32(wide_y, vdupq_n_s32 (oneY>>1));
-
- /* i = SkClampMax(f>>16, maxX) */
- wide_i = vmaxq_s32 (vshrq_n_s32 (wide_y, 16), vdupq_n_s32 (0));
- wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxY));
-
- /* i<<4 | TILEX_LOW_BITS(fx) */
- wide_lo = vshrq_n_s32 (wide_y, 12);
- wide_i = vsliq_n_s32 (wide_lo, wide_i, 4);
-
- /* i<<14 */
- wide_i = vshlq_n_s32 (wide_i, 14);
-
- /* SkClampMax(((f + one)) >> 16, max) */
-
- /* wide_fy1_1 and wide_fy1_2 are just temporary variables to
- * work-around an ICE in debug */
- int32x4_t wide_fy1_1 = vaddq_s32 (wide_y, vdupq_n_s32(oneY));
- int32x4_t wide_fy1_2 = vmaxq_s32 (vshrq_n_s32 (wide_fy1_1, 16),
- vdupq_n_s32 (0));
- wide_fy1 = vminq_s32 (wide_fy1_2, vdupq_n_s32 (maxY));
-
- /* final combination */
- wide_y = vorrq_s32 (wide_i, wide_fy1);
-
- /* switch them around; have to do it this way to get them
- * in the proper registers to match our instruction */
-
- /* iteration bookkeeping, ahead of the asm() for scheduling */
- srcXY += 2*4;
- count -= 4;
-
- /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */
- {
- register int32x4_t q0 asm ("q0") = wide_y;
- register int32x4_t q1 asm ("q1") = wide_x;
-
- asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */"
- :
- : "w" (q0), "w" (q1), "r" (xy));
- }
-
- /* on to the next iteration */
- /* count, srcXY are handled above */
- xy += 2*4;
- }
- }
-
- /* was do-while; NEON code invalidates original count>0 assumption */
- while (--count >= 0) {
- /* NB: we read x/y, we write y/x */
- *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
- oneY PREAMBLE_ARG_Y);
- *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
- oneX PREAMBLE_ARG_X);
- srcXY += 2;
- }
- }
-}
-
-const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
- SCALE_NOFILTER_NAME,
- SCALE_FILTER_NAME,
- AFFINE_NOFILTER_NAME,
- AFFINE_FILTER_NAME,
- PERSP_NOFILTER_NAME,
- PERSP_FILTER_NAME
-};
-
-#undef MAKENAME
-#undef TILEX_PROCF
-#undef TILEY_PROCF
-#ifdef CHECK_FOR_DECAL
- #undef CHECK_FOR_DECAL
-#endif
-
-#undef SCALE_NOFILTER_NAME
-#undef SCALE_FILTER_NAME
-#undef AFFINE_NOFILTER_NAME
-#undef AFFINE_FILTER_NAME
-#undef PERSP_NOFILTER_NAME
-#undef PERSP_FILTER_NAME
-
-#undef PREAMBLE
-#undef PREAMBLE_PARAM_X
-#undef PREAMBLE_PARAM_Y
-#undef PREAMBLE_ARG_X
-#undef PREAMBLE_ARG_Y
-
-#undef TILEX_LOW_BITS
-#undef TILEY_LOW_BITS
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h
new file mode 100644
index 00000000000..72bf1bce336
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h
@@ -0,0 +1,506 @@
+
+#include <arm_neon.h>
+
+
+#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
+#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
+#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
+#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
+#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
+#define PERSP_FILTER_NAME MAKENAME(_filter_persp)
+
+#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
+#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
+#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
+#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
+
+#ifndef PREAMBLE
+ #define PREAMBLE(state)
+ #define PREAMBLE_PARAM_X
+ #define PREAMBLE_PARAM_Y
+ #define PREAMBLE_ARG_X
+ #define PREAMBLE_ARG_Y
+#endif
+
+static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y) {
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+ SkMatrix::kScale_Mask)) == 0);
+
+ PREAMBLE(s);
+
+ // we store y, x, x, x, x, x
+ const unsigned maxX = s.fBitmap->width() - 1;
+ SkFractionalInt fx;
+ {
+ SkPoint pt;
+ s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, &pt);
+ fx = SkScalarToFractionalInt(pt.fY);
+ const unsigned maxY = s.fBitmap->height() - 1;
+ *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
+ fx = SkScalarToFractionalInt(pt.fX);
+ }
+
+ if (0 == maxX) {
+ // all of the following X values must be 0
+ memset(xy, 0, count * sizeof(uint16_t));
+ return;
+ }
+
+ const SkFractionalInt dx = s.fInvSxFractionalInt;
+
+#ifdef CHECK_FOR_DECAL
+ // test if we don't need to apply the tile proc
+ if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
+ decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
+ SkFractionalIntToFixed(dx), count);
+ return;
+ }
+#endif
+
+ if (count >= 8) {
+ SkFractionalInt dx2 = dx+dx;
+ SkFractionalInt dx4 = dx2+dx2;
+ SkFractionalInt dx8 = dx4+dx4;
+
+ // now build fx/fx+dx/fx+2dx/fx+3dx
+ SkFractionalInt fx1, fx2, fx3;
+ int32x4_t lbase, hbase;
+ int16_t *dst16 = (int16_t *)xy;
+
+ fx1 = fx+dx;
+ fx2 = fx1+dx;
+ fx3 = fx2+dx;
+
+ lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
+ lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
+ lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
+ lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
+ hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
+
+ // store & bump
+ while (count >= 8) {
+
+ int16x8_t fx8;
+
+ fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
+
+ vst1q_s16(dst16, fx8);
+
+ // but preserving base & on to the next
+ lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+ hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+ dst16 += 8;
+ count -= 8;
+ fx += dx8;
+ };
+ xy = (uint32_t *) dst16;
+ }
+
+ uint16_t* xx = (uint16_t*)xy;
+ for (int i = count; i > 0; --i) {
+ *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
+ fx += dx;
+ }
+}
+
+static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y) {
+ SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+ SkMatrix::kScale_Mask |
+ SkMatrix::kAffine_Mask)) == 0);
+
+ PREAMBLE(s);
+ SkPoint srcPt;
+ s.fInvProc(s.fInvMatrix,
+ SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+
+ SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
+ SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
+ SkFractionalInt dx = s.fInvSxFractionalInt;
+ SkFractionalInt dy = s.fInvKyFractionalInt;
+ int maxX = s.fBitmap->width() - 1;
+ int maxY = s.fBitmap->height() - 1;
+
+ if (count >= 8) {
+ SkFractionalInt dx4 = dx * 4;
+ SkFractionalInt dy4 = dy * 4;
+ SkFractionalInt dx8 = dx * 8;
+ SkFractionalInt dy8 = dy * 8;
+
+ int32x4_t xbase, ybase;
+ int32x4_t x2base, y2base;
+ int16_t *dst16 = (int16_t *) xy;
+
+ // now build fx, fx+dx, fx+2dx, fx+3dx
+ xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
+ xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
+ xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
+ xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
+
+ // same for fy
+ ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
+ ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
+ ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
+ ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
+
+ x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
+ y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
+
+ // store & bump
+ do {
+ int16x8x2_t hi16;
+
+ hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
+ hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
+
+ vst2q_s16(dst16, hi16);
+
+ // moving base and on to the next
+ xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+ ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
+ x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
+ y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
+
+ dst16 += 16; // 8x32 aka 16x16
+ count -= 8;
+ fx += dx8;
+ fy += dy8;
+ } while (count >= 8);
+ xy = (uint32_t *) dst16;
+ }
+
+ for (int i = count; i > 0; --i) {
+ *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
+ TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
+ fx += dx; fy += dy;
+ }
+}
+
+static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
+ uint32_t* SK_RESTRICT xy,
+ int count, int x, int y) {
+ SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
+
+ PREAMBLE(s);
+ // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
+ int maxX = s.fBitmap->width() - 1;
+ int maxY = s.fBitmap->height() - 1;
+
+ SkPerspIter iter(s.fInvMatrix,
+ SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, count);
+
+ while ((count = iter.next()) != 0) {
+ const SkFixed* SK_RESTRICT srcXY = iter.getXY();
+
+ if (count >= 8) {
+ int32_t *mysrc = (int32_t *) srcXY;
+ int16_t *mydst = (int16_t *) xy;
+ do {
+ int16x8x2_t hi16;
+ int32x4x2_t xy1, xy2;
+
+ xy1 = vld2q_s32(mysrc);
+ xy2 = vld2q_s32(mysrc+8);
+
+ hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
+ hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
+
+ vst2q_s16(mydst, hi16);
+
+ count -= 8; // 8 iterations
+ mysrc += 16; // 16 longs
+ mydst += 16; // 16 shorts, aka 8 longs
+ } while (count >= 8);
+ // get xy and srcXY fixed up
+ srcXY = (const SkFixed *) mysrc;
+ xy = (uint32_t *) mydst;
+ }
+
+ while (--count >= 0) {
+ *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
+ TILEX_PROCF(srcXY[0], maxX);
+ srcXY += 2;
+ }
+ }
+}
+
+static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
+ SkFixed one PREAMBLE_PARAM_Y) {
+ unsigned i = TILEY_PROCF(f, max);
+ i = (i << 4) | TILEY_LOW_BITS(f, max);
+ return (i << 14) | (TILEY_PROCF((f + one), max));
+}
+
+static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
+ SkFixed one PREAMBLE_PARAM_X) {
+ unsigned i = TILEX_PROCF(f, max);
+ i = (i << 4) | TILEX_LOW_BITS(f, max);
+ return (i << 14) | (TILEX_PROCF((f + one), max));
+}
+
+static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
+ SkFixed one PREAMBLE_PARAM_X) {
+ int32x4_t ret, res, wide_one;
+
+ // Prepare constants
+ wide_one = vdupq_n_s32(one);
+
+ // Step 1
+ res = TILEX_PROCF_NEON4(f, max);
+
+ // Step 2
+ ret = TILEX_LOW_BITS_NEON4(f, max);
+ ret = vsliq_n_s32(ret, res, 4);
+
+ // Step 3
+ res = TILEX_PROCF_NEON4(f + wide_one, max);
+ ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
+
+ return ret;
+}
+
+static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
+ SkFixed one PREAMBLE_PARAM_X) {
+ int32x4_t ret, res, wide_one;
+
+ // Prepare constants
+ wide_one = vdupq_n_s32(one);
+
+ // Step 1
+ res = TILEY_PROCF_NEON4(f, max);
+
+ // Step 2
+ ret = TILEY_LOW_BITS_NEON4(f, max);
+ ret = vsliq_n_s32(ret, res, 4);
+
+ // Step 3
+ res = TILEY_PROCF_NEON4(f + wide_one, max);
+ ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
+
+ return ret;
+}
+
+static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y) {
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+ SkMatrix::kScale_Mask)) == 0);
+ SkASSERT(s.fInvKy == 0);
+
+ PREAMBLE(s);
+
+ const unsigned maxX = s.fBitmap->width() - 1;
+ const SkFixed one = s.fFilterOneX;
+ const SkFractionalInt dx = s.fInvSxFractionalInt;
+ SkFractionalInt fx;
+
+ {
+ SkPoint pt;
+ s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, &pt);
+ const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+ const unsigned maxY = s.fBitmap->height() - 1;
+ // compute our two Y values up front
+ *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
+ // now initialize fx
+ fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
+ }
+
+#ifdef CHECK_FOR_DECAL
+ // test if we don't need to apply the tile proc
+ if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
+ decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
+ SkFractionalIntToFixed(dx), count);
+ return;
+ }
+#endif
+ {
+
+ if (count >= 4) {
+ int32x4_t wide_fx;
+
+ wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
+ wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
+ wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
+ wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
+
+ while (count >= 4) {
+ int32x4_t res;
+
+ res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
+
+ vst1q_u32(xy, vreinterpretq_u32_s32(res));
+
+ wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
+ fx += dx+dx+dx+dx;
+ xy += 4;
+ count -= 4;
+ }
+ }
+
+ while (--count >= 0) {
+ *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
+ fx += dx;
+ }
+
+ }
+}
+
+static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y) {
+ SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+ SkMatrix::kScale_Mask |
+ SkMatrix::kAffine_Mask)) == 0);
+
+ PREAMBLE(s);
+ SkPoint srcPt;
+ s.fInvProc(s.fInvMatrix,
+ SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+
+ SkFixed oneX = s.fFilterOneX;
+ SkFixed oneY = s.fFilterOneY;
+ SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
+ SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
+ SkFixed dx = s.fInvSx;
+ SkFixed dy = s.fInvKy;
+ unsigned maxX = s.fBitmap->width() - 1;
+ unsigned maxY = s.fBitmap->height() - 1;
+
+ if (count >= 4) {
+ int32x4_t wide_fy, wide_fx;
+
+ wide_fx = vdupq_n_s32(fx);
+ wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
+ wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
+ wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
+
+ wide_fy = vdupq_n_s32(fy);
+ wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
+ wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
+ wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
+
+ while (count >= 4) {
+ int32x4x2_t vxy;
+
+ // do the X side, then the Y side, then interleave them
+ vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
+ vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
+
+ // interleave as YXYXYXYX as part of the storing
+ vst2q_s32((int32_t*)xy, vxy);
+
+ // prepare next iteration
+ wide_fx += vdupq_n_s32(dx+dx+dx+dx);
+ fx += dx + dx + dx + dx;
+ wide_fy += vdupq_n_s32(dy+dy+dy+dy);
+ fy += dy+dy+dy+dy;
+ xy += 8; // 4 x's, 4 y's
+ count -= 4;
+ }
+ }
+
+ while (--count >= 0) {
+ // NB: writing Y/X
+ *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
+ fy += dy;
+ *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
+ fx += dx;
+ }
+}
+
+static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
+ uint32_t* SK_RESTRICT xy, int count,
+ int x, int y) {
+ SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
+
+ PREAMBLE(s);
+ unsigned maxX = s.fBitmap->width() - 1;
+ unsigned maxY = s.fBitmap->height() - 1;
+ SkFixed oneX = s.fFilterOneX;
+ SkFixed oneY = s.fFilterOneY;
+
+ SkPerspIter iter(s.fInvMatrix,
+ SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, count);
+
+ while ((count = iter.next()) != 0) {
+ const SkFixed* SK_RESTRICT srcXY = iter.getXY();
+
+ while (count >= 4) {
+ int32x4_t wide_x, wide_y;
+ int32x4x2_t vxy, vresyx;
+
+ // load src: x-y-x-y-x-y-x-y
+ vxy = vld2q_s32(srcXY);
+
+ // do the X side, then the Y side, then interleave them
+ wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
+ wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
+
+ vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
+ vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
+
+ // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
+ vst2q_s32((int32_t*)xy, vresyx);
+
+ // on to the next iteration
+ srcXY += 2*4;
+ count -= 4;
+ xy += 2*4;
+ }
+
+ while (--count >= 0) {
+ // NB: we read x/y, we write y/x
+ *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
+ oneY PREAMBLE_ARG_Y);
+ *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
+ oneX PREAMBLE_ARG_X);
+ srcXY += 2;
+ }
+ }
+}
+
+const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
+ SCALE_NOFILTER_NAME,
+ SCALE_FILTER_NAME,
+ AFFINE_NOFILTER_NAME,
+ AFFINE_FILTER_NAME,
+ PERSP_NOFILTER_NAME,
+ PERSP_FILTER_NAME
+};
+
+#undef TILEX_PROCF_NEON8
+#undef TILEY_PROCF_NEON8
+#undef TILEX_PROCF_NEON4
+#undef TILEY_PROCF_NEON4
+#undef TILEX_LOW_BITS_NEON4
+#undef TILEY_LOW_BITS_NEON4
+
+#undef MAKENAME
+#undef TILEX_PROCF
+#undef TILEY_PROCF
+#ifdef CHECK_FOR_DECAL
+ #undef CHECK_FOR_DECAL
+#endif
+
+#undef SCALE_NOFILTER_NAME
+#undef SCALE_FILTER_NAME
+#undef AFFINE_NOFILTER_NAME
+#undef AFFINE_FILTER_NAME
+#undef PERSP_NOFILTER_NAME
+#undef PERSP_FILTER_NAME
+
+#undef PREAMBLE
+#undef PREAMBLE_PARAM_X
+#undef PREAMBLE_PARAM_Y
+#undef PREAMBLE_ARG_X
+#undef PREAMBLE_ARG_Y
+
+#undef TILEX_LOW_BITS
+#undef TILEY_LOW_BITS
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h
deleted file mode 100644
index 55e2997a5ef..00000000000
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h
+++ /dev/null
@@ -1,542 +0,0 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-/*
- * Modifications done in-house at Motorola
- *
- * this is a clone of SkBitmapProcState_matrix.h
- * and has been tuned to work with the NEON unit.
- *
- * Still going back and forth between whether this approach
- * (clone the entire SkBitmapProcState_matrix.h file or
- * if I should put just the modified routines in here and
- * then use a construct like #define DONT_DO_THIS_FUNCTION or
- * something like that...
- *
- * This is for the RepeatX_RepeatY part of the world
- */
-
-
-#include <arm_neon.h>
-
-/*
- * This has been modified on the knowledge that (at the time)
- * we had the following macro definitions in the parent file
- *
- * #define MAKENAME(suffix) RepeatX_RepeatY ## suffix
- * #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
- * #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16)
- * #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
- * #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
- */
-
-/* SkClampMax(val,max) -- bound to 0..max */
-
-#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
-#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
-#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
-#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
-#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
-#define PERSP_FILTER_NAME MAKENAME(_filter_persp)
-
-#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
-#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
-
-#ifndef PREAMBLE
- #define PREAMBLE(state)
- #define PREAMBLE_PARAM_X
- #define PREAMBLE_PARAM_Y
- #define PREAMBLE_ARG_X
- #define PREAMBLE_ARG_Y
-#endif
-
-static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask)) == 0);
-
- PREAMBLE(s);
- // we store y, x, x, x, x, x
-
- const unsigned maxX = s.fBitmap->width() - 1;
- SkFixed fx;
- {
- SkPoint pt;
- s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &pt);
- fx = SkScalarToFixed(pt.fY);
- const unsigned maxY = s.fBitmap->height() - 1;
- *xy++ = TILEY_PROCF(fx, maxY);
- fx = SkScalarToFixed(pt.fX);
- }
-
- if (0 == maxX) {
- // all of the following X values must be 0
- memset(xy, 0, count * sizeof(uint16_t));
- return;
- }
-
- const SkFixed dx = s.fInvSx;
-
-#ifdef CHECK_FOR_DECAL
- // test if we don't need to apply the tile proc
- if ((unsigned)(fx >> 16) <= maxX &&
- (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
- decal_nofilter_scale_neon(xy, fx, dx, count);
- } else
-#endif
- {
- int i;
-
- /* RBE: very much like done in decal_nofilter ,
- * but some processing of the 'fx' information
- * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
- */
- if (count >= 8) {
- /* SkFixed is 16.16 fixed point */
- SkFixed dx2 = dx+dx;
- SkFixed dx4 = dx2+dx2;
- SkFixed dx8 = dx4+dx4;
-
- /* now build fx/fx+dx/fx+2dx/fx+3dx */
- SkFixed fx1, fx2, fx3;
- int32x4_t lbase, hbase;
- int16_t *dst16 = (int16_t *)xy;
-
- fx1 = fx+dx;
- fx2 = fx1+dx;
- fx3 = fx2+dx;
-
- lbase = vdupq_n_s32(fx);
- lbase = vsetq_lane_s32(fx1, lbase, 1);
- lbase = vsetq_lane_s32(fx2, lbase, 2);
- lbase = vsetq_lane_s32(fx3, lbase, 3);
- hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
-
- /* store & bump */
- do
- {
- int32x4_t lout;
- int32x4_t hout;
- int16x8_t hi16;
-
- /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
- /* mask to low 16 [would like to use uzp tricks) */
- lout = vandq_s32(lbase, vdupq_n_s32(0xffff));
- hout = vandq_s32(hbase, vdupq_n_s32(0xffff));
- /* bare multiplication, not SkFixedMul */
- lout = vmulq_s32(lout, vdupq_n_s32(maxX+1));
- hout = vmulq_s32(hout, vdupq_n_s32(maxX+1));
-
- /* extraction, using uzp */
- /* this is ok -- we want all hi(lout)s then all hi(hout)s */
- asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
- hi16 = vreinterpretq_s16_s32(hout);
- vst1q_s16(dst16, hi16);
-
- /* bump our base on to the next */
- lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
- hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
- dst16 += 8;
- count -= 8;
- fx += dx8;
- } while (count >= 8);
- xy = (uint32_t *) dst16;
- }
- uint16_t* xx = (uint16_t*)xy;
- for (i = count; i > 0; --i) {
- *xx++ = TILEX_PROCF(fx, maxX); fx += dx;
- }
- }
-}
-
-// note: we could special-case on a matrix which is skewed in X but not Y.
-// this would require a more general setup thatn SCALE does, but could use
-// SCALE's inner loop that only looks at dx
-
-
-static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask |
- SkMatrix::kAffine_Mask)) == 0);
-
- PREAMBLE(s);
- SkPoint srcPt;
- s.fInvProc(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
- SkFixed fx = SkScalarToFixed(srcPt.fX);
- SkFixed fy = SkScalarToFixed(srcPt.fY);
- SkFixed dx = s.fInvSx;
- SkFixed dy = s.fInvKy;
- int maxX = s.fBitmap->width() - 1;
- int maxY = s.fBitmap->height() - 1;
-
-#if 0
- int ocount = count;
- uint32_t *oxy = xy;
- SkFixed bfx = fx, bfy=fy, bdx=dx, bdy=dy;
-#endif
-
-
- if (0) { extern void rbe(void); rbe(); }
-
- /* RBE: benchmarks show this eats up time; can we neonize it? */
- /* RBE: very much like done in decal_nofilter ,
- * but some processing of the 'fx' information
- * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
- */
- if (count >= 4) {
- /* SkFixed is 16.16 fixed point */
- SkFixed dx4 = dx*4;
- SkFixed dy4 = dy*4;
-
- /* now build fx/fx+dx/fx+2dx/fx+3dx */
- int32x4_t xbase, ybase;
- int16_t *dst16 = (int16_t *)xy;
-
- /* synthesize 4x for both X and Y */
- xbase = vdupq_n_s32(fx);
- xbase = vsetq_lane_s32(fx+dx, xbase, 1);
- xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);
- xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);
-
- ybase = vdupq_n_s32(fy);
- ybase = vsetq_lane_s32(fy+dy, ybase, 1);
- ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);
- ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
-
- /* store & bump */
- do {
- int32x4_t xout;
- int32x4_t yout;
- int16x8_t hi16;
-
- /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
- /* mask to low 16 [would like to use uzp tricks) */
- xout = vandq_s32(xbase, vdupq_n_s32(0xffff));
- yout = vandq_s32(ybase, vdupq_n_s32(0xffff));
- /* bare multiplication, not SkFixedMul */
- xout = vmulq_s32(xout, vdupq_n_s32(maxX+1));
- yout = vmulq_s32(yout, vdupq_n_s32(maxY+1));
-
- /* put hi16 from xout over low16 from yout */
- yout = vsriq_n_s32(yout, xout, 16);
-
- /* and then yout has the interleaved upper 16's */
- hi16 = vreinterpretq_s16_s32(yout);
- vst1q_s16(dst16, hi16);
-
- /* bump preserved base & on to the next */
- xbase = vaddq_s32 (xbase, vdupq_n_s32(dx4));
- ybase = vaddq_s32 (ybase, vdupq_n_s32(dy4));
- dst16 += 8; /* 8 x16 aka 4x32 */
- count -= 4;
- fx += dx4;
- fy += dy4;
- } while (count >= 4);
- xy = (uint32_t *) dst16;
- }
-
-#if 0
- /* diagnostics... see whether we agree with the NEON code */
- int bad = 0;
- uint32_t *myxy = oxy;
- int myi = (-1);
- SkFixed ofx = bfx, ofy= bfy, odx= bdx, ody= bdy;
- for (myi = ocount; myi > 0; --myi) {
- uint32_t val = (TILEY_PROCF(ofy, maxY) << 16) | TILEX_PROCF(ofx, maxX);
- if (val != *myxy++) {
- bad++;
- break;
- }
- ofx += odx; ofy += ody;
- }
- if (bad) {
- SkDebugf("repeat-nofilter-affine fails\n");
- SkDebugf("count %d myi %d\n", ocount, myi);
- SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
- bfx, bdx, bfy, bdy);
- SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
- }
-#endif
-
- for (int i = count; i > 0; --i) {
- /* fx, fy, dx, dy are all 32 bit 16.16 fixed point */
- /* (((fx) & 0xFFFF) * ((max) + 1) >> 16) */
- *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX);
- fx += dx; fy += dy;
- }
-}
-
-static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
- uint32_t* SK_RESTRICT xy,
- int count, int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
- PREAMBLE(s);
- int maxX = s.fBitmap->width() - 1;
- int maxY = s.fBitmap->height() - 1;
-
- SkPerspIter iter(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, count);
-
- while ((count = iter.next()) != 0) {
- const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-
- /* RBE: */
- /* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */
- /* it's a little more complicated than what I did for the
- * clamp case -- where I could immediately snip to the top
- * 16 bits and do my min/max games there.
- * ... might only be able to get 4x unrolling here
- */
-
- /* vld2 to get a set of 32x4's ... */
- /* do the tile[xy]_procf operations */
- /* which includes doing vuzp to get hi16's */
- /* store it */
- /* -- inner loop (other than vld2) can be had from above */
-
- /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
- * but we immediately discard the low 16 bits...
- * so what we're going to do is vld4, which will give us
- * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo'
- * parts....
- */
- if (0) { extern void rbe(void); rbe(); }
- if (count >= 8) {
- int32_t *mysrc = (int32_t *) srcXY;
- int16_t *mydst = (int16_t *) xy;
- do {
- int32x4_t x, y, x2, y2;
- int16x8_t hi, hi2;
-
- /* read array of x,y,x,y,x,y */
- /* vld2 does the de-interleaving for us */
- /* isolate reg-bound scopes; gcc will minimize register
- * motion if possible; this ensures that we don't lose
- * a register across a debugging call because it happens
- * to be bound into a call-clobbered register
- */
- {
- register int32x4_t q0 asm("q0");
- register int32x4_t q1 asm("q1");
- asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
- : "=w" (q0), "=w" (q1)
- : "r" (mysrc)
- );
- x = q0; y = q1;
- }
-
- /* offset == 256 bits == 32 bytes == 8 longs */
- {
- register int32x4_t q2 asm("q2");
- register int32x4_t q3 asm("q3");
- asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */"
- : "=w" (q2), "=w" (q3)
- : "r" (mysrc+8)
- );
- x2 = q2; y2 = q3;
- }
-
- /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
- /* mask to low 16 [would like to use uzp tricks) */
- /* bare multiplication, not SkFixedMul */
- x = vandq_s32(x, vdupq_n_s32(0xffff));
- x = vmulq_s32(x, vdupq_n_s32(maxX+1));
- y = vandq_s32(y, vdupq_n_s32(0xffff));
- y = vmulq_s32(y, vdupq_n_s32(maxY+1));
-
- x2 = vandq_s32(x2, vdupq_n_s32(0xffff));
- x2 = vmulq_s32(x2, vdupq_n_s32(maxX+1));
- y2 = vandq_s32(y2, vdupq_n_s32(0xffff));
- y2 = vmulq_s32(y2, vdupq_n_s32(maxY+1));
-
- /* now collect interleaved high 16's */
- /* (hi-x, hi-y)4 (hi-x2; hi-y2)4 */
-
- /* extraction, using uzp, leaves hi16's in y */
- y = vsriq_n_s32(y, x, 16);
- hi = vreinterpretq_s16_s32(y);
- vst1q_s16(mydst, hi);
-
- /* and likewise for the second 8 entries */
- y2 = vsriq_n_s32(y2, x2, 16);
- hi2 = vreinterpretq_s16_s32(y2);
- vst1q_s16(mydst+8, hi2);
-
- /* XXX: gcc isn't interleaving these with the NEON ops
- * but i think that all the scoreboarding works out */
- count -= 8; /* 8 iterations */
- mysrc += 16; /* 16 longs */
- mydst += 16; /* 16 shorts, aka 8 longs */
- } while (count >= 8);
- /* get xy and srcXY fixed up */
- srcXY = (const SkFixed *) mysrc;
- xy = (uint32_t *) mydst;
- }
- while (--count >= 0) {
- *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
- TILEX_PROCF(srcXY[0], maxX);
- srcXY += 2;
- }
- }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-
-static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
- SkFixed one PREAMBLE_PARAM_Y) {
- unsigned i = TILEY_PROCF(f, max);
- i = (i << 4) | TILEY_LOW_BITS(f, max);
- return (i << 14) | (TILEY_PROCF((f + one), max));
-}
-
-static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
- SkFixed one PREAMBLE_PARAM_X) {
- unsigned i = TILEX_PROCF(f, max);
- i = (i << 4) | TILEX_LOW_BITS(f, max);
- return (i << 14) | (TILEX_PROCF((f + one), max));
-}
-
-static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask)) == 0);
- SkASSERT(s.fInvKy == 0);
-
- PREAMBLE(s);
-
- const unsigned maxX = s.fBitmap->width() - 1;
- const SkFixed one = s.fFilterOneX;
- const SkFractionalInt dx = s.fInvSxFractionalInt;
- SkFractionalInt fx;
-
- {
- SkPoint pt;
- s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &pt);
- const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
- const unsigned maxY = s.fBitmap->height() - 1;
- // compute our two Y values up front
- *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
- // now initialize fx
- fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
- }
-
-#ifdef CHECK_FOR_DECAL
- // test if we don't need to apply the tile proc
- if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
- decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
- SkFractionalIntToFixed(dx), count);
- } else
-#endif
- {
- do {
- SkFixed fixedFx = SkFractionalIntToFixed(fx);
- *xy++ = PACK_FILTER_X_NAME(fixedFx, maxX, one PREAMBLE_ARG_X);
- fx += dx;
- } while (--count != 0);
- }
-}
-
-static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
- SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
- SkMatrix::kScale_Mask |
- SkMatrix::kAffine_Mask)) == 0);
-
- PREAMBLE(s);
- SkPoint srcPt;
- s.fInvProc(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-
- SkFixed oneX = s.fFilterOneX;
- SkFixed oneY = s.fFilterOneY;
- SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
- SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
- SkFixed dx = s.fInvSx;
- SkFixed dy = s.fInvKy;
- unsigned maxX = s.fBitmap->width() - 1;
- unsigned maxY = s.fBitmap->height() - 1;
-
- do {
- *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
- fy += dy;
- *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
- fx += dx;
- } while (--count != 0);
-}
-
-static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
- uint32_t* SK_RESTRICT xy, int count,
- int x, int y) {
- SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
-
- extern void rbe(void);
-
- PREAMBLE(s);
- unsigned maxX = s.fBitmap->width() - 1;
- unsigned maxY = s.fBitmap->height() - 1;
- SkFixed oneX = s.fFilterOneX;
- SkFixed oneY = s.fFilterOneY;
-
-
-
- SkPerspIter iter(s.fInvMatrix,
- SkIntToScalar(x) + SK_ScalarHalf,
- SkIntToScalar(y) + SK_ScalarHalf, count);
-
- while ((count = iter.next()) != 0) {
- const SkFixed* SK_RESTRICT srcXY = iter.getXY();
- do {
- *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
- oneY PREAMBLE_ARG_Y);
- *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
- oneX PREAMBLE_ARG_X);
- srcXY += 2;
- } while (--count != 0);
- }
-}
-
-const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
- SCALE_NOFILTER_NAME,
- SCALE_FILTER_NAME,
- AFFINE_NOFILTER_NAME,
- AFFINE_FILTER_NAME,
- PERSP_NOFILTER_NAME,
- PERSP_FILTER_NAME
-};
-
-#undef MAKENAME
-#undef TILEX_PROCF
-#undef TILEY_PROCF
-#ifdef CHECK_FOR_DECAL
- #undef CHECK_FOR_DECAL
-#endif
-
-#undef SCALE_NOFILTER_NAME
-#undef SCALE_FILTER_NAME
-#undef AFFINE_NOFILTER_NAME
-#undef AFFINE_FILTER_NAME
-#undef PERSP_NOFILTER_NAME
-#undef PERSP_FILTER_NAME
-
-#undef PREAMBLE
-#undef PREAMBLE_PARAM_X
-#undef PREAMBLE_PARAM_Y
-#undef PREAMBLE_ARG_X
-#undef PREAMBLE_ARG_Y
-
-#undef TILEX_LOW_BITS
-#undef TILEY_LOW_BITS
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp
index 0b079977eb8..1f3bbc1f8f7 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp
@@ -1,4 +1,3 @@
-
/*
* Copyright 2009 The Android Open Source Project
*
@@ -6,9 +5,9 @@
* found in the LICENSE file.
*/
-
#include <emmintrin.h>
#include "SkBitmapProcState_opts_SSE2.h"
+#include "SkColorPriv.h"
#include "SkPaint.h"
#include "SkUtils.h"
@@ -17,7 +16,7 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != NULL);
SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
- SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+ SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
SkASSERT(s.fAlphaScale == 256);
const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
@@ -123,7 +122,7 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != NULL);
SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
- SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+ SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
SkASSERT(s.fAlphaScale < 256);
const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
@@ -639,11 +638,11 @@ void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
* It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
*/
void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
- const uint32_t* xy,
- int count, uint16_t* colors) {
+ const uint32_t* xy,
+ int count, uint16_t* colors) {
SkASSERT(count > 0 && colors != NULL);
SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
- SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+ SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
SkASSERT(s.fBitmap->isOpaque());
SkPMColor dstColor;
@@ -744,23 +743,6 @@ void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
// Extract low int and store.
dstColor = _mm_cvtsi128_si32(sum);
- //*colors++ = SkPixel32ToPixel16(dstColor);
- // below is much faster than the above. It's tested for Android benchmark--Softweg
- __m128i _m_temp1 = _mm_set1_epi32(dstColor);
- __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
-
- unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
- unsigned r = (r32 & ((1<<5) -1)) << 11;
-
- _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
- unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
- unsigned g = (g32 & ((1<<6) -1)) << 5;
-
- _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
- unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
- unsigned b = (b32 & ((1<<5) -1));
-
- *colors++ = r | g | b;
-
+ *colors++ = SkPixel32ToPixel16(dstColor);
} while (--count > 0);
}
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h
index 46e35a0f96f..82c5cc8d6e1 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -1,4 +1,3 @@
-
/*
* Copyright 2009 The Android Open Source Project
*
@@ -6,6 +5,8 @@
* found in the LICENSE file.
*/
+#ifndef SkBitmapProcState_opts_SSE2_DEFINED
+#define SkBitmapProcState_opts_SSE2_DEFINED
#include "SkBitmapProcState.h"
@@ -24,7 +25,9 @@ void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y);
void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
- uint32_t xy[], int count, int x, int y);
+ uint32_t xy[], int count, int x, int y);
void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
- const uint32_t* xy,
- int count, uint16_t* colors);
+ const uint32_t* xy,
+ int count, uint16_t* colors);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp
index f8342ecaad5..5b97215cc01 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp
@@ -5,11 +5,19 @@
* found in the LICENSE file.
*/
-#include <tmmintrin.h> // SSSE3
#include "SkBitmapProcState_opts_SSSE3.h"
#include "SkPaint.h"
#include "SkUtils.h"
+/* With the exception of the Android framework we always build the SSSE3 functions
+ * and enable the caller to determine SSSE3 support. However for the Android framework
+ * if the device does not support SSSE3 then the compiler will not supply the required
+ * -mssse3 option needed to build this file, so instead we provide a stub implementation.
+ */
+#if !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+
+#include <tmmintrin.h> // SSSE3
+
// adding anonymous namespace seemed to force gcc to inline directly the
// instantiation, instead of creating the functions
// S32_generic_D32_filter_DX_SSSE3<true> and
@@ -387,7 +395,7 @@ void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != NULL);
SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
- SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+ SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
if (has_alpha) {
SkASSERT(s.fAlphaScale < 256);
} else {
@@ -417,9 +425,10 @@ void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
const __m128i zero = _mm_setzero_si128();
__m128i alpha = _mm_setzero_si128();
- if (has_alpha)
+ if (has_alpha) {
// 8x(alpha)
alpha = _mm_set1_epi16(s.fAlphaScale);
+ }
if (sub_y == 0) {
// Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
@@ -578,7 +587,7 @@ void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
int count, uint32_t* colors) {
SkASSERT(count > 0 && colors != NULL);
SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
- SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+ SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
if (has_alpha) {
SkASSERT(s.fAlphaScale < 256);
} else {
@@ -697,7 +706,7 @@ void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
*colors++ = _mm_cvtsi128_si32(sum0);
}
}
-} // namepace
+} // namespace
void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
const uint32_t* xy,
@@ -722,3 +731,31 @@ void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
int count, uint32_t* colors) {
S32_generic_D32_filter_DXDY_SSSE3<true>(s, xy, count, colors);
}
+
+#else // !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+
+void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+ const uint32_t* xy,
+ int count, uint32_t* colors) {
+ sk_throw();
+}
+
+void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+ const uint32_t* xy,
+ int count, uint32_t* colors) {
+ sk_throw();
+}
+
+void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+ const uint32_t* xy,
+ int count, uint32_t* colors) {
+ sk_throw();
+}
+
+void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
+ const uint32_t* xy,
+ int count, uint32_t* colors) {
+ sk_throw();
+}
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h
index 176f2bfbe74..9fd074aacf2 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h
@@ -5,6 +5,9 @@
* found in the LICENSE file.
*/
+#ifndef SkBitmapProcState_opts_SSSE3_DEFINED
+#define SkBitmapProcState_opts_SSSE3_DEFINED
+
#include "SkBitmapProcState.h"
void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
@@ -19,3 +22,5 @@ void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
const uint32_t* xy,
int count, uint32_t* colors);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp
index 96fbebd4e19..ffa0ccfa8aa 100644
--- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -15,7 +15,7 @@
#include "SkConvolver.h"
-#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
+#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
void SI8_D16_nofilter_DX_arm(
const SkBitmapProcState& s,
const uint32_t* SK_RESTRICT xy,
@@ -186,7 +186,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
s.fBitmap->getColorTable()->unlockColors();
}
-#endif // SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
+#endif // !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
///////////////////////////////////////////////////////////////////////////////
@@ -194,6 +194,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
otherwise the shader won't even look at the matrix/sampler
*/
void SkBitmapProcState::platformProcs() {
+#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
bool isOpaque = 256 == fAlphaScale;
bool justDx = false;
@@ -201,9 +202,8 @@ void SkBitmapProcState::platformProcs() {
justDx = true;
}
- switch (fBitmap->config()) {
- case SkBitmap::kIndex8_Config:
-#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN)
+ switch (fBitmap->colorType()) {
+ case kIndex_8_SkColorType:
if (justDx && SkPaint::kNone_FilterLevel == fFilterLevel) {
#if 0 /* crashing on android device */
fSampleProc16 = SI8_D16_nofilter_DX_arm;
@@ -215,11 +215,11 @@ void SkBitmapProcState::platformProcs() {
fShaderProc32 = NULL;
}
}
-#endif
break;
default:
break;
}
+#endif
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp
index 2bf760313c1..11e172c0d1d 100644
--- a/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp
@@ -1,3 +1,9 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
#include "SkColor.h"
#include "SkColorPriv.h"
@@ -5,21 +11,24 @@
#include "SkUtilsArm.h"
#include "SkBlitMask_opts_arm_neon.h"
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
+SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
SkMask::Format maskFormat,
SkColor color) {
#if SK_ARM_NEON_IS_NONE
return NULL;
#else
+/* ** This has been disabled until we can diagnose and fix the SIGILL generated
+ ** in the NEON code. See http://skbug.com/2067 for details.
#if SK_ARM_NEON_IS_DYNAMIC
if (!sk_cpu_arm_has_neon()) {
return NULL;
}
#endif
- if ((SkBitmap::kARGB_8888_Config == dstConfig) &&
+ if ((kN32_SkColorType == dstCT) &&
(SkMask::kA8_Format == maskFormat)) {
return D32_A8_Factory_neon(color);
}
+*/
#endif
// We don't need to handle the SkMask::kLCD16_Format case as the default
@@ -36,7 +45,7 @@ SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
}
}
-SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
+SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType dstCT,
SkMask::Format maskFormat,
RowFlags flags) {
return NULL;
diff --git a/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp b/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp
index 0ad09193871..90f89a71292 100644
--- a/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp
@@ -1,7 +1,13 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
#include "SkBlitMask.h"
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
+SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
SkMask::Format maskFormat,
SkColor color) {
return NULL;
@@ -11,7 +17,7 @@ SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
return NULL;
}
-SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
+SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType dstCT,
SkMask::Format maskFormat,
RowFlags flags) {
return NULL;
diff --git a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp
index 3cb2b9c6d09..d65a313dadf 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp
@@ -5,15 +5,14 @@
* found in the LICENSE file.
*/
+#include <emmintrin.h>
#include "SkBlitRect_opts_SSE2.h"
#include "SkBlitRow.h"
#include "SkColorPriv.h"
-#include <emmintrin.h>
-
-/** Simple blitting of opaque rectangles less than 31 pixels wide:
- inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
-*/
+/* Simple blitting of opaque rectangles less than 31 pixels wide:
+ * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+ */
static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
int width, int height,
size_t rowBytes, uint32_t color) {
@@ -42,12 +41,12 @@ static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
}
}
-/**
- Fast blitting of opaque rectangles at least 31 pixels wide:
- inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
- A 31 pixel rectangle is guaranteed to have at least one
- 16-pixel aligned span that can take advantage of mm_store.
-*/
+/*
+ * Fast blitting of opaque rectangles at least 31 pixels wide:
+ * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+ * A 31 pixel rectangle is guaranteed to have at least one
+ * 16-pixel aligned span that can take advantage of mm_store.
+ */
static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
int width, int height,
size_t rowBytes, uint32_t color) {
diff --git a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h
index 4d2f74a4b1b..3d09f5c3abc 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h
@@ -8,13 +8,11 @@
#ifndef SkBlitRect_opts_SSE2_DEFINED
#define SkBlitRect_opts_SSE2_DEFINED
-/*
- These functions' implementations copy sections of both
- SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
-*/
-
#include "SkColor.h"
+/* These functions' implementations copy sections of both
+ * SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
+ */
void ColorRect32_SSE2(SkPMColor* SK_RESTRICT dst,
int width, int height,
size_t rowBytes, uint32_t color);
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp
index f3d010e3bc4..391b24c8673 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -5,14 +5,14 @@
* found in the LICENSE file.
*/
-
-#include "SkBlitRow_opts_SSE2.h"
+#include <emmintrin.h>
#include "SkBitmapProcState_opts_SSE2.h"
+#include "SkBlitRow_opts_SSE2.h"
#include "SkColorPriv.h"
+#include "SkColor_opts_SSE2.h"
+#include "SkDither.h"
#include "SkUtils.h"
-#include <emmintrin.h>
-
/* SSE2 version of S32_Blend_BlitRow32()
* portable version is in core/SkBlitRow_D32.cpp
*/
@@ -177,7 +177,7 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
d++;
count -= 4;
}
- #else
+#else
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
__m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
while (count >= 4) {
@@ -340,7 +340,6 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
*/
void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
SkPMColor color) {
-
if (count <= 0) {
return;
}
@@ -404,7 +403,7 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
}
src = reinterpret_cast<const SkPMColor*>(s);
dst = reinterpret_cast<SkPMColor*>(d);
- }
+ }
while (count > 0) {
*dst = color + SkAlphaMulQ(*src, scale);
@@ -502,7 +501,7 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
}
dst = reinterpret_cast<SkPMColor *>(d);
}
- while(count > 0) {
+ while (count > 0) {
*dst= SkBlendARGB32(color, *dst, *mask);
dst += 1;
mask++;
@@ -851,3 +850,512 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
width--;
}
}
+
+/* SSE2 version of S32_D565_Opaque()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/) {
+ SkASSERT(255 == alpha);
+
+ if (count <= 0) {
+ return;
+ }
+
+ if (count >= 8) {
+ while (((size_t)dst & 0x0F) != 0) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+
+ *dst++ = SkPixel32ToPixel16_ToU16(c);
+ count--;
+ }
+
+ const __m128i* s = reinterpret_cast<const __m128i*>(src);
+ __m128i* d = reinterpret_cast<__m128i*>(dst);
+ __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
+ __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
+ __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
+
+ while (count >= 8) {
+ // Load 8 pixels of src.
+ __m128i src_pixel1 = _mm_loadu_si128(s++);
+ __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+ // Calculate result r.
+ __m128i r1 = _mm_srli_epi32(src_pixel1,
+ SK_R32_SHIFT + (8 - SK_R16_BITS));
+ r1 = _mm_and_si128(r1, r16_mask);
+ __m128i r2 = _mm_srli_epi32(src_pixel2,
+ SK_R32_SHIFT + (8 - SK_R16_BITS));
+ r2 = _mm_and_si128(r2, r16_mask);
+ __m128i r = _mm_packs_epi32(r1, r2);
+
+ // Calculate result g.
+ __m128i g1 = _mm_srli_epi32(src_pixel1,
+ SK_G32_SHIFT + (8 - SK_G16_BITS));
+ g1 = _mm_and_si128(g1, g16_mask);
+ __m128i g2 = _mm_srli_epi32(src_pixel2,
+ SK_G32_SHIFT + (8 - SK_G16_BITS));
+ g2 = _mm_and_si128(g2, g16_mask);
+ __m128i g = _mm_packs_epi32(g1, g2);
+
+ // Calculate result b.
+ __m128i b1 = _mm_srli_epi32(src_pixel1,
+ SK_B32_SHIFT + (8 - SK_B16_BITS));
+ b1 = _mm_and_si128(b1, b16_mask);
+ __m128i b2 = _mm_srli_epi32(src_pixel2,
+ SK_B32_SHIFT + (8 - SK_B16_BITS));
+ b2 = _mm_and_si128(b2, b16_mask);
+ __m128i b = _mm_packs_epi32(b1, b2);
+
+ // Store 8 16-bit colors in dst.
+ __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
+ _mm_store_si128(d++, d_pixel);
+ count -= 8;
+ }
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<uint16_t*>(d);
+ }
+
+ if (count > 0) {
+ do {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ *dst++ = SkPixel32ToPixel16_ToU16(c);
+ } while (--count != 0);
+ }
+}
+
+/* SSE2 version of S32A_D565_Opaque()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int /*x*/, int /*y*/) {
+ SkASSERT(255 == alpha);
+
+ if (count <= 0) {
+ return;
+ }
+
+ if (count >= 8) {
+ // Make dst 16 bytes alignment
+ while (((size_t)dst & 0x0F) != 0) {
+ SkPMColor c = *src++;
+ if (c) {
+ *dst = SkSrcOver32To16(c, *dst);
+ }
+ dst += 1;
+ count--;
+ }
+
+ const __m128i* s = reinterpret_cast<const __m128i*>(src);
+ __m128i* d = reinterpret_cast<__m128i*>(dst);
+ __m128i var255 = _mm_set1_epi16(255);
+ __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
+ __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
+ __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
+
+ while (count >= 8) {
+ // Load 8 pixels of src.
+ __m128i src_pixel1 = _mm_loadu_si128(s++);
+ __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+ // Check whether src pixels are equal to 0 and get the highest bit
+ // of each byte of result, if src pixels are all zero, src_cmp1 and
+ // src_cmp2 will be 0xFFFF.
+ int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
+ _mm_setzero_si128()));
+ int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
+ _mm_setzero_si128()));
+ if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
+ d++;
+ count -= 8;
+ continue;
+ }
+
+ // Load 8 pixels of dst.
+ __m128i dst_pixel = _mm_load_si128(d);
+
+ // Extract A from src.
+ __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
+ sa1 = _mm_srli_epi32(sa1, 24);
+ __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
+ sa2 = _mm_srli_epi32(sa2, 24);
+ __m128i sa = _mm_packs_epi32(sa1, sa2);
+
+ // Extract R from src.
+ __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+ sr1 = _mm_srli_epi32(sr1, 24);
+ __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+ sr2 = _mm_srli_epi32(sr2, 24);
+ __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+ // Extract G from src.
+ __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+ sg1 = _mm_srli_epi32(sg1, 24);
+ __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+ sg2 = _mm_srli_epi32(sg2, 24);
+ __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+ // Extract B from src.
+ __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+ sb1 = _mm_srli_epi32(sb1, 24);
+ __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+ sb2 = _mm_srli_epi32(sb2, 24);
+ __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+ // Extract R G B from dst.
+ __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
+ dr = _mm_and_si128(dr, r16_mask);
+ __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
+ dg = _mm_and_si128(dg, g16_mask);
+ __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
+ db = _mm_and_si128(db, b16_mask);
+
+ __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
+
+ // Calculate R G B of result.
+ // Original algorithm is in SkSrcOver32To16().
+ dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
+ dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
+ dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
+ dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
+ db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
+ db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
+
+ // Pack R G B into 16-bit color.
+ __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
+
+ // Store 8 16-bit colors in dst.
+ _mm_store_si128(d++, d_pixel);
+ count -= 8;
+ }
+
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<uint16_t*>(d);
+ }
+
+ if (count > 0) {
+ do {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ if (c) {
+ *dst = SkSrcOver32To16(c, *dst);
+ }
+ dst += 1;
+ } while (--count != 0);
+ }
+}
+
+void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int x, int y) {
+ SkASSERT(255 == alpha);
+
+ if (count <= 0) {
+ return;
+ }
+
+ if (count >= 8) {
+ while (((size_t)dst & 0x0F) != 0) {
+ DITHER_565_SCAN(y);
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+
+ unsigned dither = DITHER_VALUE(x);
+ *dst++ = SkDitherRGB32To565(c, dither);
+ DITHER_INC_X(x);
+ count--;
+ }
+
+ unsigned short dither_value[8];
+ __m128i dither;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+ const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+ dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
+ dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
+ dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
+ dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
+#else
+ const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+ dither_value[0] = dither_value[4] = (dither_scan
+ >> (((x) & 3) << 2)) & 0xF;
+ dither_value[1] = dither_value[5] = (dither_scan
+ >> (((x + 1) & 3) << 2)) & 0xF;
+ dither_value[2] = dither_value[6] = (dither_scan
+ >> (((x + 2) & 3) << 2)) & 0xF;
+ dither_value[3] = dither_value[7] = (dither_scan
+ >> (((x + 3) & 3) << 2)) & 0xF;
+#endif
+ dither = _mm_loadu_si128((__m128i*) dither_value);
+
+ const __m128i* s = reinterpret_cast<const __m128i*>(src);
+ __m128i* d = reinterpret_cast<__m128i*>(dst);
+
+ while (count >= 8) {
+ // Load 8 pixels of src.
+ __m128i src_pixel1 = _mm_loadu_si128(s++);
+ __m128i src_pixel2 = _mm_loadu_si128(s++);
+
+ // Extract R from src.
+ __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+ sr1 = _mm_srli_epi32(sr1, 24);
+ __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+ sr2 = _mm_srli_epi32(sr2, 24);
+ __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+ // SkDITHER_R32To565(sr, dither)
+ __m128i sr_offset = _mm_srli_epi16(sr, 5);
+ sr = _mm_add_epi16(sr, dither);
+ sr = _mm_sub_epi16(sr, sr_offset);
+ sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
+
+ // Extract G from src.
+ __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+ sg1 = _mm_srli_epi32(sg1, 24);
+ __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+ sg2 = _mm_srli_epi32(sg2, 24);
+ __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+ // SkDITHER_R32To565(sg, dither)
+ __m128i sg_offset = _mm_srli_epi16(sg, 6);
+ sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
+ sg = _mm_sub_epi16(sg, sg_offset);
+ sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
+
+ // Extract B from src.
+ __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+ sb1 = _mm_srli_epi32(sb1, 24);
+ __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+ sb2 = _mm_srli_epi32(sb2, 24);
+ __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+ // SkDITHER_R32To565(sb, dither)
+ __m128i sb_offset = _mm_srli_epi16(sb, 5);
+ sb = _mm_add_epi16(sb, dither);
+ sb = _mm_sub_epi16(sb, sb_offset);
+ sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
+
+ // Pack and store 16-bit dst pixel.
+ __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
+ _mm_store_si128(d++, d_pixel);
+
+ count -= 8;
+ x += 8;
+ }
+
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<uint16_t*>(d);
+ }
+
+ if (count > 0) {
+ DITHER_565_SCAN(y);
+ do {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+
+ unsigned dither = DITHER_VALUE(x);
+ *dst++ = SkDitherRGB32To565(c, dither);
+ DITHER_INC_X(x);
+ } while (--count != 0);
+ }
+}
+
+/* SSE2 version of S32A_D565_Opaque_Dither()
+ * portable version is in core/SkBlitRow_D16.cpp
+ */
+void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int x, int y) {
+ SkASSERT(255 == alpha);
+
+ if (count <= 0) {
+ return;
+ }
+
+ if (count >= 8) {
+ while (((size_t)dst & 0x0F) != 0) {
+ DITHER_565_SCAN(y);
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ if (c) {
+ unsigned a = SkGetPackedA32(c);
+
+ int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+ unsigned sr = SkGetPackedR32(c);
+ unsigned sg = SkGetPackedG32(c);
+ unsigned sb = SkGetPackedB32(c);
+ sr = SkDITHER_R32_FOR_565(sr, d);
+ sg = SkDITHER_G32_FOR_565(sg, d);
+ sb = SkDITHER_B32_FOR_565(sb, d);
+
+ uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+ uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+ dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+ // now src and dst expanded are in g:11 r:10 x:1 b:10
+ *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+ }
+ dst += 1;
+ DITHER_INC_X(x);
+ count--;
+ }
+
+ unsigned short dither_value[8];
+ __m128i dither, dither_cur;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+ const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+ dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
+ dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
+ dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
+ dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
+#else
+ const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+ dither_value[0] = dither_value[4] = (dither_scan
+ >> (((x) & 3) << 2)) & 0xF;
+ dither_value[1] = dither_value[5] = (dither_scan
+ >> (((x + 1) & 3) << 2)) & 0xF;
+ dither_value[2] = dither_value[6] = (dither_scan
+ >> (((x + 2) & 3) << 2)) & 0xF;
+ dither_value[3] = dither_value[7] = (dither_scan
+ >> (((x + 3) & 3) << 2)) & 0xF;
+#endif
+ dither = _mm_loadu_si128((__m128i*) dither_value);
+
+ const __m128i* s = reinterpret_cast<const __m128i*>(src);
+ __m128i* d = reinterpret_cast<__m128i*>(dst);
+ __m128i var256 = _mm_set1_epi16(256);
+ __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
+ __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
+ __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
+
+ while (count >= 8) {
+ // Load 8 pixels of src and dst.
+ __m128i src_pixel1 = _mm_loadu_si128(s++);
+ __m128i src_pixel2 = _mm_loadu_si128(s++);
+ __m128i dst_pixel = _mm_load_si128(d);
+
+ // Extract A from src.
+ __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
+ sa1 = _mm_srli_epi32(sa1, 24);
+ __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
+ sa2 = _mm_srli_epi32(sa2, 24);
+ __m128i sa = _mm_packs_epi32(sa1, sa2);
+
+ // Calculate current dither value.
+ dither_cur = _mm_mullo_epi16(dither,
+ _mm_add_epi16(sa, _mm_set1_epi16(1)));
+ dither_cur = _mm_srli_epi16(dither_cur, 8);
+
+ // Extract R from src.
+ __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
+ sr1 = _mm_srli_epi32(sr1, 24);
+ __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
+ sr2 = _mm_srli_epi32(sr2, 24);
+ __m128i sr = _mm_packs_epi32(sr1, sr2);
+
+ // SkDITHER_R32_FOR_565(sr, d)
+ __m128i sr_offset = _mm_srli_epi16(sr, 5);
+ sr = _mm_add_epi16(sr, dither_cur);
+ sr = _mm_sub_epi16(sr, sr_offset);
+
+ // Expand sr.
+ sr = _mm_slli_epi16(sr, 2);
+
+ // Extract G from src.
+ __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
+ sg1 = _mm_srli_epi32(sg1, 24);
+ __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
+ sg2 = _mm_srli_epi32(sg2, 24);
+ __m128i sg = _mm_packs_epi32(sg1, sg2);
+
+ // sg = SkDITHER_G32_FOR_565(sg, d).
+ __m128i sg_offset = _mm_srli_epi16(sg, 6);
+ sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
+ sg = _mm_sub_epi16(sg, sg_offset);
+
+ // Expand sg.
+ sg = _mm_slli_epi16(sg, 3);
+
+ // Extract B from src.
+ __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
+ sb1 = _mm_srli_epi32(sb1, 24);
+ __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
+ sb2 = _mm_srli_epi32(sb2, 24);
+ __m128i sb = _mm_packs_epi32(sb1, sb2);
+
+ // sb = SkDITHER_B32_FOR_565(sb, d).
+ __m128i sb_offset = _mm_srli_epi16(sb, 5);
+ sb = _mm_add_epi16(sb, dither_cur);
+ sb = _mm_sub_epi16(sb, sb_offset);
+
+ // Expand sb.
+ sb = _mm_slli_epi16(sb, 2);
+
+ // Extract R G B from dst.
+ __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
+ dr = _mm_and_si128(dr, r16_mask);
+ __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
+ dg = _mm_and_si128(dg, g16_mask);
+ __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
+ db = _mm_and_si128(db, b16_mask);
+
+ // SkAlpha255To256(255 - a) >> 3
+ __m128i isa = _mm_sub_epi16(var256, sa);
+ isa = _mm_srli_epi16(isa, 3);
+
+ dr = _mm_mullo_epi16(dr, isa);
+ dr = _mm_add_epi16(dr, sr);
+ dr = _mm_srli_epi16(dr, 5);
+
+ dg = _mm_mullo_epi16(dg, isa);
+ dg = _mm_add_epi16(dg, sg);
+ dg = _mm_srli_epi16(dg, 5);
+
+ db = _mm_mullo_epi16(db, isa);
+ db = _mm_add_epi16(db, sb);
+ db = _mm_srli_epi16(db, 5);
+
+ // Package and store dst pixel.
+ __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
+ _mm_store_si128(d++, d_pixel);
+
+ count -= 8;
+ x += 8;
+ }
+
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<uint16_t*>(d);
+ }
+
+ if (count > 0) {
+ DITHER_565_SCAN(y);
+ do {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ if (c) {
+ unsigned a = SkGetPackedA32(c);
+
+ int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+ unsigned sr = SkGetPackedR32(c);
+ unsigned sg = SkGetPackedG32(c);
+ unsigned sb = SkGetPackedB32(c);
+ sr = SkDITHER_R32_FOR_565(sr, d);
+ sg = SkDITHER_G32_FOR_565(sg, d);
+ sb = SkDITHER_B32_FOR_565(sb, d);
+
+ uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+ uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+ dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+ // now src and dst expanded are in g:11 r:10 x:1 b:10
+ *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+ }
+ dst += 1;
+ DITHER_INC_X(x);
+ } while (--count != 0);
+ }
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h
index b443ec7f213..29fd96e5e91 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h
@@ -1,4 +1,3 @@
-
/*
* Copyright 2009 The Android Open Source Project
*
@@ -6,6 +5,8 @@
* found in the LICENSE file.
*/
+#ifndef SkBlitRow_opts_SSE2_DEFINED
+#define SkBlitRow_opts_SSE2_DEFINED
#include "SkBlitRow.h"
@@ -28,3 +29,18 @@ void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
SkColor color, int width, SkPMColor);
void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
SkColor color, int width, SkPMColor opaqueDst);
+
+void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/);
+void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int /*x*/, int /*y*/);
+void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int x, int y);
+void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int x, int y);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp
index e8e544e9dcb..34b8564723c 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp
@@ -12,8 +12,6 @@
#include "SkUtils.h"
#include "SkUtilsArm.h"
-#include "SkCachePreload_arm.h"
-
// Define USE_NEON_CODE to indicate that we need to build NEON routines
#define USE_NEON_CODE (!SK_ARM_NEON_IS_NONE)
@@ -376,3 +374,7 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
return SK_ARM_NEON_WRAP(Color32_arm);
}
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+ return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
index 672980d0d26..01a6a2aa745 100644
--- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -14,10 +14,56 @@
#include "SkMathPriv.h"
#include "SkUtils.h"
-#include "SkCachePreload_arm.h"
#include "SkColor_opts_neon.h"
#include <arm_neon.h>
+#ifdef SK_CPU_ARM64
+static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
+ uint8x8x4_t vsrc;
+ uint8x8_t vsrc_0, vsrc_1, vsrc_2;
+
+ asm (
+ "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
+ "mov %[vsrc0].8b, v0.8b \t\n"
+ "mov %[vsrc1].8b, v1.8b \t\n"
+ "mov %[vsrc2].8b, v2.8b \t\n"
+ : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
+ [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
+ : : "v0", "v1", "v2", "v3"
+ );
+
+ vsrc.val[0] = vsrc_0;
+ vsrc.val[1] = vsrc_1;
+ vsrc.val[2] = vsrc_2;
+
+ return vsrc;
+}
+
+static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
+ uint8x8x4_t vsrc;
+ uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
+
+ asm (
+ "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
+ "mov %[vsrc0].8b, v0.8b \t\n"
+ "mov %[vsrc1].8b, v1.8b \t\n"
+ "mov %[vsrc2].8b, v2.8b \t\n"
+ "mov %[vsrc3].8b, v3.8b \t\n"
+ : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
+ [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
+ [src] "+&r" (src)
+ : : "v0", "v1", "v2", "v3"
+ );
+
+ vsrc.val[0] = vsrc_0;
+ vsrc.val[1] = vsrc_1;
+ vsrc.val[2] = vsrc_2;
+ vsrc.val[3] = vsrc_3;
+
+ return vsrc;
+}
+#endif
+
void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
@@ -28,7 +74,12 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
uint16x8_t vdst;
// Load
+#ifdef SK_CPU_ARM64
+ vsrc = sk_vld4_u8_arm64_3(src);
+#else
vsrc = vld4_u8((uint8_t*)src);
+ src += 8;
+#endif
// Convert src to 565
vdst = SkPixel32ToPixel16_neon8(vsrc);
@@ -38,7 +89,6 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
// Prepare next iteration
dst += 8;
- src += 8;
count -= 8;
};
@@ -52,6 +102,92 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
};
}
+void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/) {
+ SkASSERT(255 > alpha);
+
+ uint16x8_t vmask_blue, vscale;
+
+ // prepare constants
+ vscale = vdupq_n_u16(SkAlpha255To256(alpha));
+ vmask_blue = vmovq_n_u16(0x1F);
+
+ while (count >= 8) {
+ uint8x8x4_t vsrc;
+ uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
+ uint16x8_t vres_r, vres_g, vres_b;
+
+ // Load src
+#ifdef SK_CPU_ARM64
+ vsrc = sk_vld4_u8_arm64_3(src);
+#else
+ {
+ register uint8x8_t d0 asm("d0");
+ register uint8x8_t d1 asm("d1");
+ register uint8x8_t d2 asm("d2");
+ register uint8x8_t d3 asm("d3");
+
+ asm (
+ "vld4.8 {d0-d3},[%[src]]!"
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
+ :
+ );
+ vsrc.val[0] = d0;
+ vsrc.val[1] = d1;
+ vsrc.val[2] = d2;
+ }
+#endif
+
+ // Load and unpack dst
+ vdst = vld1q_u16(dst);
+ vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
+ vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
+ vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
+ vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
+
+ // Shift src to 565 range
+ vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
+ vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
+ vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
+
+ // Scale src - dst
+ vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
+ vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
+ vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
+
+ vres_r = vshrq_n_u16(vres_r * vscale, 8);
+ vres_g = vshrq_n_u16(vres_g * vscale, 8);
+ vres_b = vshrq_n_u16(vres_b * vscale, 8);
+
+ vres_r += vdst_r;
+ vres_g += vdst_g;
+ vres_b += vdst_b;
+
+ // Combine
+ vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue
+ vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue
+
+ // Store
+ vst1q_u16(dst, vres_b);
+ dst += 8;
+ count -= 8;
+ }
+ if (count > 0) {
+ int scale = SkAlpha255To256(alpha);
+ do {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ uint16_t d = *dst;
+ *dst++ = SkPackRGB16(
+ SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
+ SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
+ SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
+ } while (--count != 0);
+ }
+}
+
+#ifdef SK_CPU_ARM32
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
@@ -229,114 +365,129 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
);
}
}
+#endif
+
+static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
+ prod += vdupq_n_u16(128);
+ prod += vshrq_n_u16(prod, 8);
+ return vshrq_n_u16(prod, 8);
+}
void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
+ SkASSERT(255 > alpha);
- U8CPU alpha_for_asm = alpha;
-
- asm volatile (
- /* This code implements a Neon version of S32A_D565_Blend. The output differs from
- * the original in two respects:
- * 1. The results have a few mismatches compared to the original code. These mismatches
- * never exceed 1. It's possible to improve accuracy vs. a floating point
- * implementation by introducing rounding right shifts (vrshr) for the final stage.
- * Rounding is not present in the code below, because although results would be closer
- * to a floating point implementation, the number of mismatches compared to the
- * original code would be far greater.
- * 2. On certain inputs, the original code can overflow, causing colour channels to
- * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
- * to affect another.
+ /* This code implements a Neon version of S32A_D565_Blend. The results have
+ * a few mismatches compared to the original code. These mismatches never
+ * exceed 1.
*/
-#if 1
- /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
- "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
+ if (count >= 8) {
+ uint16x8_t valpha_max, vmask_blue;
+ uint8x8_t valpha;
+
+ // prepare constants
+ valpha_max = vmovq_n_u16(255);
+ valpha = vdup_n_u8(alpha);
+ vmask_blue = vmovq_n_u16(SK_B16_MASK);
+
+ do {
+ uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
+ uint16x8_t vres_a, vres_r, vres_g, vres_b;
+ uint8x8x4_t vsrc;
+
+ // load pixels
+ vdst = vld1q_u16(dst);
+#ifdef SK_CPU_ARM64
+ vsrc = sk_vld4_u8_arm64_4(src);
#else
- "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
+#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
+ asm (
+ "vld4.u8 %h[vsrc], [%[src]]!"
+ : [vsrc] "=w" (vsrc), [src] "+&r" (src)
+ : :
+ );
+#else
+ register uint8x8_t d0 asm("d0");
+ register uint8x8_t d1 asm("d1");
+ register uint8x8_t d2 asm("d2");
+ register uint8x8_t d3 asm("d3");
+
+ asm volatile (
+ "vld4.u8 {d0-d3},[%[src]]!;"
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
+ [src] "+&r" (src)
+ : :
+ );
+ vsrc.val[0] = d0;
+ vsrc.val[1] = d1;
+ vsrc.val[2] = d2;
+ vsrc.val[3] = d3;
#endif
- "vmov.u16 q3, #255 \n\t" // set up constant
- "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
- "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
- "beq 2f \n\t" // if count8 == 0, exit
- "vmov.u16 q15, #0x1f \n\t" // set up blue mask
-
- "1: \n\t"
- "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
- "subs r4, r4, #1 \n\t" // decrement loop counter
- "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
- // and deinterleave
-
- "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
- "vand q10, q0, q15 \n\t" // extract blue
- "vshr.u16 q8, q0, #11 \n\t" // extract red
- "vshr.u16 q9, q9, #10 \n\t" // extract green
- // dstrgb = {q8, q9, q10}
-
- "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
- "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
- "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
-
- "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
- "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
- "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
- "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
- // srcrgba = {q11, q12, q13, q14}
-
- "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
- "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
- "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
- "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
-
- "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
- "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
- // dst_scale = q2
-
- "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
- "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
- "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
-
-#if 1
- // trying for a better match with SkDiv255Round(a)
- // C alg is: a+=128; (a+a>>8)>>8
- // we'll use just a rounding shift [q2 is available for scratch]
- "vrshr.u16 q11, q11, #8 \n\t" // shift down red
- "vrshr.u16 q12, q12, #8 \n\t" // shift down green
- "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
+#endif // #ifdef SK_CPU_ARM64
+
+
+ // deinterleave dst
+ vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
+ vdst_b = vdst & vmask_blue; // extract blue
+ vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
+ vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
+
+ // shift src to 565
+ vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
+ vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
+ vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
+
+ // calc src * src_scale
+ vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
+ vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
+ vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
+ vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
+
+ // prepare dst_scale
+ vres_a = SkDiv255Round_neon8(vres_a);
+ vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
+
+ // add dst * dst_scale to previous result
+ vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
+ vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
+ vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
+
+#ifdef S32A_D565_BLEND_EXACT
+ // It is possible to get exact results with this but it is slow,
+ // even slower than C code in some cases
+ vres_r = SkDiv255Round_neon8(vres_r);
+ vres_g = SkDiv255Round_neon8(vres_g);
+ vres_b = SkDiv255Round_neon8(vres_b);
#else
- // arm's original "truncating divide by 256"
- "vshr.u16 q11, q11, #8 \n\t" // shift down red
- "vshr.u16 q12, q12, #8 \n\t" // shift down green
- "vshr.u16 q13, q13, #8 \n\t" // shift down blue
+ vres_r = vrshrq_n_u16(vres_r, 8);
+ vres_g = vrshrq_n_u16(vres_g, 8);
+ vres_b = vrshrq_n_u16(vres_b, 8);
#endif
+ // pack result
+ vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
+ vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
- "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
- "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
- "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
-
- "bne 1b \n\t" // if counter != 0, loop
- "2: \n\t" // exit
-
- : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
- :
- : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
- );
+ // store
+ vst1q_u16(dst, vres_b);
+ dst += 8;
+ count -= 8;
+ } while (count >= 8);
+ }
- count &= 7;
- if (count > 0) {
- do {
- SkPMColor sc = *src++;
- if (sc) {
- uint16_t dc = *dst;
- unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
- unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
- unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
- unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
- *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
- }
- dst += 1;
- } while (--count != 0);
+ // leftovers
+ while (count-- > 0) {
+ SkPMColor sc = *src++;
+ if (sc) {
+ uint16_t dc = *dst;
+ unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
+ unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
+ unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
+ unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
+ *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
+ }
+ dst += 1;
}
}
@@ -374,6 +525,7 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
do {
+ uint8x8x4_t vsrc;
uint8x8_t vsrc_r, vsrc_g, vsrc_b;
uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
@@ -384,6 +536,9 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
int8x8_t vres8_r, vres8_g, vres8_b;
// Load source and add dither
+#ifdef SK_CPU_ARM64
+ vsrc = sk_vld4_u8_arm64_3(src);
+#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
@@ -391,17 +546,18 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
register uint8x8_t d3 asm("d3");
asm (
- "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+ "vld4.8 {d0-d3},[%[src]]! "
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
:
);
- vsrc_g = d1;
-#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
- vsrc_r = d2; vsrc_b = d0;
-#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
- vsrc_r = d0; vsrc_b = d2;
-#endif
+ vsrc.val[0] = d0;
+ vsrc.val[1] = d1;
+ vsrc.val[2] = d2;
}
+#endif
+ vsrc_r = vsrc.val[NEON_R];
+ vsrc_g = vsrc.val[NEON_G];
+ vsrc_b = vsrc.val[NEON_B];
vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
@@ -766,76 +922,67 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(alpha <= 255);
- if (count > 0) {
- uint16_t src_scale = SkAlpha255To256(alpha);
- uint16_t dst_scale = 256 - src_scale;
-
- /* run them N at a time through the NEON unit */
- /* note that each 1 is 4 bytes, each treated exactly the same,
- * so we can work under that guise. We *do* know that the src&dst
- * will be 32-bit aligned quantities, so we can specify that on
- * the load/store ops and do a neon 'reinterpret' to get us to
- * byte-sized (pun intended) pieces that we widen/multiply/shift
- * we're limited at 128 bits in the wide ops, which is 8x16bits
- * or a pair of 32 bit src/dsts.
- */
- /* we *could* manually unroll this loop so that we load 128 bits
- * (as a pair of 64s) from each of src and dst, processing them
- * in pieces. This might give us a little better management of
- * the memory latency, but my initial attempts here did not
- * produce an instruction stream that looked all that nice.
- */
-#define UNROLL 2
- while (count >= UNROLL) {
- uint8x8_t src_raw, dst_raw, dst_final;
- uint16x8_t src_wide, dst_wide;
- /* get 64 bits of src, widen it, multiply by src_scale */
- src_raw = vreinterpret_u8_u32(vld1_u32(src));
- src_wide = vmovl_u8(src_raw);
- /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
- src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
+ if (count <= 0) {
+ return;
+ }
- /* ditto with dst */
- dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
- dst_wide = vmovl_u8(dst_raw);
+ uint16_t src_scale = SkAlpha255To256(alpha);
+ uint16_t dst_scale = 256 - src_scale;
- /* combine add with dst multiply into mul-accumulate */
- dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
+ while (count >= 2) {
+ uint8x8_t vsrc, vdst, vres;
+ uint16x8_t vsrc_wide, vdst_wide;
- dst_final = vshrn_n_u16(dst_wide, 8);
- vst1_u32(dst, vreinterpret_u32_u8(dst_final));
+ /* These commented prefetches are a big win for count
+ * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
+ * They also hurt a little (<5%) on an A15
+ */
+ //__builtin_prefetch(src+32);
+ //__builtin_prefetch(dst+32);
- src += UNROLL;
- dst += UNROLL;
- count -= UNROLL;
+ // Load
+ vsrc = vreinterpret_u8_u32(vld1_u32(src));
+ vdst = vreinterpret_u8_u32(vld1_u32(dst));
+
+ // Process src
+ vsrc_wide = vmovl_u8(vsrc);
+ vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
+
+ // Process dst
+ vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+
+ // Combine
+ vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+
+ // Store
+ vst1_u32(dst, vreinterpret_u32_u8(vres));
+
+ src += 2;
+ dst += 2;
+ count -= 2;
}
- /* RBE: well, i don't like how gcc manages src/dst across the above
- * loop it's constantly calculating src+bias, dst+bias and it only
- * adjusts the real ones when we leave the loop. Not sure why
- * it's "hoisting down" (hoisting implies above in my lexicon ;))
- * the adjustments to src/dst/count, but it does...
- * (might be SSA-style internal logic...
- */
-#if UNROLL == 2
if (count == 1) {
- *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
- }
-#else
- if (count > 0) {
- do {
- *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
- src += 1;
- dst += 1;
- } while (--count > 0);
- }
-#endif
+ uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
+ uint16x8_t vsrc_wide, vdst_wide;
-#undef UNROLL
+ // Load
+ vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
+ vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
+
+ // Process
+ vsrc_wide = vmovl_u8(vsrc);
+ vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
+ vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+ vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+
+ // Store
+ vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
}
}
+#ifdef SK_CPU_ARM32
void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
@@ -961,6 +1108,7 @@ static void showme16(char *str, void *p, int len)
SkDebugf("%s\n", buf);
}
#endif
+#endif // #ifdef SK_CPU_ARM32
void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
@@ -970,9 +1118,8 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
#define UNROLL 8
if (count >= UNROLL) {
- uint8x8_t dbase;
-#if defined(DEBUG_OPAQUE_DITHER)
+#if defined(DEBUG_OPAQUE_DITHER)
uint16_t tmpbuf[UNROLL];
int td[UNROLL];
int tdv[UNROLL];
@@ -983,35 +1130,37 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
int noisy = 0;
#endif
+ uint8x8_t dbase;
const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
dbase = vld1_u8(dstart);
do {
+ uint8x8x4_t vsrc;
uint8x8_t sr, sg, sb, sa, d;
uint16x8_t dst8, scale8, alpha8;
uint16x8_t dst_r, dst_g, dst_b;
-#if defined(DEBUG_OPAQUE_DITHER)
- /* calculate 8 elements worth into a temp buffer */
- {
- int my_y = y;
- int my_x = x;
- SkPMColor* my_src = (SkPMColor*)src;
- uint16_t* my_dst = dst;
- int i;
-
- DITHER_565_SCAN(my_y);
- for(i=0;i<UNROLL;i++) {
+#if defined(DEBUG_OPAQUE_DITHER)
+ // calculate 8 elements worth into a temp buffer
+ {
+ int my_y = y;
+ int my_x = x;
+ SkPMColor* my_src = (SkPMColor*)src;
+ uint16_t* my_dst = dst;
+ int i;
+
+ DITHER_565_SCAN(my_y);
+ for(i = 0; i < UNROLL; i++) {
SkPMColor c = *my_src++;
SkPMColorAssert(c);
if (c) {
unsigned a = SkGetPackedA32(c);
int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
- tdv[i] = DITHER_VALUE(my_x);
- ta[i] = a;
- tap[i] = SkAlpha255To256(a);
- td[i] = d;
+ tdv[i] = DITHER_VALUE(my_x);
+ ta[i] = a;
+ tap[i] = SkAlpha255To256(a);
+ td[i] = d;
unsigned sr = SkGetPackedR32(c);
unsigned sg = SkGetPackedG32(c);
@@ -1025,147 +1174,132 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
// now src and dst expanded are in g:11 r:10 x:1 b:10
tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
- td[i] = d;
-
+ td[i] = d;
} else {
- tmpbuf[i] = *my_dst;
- ta[i] = tdv[i] = td[i] = 0xbeef;
- }
- in_dst[i] = *my_dst;
+ tmpbuf[i] = *my_dst;
+ ta[i] = tdv[i] = td[i] = 0xbeef;
+ }
+ in_dst[i] = *my_dst;
my_dst += 1;
DITHER_INC_X(my_x);
- }
- }
+ }
+ }
#endif
- /* source is in ABGR */
+#ifdef SK_CPU_ARM64
+ vsrc = sk_vld4_u8_arm64_4(src);
+#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
register uint8x8_t d2 asm("d2");
register uint8x8_t d3 asm("d3");
- asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
- : "r" (src)
- );
- sr = d0; sg = d1; sb = d2; sa = d3;
+ asm ("vld4.8 {d0-d3},[%[src]]! "
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
+ :
+ );
+ vsrc.val[0] = d0;
+ vsrc.val[1] = d1;
+ vsrc.val[2] = d2;
+ vsrc.val[3] = d3;
}
-
- /* calculate 'd', which will be 0..7 */
- /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
-#if defined(SK_BUILD_FOR_ANDROID)
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
- alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
-#else
- alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
#endif
- alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
- d = vshrn_n_u16(alpha8, 8); /* narrowing too */
+ sa = vsrc.val[NEON_A];
+ sr = vsrc.val[NEON_R];
+ sg = vsrc.val[NEON_G];
+ sb = vsrc.val[NEON_B];
- /* sr = sr - (sr>>5) + d */
+ /* calculate 'd', which will be 0..7
+ * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
+ */
+ alpha8 = vmovl_u8(dbase);
+ alpha8 = vmlal_u8(alpha8, sa, dbase);
+ d = vshrn_n_u16(alpha8, 8); // narrowing too
+
+ // sr = sr - (sr>>5) + d
/* watching for 8-bit overflow. d is 0..7; risky range of
* sr is >248; and then (sr>>5) is 7 so it offsets 'd';
- * safe as long as we do ((sr-sr>>5) + d) */
+ * safe as long as we do ((sr-sr>>5) + d)
+ */
sr = vsub_u8(sr, vshr_n_u8(sr, 5));
sr = vadd_u8(sr, d);
- /* sb = sb - (sb>>5) + d */
+ // sb = sb - (sb>>5) + d
sb = vsub_u8(sb, vshr_n_u8(sb, 5));
sb = vadd_u8(sb, d);
- /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+ // sg = sg - (sg>>6) + d>>1; similar logic for overflows
sg = vsub_u8(sg, vshr_n_u8(sg, 6));
sg = vadd_u8(sg, vshr_n_u8(d,1));
- /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
+ // need to pick up 8 dst's -- at 16 bits each, 128 bits
dst8 = vld1q_u16(dst);
- dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
- dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
- dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
-
- /* blend */
-#if 1
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
- /* originally 255-sa + 1 */
+ dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
+ dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
+ dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
+
+ // blend
scale8 = vsubw_u8(vdupq_n_u16(256), sa);
-#else
- scale8 = vsubw_u8(vdupq_n_u16(255), sa);
- scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
-#endif
-#if 1
- /* combine the addq and mul, save 3 insns */
+ // combine the addq and mul, save 3 insns
scale8 = vshrq_n_u16(scale8, 3);
dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
-#else
- /* known correct, but +3 insns over above */
- scale8 = vshrq_n_u16(scale8, 3);
- dst_b = vmulq_u16(dst_b, scale8);
- dst_g = vmulq_u16(dst_g, scale8);
- dst_r = vmulq_u16(dst_r, scale8);
-
- /* combine */
- /* NB: vshll widens, need to preserve those bits */
- dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
- dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
- dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
-#endif
- /* repack to store */
- dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
+ // repack to store
+ dst8 = vshrq_n_u16(dst_b, 5);
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
vst1q_u16(dst, dst8);
-#if defined(DEBUG_OPAQUE_DITHER)
- /* verify my 8 elements match the temp buffer */
- {
- int i, bad=0;
- static int invocation;
-
- for (i=0;i<UNROLL;i++)
- if (tmpbuf[i] != dst[i]) bad=1;
- if (bad) {
- SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
- invocation, offset);
- SkDebugf(" alpha 0x%x\n", alpha);
- for (i=0;i<UNROLL;i++)
- SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
- i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
- dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
-
- showme16("alpha8", &alpha8, sizeof(alpha8));
- showme16("scale8", &scale8, sizeof(scale8));
- showme8("d", &d, sizeof(d));
- showme16("dst8", &dst8, sizeof(dst8));
- showme16("dst_b", &dst_b, sizeof(dst_b));
- showme16("dst_g", &dst_g, sizeof(dst_g));
- showme16("dst_r", &dst_r, sizeof(dst_r));
- showme8("sb", &sb, sizeof(sb));
- showme8("sg", &sg, sizeof(sg));
- showme8("sr", &sr, sizeof(sr));
-
- /* cop out */
- return;
- }
- offset += UNROLL;
- invocation++;
- }
-#endif
+#if defined(DEBUG_OPAQUE_DITHER)
+ // verify my 8 elements match the temp buffer
+ {
+ int i, bad=0;
+ static int invocation;
- dst += UNROLL;
- src += UNROLL;
+ for (i = 0; i < UNROLL; i++) {
+ if (tmpbuf[i] != dst[i]) {
+ bad=1;
+ }
+ }
+ if (bad) {
+ SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
+ invocation, offset);
+ SkDebugf(" alpha 0x%x\n", alpha);
+ for (i = 0; i < UNROLL; i++)
+ SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
+ i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
+ in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
+
+ showme16("alpha8", &alpha8, sizeof(alpha8));
+ showme16("scale8", &scale8, sizeof(scale8));
+ showme8("d", &d, sizeof(d));
+ showme16("dst8", &dst8, sizeof(dst8));
+ showme16("dst_b", &dst_b, sizeof(dst_b));
+ showme16("dst_g", &dst_g, sizeof(dst_g));
+ showme16("dst_r", &dst_r, sizeof(dst_r));
+ showme8("sb", &sb, sizeof(sb));
+ showme8("sg", &sg, sizeof(sg));
+ showme8("sr", &sr, sizeof(sr));
+
+ return;
+ }
+ offset += UNROLL;
+ invocation++;
+ }
+#endif
+ dst += UNROLL;
count -= UNROLL;
- /* skip x += UNROLL, since it's unchanged mod-4 */
+ // skip x += UNROLL, since it's unchanged mod-4
} while (count >= UNROLL);
}
#undef UNROLL
- /* residuals */
+ // residuals
if (count > 0) {
DITHER_565_SCAN(y);
do {
@@ -1218,7 +1352,11 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
uint8x8_t sr, sg, sb;
uint16x8_t dr, dg, db;
uint16x8_t dst8;
+ uint8x8x4_t vsrc;
+#ifdef SK_CPU_ARM64
+ vsrc = sk_vld4_u8_arm64_3(src);
+#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
@@ -1226,17 +1364,19 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
register uint8x8_t d3 asm("d3");
asm (
- "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+ "vld4.8 {d0-d3},[%[src]]! "
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
:
);
- sg = d1;
-#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
- sr = d2; sb = d0;
-#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
- sr = d0; sb = d2;
-#endif
+ vsrc.val[0] = d0;
+ vsrc.val[1] = d1;
+ vsrc.val[2] = d2;
}
+#endif
+ sr = vsrc.val[NEON_R];
+ sg = vsrc.val[NEON_G];
+ sb = vsrc.val[NEON_B];
+
/* XXX: if we want to prefetch, hide it in the above asm()
* using the gcc __builtin_prefetch(), the prefetch will
* fall to the bottom of the loop -- it won't stick up
@@ -1321,84 +1461,88 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
unsigned colorA = SkGetPackedA32(color);
if (255 == colorA) {
sk_memset32(dst, color, count);
- } else {
- unsigned scale = 256 - SkAlpha255To256(colorA);
+ return;
+ }
- if (count >= 8) {
- // at the end of this assembly, count will have been decremented
- // to a negative value. That is, if count mod 8 = x, it will be
- // -8 +x coming out.
- asm volatile (
- PLD128(src, 0)
-
- "vdup.32 q0, %[color] \n\t"
-
- PLD128(src, 128)
-
- // scale numerical interval [0-255], so load as 8 bits
- "vdup.8 d2, %[scale] \n\t"
-
- PLD128(src, 256)
-
- "subs %[count], %[count], #8 \n\t"
-
- PLD128(src, 384)
-
- "Loop_Color32: \n\t"
-
- // load src color, 8 pixels, 4 64 bit registers
- // (and increment src).
- "vld1.32 {d4-d7}, [%[src]]! \n\t"
-
- PLD128(src, 384)
-
- // multiply long by scale, 64 bits at a time,
- // destination into a 128 bit register.
- "vmull.u8 q4, d4, d2 \n\t"
- "vmull.u8 q5, d5, d2 \n\t"
- "vmull.u8 q6, d6, d2 \n\t"
- "vmull.u8 q7, d7, d2 \n\t"
-
- // shift the 128 bit registers, containing the 16
- // bit scaled values back to 8 bits, narrowing the
- // results to 64 bit registers.
- "vshrn.i16 d8, q4, #8 \n\t"
- "vshrn.i16 d9, q5, #8 \n\t"
- "vshrn.i16 d10, q6, #8 \n\t"
- "vshrn.i16 d11, q7, #8 \n\t"
-
- // adding back the color, using 128 bit registers.
- "vadd.i8 q6, q4, q0 \n\t"
- "vadd.i8 q7, q5, q0 \n\t"
-
- // store back the 8 calculated pixels (2 128 bit
- // registers), and increment dst.
- "vst1.32 {d12-d15}, [%[dst]]! \n\t"
-
- "subs %[count], %[count], #8 \n\t"
- "bge Loop_Color32 \n\t"
- : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
- : [color] "r" (color), [scale] "r" (scale)
- : "cc", "memory",
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
- "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
- );
- // At this point, if we went through the inline assembly, count is
- // a negative value:
- // if the value is -8, there is no pixel left to process.
- // if the value is -7, there is one pixel left to process
- // ...
- // And'ing it with 7 will give us the number of pixels
- // left to process.
- count = count & 0x7;
- }
+ unsigned scale = 256 - SkAlpha255To256(colorA);
- while (count > 0) {
- *dst = color + SkAlphaMulQ(*src, scale);
- src += 1;
- dst += 1;
- count--;
- }
+ if (count >= 8) {
+ uint32x4_t vcolor;
+ uint8x8_t vscale;
+
+ vcolor = vdupq_n_u32(color);
+
+ // scale numerical interval [0-255], so load as 8 bits
+ vscale = vdup_n_u8(scale);
+
+ do {
+ // load src color, 8 pixels, 4 64 bit registers
+ // (and increment src).
+ uint32x2x4_t vsrc;
+#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
+ asm (
+ "vld1.32 %h[vsrc], [%[src]]!"
+ : [vsrc] "=w" (vsrc), [src] "+r" (src)
+ : :
+ );
+#else // 64bit targets and Clang
+ vsrc.val[0] = vld1_u32(src);
+ vsrc.val[1] = vld1_u32(src+2);
+ vsrc.val[2] = vld1_u32(src+4);
+ vsrc.val[3] = vld1_u32(src+6);
+ src += 8;
+#endif
+
+ // multiply long by scale, 64 bits at a time,
+ // destination into a 128 bit register.
+ uint16x8x4_t vtmp;
+ vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
+ vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
+ vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
+ vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
+
+ // shift the 128 bit registers, containing the 16
+ // bit scaled values back to 8 bits, narrowing the
+ // results to 64 bit registers.
+ uint8x16x2_t vres;
+ vres.val[0] = vcombine_u8(
+ vshrn_n_u16(vtmp.val[0], 8),
+ vshrn_n_u16(vtmp.val[1], 8));
+ vres.val[1] = vcombine_u8(
+ vshrn_n_u16(vtmp.val[2], 8),
+ vshrn_n_u16(vtmp.val[3], 8));
+
+ // adding back the color, using 128 bit registers.
+ uint32x4x2_t vdst;
+ vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
+ vreinterpretq_u8_u32(vcolor));
+ vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
+ vreinterpretq_u8_u32(vcolor));
+
+ // store back the 8 calculated pixels (2 128 bit
+ // registers), and increment dst.
+#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
+ asm (
+ "vst1.32 %h[vdst], [%[dst]]!"
+ : [dst] "+r" (dst)
+ : [vdst] "w" (vdst)
+ : "memory"
+ );
+#else // 64bit targets and Clang
+ vst1q_u32(dst, vdst.val[0]);
+ vst1q_u32(dst+4, vdst.val[1]);
+ dst += 8;
+#endif
+ count -= 8;
+
+ } while (count >= 8);
+ }
+
+ while (count > 0) {
+ *dst = color + SkAlphaMulQ(*src, scale);
+ src += 1;
+ dst += 1;
+ count--;
}
}
@@ -1406,12 +1550,13 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
// no dither
- // NOTE: For the S32_D565_Blend function below, we don't have a special
- // version that assumes that each source pixel is opaque. But our
- // S32A is still faster than the default, so use it.
S32_D565_Opaque_neon,
- S32A_D565_Blend_neon, // really S32_D565_Blend
+ S32_D565_Blend_neon,
+#ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon,
+#else
+ NULL,
+#endif
S32A_D565_Blend_neon,
// dither
@@ -1439,5 +1584,9 @@ const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
#else
S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
#endif
+#ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon // S32A_Blend
+#else
+ NULL
+#endif
};
diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
new file mode 100644
index 00000000000..30bb4c2701a
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp
@@ -0,0 +1,848 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBlitRow.h"
+#include "SkBlitMask.h"
+#include "SkColorPriv.h"
+#include "SkDither.h"
+#include "SkMathPriv.h"
+
+static void S32_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/) {
+ register uint32_t t0, t1, t2, t3, t4, t5, t6;
+ register uint32_t s0, s1, s2, s4, s5, s6;
+
+ alpha += 1;
+ if (count >= 2) {
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "sll %[s4], %[alpha], 8 \n\t"
+ "or %[s4], %[s4], %[alpha] \n\t"
+ "repl.ph %[s5], 0x1f \n\t"
+ "repl.ph %[s6], 0x3f \n\t"
+ "1: \n\t"
+ "lw %[s2], 0(%[src]) \n\t"
+ "lw %[s1], 4(%[src]) \n\t"
+ "lwr %[s0], 0(%[dst]) \n\t"
+ "lwl %[s0], 3(%[dst]) \n\t"
+ "and %[t1], %[s0], %[s5] \n\t"
+ "shra.ph %[t0], %[s0], 5 \n\t"
+ "and %[t2], %[t0], %[s6] \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "shrl.ph %[t3], %[s0], 11 \n\t"
+#else
+ "shra.ph %[t0], %[s0], 11 \n\t"
+ "and %[t3], %[t0], %[s5] \n\t"
+#endif
+ "precrq.ph.w %[t0], %[s1], %[s2] \n\t"
+ "shrl.qb %[t5], %[t0], 3 \n\t"
+ "and %[t4], %[t5], %[s5] \n\t"
+ "ins %[s2], %[s1], 16, 16 \n\t"
+ "preceu.ph.qbra %[t0], %[s2] \n\t"
+ "shrl.qb %[t6], %[t0], 3 \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "shrl.ph %[t5], %[s2], 10 \n\t"
+#else
+ "shra.ph %[t0], %[s2], 10 \n\t"
+ "and %[t5], %[t0], %[s6] \n\t"
+#endif
+ "subu.qb %[t4], %[t4], %[t1] \n\t"
+ "subu.qb %[t5], %[t5], %[t2] \n\t"
+ "subu.qb %[t6], %[t6], %[t3] \n\t"
+ "muleu_s.ph.qbr %[t4], %[s4], %[t4] \n\t"
+ "muleu_s.ph.qbr %[t5], %[s4], %[t5] \n\t"
+ "muleu_s.ph.qbr %[t6], %[s4], %[t6] \n\t"
+ "addiu %[count], %[count], -2 \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "shra.ph %[t4], %[t4], 8 \n\t"
+ "shra.ph %[t5], %[t5], 8 \n\t"
+ "shra.ph %[t6], %[t6], 8 \n\t"
+ "addu.qb %[t4], %[t4], %[t1] \n\t"
+ "addu.qb %[t5], %[t5], %[t2] \n\t"
+ "addu.qb %[t6], %[t6], %[t3] \n\t"
+ "andi %[s0], %[t4], 0xffff \n\t"
+ "andi %[t0], %[t5], 0xffff \n\t"
+ "sll %[t0], %[t0], 0x5 \n\t"
+ "or %[s0], %[s0], %[t0] \n\t"
+ "sll %[t0], %[t6], 0xb \n\t"
+ "or %[t0], %[t0], %[s0] \n\t"
+ "sh %[t0], 0(%[dst]) \n\t"
+ "srl %[s1], %[t4], 16 \n\t"
+ "srl %[t0], %[t5], 16 \n\t"
+ "sll %[t5], %[t0], 5 \n\t"
+ "or %[t0], %[t5], %[s1] \n\t"
+ "srl %[s0], %[t6], 16 \n\t"
+ "sll %[s2], %[s0], 0xb \n\t"
+ "or %[s1], %[s2], %[t0] \n\t"
+ "sh %[s1], 2(%[dst]) \n\t"
+ "bge %[count], 2, 1b \n\t"
+ " addiu %[dst], %[dst], 4 \n\t"
+ ".set pop \n\t"
+ : [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+ [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [s0]"=&r"(s0),
+ [s1]"=&r"(s1), [s2]"=&r"(s2), [s4]"=&r"(s4), [s5]"=&r"(s5),
+ [s6]"=&r"(s6), [count]"+r"(count), [dst]"+r"(dst),
+ [src]"+r"(src)
+ : [alpha]"r"(alpha)
+ : "memory", "hi", "lo"
+ );
+ }
+
+ if (count == 1) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ SkASSERT(SkGetPackedA32(c) == 255);
+ uint16_t d = *dst;
+ *dst++ = SkPackRGB16(SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), alpha),
+ SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), alpha),
+ SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), alpha));
+ }
+}
+
+static void S32A_D565_Opaque_Dither_mips_dsp(uint16_t* __restrict__ dst,
+ const SkPMColor* __restrict__ src,
+ int count, U8CPU alpha, int x, int y) {
+ __asm__ volatile (
+ "pref 0, 0(%[src]) \n\t"
+ "pref 1, 0(%[dst]) \n\t"
+ "pref 0, 32(%[src]) \n\t"
+ "pref 1, 32(%[dst]) \n\t"
+ :
+ : [src]"r"(src), [dst]"r"(dst)
+ : "memory"
+ );
+
+ register int32_t t0, t1, t2, t3, t4, t5, t6;
+ register int32_t t7, t8, t9, s0, s1, s2, s3;
+ const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+
+ if (count >= 2) {
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "li %[s1], 0x01010101 \n\t"
+ "li %[s2], -2017 \n\t"
+ "1: \n\t"
+ "bnez %[s3], 4f \n\t"
+ " li %[s3], 2 \n\t"
+ "pref 0, 64(%[src]) \n\t"
+ "pref 1, 64(%[dst]) \n\t"
+ "4: \n\t"
+ "addiu %[s3], %[s3], -1 \n\t"
+ "lw %[t1], 0(%[src]) \n\t"
+ "andi %[t3], %[x], 0x3 \n\t"
+ "addiu %[x], %[x], 1 \n\t"
+ "sll %[t4], %[t3], 2 \n\t"
+ "srav %[t5], %[dither_scan], %[t4] \n\t"
+ "andi %[t3], %[t5], 0xf \n\t"
+ "lw %[t2], 4(%[src]) \n\t"
+ "andi %[t4], %[x], 0x3 \n\t"
+ "sll %[t5], %[t4], 2 \n\t"
+ "srav %[t6], %[dither_scan], %[t5] \n\t"
+ "addiu %[x], %[x], 1 \n\t"
+ "ins %[t3], %[t6], 8, 4 \n\t"
+ "srl %[t4], %[t1], 24 \n\t"
+ "addiu %[t0], %[t4], 1 \n\t"
+ "srl %[t4], %[t2], 24 \n\t"
+ "addiu %[t5], %[t4], 1 \n\t"
+ "ins %[t0], %[t5], 16, 16 \n\t"
+ "muleu_s.ph.qbr %[t4], %[t3], %[t0] \n\t"
+ "preceu.ph.qbla %[t3], %[t4] \n\t"
+ "andi %[t4], %[t1], 0xff \n\t"
+ "ins %[t4], %[t2], 16, 8 \n\t"
+ "shrl.qb %[t5], %[t4], 5 \n\t"
+ "subu.qb %[t6], %[t3], %[t5] \n\t"
+ "addq.ph %[t5], %[t6], %[t4] \n\t"
+ "ext %[t4], %[t1], 8, 8 \n\t"
+ "srl %[t6], %[t2], 8 \n\t"
+ "ins %[t4], %[t6], 16, 8 \n\t"
+ "shrl.qb %[t6], %[t4], 6 \n\t"
+ "shrl.qb %[t7], %[t3], 1 \n\t"
+ "subu.qb %[t8], %[t7], %[t6] \n\t"
+ "addq.ph %[t6], %[t8], %[t4] \n\t"
+ "ext %[t4], %[t1], 16, 8 \n\t"
+ "srl %[t7], %[t2], 16 \n\t"
+ "ins %[t4], %[t7], 16, 8 \n\t"
+ "shrl.qb %[t7], %[t4], 5 \n\t"
+ "subu.qb %[t8], %[t3], %[t7] \n\t"
+ "addq.ph %[t7], %[t8], %[t4] \n\t"
+ "shll.ph %[t4], %[t7], 2 \n\t"
+ "andi %[t9], %[t4], 0xffff \n\t"
+ "srl %[s0], %[t4], 16 \n\t"
+ "andi %[t3], %[t6], 0xffff \n\t"
+ "srl %[t4], %[t6], 16 \n\t"
+ "andi %[t6], %[t5], 0xffff \n\t"
+ "srl %[t7], %[t5], 16 \n\t"
+ "subq.ph %[t5], %[s1], %[t0] \n\t"
+ "srl %[t0], %[t5], 3 \n\t"
+ "beqz %[t1], 3f \n\t"
+ " lhu %[t5], 0(%[dst]) \n\t"
+ "sll %[t1], %[t6], 13 \n\t"
+ "or %[t8], %[t9], %[t1] \n\t"
+ "sll %[t1], %[t3], 24 \n\t"
+ "or %[t9], %[t1], %[t8] \n\t"
+ "andi %[t3], %[t5], 0x7e0 \n\t"
+ "sll %[t6], %[t3], 0x10 \n\t"
+ "and %[t8], %[s2], %[t5] \n\t"
+ "or %[t5], %[t6], %[t8] \n\t"
+ "andi %[t6], %[t0], 0xff \n\t"
+ "mul %[t1], %[t6], %[t5] \n\t"
+ "addu %[t5], %[t1], %[t9] \n\t"
+ "srl %[t6], %[t5], 5 \n\t"
+ "and %[t5], %[s2], %[t6] \n\t"
+ "srl %[t8], %[t6], 16 \n\t"
+ "andi %[t6], %[t8], 0x7e0 \n\t"
+ "or %[t1], %[t5], %[t6] \n\t"
+ "sh %[t1], 0(%[dst]) \n\t"
+ "3: \n\t"
+ "beqz %[t2], 2f \n\t"
+ " lhu %[t5], 2(%[dst]) \n\t"
+ "sll %[t1], %[t7], 13 \n\t"
+ "or %[t8], %[s0], %[t1] \n\t"
+ "sll %[t1], %[t4], 24 \n\t"
+ "or %[t9], %[t1], %[t8] \n\t"
+ "andi %[t3], %[t5], 0x7e0 \n\t"
+ "sll %[t6], %[t3], 0x10 \n\t"
+ "and %[t8], %[s2], %[t5] \n\t"
+ "or %[t5], %[t6], %[t8] \n\t"
+ "srl %[t6], %[t0], 16 \n\t"
+ "mul %[t1], %[t6], %[t5] \n\t"
+ "addu %[t5], %[t1], %[t9] \n\t"
+ "srl %[t6], %[t5], 5 \n\t"
+ "and %[t5], %[s2], %[t6] \n\t"
+ "srl %[t8], %[t6], 16 \n\t"
+ "andi %[t6], %[t8], 0x7e0 \n\t"
+ "or %[t1], %[t5], %[t6] \n\t"
+ "sh %[t1], 2(%[dst]) \n\t"
+ "2: \n\t"
+ "addiu %[count], %[count], -2 \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "addiu %[t1], %[count], -1 \n\t"
+ "bgtz %[t1], 1b \n\t"
+ " addiu %[dst], %[dst], 4 \n\t"
+ ".set pop \n\t"
+ : [src]"+r"(src), [count]"+r"(count), [dst]"+r"(dst), [x]"+r"(x),
+ [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+ [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7),
+ [t8]"=&r"(t8), [t9]"=&r"(t9), [s0]"=&r"(s0), [s1]"=&r"(s1),
+ [s2]"=&r"(s2), [s3]"=&r"(s3)
+ : [dither_scan]"r"(dither_scan)
+ : "memory", "hi", "lo"
+ );
+ }
+
+ if (count == 1) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ if (c) {
+ unsigned a = SkGetPackedA32(c);
+ int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
+
+ unsigned sr = SkGetPackedR32(c);
+ unsigned sg = SkGetPackedG32(c);
+ unsigned sb = SkGetPackedB32(c);
+ sr = SkDITHER_R32_FOR_565(sr, d);
+ sg = SkDITHER_G32_FOR_565(sg, d);
+ sb = SkDITHER_B32_FOR_565(sb, d);
+
+ uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+ uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+ dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+ // now src and dst expanded are in g:11 r:10 x:1 b:10
+ *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+ }
+ dst += 1;
+ DITHER_INC_X(x);
+ }
+}
+
+static void S32_D565_Opaque_Dither_mips_dsp(uint16_t* __restrict__ dst,
+ const SkPMColor* __restrict__ src,
+ int count, U8CPU alpha, int x, int y) {
+ uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+ register uint32_t t0, t1, t2, t3, t4, t5;
+ register uint32_t t6, t7, t8, t9, s0;
+ int dither[4];
+ int i;
+
+ for (i = 0; i < 4; i++, x++) {
+ dither[i] = (dither_scan >> ((x & 3) << 2)) & 0xF;
+ }
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "li %[s0], 1 \n\t"
+ "2: \n\t"
+ "beqz %[count], 1f \n\t"
+ " nop \n\t"
+ "addiu %[t0], %[count], -1 \n\t"
+ "beqz %[t0], 1f \n\t"
+ " nop \n\t"
+ "beqz %[s0], 3f \n\t"
+ " nop \n\t"
+ "lw %[t0], 0(%[dither]) \n\t"
+ "lw %[t1], 4(%[dither]) \n\t"
+ "li %[s0], 0 \n\t"
+ "b 4f \n\t"
+ " nop \n\t"
+ "3: \n\t"
+ "lw %[t0], 8(%[dither]) \n\t"
+ "lw %[t1], 12(%[dither]) \n\t"
+ "li %[s0], 1 \n\t"
+ "4: \n\t"
+ "sll %[t2], %[t0], 16 \n\t"
+ "or %[t1], %[t2], %[t1] \n\t"
+ "lw %[t0], 0(%[src]) \n\t"
+ "lw %[t2], 4(%[src]) \n\t"
+ "precrq.ph.w %[t3], %[t0], %[t2] \n\t"
+ "preceu.ph.qbra %[t9], %[t3] \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "append %[t0], %[t2], 16 \n\t"
+ "preceu.ph.qbra %[t4], %[t0] \n\t"
+ "preceu.ph.qbla %[t5], %[t0] \n\t"
+#else
+ "sll %[t6], %[t0], 16 \n\t"
+ "sll %[t7], %[t2], 16 \n\t"
+ "precrq.ph.w %[t8], %[t6], %[t7] \n\t"
+ "preceu.ph.qbra %[t4], %[t8] \n\t"
+ "preceu.ph.qbla %[t5], %[t8] \n\t"
+#endif
+ "addu.qb %[t0], %[t4], %[t1] \n\t"
+ "shra.ph %[t2], %[t4], 5 \n\t"
+ "subu.qb %[t3], %[t0], %[t2] \n\t"
+ "shra.ph %[t6], %[t3], 3 \n\t"
+ "addu.qb %[t0], %[t9], %[t1] \n\t"
+ "shra.ph %[t2], %[t9], 5 \n\t"
+ "subu.qb %[t3], %[t0], %[t2] \n\t"
+ "shra.ph %[t7], %[t3], 3 \n\t"
+ "shra.ph %[t0], %[t1], 1 \n\t"
+ "shra.ph %[t2], %[t5], 6 \n\t"
+ "addu.qb %[t3], %[t5], %[t0] \n\t"
+ "subu.qb %[t4], %[t3], %[t2] \n\t"
+ "shra.ph %[t8], %[t4], 2 \n\t"
+ "precrq.ph.w %[t0], %[t6], %[t7] \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "append %[t6], %[t7], 16 \n\t"
+#else
+ "sll %[t6], %[t6], 16 \n\t"
+ "sll %[t2], %[t7], 16 \n\t"
+ "precrq.ph.w %[t6], %[t6], %[t2] \n\t"
+#endif
+ "sra %[t4], %[t8], 16 \n\t"
+ "andi %[t5], %[t8], 0xFF \n\t"
+ "sll %[t7], %[t4], 5 \n\t"
+ "sra %[t8], %[t0], 5 \n\t"
+ "or %[t9], %[t7], %[t8] \n\t"
+ "or %[t3], %[t9], %[t0] \n\t"
+ "andi %[t4], %[t3], 0xFFFF \n\t"
+ "sll %[t7], %[t5], 5 \n\t"
+ "sra %[t8], %[t6], 5 \n\t"
+ "or %[t9], %[t7], %[t8] \n\t"
+ "or %[t3], %[t9], %[t6] \n\t"
+ "and %[t7], %[t3], 0xFFFF \n\t"
+ "sh %[t4], 0(%[dst]) \n\t"
+ "sh %[t7], 2(%[dst]) \n\t"
+ "addiu %[count], %[count], -2 \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "b 2b \n\t"
+ " addiu %[dst], %[dst], 4 \n\t"
+ "1: \n\t"
+ ".set pop \n\t"
+ : [dst]"+r"(dst), [src]"+r"(src), [count]"+r"(count),
+ [x]"+r"(x), [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2),
+ [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6),
+ [t7]"=&r"(t7), [t8]"=&r"(t8), [t9]"=&r"(t9), [s0]"=&r"(s0)
+ : [dither] "r" (dither)
+ : "memory"
+ );
+
+ if (count == 1) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c); // only if DEBUG is turned on
+ SkASSERT(SkGetPackedA32(c) == 255);
+ unsigned dither = DITHER_VALUE(x);
+ *dst++ = SkDitherRGB32To565(c, dither);
+ }
+}
+
+static void S32_D565_Blend_Dither_mips_dsp(uint16_t* dst,
+ const SkPMColor* src,
+ int count, U8CPU alpha, int x, int y) {
+ register int32_t t0, t1, t2, t3, t4, t5, t6;
+ register int32_t s0, s1, s2, s3;
+ register int x1 = 0;
+ register uint32_t sc_mul;
+ register uint32_t sc_add;
+#ifdef ENABLE_DITHER_MATRIX_4X4
+ const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
+#else // ENABLE_DITHER_MATRIX_4X4
+ const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
+#endif // ENABLE_DITHER_MATRIX_4X4
+ int dither[4];
+
+ for (int i = 0; i < 4; i++) {
+ dither[i] = (dither_scan >> ((x & 3) << 2)) & 0xF;
+ x += 1;
+ }
+ alpha += 1;
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "li %[t0], 0x100 \n\t"
+ "subu %[t0], %[t0], %[alpha] \n\t"
+ "replv.ph %[sc_mul], %[alpha] \n\t"
+ "beqz %[alpha], 1f \n\t"
+ " nop \n\t"
+ "replv.qb %[sc_add], %[t0] \n\t"
+ "b 2f \n\t"
+ " nop \n\t"
+ "1: \n\t"
+ "replv.qb %[sc_add], %[alpha] \n\t"
+ "2: \n\t"
+ "addiu %[t2], %[count], -1 \n\t"
+ "blez %[t2], 3f \n\t"
+ " nop \n\t"
+ "lw %[s0], 0(%[src]) \n\t"
+ "lw %[s1], 4(%[src]) \n\t"
+ "bnez %[x1], 4f \n\t"
+ " nop \n\t"
+ "lw %[t0], 0(%[dither]) \n\t"
+ "lw %[t1], 4(%[dither]) \n\t"
+ "li %[x1], 1 \n\t"
+ "b 5f \n\t"
+ " nop \n\t"
+ "4: \n\t"
+ "lw %[t0], 8(%[dither]) \n\t"
+ "lw %[t1], 12(%[dither]) \n\t"
+ "li %[x1], 0 \n\t"
+ "5: \n\t"
+ "sll %[t3], %[t0], 7 \n\t"
+ "sll %[t4], %[t1], 7 \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "append %[t0], %[t1], 16 \n\t"
+#else
+ "sll %[t0], %[t0], 8 \n\t"
+ "sll %[t2], %[t1], 8 \n\t"
+ "precrq.qb.ph %[t0], %[t0], %[t2] \n\t"
+#endif
+ "precrq.qb.ph %[t1], %[t3], %[t4] \n\t"
+ "sll %[t5], %[s0], 8 \n\t"
+ "sll %[t6], %[s1], 8 \n\t"
+ "precrq.qb.ph %[t4], %[t5], %[t6] \n\t"
+ "precrq.qb.ph %[t6], %[s0], %[s1] \n\t"
+ "preceu.ph.qbla %[t5], %[t4] \n\t"
+ "preceu.ph.qbra %[t4], %[t4] \n\t"
+ "preceu.ph.qbra %[t6], %[t6] \n\t"
+ "lh %[t2], 0(%[dst]) \n\t"
+ "lh %[s1], 2(%[dst]) \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "append %[t2], %[s1], 16 \n\t"
+#else
+ "sll %[s1], %[s1], 16 \n\t"
+ "packrl.ph %[t2], %[t2], %[s1] \n\t"
+#endif
+ "shra.ph %[s1], %[t2], 11 \n\t"
+ "and %[s1], %[s1], 0x1F001F \n\t"
+ "shra.ph %[s2], %[t2], 5 \n\t"
+ "and %[s2], %[s2], 0x3F003F \n\t"
+ "and %[s3], %[t2], 0x1F001F \n\t"
+ "shrl.qb %[t3], %[t4], 5 \n\t"
+ "addu.qb %[t4], %[t4], %[t0] \n\t"
+ "subu.qb %[t4], %[t4], %[t3] \n\t"
+ "shrl.qb %[t4], %[t4], 3 \n\t"
+ "shrl.qb %[t3], %[t5], 5 \n\t"
+ "addu.qb %[t5], %[t5], %[t0] \n\t"
+ "subu.qb %[t5], %[t5], %[t3] \n\t"
+ "shrl.qb %[t5], %[t5], 3 \n\t"
+ "shrl.qb %[t3], %[t6], 6 \n\t"
+ "addu.qb %[t6], %[t6], %[t1] \n\t"
+ "subu.qb %[t6], %[t6], %[t3] \n\t"
+ "shrl.qb %[t6], %[t6], 2 \n\t"
+ "cmpu.lt.qb %[t4], %[s1] \n\t"
+ "pick.qb %[s0], %[sc_add], $0 \n\t"
+ "addu.qb %[s0], %[s0], %[s1] \n\t"
+ "subu.qb %[t4], %[t4], %[s1] \n\t"
+ "muleu_s.ph.qbl %[t0], %[t4], %[sc_mul] \n\t"
+ "muleu_s.ph.qbr %[t1], %[t4], %[sc_mul] \n\t"
+ "precrq.qb.ph %[t4], %[t0], %[t1] \n\t"
+ "addu.qb %[t4], %[t4], %[s0] \n\t"
+ "cmpu.lt.qb %[t5], %[s3] \n\t"
+ "pick.qb %[s0], %[sc_add], $0 \n\t"
+ "addu.qb %[s0], %[s0], %[s3] \n\t"
+ "subu.qb %[t5], %[t5], %[s3] \n\t"
+ "muleu_s.ph.qbl %[t0], %[t5], %[sc_mul] \n\t"
+ "muleu_s.ph.qbr %[t1], %[t5], %[sc_mul] \n\t"
+ "precrq.qb.ph %[t5], %[t0], %[t1] \n\t"
+ "addu.qb %[t5], %[t5], %[s0] \n\t"
+ "cmpu.lt.qb %[t6], %[s2] \n\t"
+ "pick.qb %[s0], %[sc_add], $0 \n\t"
+ "addu.qb %[s0], %[s0], %[s2] \n\t"
+ "subu.qb %[t6], %[t6], %[s2] \n\t"
+ "muleu_s.ph.qbl %[t0], %[t6], %[sc_mul] \n\t"
+ "muleu_s.ph.qbr %[t1], %[t6], %[sc_mul] \n\t"
+ "precrq.qb.ph %[t6], %[t0], %[t1] \n\t"
+ "addu.qb %[t6], %[t6], %[s0] \n\t"
+ "shll.ph %[s1], %[t4], 11 \n\t"
+ "shll.ph %[t0], %[t6], 5 \n\t"
+ "or %[s0], %[s1], %[t0] \n\t"
+ "or %[s1], %[s0], %[t5] \n\t"
+ "srl %[t2], %[s1], 16 \n\t"
+ "and %[t3], %[s1], 0xFFFF \n\t"
+ "sh %[t2], 0(%[dst]) \n\t"
+ "sh %[t3], 2(%[dst]) \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "addi %[count], %[count], -2 \n\t"
+ "b 2b \n\t"
+ " addu %[dst], %[dst], 4 \n\t"
+ "3: \n\t"
+ ".set pop \n\t"
+ : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
+ [x1]"+r"(x1), [sc_mul]"=&r"(sc_mul), [sc_add]"=&r"(sc_add),
+ [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+ [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [s0]"=&r"(s0),
+ [s1]"=&r"(s1), [s2]"=&r"(s2), [s3]"=&r"(s3)
+ : [dither]"r"(dither), [alpha]"r"(alpha)
+ : "memory", "hi", "lo"
+ );
+
+ if(count == 1) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ SkASSERT(SkGetPackedA32(c) == 255);
+ DITHER_565_SCAN(y);
+ int dither = DITHER_VALUE(x);
+ int sr = SkGetPackedR32(c);
+ int sg = SkGetPackedG32(c);
+ int sb = SkGetPackedB32(c);
+ sr = SkDITHER_R32To565(sr, dither);
+ sg = SkDITHER_G32To565(sg, dither);
+ sb = SkDITHER_B32To565(sb, dither);
+
+ uint16_t d = *dst;
+ *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), alpha),
+ SkAlphaBlend(sg, SkGetPackedG16(d), alpha),
+ SkAlphaBlend(sb, SkGetPackedB16(d), alpha));
+ DITHER_INC_X(x);
+ }
+}
+
+static void S32A_D565_Opaque_mips_dsp(uint16_t* __restrict__ dst,
+ const SkPMColor* __restrict__ src,
+ int count, U8CPU alpha, int x, int y) {
+
+ __asm__ volatile (
+ "pref 0, 0(%[src]) \n\t"
+ "pref 1, 0(%[dst]) \n\t"
+ "pref 0, 32(%[src]) \n\t"
+ "pref 1, 32(%[dst]) \n\t"
+ :
+ : [src]"r"(src), [dst]"r"(dst)
+ : "memory"
+ );
+
+ register uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8;
+ register uint32_t t16;
+ register uint32_t add_x10 = 0x100010;
+ register uint32_t add_x20 = 0x200020;
+ register uint32_t sa = 0xff00ff;
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "blez %[count], 1f \n\t"
+ " nop \n\t"
+ "2: \n\t"
+ "beqz %[count], 1f \n\t"
+ " nop \n\t"
+ "addiu %[t0], %[count], -1 \n\t"
+ "beqz %[t0], 1f \n\t"
+ " nop \n\t"
+ "bnez %[t16], 3f \n\t"
+ " nop \n\t"
+ "li %[t16], 2 \n\t"
+ "pref 0, 64(%[src]) \n\t"
+ "pref 1, 64(%[dst]) \n\t"
+ "3: \n\t"
+ "addiu %[t16], %[t16], -1 \n\t"
+ "lw %[t0], 0(%[src]) \n\t"
+ "lw %[t1], 4(%[src]) \n\t"
+ "precrq.ph.w %[t2], %[t0], %[t1] \n\t"
+ "preceu.ph.qbra %[t8], %[t2] \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "append %[t0], %[t1], 16 \n\t"
+#else
+ "sll %[t0], %[t0], 16 \n\t"
+ "sll %[t6], %[t1], 16 \n\t"
+ "precrq.ph.w %[t0], %[t0], %[t6] \n\t"
+#endif
+ "preceu.ph.qbra %[t3], %[t0] \n\t"
+ "preceu.ph.qbla %[t4], %[t0] \n\t"
+ "preceu.ph.qbla %[t0], %[t2] \n\t"
+ "subq.ph %[t1], %[sa], %[t0] \n\t"
+ "sra %[t2], %[t1], 8 \n\t"
+ "or %[t5], %[t2], %[t1] \n\t"
+ "replv.ph %[t2], %[t5] \n\t"
+ "lh %[t0], 0(%[dst]) \n\t"
+ "lh %[t1], 2(%[dst]) \n\t"
+ "and %[t1], %[t1], 0xffff \n\t"
+#ifdef __MIPS_HAVE_DSPR2
+ "append %[t0], %[t1], 16 \n\t"
+#else
+ "sll %[t5], %[t0], 16 \n\t"
+ "or %[t0], %[t5], %[t1] \n\t"
+#endif
+ "and %[t1], %[t0], 0x1f001f \n\t"
+ "shra.ph %[t6], %[t0], 11 \n\t"
+ "and %[t6], %[t6], 0x1f001f \n\t"
+ "and %[t7], %[t0], 0x7e007e0 \n\t"
+ "shra.ph %[t5], %[t7], 5 \n\t"
+ "muleu_s.ph.qbl %[t0], %[t2], %[t6] \n\t"
+ "addq.ph %[t7], %[t0], %[add_x10] \n\t"
+ "shra.ph %[t6], %[t7], 5 \n\t"
+ "addq.ph %[t6], %[t7], %[t6] \n\t"
+ "shra.ph %[t0], %[t6], 5 \n\t"
+ "addq.ph %[t7], %[t0], %[t3] \n\t"
+ "shra.ph %[t6], %[t7], 3 \n\t"
+ "muleu_s.ph.qbl %[t0], %[t2], %[t1] \n\t"
+ "addq.ph %[t7], %[t0], %[add_x10] \n\t"
+ "shra.ph %[t0], %[t7], 5 \n\t"
+ "addq.ph %[t7], %[t7], %[t0] \n\t"
+ "shra.ph %[t0], %[t7], 5 \n\t"
+ "addq.ph %[t7], %[t0], %[t8] \n\t"
+ "shra.ph %[t3], %[t7], 3 \n\t"
+ "muleu_s.ph.qbl %[t0], %[t2], %[t5] \n\t"
+ "addq.ph %[t7], %[t0], %[add_x20] \n\t"
+ "shra.ph %[t0], %[t7], 6 \n\t"
+ "addq.ph %[t8], %[t7], %[t0] \n\t"
+ "shra.ph %[t0], %[t8], 6 \n\t"
+ "addq.ph %[t7], %[t0], %[t4] \n\t"
+ "shra.ph %[t8], %[t7], 2 \n\t"
+ "shll.ph %[t0], %[t8], 5 \n\t"
+ "shll.ph %[t1], %[t6], 11 \n\t"
+ "or %[t2], %[t0], %[t1] \n\t"
+ "or %[t3], %[t2], %[t3] \n\t"
+ "sra %[t4], %[t3], 16 \n\t"
+ "sh %[t4], 0(%[dst]) \n\t"
+ "sh %[t3], 2(%[dst]) \n\t"
+ "addiu %[count], %[count], -2 \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "b 2b \n\t"
+ " addiu %[dst], %[dst], 4 \n\t"
+ "1: \n\t"
+ ".set pop \n\t"
+ : [dst]"+r"(dst), [src]"+r"(src), [count]"+r"(count),
+ [t16]"=&r"(t16), [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2),
+ [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6),
+ [t7]"=&r"(t7), [t8]"=&r"(t8)
+ : [add_x10]"r"(add_x10), [add_x20]"r"(add_x20), [sa]"r"(sa)
+ : "memory", "hi", "lo"
+ );
+
+ if (count == 1) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ if (c) {
+ *dst = SkSrcOver32To16(c, *dst);
+ }
+ dst += 1;
+ }
+}
+
+static void S32A_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/) {
+ register uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+ register uint32_t s0, s1, s2, s3;
+ register unsigned dst_scale = 0;
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "replv.qb %[t0], %[alpha] \n\t"
+ "repl.ph %[t6], 0x80 \n\t"
+ "repl.ph %[t7], 0xFF \n\t"
+ "1: \n\t"
+ "addiu %[t8], %[count], -1 \n\t"
+ "blez %[t8], 2f \n\t"
+ " nop \n\t"
+ "lw %[t8], 0(%[src]) \n\t"
+ "lw %[t9], 4(%[src]) \n\t"
+ "lh %[t4], 0(%[dst]) \n\t"
+ "lh %[t5], 2(%[dst]) \n\t"
+ "sll %[t5], %[t5], 16 \n\t"
+ "sll %[t2], %[t8], 8 \n\t"
+ "sll %[t3], %[t9], 8 \n\t"
+ "precrq.qb.ph %[t1], %[t2], %[t3] \n\t"
+ "precrq.qb.ph %[t3], %[t8], %[t9] \n\t"
+ "preceu.ph.qbla %[t8], %[t3] \n\t"
+ "muleu_s.ph.qbr %[s3], %[t0], %[t8] \n\t"
+ "preceu.ph.qbla %[t2], %[t1] \n\t"
+ "preceu.ph.qbra %[t1], %[t1] \n\t"
+ "preceu.ph.qbra %[t3], %[t3] \n\t"
+ "packrl.ph %[t9], %[t4], %[t5] \n\t"
+ "shra.ph %[s0], %[t9], 11 \n\t"
+ "and %[s0], %[s0], 0x1F001F \n\t"
+ "shra.ph %[s1], %[t9], 5 \n\t"
+ "and %[s1], %[s1], 0x3F003F \n\t"
+ "and %[s2], %[t9], 0x1F001F \n\t"
+ "addq.ph %[s3], %[s3], %[t6] \n\t"
+ "shra.ph %[t5], %[s3], 8 \n\t"
+ "and %[t5], %[t5], 0xFF00FF \n\t"
+ "addq.ph %[dst_scale], %[s3], %[t5] \n\t"
+ "shra.ph %[dst_scale], %[dst_scale], 8 \n\t"
+ "subq_s.ph %[dst_scale], %[t7], %[dst_scale] \n\t"
+ "sll %[dst_scale], %[dst_scale], 8 \n\t"
+ "precrq.qb.ph %[dst_scale], %[dst_scale], %[dst_scale] \n\t"
+ "shrl.qb %[t1], %[t1], 3 \n\t"
+ "shrl.qb %[t2], %[t2], 3 \n\t"
+ "shrl.qb %[t3], %[t3], 2 \n\t"
+ "muleu_s.ph.qbl %[t1], %[t0], %[t1] \n\t"
+ "muleu_s.ph.qbl %[t2], %[t0], %[t2] \n\t"
+ "muleu_s.ph.qbl %[t3], %[t0], %[t3] \n\t"
+ "muleu_s.ph.qbl %[t8], %[dst_scale], %[s0] \n\t"
+ "muleu_s.ph.qbl %[t9], %[dst_scale], %[s2] \n\t"
+ "muleu_s.ph.qbl %[t4], %[dst_scale], %[s1] \n\t"
+ "addq.ph %[t1], %[t1], %[t8] \n\t"
+ "addq.ph %[t2], %[t2], %[t9] \n\t"
+ "addq.ph %[t3], %[t3], %[t4] \n\t"
+ "addq.ph %[t8], %[t1], %[t6] \n\t"
+ "addq.ph %[t9], %[t2], %[t6] \n\t"
+ "addq.ph %[t4], %[t3], %[t6] \n\t"
+ "shra.ph %[t1], %[t8], 8 \n\t"
+ "addq.ph %[t1], %[t1], %[t8] \n\t"
+ "preceu.ph.qbla %[t1], %[t1] \n\t"
+ "shra.ph %[t2], %[t9], 8 \n\t"
+ "addq.ph %[t2], %[t2], %[t9] \n\t"
+ "preceu.ph.qbla %[t2], %[t2] \n\t"
+ "shra.ph %[t3], %[t4], 8 \n\t"
+ "addq.ph %[t3], %[t3], %[t4] \n\t"
+ "preceu.ph.qbla %[t3], %[t3] \n\t"
+ "shll.ph %[t8], %[t1], 11 \n\t"
+ "shll.ph %[t9], %[t3], 5 \n\t"
+ "or %[t8], %[t8], %[t9] \n\t"
+ "or %[s0], %[t8], %[t2] \n\t"
+ "srl %[t8], %[s0], 16 \n\t"
+ "and %[t9], %[s0], 0xFFFF \n\t"
+ "sh %[t8], 0(%[dst]) \n\t"
+ "sh %[t9], 2(%[dst]) \n\t"
+ "addiu %[src], %[src], 8 \n\t"
+ "addiu %[count], %[count], -2 \n\t"
+ "b 1b \n\t"
+ " addiu %[dst], %[dst], 4 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
+ [dst_scale]"+r"(dst_scale), [s0]"=&r"(s0), [s1]"=&r"(s1),
+ [s2]"=&r"(s2), [s3]"=&r"(s3), [t0]"=&r"(t0), [t1]"=&r"(t1),
+ [t2]"=&r"(t2), [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5),
+ [t6]"=&r"(t6), [t7]"=&r"(t7), [t8]"=&r"(t8), [t9]"=&r"(t9)
+ : [alpha]"r"(alpha)
+ : "memory", "hi", "lo"
+ );
+
+ if (count == 1) {
+ SkPMColor sc = *src++;
+ SkPMColorAssert(sc);
+ if (sc) {
+ uint16_t dc = *dst;
+ unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
+ unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) +
+ SkMulS16(SkGetPackedR16(dc), dst_scale);
+ unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) +
+ SkMulS16(SkGetPackedG16(dc), dst_scale);
+ unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) +
+ SkMulS16(SkGetPackedB16(dc), dst_scale);
+ *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
+ }
+ dst += 1;
+ }
+}
+
+static void S32_Blend_BlitRow32_mips_dsp(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha) {
+ register int32_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+ __asm__ volatile (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "li %[t2], 0x100 \n\t"
+ "addiu %[t0], %[alpha], 1 \n\t"
+ "subu %[t1], %[t2], %[t0] \n\t"
+ "replv.qb %[t7], %[t0] \n\t"
+ "replv.qb %[t6], %[t1] \n\t"
+ "1: \n\t"
+ "blez %[count], 2f \n\t"
+ "lw %[t0], 0(%[src]) \n\t"
+ "lw %[t1], 0(%[dst]) \n\t"
+ "preceu.ph.qbr %[t2], %[t0] \n\t"
+ "preceu.ph.qbl %[t3], %[t0] \n\t"
+ "preceu.ph.qbr %[t4], %[t1] \n\t"
+ "preceu.ph.qbl %[t5], %[t1] \n\t"
+ "muleu_s.ph.qbr %[t2], %[t7], %[t2] \n\t"
+ "muleu_s.ph.qbr %[t3], %[t7], %[t3] \n\t"
+ "muleu_s.ph.qbr %[t4], %[t6], %[t4] \n\t"
+ "muleu_s.ph.qbr %[t5], %[t6], %[t5] \n\t"
+ "addiu %[src], %[src], 4 \n\t"
+ "addiu %[count], %[count], -1 \n\t"
+ "precrq.qb.ph %[t0], %[t3], %[t2] \n\t"
+ "precrq.qb.ph %[t2], %[t5], %[t4] \n\t"
+ "addu %[t1], %[t0], %[t2] \n\t"
+ "sw %[t1], 0(%[dst]) \n\t"
+ "b 1b \n\t"
+ " addi %[dst], %[dst], 4 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count),
+ [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+ [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+ : [alpha]"r"(alpha)
+ : "memory", "hi", "lo"
+ );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+const SkBlitRow::Proc platform_565_procs_mips_dsp[] = {
+ // no dither
+ NULL,
+ S32_D565_Blend_mips_dsp,
+ S32A_D565_Opaque_mips_dsp,
+ S32A_D565_Blend_mips_dsp,
+
+ // dither
+ S32_D565_Opaque_Dither_mips_dsp,
+ S32_D565_Blend_Dither_mips_dsp,
+ S32A_D565_Opaque_Dither_mips_dsp,
+ NULL,
+};
+
+static const SkBlitRow::Proc32 platform_32_procs_mips_dsp[] = {
+ NULL, // S32_Opaque,
+ S32_Blend_BlitRow32_mips_dsp, // S32_Blend,
+ NULL, // S32A_Opaque,
+ NULL, // S32A_Blend,
+};
+
+SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
+ return platform_565_procs_mips_dsp[flags];
+}
+
+SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+ return platform_32_procs_mips_dsp[flags];
+}
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+ return NULL;
+}
+
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+ return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp
index 93830d78b46..bbc6a66462e 100644
--- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp
@@ -5,36 +5,31 @@
* found in the LICENSE file.
*/
-
+#include <emmintrin.h>
#include "SkBitmap.h"
-#include "SkColorPriv.h"
#include "SkBlurImage_opts_SSE2.h"
+#include "SkColorPriv.h"
#include "SkRect.h"
-#include <emmintrin.h>
-
namespace {
-
enum BlurDirection {
kX, kY
};
-/**
- * Helper function to spread the components of a 32-bit integer into the
+/* Helper function to spread the components of a 32-bit integer into the
* lower 8 bits of each 32-bit element of an SSE register.
*/
-
inline __m128i expand(int a) {
- const __m128i zero = _mm_setzero_si128();
+ const __m128i zero = _mm_setzero_si128();
- // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B
- __m128i result = _mm_cvtsi32_si128(a);
+ // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B
+ __m128i result = _mm_cvtsi32_si128(a);
- // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B
- result = _mm_unpacklo_epi8(result, zero);
+ // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B
+ result = _mm_unpacklo_epi8(result, zero);
- // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B
- return _mm_unpacklo_epi16(result, zero);
+ // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B
+ return _mm_unpacklo_epi16(result, zero);
}
template<BlurDirection srcDirection, BlurDirection dstDirection>
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h
index c8deea4bb9c..db104bacf4f 100644
--- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h
@@ -5,9 +5,14 @@
* found in the LICENSE file.
*/
+#ifndef SkBlurImage_opts_SSE2_DEFINED
+#define SkBlurImage_opts_SSE2_DEFINED
+
#include "SkBlurImage_opts.h"
bool SkBoxBlurGetPlatformProcs_SSE2(SkBoxBlurProc* boxBlurX,
SkBoxBlurProc* boxBlurY,
SkBoxBlurProc* boxBlurXY,
SkBoxBlurProc* boxBlurYX);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp
new file mode 100644
index 00000000000..10d595afa59
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2014 ARM Ltd.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBlurImage_opts_neon.h"
+#include "SkUtilsArm.h"
+
+bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
+ SkBoxBlurProc* boxBlurY,
+ SkBoxBlurProc* boxBlurXY,
+ SkBoxBlurProc* boxBlurYX) {
+#if SK_ARM_NEON_IS_NONE
+ return false;
+#else
+#if SK_ARM_NEON_IS_DYNAMIC
+ if (!sk_cpu_arm_has_neon()) {
+ return false;
+ }
+#endif
+ return SkBoxBlurGetPlatformProcs_NEON(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
+#endif
+}
diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp
index 4e33d72d462..08187f3e55e 100644
--- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp
@@ -20,6 +20,86 @@ enum BlurDirection {
};
/**
+ * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register
+ * and also pre-load pixels for future read
+ */
+template<BlurDirection srcDirection>
+inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) {
+ if (srcDirection == kX) {
+ uint32x2_t temp = vdup_n_u32(0);
+ // 10% faster by adding these 2 prefetches
+ SK_PREFETCH(src + 16);
+ SK_PREFETCH(src + srcStride + 16);
+ return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(src, temp, 0), 1));
+ } else {
+ return vld1_u8((uint8_t*)src);
+ }
+}
+
+/**
+ * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows
+ */
+template<BlurDirection dstDirection>
+inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) {
+ if (dstDirection == kX) {
+ uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8));
+ vst1_lane_u32(dst, temp, 0);
+ vst1_lane_u32(dst + dstStride, temp, 1);
+ } else {
+ uint8x8_t temp = vmovn_u16(result16x8);
+ vst1_u8((uint8_t*)dst, temp);
+ }
+}
+
+/**
+ * fast path for kernel size less than 128
+ */
+template<BlurDirection srcDirection, BlurDirection dstDirection>
+void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize,
+ int leftOffset, int rightOffset, int width, int* height)
+{
+ const int rightBorder = SkMin32(rightOffset + 1, width);
+ const int srcStrideX = srcDirection == kX ? 1 : srcStride;
+ const int dstStrideX = dstDirection == kX ? 1 : *height;
+ const int srcStrideY = srcDirection == kX ? srcStride : 1;
+ const int dstStrideY = dstDirection == kX ? width : 1;
+ const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);
+
+ for (; *height >= 2; *height -= 2) {
+ uint16x8_t sum = vdupq_n_u16(0);
+ const SkPMColor* p = *src;
+ for (int i = 0; i < rightBorder; i++) {
+ sum = vaddw_u8(sum,
+ load_2_pixels<srcDirection>(p, srcStride));
+ p += srcStrideX;
+ }
+
+ const SkPMColor* sptr = *src;
+ SkPMColor* dptr = *dst;
+ for (int x = 0; x < width; x++) {
+ // val = (sum * scale * 2 + 0x8000) >> 16
+ uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16(
+ vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale)));
+ store_2_pixels<dstDirection>(resultPixels, dptr, width);
+
+ if (x >= leftOffset) {
+ sum = vsubw_u8(sum,
+ load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX, srcStride));
+ }
+ if (x + rightOffset + 1 < width) {
+ sum = vaddw_u8(sum,
+ load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcStrideX, srcStride));
+ }
+ sptr += srcStrideX;
+ dptr += dstStrideX;
+ }
+ *src += srcStrideY * 2;
+ *dst += dstStrideY * 2;
+ }
+}
+
+
+/**
* Helper function to spread the components of a 32-bit integer into the
* lower 8 bits of each 16-bit element of a NEON register.
*/
@@ -42,7 +122,14 @@ void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker
const int dstStrideY = dstDirection == kX ? width : 1;
const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
const uint32x4_t half = vdupq_n_u32(1 << 23);
- for (int y = 0; y < height; ++y) {
+
+ if (kernelSize < 128)
+ {
+ SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &dst, kernelSize,
+ leftOffset, rightOffset, width, &height);
+ }
+
+ for (; height > 0; height--) {
uint32x4_t sum = vdupq_n_u32(0);
const SkPMColor* p = src;
for (int i = 0; i < rightBorder; ++i) {
@@ -77,8 +164,8 @@ void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker
sum = vaddw_u16(sum, expand(*r));
}
sptr += srcStrideX;
- if (srcDirection == kY) {
- SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX);
+ if (srcDirection == kX) {
+ SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX);
}
dptr += dstStrideX;
}
diff --git a/chromium/third_party/skia/src/opts/SkCachePreload_arm.h b/chromium/third_party/skia/src/opts/SkCachePreload_arm.h
deleted file mode 100644
index cff8c2a9b79..00000000000
--- a/chromium/third_party/skia/src/opts/SkCachePreload_arm.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2012 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-#ifndef SkCachePreload_arm_DEFINED
-#define SkCachePreload_arm_DEFINED
-
-// This file defines macros for preload instructions for ARM. These macros
-// are designed to be embedded inside GNU inline assembly.
-// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache
-// line size also needs to be known (and needs to be contained inside
-// __ARM_CACHE_LINE_SIZE).
-#if defined(__ARM_USE_PLD)
-
-#define PLD(x, n) "pld [%["#x"], #("#n")]\n\t"
-
-#if __ARM_CACHE_LINE_SIZE == 32
- #define PLD64(x, n) PLD(x, n) PLD(x, (n) + 32)
-#elif __ARM_CACHE_LINE_SIZE == 64
- #define PLD64(x, n) PLD(x, n)
-#else
- #error "unknown __ARM_CACHE_LINE_SIZE."
-#endif
-#else
- // PLD is disabled, all macros become empty.
- #define PLD(x, n)
- #define PLD64(x, n)
-#endif
-
-#define PLD128(x, n) PLD64(x, n) PLD64(x, (n) + 64)
-
-#endif // SkCachePreload_arm_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h
new file mode 100644
index 00000000000..7e61d526b3b
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColor_opts_SSE2_DEFINED
+#define SkColor_opts_SSE2_DEFINED
+
+#include <emmintrin.h>
+
+// Because no _mm_mul_epi32() in SSE2, we emulate it here.
+// Multiplies 4 32-bit integers from a by 4 32-bit intergers from b.
+// The 4 multiplication results should be represented within 32-bit
+// integers, otherwise they would be overflow.
+static inline __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) {
+ // Calculate results of a0 * b0 and a2 * b2.
+ __m128i r1 = _mm_mul_epu32(a, b);
+ // Calculate results of a1 * b1 and a3 * b3.
+ __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+ // Shuffle results to [63..0] and interleave the results.
+ __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)),
+ _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0)));
+ return r;
+}
+
+static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) {
+ return _mm_add_epi32(alpha, _mm_set1_epi32(1));
+}
+
+// See #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) in SkXfermode.cpp.
+static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a,
+ const __m128i& b) {
+ __m128i prod = _mm_mullo_epi16(a, b);
+ prod = _mm_add_epi32(prod, _mm_set1_epi32(128));
+ prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8));
+ prod = _mm_srli_epi32(prod, 8);
+
+ return prod;
+}
+
+// Portable version SkAlphaMulQ is in SkColorPriv.h.
+static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) {
+ __m128i mask = _mm_set1_epi32(0xFF00FF);
+ __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
+
+ // uint32_t rb = ((c & mask) * scale) >> 8
+ __m128i rb = _mm_and_si128(mask, c);
+ rb = _mm_mullo_epi16(rb, s);
+ rb = _mm_srli_epi16(rb, 8);
+
+ // uint32_t ag = ((c >> 8) & mask) * scale
+ __m128i ag = _mm_srli_epi16(c, 8);
+ ag = _mm_and_si128(ag, mask);
+ ag = _mm_mullo_epi16(ag, s);
+
+ // (rb & mask) | (ag & ~mask)
+ rb = _mm_and_si128(mask, rb);
+ ag = _mm_andnot_si128(mask, ag);
+ return _mm_or_si128(rb, ag);
+}
+
+static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
+ __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
+ return _mm_srli_epi32(a, 24);
+}
+
+static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {
+ __m128i r = _mm_slli_epi32(src, (24 - SK_R32_SHIFT));
+ return _mm_srli_epi32(r, 24);
+}
+
+static inline __m128i SkGetPackedG32_SSE2(const __m128i& src) {
+ __m128i g = _mm_slli_epi32(src, (24 - SK_G32_SHIFT));
+ return _mm_srli_epi32(g, 24);
+}
+
+static inline __m128i SkGetPackedB32_SSE2(const __m128i& src) {
+ __m128i b = _mm_slli_epi32(src, (24 - SK_B32_SHIFT));
+ return _mm_srli_epi32(b, 24);
+}
+
+static inline __m128i SkMul16ShiftRound_SSE2(const __m128i& a,
+ const __m128i& b, int shift) {
+ __m128i prod = _mm_mullo_epi16(a, b);
+ prod = _mm_add_epi16(prod, _mm_set1_epi16(1 << (shift - 1)));
+ prod = _mm_add_epi16(prod, _mm_srli_epi16(prod, shift));
+ prod = _mm_srli_epi16(prod, shift);
+
+ return prod;
+}
+
+static inline __m128i SkPackRGB16_SSE2(const __m128i& r,
+ const __m128i& g, const __m128i& b) {
+ __m128i dr = _mm_slli_epi16(r, SK_R16_SHIFT);
+ __m128i dg = _mm_slli_epi16(g, SK_G16_SHIFT);
+ __m128i db = _mm_slli_epi16(b, SK_B16_SHIFT);
+
+ __m128i c = _mm_or_si128(dr, dg);
+ return _mm_or_si128(c, db);
+}
+
+static inline __m128i SkPackARGB32_SSE2(const __m128i& a, const __m128i& r,
+ const __m128i& g, const __m128i& b) {
+ __m128i da = _mm_slli_epi32(a, SK_A32_SHIFT);
+ __m128i dr = _mm_slli_epi32(r, SK_R32_SHIFT);
+ __m128i dg = _mm_slli_epi32(g, SK_G32_SHIFT);
+ __m128i db = _mm_slli_epi32(b, SK_B32_SHIFT);
+
+ __m128i c = _mm_or_si128(da, dr);
+ c = _mm_or_si128(c, dg);
+ return _mm_or_si128(c, db);
+}
+
+static inline __m128i SkPacked16ToR32_SSE2(const __m128i& src) {
+ __m128i r = _mm_srli_epi32(src, SK_R16_SHIFT);
+ r = _mm_and_si128(r, _mm_set1_epi32(SK_R16_MASK));
+ r = _mm_or_si128(_mm_slli_epi32(r, (8 - SK_R16_BITS)),
+ _mm_srli_epi32(r, (2 * SK_R16_BITS - 8)));
+
+ return r;
+}
+
+static inline __m128i SkPacked16ToG32_SSE2(const __m128i& src) {
+ __m128i g = _mm_srli_epi32(src, SK_G16_SHIFT);
+ g = _mm_and_si128(g, _mm_set1_epi32(SK_G16_MASK));
+ g = _mm_or_si128(_mm_slli_epi32(g, (8 - SK_G16_BITS)),
+ _mm_srli_epi32(g, (2 * SK_G16_BITS - 8)));
+
+ return g;
+}
+
+static inline __m128i SkPacked16ToB32_SSE2(const __m128i& src) {
+ __m128i b = _mm_srli_epi32(src, SK_B16_SHIFT);
+ b = _mm_and_si128(b, _mm_set1_epi32(SK_B16_MASK));
+ b = _mm_or_si128(_mm_slli_epi32(b, (8 - SK_B16_BITS)),
+ _mm_srli_epi32(b, (2 * SK_B16_BITS - 8)));
+
+ return b;
+}
+
+static inline __m128i SkPixel16ToPixel32_SSE2(const __m128i& src) {
+ __m128i r = SkPacked16ToR32_SSE2(src);
+ __m128i g = SkPacked16ToG32_SSE2(src);
+ __m128i b = SkPacked16ToB32_SSE2(src);
+
+ return SkPackARGB32_SSE2(_mm_set1_epi32(0xFF), r, g, b);
+}
+
+static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
+ const __m128i& src_pixel2) {
+ // Calculate result r.
+ __m128i r1 = _mm_srli_epi32(src_pixel1,
+ SK_R32_SHIFT + (8 - SK_R16_BITS));
+ r1 = _mm_and_si128(r1, _mm_set1_epi32(SK_R16_MASK));
+ __m128i r2 = _mm_srli_epi32(src_pixel2,
+ SK_R32_SHIFT + (8 - SK_R16_BITS));
+ r2 = _mm_and_si128(r2, _mm_set1_epi32(SK_R16_MASK));
+ __m128i r = _mm_packs_epi32(r1, r2);
+
+ // Calculate result g.
+ __m128i g1 = _mm_srli_epi32(src_pixel1,
+ SK_G32_SHIFT + (8 - SK_G16_BITS));
+ g1 = _mm_and_si128(g1, _mm_set1_epi32(SK_G16_MASK));
+ __m128i g2 = _mm_srli_epi32(src_pixel2,
+ SK_G32_SHIFT + (8 - SK_G16_BITS));
+ g2 = _mm_and_si128(g2, _mm_set1_epi32(SK_G16_MASK));
+ __m128i g = _mm_packs_epi32(g1, g2);
+
+ // Calculate result b.
+ __m128i b1 = _mm_srli_epi32(src_pixel1,
+ SK_B32_SHIFT + (8 - SK_B16_BITS));
+ b1 = _mm_and_si128(b1, _mm_set1_epi32(SK_B16_MASK));
+ __m128i b2 = _mm_srli_epi32(src_pixel2,
+ SK_B32_SHIFT + (8 - SK_B16_BITS));
+ b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK));
+ __m128i b = _mm_packs_epi32(b1, b2);
+
+ // Store 8 16-bit colors in dst.
+ __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
+
+ return d_pixel;
+}
+
+#endif // SkColor_opts_SSE2_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h
new file mode 100644
index 00000000000..2cc21afa0df
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkMath_opts_SSE2_DEFINED
+#define SkMath_opts_SSE2_DEFINED
+
+#include <emmintrin.h>
+
+// Because no _mm_div_epi32() in SSE2, we use float division to emulate.
+// When using this function, make sure a and b don't exceed float's precision.
+static inline __m128i shim_mm_div_epi32(const __m128i& a, const __m128i& b) {
+ __m128 x = _mm_cvtepi32_ps(a);
+ __m128 y = _mm_cvtepi32_ps(b);
+ return _mm_cvttps_epi32(_mm_div_ps(x, y));
+}
+
+// Portable version of SkSqrtBits is in SkMath.cpp.
+static inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) {
+ __m128i root = _mm_setzero_si128();
+ __m128i remHi = _mm_setzero_si128();
+ __m128i remLo = x;
+ __m128i one128 = _mm_set1_epi32(1);
+
+ do {
+ root = _mm_slli_epi32(root, 1);
+
+ remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2),
+ _mm_srli_epi32(remLo, 30));
+ remLo = _mm_slli_epi32(remLo, 2);
+
+ __m128i testDiv = _mm_slli_epi32(root, 1);
+ testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1));
+
+ __m128i cmp = _mm_cmplt_epi32(remHi, testDiv);
+ __m128i remHi1 = _mm_and_si128(cmp, remHi);
+ __m128i root1 = _mm_and_si128(cmp, root);
+ __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv));
+ __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128));
+
+ remHi = _mm_or_si128(remHi1, remHi2);
+ root = _mm_or_si128(root1, root2);
+ } while (--count >= 0);
+
+ return root;
+}
+
+#endif // SkMath_opts_SSE2_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts.h b/chromium/third_party/skia/src/opts/SkMorphology_opts.h
index e3ad853cf64..7ea7c546231 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts.h
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts.h
@@ -5,17 +5,10 @@
* found in the LICENSE file.
*/
-#include <SkColor.h>
+#ifndef SkMorphology_opts_DEFINED
+#define SkMorphology_opts_DEFINED
-/**
- * All morphology procs have the same signature: src is the source buffer, dst the
- * destination buffer, radius is the morphology radius, width and height are the bounds
- * of the destination buffer (in pixels), and srcStride and dstStride are the
- * number of pixels per row in each buffer. All buffers are 8888.
- */
-
-typedef void (*SkMorphologyProc)(const SkPMColor* src, SkPMColor* dst, int radius,
- int width, int height, int srcStride, int dstStride);
+#include <SkMorphologyImageFilter.h>
enum SkMorphologyProcType {
kDilateX_SkMorphologyProcType,
@@ -24,4 +17,6 @@ enum SkMorphologyProcType {
kErodeY_SkMorphologyProcType
};
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type);
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp
index b58fced2c12..e782950956a 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp
@@ -5,12 +5,10 @@
* found in the LICENSE file.
*/
-
+#include <emmintrin.h>
#include "SkColorPriv.h"
#include "SkMorphology_opts_SSE2.h"
-#include <emmintrin.h>
-
/* SSE2 version of dilateX, dilateY, erodeX, erodeY.
* portable versions are in src/effects/SkMorphologyImageFilter.cpp.
*/
@@ -48,8 +46,12 @@ static void SkMorph_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
lp += srcStrideY;
up += srcStrideY;
}
- if (x >= radius) src += srcStrideX;
- if (x + radius < width - 1) upperSrc += srcStrideX;
+ if (x >= radius) {
+ src += srcStrideX;
+ }
+ if (x + radius < width - 1) {
+ upperSrc += srcStrideX;
+ }
dst += dstStrideX;
}
}
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h
index bd103e6eba9..bf5aa03b092 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h
@@ -5,6 +5,11 @@
* found in the LICENSE file.
*/
+#ifndef SkMorphology_opts_SSE2_DEFINED
+#define SkMorphology_opts_SSE2_DEFINED
+
+#include "SkColor.h"
+
void SkDilateX_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
int width, int height, int srcStride, int dstStride);
void SkDilateY_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
@@ -13,3 +18,5 @@ void SkErodeX_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
int width, int height, int srcStride, int dstStride);
void SkErodeY_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
int width, int height, int srcStride, int dstStride);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp
new file mode 100644
index 00000000000..2bba4929c22
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2014 ARM Ltd.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkMorphology_opts.h"
+#include "SkMorphology_opts_neon.h"
+#include "SkUtilsArm.h"
+
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
+#if SK_ARM_NEON_IS_NONE
+ return NULL;
+#else
+#if SK_ARM_NEON_IS_DYNAMIC
+ if (!sk_cpu_arm_has_neon()) {
+ return NULL;
+ }
+#endif
+ switch (type) {
+ case kDilateX_SkMorphologyProcType:
+ return SkDilateX_neon;
+ case kDilateY_SkMorphologyProcType:
+ return SkDilateY_neon;
+ case kErodeX_SkMorphologyProcType:
+ return SkErodeX_neon;
+ case kErodeY_SkMorphologyProcType:
+ return SkErodeY_neon;
+ default:
+ return NULL;
+ }
+#endif
+}
diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp
index 66d58ba571f..ade261fc7d2 100644
--- a/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp
+++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp
@@ -7,6 +7,6 @@
#include "SkMorphology_opts.h"
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType) {
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType) {
return NULL;
}
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp
index e22044d39d3..bd2f9b29a44 100644
--- a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp
@@ -1,4 +1,3 @@
-
/*
* Copyright 2009 The Android Open Source Project
*
@@ -6,7 +5,6 @@
* found in the LICENSE file.
*/
-
#include <emmintrin.h>
#include "SkUtils_opts_SSE2.h"
@@ -69,3 +67,33 @@ void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count)
--count;
}
}
+
+void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count)
+{
+ if (count >= 16) {
+ while (((size_t)dst) & 0x0F) {
+ *dst++ = *src++;
+ --count;
+ }
+ __m128i *dst128 = reinterpret_cast<__m128i*>(dst);
+ const __m128i *src128 = reinterpret_cast<const __m128i*>(src);
+ while (count >= 16) {
+ __m128i a = _mm_loadu_si128(src128++);
+ __m128i b = _mm_loadu_si128(src128++);
+ __m128i c = _mm_loadu_si128(src128++);
+ __m128i d = _mm_loadu_si128(src128++);
+
+ _mm_store_si128(dst128++, a);
+ _mm_store_si128(dst128++, b);
+ _mm_store_si128(dst128++, c);
+ _mm_store_si128(dst128++, d);
+ count -= 16;
+ }
+ dst = reinterpret_cast<uint32_t*>(dst128);
+ src = reinterpret_cast<const uint32_t*>(src128);
+ }
+ while (count > 0) {
+ *dst++ = *src++;
+ --count;
+ }
+}
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h
index ed24c1ffa40..009f01894b4 100644
--- a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h
@@ -1,4 +1,3 @@
-
/*
* Copyright 2009 The Android Open Source Project
*
@@ -6,8 +5,13 @@
* found in the LICENSE file.
*/
+#ifndef SkUtils_opts_SSE2_DEFINED
+#define SkUtils_opts_SSE2_DEFINED
#include "SkTypes.h"
void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count);
void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count);
+void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count);
+
+#endif
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp
new file mode 100644
index 00000000000..b1c9d0aa93e
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2014 ARM Ltd.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkUtils.h"
+#include "SkUtilsArm.h"
+
+#if defined(SK_CPU_LENDIAN) && !SK_ARM_NEON_IS_NONE
+extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
+extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
+#endif
+
+#if defined(SK_CPU_LENDIAN)
+extern "C" void arm_memset16(uint16_t* dst, uint16_t value, int count);
+extern "C" void arm_memset32(uint32_t* dst, uint32_t value, int count);
+#endif
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+ // FIXME: memset.arm.S is using syntax incompatible with XCode
+#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
+ return NULL;
+#elif SK_ARM_NEON_IS_DYNAMIC
+ if (sk_cpu_arm_has_neon()) {
+ return memset16_neon;
+ } else {
+ return arm_memset16;
+ }
+#elif SK_ARM_NEON_IS_ALWAYS
+ return memset16_neon;
+#else
+ return arm_memset16;
+#endif
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+ // FIXME: memset.arm.S is using syntax incompatible with XCode
+#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
+ return NULL;
+#elif SK_ARM_NEON_IS_DYNAMIC
+ if (sk_cpu_arm_has_neon()) {
+ return memset32_neon;
+ } else {
+ return arm_memset32;
+ }
+#elif SK_ARM_NEON_IS_ALWAYS
+ return memset32_neon;
+#else
+ return arm_memset32;
+#endif
+}
+
+SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
+ return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp
index 286f10d7e53..18f52496db4 100644
--- a/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp
+++ b/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp
@@ -16,3 +16,7 @@ SkMemset16Proc SkMemset16GetPlatformProc() {
SkMemset32Proc SkMemset32GetPlatformProc() {
return NULL;
}
+
+SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
+ return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp
new file mode 100644
index 00000000000..94f9a4aea3b
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp
@@ -0,0 +1,819 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkColorPriv.h"
+#include "SkColor_opts_SSE2.h"
+#include "SkMathPriv.h"
+#include "SkMath_opts_SSE2.h"
+#include "SkXfermode.h"
+#include "SkXfermode_opts_SSE2.h"
+#include "SkXfermode_proccoeff.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 4 pixels SSE2 version functions
+////////////////////////////////////////////////////////////////////////////////
+
+static inline __m128i SkDiv255Round_SSE2(const __m128i& a) {
+ __m128i prod = _mm_add_epi32(a, _mm_set1_epi32(128)); // prod += 128;
+ prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8)); // prod + (prod >> 8)
+ prod = _mm_srli_epi32(prod, 8); // >> 8
+
+ return prod;
+}
+
+static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) {
+ __m128i sum = _mm_add_epi32(a, b);
+ __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255));
+
+ sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)),
+ _mm_andnot_si128(cmp, sum));
+ return sum;
+}
+
+static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) {
+ __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128());
+ __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255));
+ __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255));
+
+ __m128i cmp = _mm_or_si128(cmp1, cmp2);
+ ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n));
+
+ return ret;
+}
+
+static inline __m128i clamp_div255round_SSE2(const __m128i& prod) {
+ // test if > 0
+ __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128());
+ // test if < 255*255
+ __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255));
+
+ __m128i ret = _mm_setzero_si128();
+
+ // if value >= 255*255, value = 255
+ ret = _mm_andnot_si128(cmp2, _mm_set1_epi32(255));
+
+ __m128i div = SkDiv255Round_SSE2(prod);
+
+ // test if > 0 && < 255*255
+ __m128i cmp = _mm_and_si128(cmp1, cmp2);
+
+ ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret));
+
+ return ret;
+}
+
+static __m128i srcover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
+ return _mm_add_epi32(src, SkAlphaMulQ_SSE2(dst, isa));
+}
+
+static __m128i dstover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
+ return _mm_add_epi32(dst, SkAlphaMulQ_SSE2(src, ida));
+}
+
+static __m128i srcin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i da = SkGetPackedA32_SSE2(dst);
+ return SkAlphaMulQ_SSE2(src, SkAlpha255To256_SSE2(da));
+}
+
+static __m128i dstin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ return SkAlphaMulQ_SSE2(dst, SkAlpha255To256_SSE2(sa));
+}
+
+static __m128i srcout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
+ return SkAlphaMulQ_SSE2(src, ida);
+}
+
+static __m128i dstout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
+ return SkAlphaMulQ_SSE2(dst, isa);
+}
+
+static __m128i srcatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+
+ __m128i a = da;
+
+ __m128i r1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedR32_SSE2(src));
+ __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
+ __m128i r = _mm_add_epi32(r1, r2);
+
+ __m128i g1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedG32_SSE2(src));
+ __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
+ __m128i g = _mm_add_epi32(g1, g2);
+
+ __m128i b1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedB32_SSE2(src));
+ __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
+ __m128i b = _mm_add_epi32(b1, b2);
+
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i dstatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+
+ __m128i a = sa;
+
+ __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
+ __m128i r2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedR32_SSE2(dst));
+ __m128i r = _mm_add_epi32(r1, r2);
+
+ __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
+ __m128i g2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedG32_SSE2(dst));
+ __m128i g = _mm_add_epi32(g1, g2);
+
+ __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
+ __m128i b2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedB32_SSE2(dst));
+ __m128i b = _mm_add_epi32(b1, b2);
+
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+
+ __m128i a1 = _mm_add_epi32(sa, da);
+ __m128i a2 = SkAlphaMulAlpha_SSE2(sa, da);
+ a2 = _mm_slli_epi32(a2, 1);
+ __m128i a = _mm_sub_epi32(a1, a2);
+
+ __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
+ __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
+ __m128i r = _mm_add_epi32(r1, r2);
+
+ __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
+ __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
+ __m128i g = _mm_add_epi32(g1, g2);
+
+ __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
+ __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
+ __m128i b = _mm_add_epi32(b1, b2);
+
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i plus_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i b = saturated_add_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst));
+ __m128i g = saturated_add_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst));
+ __m128i r = saturated_add_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst));
+ __m128i a = saturated_add_SSE2(SkGetPackedA32_SSE2(src),
+ SkGetPackedA32_SSE2(dst));
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i modulate_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i a = SkAlphaMulAlpha_SSE2(SkGetPackedA32_SSE2(src),
+ SkGetPackedA32_SSE2(dst));
+ __m128i r = SkAlphaMulAlpha_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst));
+ __m128i g = SkAlphaMulAlpha_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst));
+ __m128i b = SkAlphaMulAlpha_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst));
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) {
+ __m128i cmp = _mm_cmplt_epi32(a, b);
+ return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b));
+}
+
+static inline __m128i srcover_byte_SSE2(const __m128i& a, const __m128i& b) {
+ // a + b - SkAlphaMulAlpha(a, b);
+ return _mm_sub_epi32(_mm_add_epi32(a, b), SkAlphaMulAlpha_SSE2(a, b));
+
+}
+
+static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ // sc * (255 - da)
+ __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da);
+ ret1 = _mm_mullo_epi16(sc, ret1);
+
+ // dc * (255 - sa)
+ __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+ ret2 = _mm_mullo_epi16(dc, ret2);
+
+ // sc * dc
+ __m128i ret3 = _mm_mullo_epi16(sc, dc);
+
+ __m128i ret = _mm_add_epi32(ret1, ret2);
+ ret = _mm_add_epi32(ret, ret3);
+
+ return clamp_div255round_SSE2(ret);
+}
+
+static __m128i multiply_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+ __m128i a = srcover_byte_SSE2(sa, da);
+
+ __m128i sr = SkGetPackedR32_SSE2(src);
+ __m128i dr = SkGetPackedR32_SSE2(dst);
+ __m128i r = blendfunc_multiply_byte_SSE2(sr, dr, sa, da);
+
+ __m128i sg = SkGetPackedG32_SSE2(src);
+ __m128i dg = SkGetPackedG32_SSE2(dst);
+ __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da);
+
+
+ __m128i sb = SkGetPackedB32_SSE2(src);
+ __m128i db = SkGetPackedB32_SSE2(dst);
+ __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da);
+
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i screen_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i a = srcover_byte_SSE2(SkGetPackedA32_SSE2(src),
+ SkGetPackedA32_SSE2(dst));
+ __m128i r = srcover_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst));
+ __m128i g = srcover_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst));
+ __m128i b = srcover_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst));
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+// Portable version overlay_byte() is in SkXfermode.cpp.
+static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+ __m128i tmp1 = _mm_mullo_epi16(sc, ida);
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+ __m128i tmp2 = _mm_mullo_epi16(dc, isa);
+ __m128i tmp = _mm_add_epi32(tmp1, tmp2);
+
+ __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da);
+ __m128i rc1 = _mm_slli_epi32(sc, 1); // 2 * sc
+ rc1 = Multiply32_SSE2(rc1, dc); // *dc
+
+ __m128i rc2 = _mm_mullo_epi16(sa, da); // sa * da
+ __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1); // 2 * (da - dc)
+ tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc)); // * (sa - sc)
+ rc2 = _mm_sub_epi32(rc2, tmp3);
+
+ __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1),
+ _mm_and_si128(cmp, rc2));
+ return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp));
+}
+
+static __m128i overlay_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = overlay_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = overlay_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = overlay_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i sd = _mm_mullo_epi16(sc, da);
+ __m128i ds = _mm_mullo_epi16(dc, sa);
+
+ __m128i cmp = _mm_cmplt_epi32(sd, ds);
+
+ __m128i tmp = _mm_add_epi32(sc, dc);
+ __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
+ __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
+ __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
+ _mm_andnot_si128(cmp, ret2));
+ return ret;
+}
+
+static __m128i darken_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = darken_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = darken_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = darken_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i lighten_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i sd = _mm_mullo_epi16(sc, da);
+ __m128i ds = _mm_mullo_epi16(dc, sa);
+
+ __m128i cmp = _mm_cmpgt_epi32(sd, ds);
+
+ __m128i tmp = _mm_add_epi32(sc, dc);
+ __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
+ __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
+ __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
+ _mm_andnot_si128(cmp, ret2));
+ return ret;
+}
+
+static __m128i lighten_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = lighten_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = lighten_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = lighten_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i diff = _mm_sub_epi32(sa, sc);
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+
+ // if (0 == dc)
+ __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128());
+ __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida));
+
+ // else if (0 == diff)
+ __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128());
+ __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
+ __m128i tmp1 = _mm_mullo_epi16(sa, da);
+ __m128i tmp2 = _mm_mullo_epi16(sc, ida);
+ __m128i tmp3 = _mm_mullo_epi16(dc, isa);
+ __m128i rc2 = _mm_add_epi32(tmp1, tmp2);
+ rc2 = _mm_add_epi32(rc2, tmp3);
+ rc2 = clamp_div255round_SSE2(rc2);
+ rc2 = _mm_and_si128(cmp, rc2);
+
+ // else
+ __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
+ __m128i value = _mm_mullo_epi16(dc, sa);
+ diff = shim_mm_div_epi32(value, diff);
+
+ __m128i tmp4 = SkMin32_SSE2(da, diff);
+ tmp4 = Multiply32_SSE2(sa, tmp4);
+ __m128i rc3 = _mm_add_epi32(tmp4, tmp2);
+ rc3 = _mm_add_epi32(rc3, tmp3);
+ rc3 = clamp_div255round_SSE2(rc3);
+ rc3 = _mm_andnot_si128(cmp3, rc3);
+
+ __m128i rc = _mm_or_si128(rc1, rc2);
+ rc = _mm_or_si128(rc, rc3);
+
+ return rc;
+}
+
+static __m128i colordodge_modeproc_SSE2(const __m128i& src,
+ const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = colordodge_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = colordodge_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = colordodge_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+
+ // if (dc == da)
+ __m128i cmp1 = _mm_cmpeq_epi32(dc, da);
+ __m128i tmp1 = _mm_mullo_epi16(sa, da);
+ __m128i tmp2 = _mm_mullo_epi16(sc, ida);
+ __m128i tmp3 = _mm_mullo_epi16(dc, isa);
+ __m128i rc1 = _mm_add_epi32(tmp1, tmp2);
+ rc1 = _mm_add_epi32(rc1, tmp3);
+ rc1 = clamp_div255round_SSE2(rc1);
+ rc1 = _mm_and_si128(cmp1, rc1);
+
+ // else if (0 == sc)
+ __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128());
+ __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa);
+ __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
+ rc2 = _mm_and_si128(cmp, rc2);
+
+ // else
+ __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
+ __m128i tmp4 = _mm_sub_epi32(da, dc);
+ tmp4 = Multiply32_SSE2(tmp4, sa);
+ tmp4 = shim_mm_div_epi32(tmp4, sc);
+
+ __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4));
+ tmp5 = Multiply32_SSE2(sa, tmp5);
+ __m128i rc3 = _mm_add_epi32(tmp5, tmp2);
+ rc3 = _mm_add_epi32(rc3, tmp3);
+ rc3 = clamp_div255round_SSE2(rc3);
+ rc3 = _mm_andnot_si128(cmp3, rc3);
+
+ __m128i rc = _mm_or_si128(rc1, rc2);
+ rc = _mm_or_si128(rc, rc3);
+
+ return rc;
+}
+
+static __m128i colorburn_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = colorburn_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = colorburn_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = colorburn_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ // if (2 * sc <= sa)
+ __m128i tmp1 = _mm_slli_epi32(sc, 1);
+ __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
+ __m128i rc1 = _mm_mullo_epi16(sc, dc); // sc * dc;
+ rc1 = _mm_slli_epi32(rc1, 1); // 2 * sc * dc
+ rc1 = _mm_andnot_si128(cmp1, rc1);
+
+ // else
+ tmp1 = _mm_mullo_epi16(sa, da);
+ __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc),
+ _mm_sub_epi32(sa, sc));
+ tmp2 = _mm_slli_epi32(tmp2, 1);
+ __m128i rc2 = _mm_sub_epi32(tmp1, tmp2);
+ rc2 = _mm_and_si128(cmp1, rc2);
+
+ __m128i rc = _mm_or_si128(rc1, rc2);
+
+ __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
+ tmp1 = _mm_mullo_epi16(sc, ida);
+ __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
+ tmp2 = _mm_mullo_epi16(dc, isa);
+ rc = _mm_add_epi32(rc, tmp1);
+ rc = _mm_add_epi32(rc, tmp2);
+ return clamp_div255round_SSE2(rc);
+}
+
+static __m128i hardlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = hardlight_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = hardlight_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = hardlight_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static __m128i sqrt_unit_byte_SSE2(const __m128i& n) {
+ return SkSqrtBits_SSE2(n, 15+4);
+}
+
+static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i tmp1, tmp2, tmp3;
+
+ // int m = da ? dc * 256 / da : 0;
+ __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
+ __m128i m = _mm_slli_epi32(dc, 8);
+ __m128 x = _mm_cvtepi32_ps(m);
+ __m128 y = _mm_cvtepi32_ps(da);
+ m = _mm_cvttps_epi32(_mm_div_ps(x, y));
+ m = _mm_andnot_si128(cmp, m);
+
+ // if (2 * sc <= sa)
+ tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc
+ __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
+ tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa
+ tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m
+ tmp1 = Multiply32_SSE2(tmp1, tmp2);
+ tmp1 = _mm_srai_epi32(tmp1, 8);
+ tmp1 = _mm_add_epi32(sa, tmp1);
+ tmp1 = Multiply32_SSE2(dc, tmp1);
+ __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);
+
+ // else if (4 * dc <= da)
+ tmp2 = _mm_slli_epi32(dc, 2); // dc * 4
+ __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
+ __m128i i = _mm_slli_epi32(m, 2); // 4 * m
+ __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
+ __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256)
+ __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
+ i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256)
+ i = _mm_srai_epi32(i, 16); // >> 16
+ j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m
+ tmp2 = _mm_add_epi32(i, j);
+ i = Multiply32_SSE2(dc, sa); // dc * sa
+ j = _mm_slli_epi32(sc, 1); // 2 * sc
+ j = _mm_sub_epi32(j, sa); // 2 * sc - sa
+ j = Multiply32_SSE2(da, j); // da * (2 * sc - sa)
+ tmp2 = Multiply32_SSE2(j, tmp2); // * tmp
+ tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8
+ tmp2 = _mm_add_epi32(i, tmp2);
+ cmp = _mm_andnot_si128(cmp2, cmp1);
+ __m128i rc2 = _mm_and_si128(cmp, tmp2);
+ __m128i rc = _mm_or_si128(rc1, rc2);
+
+ // else
+ tmp3 = sqrt_unit_byte_SSE2(m);
+ tmp3 = _mm_sub_epi32(tmp3, m);
+ tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa)
+ tmp3 = _mm_srai_epi32(tmp3, 8);
+ tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa
+ cmp = _mm_and_si128(cmp1, cmp2);
+ __m128i rc3 = _mm_and_si128(cmp, tmp3);
+ rc = _mm_or_si128(rc, rc3);
+
+ tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da
+ tmp1 = _mm_mullo_epi16(sc, tmp1);
+ tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa
+ tmp2 = _mm_mullo_epi16(dc, tmp2);
+ rc = _mm_add_epi32(rc, tmp1);
+ rc = _mm_add_epi32(rc, tmp2);
+ return clamp_div255round_SSE2(rc);
+}
+
+static __m128i softlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = softlight_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = softlight_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = softlight_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i difference_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i& sa, const __m128i& da) {
+ __m128i tmp1 = _mm_mullo_epi16(sc, da);
+ __m128i tmp2 = _mm_mullo_epi16(dc, sa);
+ __m128i tmp = SkMin32_SSE2(tmp1, tmp2);
+
+ __m128i ret1 = _mm_add_epi32(sc, dc);
+ __m128i ret2 = _mm_slli_epi32(SkDiv255Round_SSE2(tmp), 1);
+ __m128i ret = _mm_sub_epi32(ret1, ret2);
+
+ ret = clamp_signed_byte_SSE2(ret);
+ return ret;
+}
+
+static __m128i difference_modeproc_SSE2(const __m128i& src,
+ const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = difference_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = difference_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = difference_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc,
+ const __m128i&, __m128i&) {
+ __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc
+ __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc
+ tmp1 = _mm_add_epi32(tmp1, tmp2);
+ tmp2 = _mm_mullo_epi16(sc, dc); // sc * dc
+ tmp2 = _mm_slli_epi32(tmp2, 1); // 2 * sc * dc
+
+ __m128i r = _mm_sub_epi32(tmp1, tmp2);
+ return clamp_div255round_SSE2(r);
+}
+
+static __m128i exclusion_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
+ __m128i sa = SkGetPackedA32_SSE2(src);
+ __m128i da = SkGetPackedA32_SSE2(dst);
+
+ __m128i a = srcover_byte_SSE2(sa, da);
+ __m128i r = exclusion_byte_SSE2(SkGetPackedR32_SSE2(src),
+ SkGetPackedR32_SSE2(dst), sa, da);
+ __m128i g = exclusion_byte_SSE2(SkGetPackedG32_SSE2(src),
+ SkGetPackedG32_SSE2(dst), sa, da);
+ __m128i b = exclusion_byte_SSE2(SkGetPackedB32_SSE2(src),
+ SkGetPackedB32_SSE2(dst), sa, da);
+ return SkPackARGB32_SSE2(a, r, g, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst);
+
+extern SkXfermodeProcSIMD gSSE2XfermodeProcs[];
+
+SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer)
+ : INHERITED(buffer) {
+ fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]);
+ buffer.validate(fProcSIMD != NULL);
+}
+
+void SkSSE2ProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
+ int count, const SkAlpha aa[]) const {
+ SkASSERT(dst && src && count >= 0);
+
+ SkXfermodeProc proc = this->getProc();
+ SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
+ SkASSERT(procSIMD != NULL);
+
+ if (NULL == aa) {
+ if (count >= 4) {
+ while (((size_t)dst & 0x0F) != 0) {
+ *dst = proc(*src, *dst);
+ dst++;
+ src++;
+ count--;
+ }
+
+ const __m128i* s = reinterpret_cast<const __m128i*>(src);
+ __m128i* d = reinterpret_cast<__m128i*>(dst);
+
+ while (count >= 4) {
+ __m128i src_pixel = _mm_loadu_si128(s++);
+ __m128i dst_pixel = _mm_load_si128(d);
+
+ dst_pixel = procSIMD(src_pixel, dst_pixel);
+ _mm_store_si128(d++, dst_pixel);
+ count -= 4;
+ }
+
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<SkPMColor*>(d);
+ }
+
+ for (int i = count - 1; i >= 0; --i) {
+ *dst = proc(*src, *dst);
+ dst++;
+ src++;
+ }
+ } else {
+ for (int i = count - 1; i >= 0; --i) {
+ unsigned a = aa[i];
+ if (0 != a) {
+ SkPMColor dstC = dst[i];
+ SkPMColor C = proc(src[i], dstC);
+ if (a != 0xFF) {
+ C = SkFourByteInterp(C, dstC, a);
+ }
+ dst[i] = C;
+ }
+ }
+ }
+}
+
+void SkSSE2ProcCoeffXfermode::xfer16(uint16_t dst[], const SkPMColor src[],
+ int count, const SkAlpha aa[]) const {
+ SkASSERT(dst && src && count >= 0);
+
+ SkXfermodeProc proc = this->getProc();
+ SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
+ SkASSERT(procSIMD != NULL);
+
+ if (NULL == aa) {
+ if (count >= 8) {
+ while (((size_t)dst & 0x0F) != 0) {
+ SkPMColor dstC = SkPixel16ToPixel32(*dst);
+ *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC));
+ dst++;
+ src++;
+ count--;
+ }
+
+ const __m128i* s = reinterpret_cast<const __m128i*>(src);
+ __m128i* d = reinterpret_cast<__m128i*>(dst);
+
+ while (count >= 8) {
+ __m128i src_pixel1 = _mm_loadu_si128(s++);
+ __m128i src_pixel2 = _mm_loadu_si128(s++);
+ __m128i dst_pixel = _mm_load_si128(d);
+
+ __m128i dst_pixel1 = _mm_unpacklo_epi16(dst_pixel, _mm_setzero_si128());
+ __m128i dst_pixel2 = _mm_unpackhi_epi16(dst_pixel, _mm_setzero_si128());
+
+ __m128i dstC1 = SkPixel16ToPixel32_SSE2(dst_pixel1);
+ __m128i dstC2 = SkPixel16ToPixel32_SSE2(dst_pixel2);
+
+ dst_pixel1 = procSIMD(src_pixel1, dstC1);
+ dst_pixel2 = procSIMD(src_pixel2, dstC2);
+ dst_pixel = SkPixel32ToPixel16_ToU16_SSE2(dst_pixel1, dst_pixel2);
+
+ _mm_store_si128(d++, dst_pixel);
+ count -= 8;
+ }
+
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<uint16_t*>(d);
+ }
+
+ for (int i = count - 1; i >= 0; --i) {
+ SkPMColor dstC = SkPixel16ToPixel32(*dst);
+ *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC));
+ dst++;
+ src++;
+ }
+ } else {
+ for (int i = count - 1; i >= 0; --i) {
+ unsigned a = aa[i];
+ if (0 != a) {
+ SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
+ SkPMColor C = proc(src[i], dstC);
+ if (0xFF != a) {
+ C = SkFourByteInterp(C, dstC, a);
+ }
+ dst[i] = SkPixel32ToPixel16_ToU16(C);
+ }
+ }
+ }
+}
+
+#ifndef SK_IGNORE_TO_STRING
+void SkSSE2ProcCoeffXfermode::toString(SkString* str) const {
+ this->INHERITED::toString(str);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// 4 pixels modeprocs with SSE2
+SkXfermodeProcSIMD gSSE2XfermodeProcs[] = {
+ NULL, // kClear_Mode
+ NULL, // kSrc_Mode
+ NULL, // kDst_Mode
+ srcover_modeproc_SSE2,
+ dstover_modeproc_SSE2,
+ srcin_modeproc_SSE2,
+ dstin_modeproc_SSE2,
+ srcout_modeproc_SSE2,
+ dstout_modeproc_SSE2,
+ srcatop_modeproc_SSE2,
+ dstatop_modeproc_SSE2,
+ xor_modeproc_SSE2,
+ plus_modeproc_SSE2,
+ modulate_modeproc_SSE2,
+ screen_modeproc_SSE2,
+
+ overlay_modeproc_SSE2,
+ darken_modeproc_SSE2,
+ lighten_modeproc_SSE2,
+ colordodge_modeproc_SSE2,
+ colorburn_modeproc_SSE2,
+ hardlight_modeproc_SSE2,
+ softlight_modeproc_SSE2,
+ difference_modeproc_SSE2,
+ exclusion_modeproc_SSE2,
+ multiply_modeproc_SSE2,
+
+ NULL, // kHue_Mode
+ NULL, // kSaturation_Mode
+ NULL, // kColor_Mode
+ NULL, // kLuminosity_Mode
+};
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
+ SkXfermode::Mode mode) {
+ void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]);
+
+ if (procSIMD != NULL) {
+ return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD));
+ }
+ return NULL;
+}
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h
new file mode 100644
index 00000000000..bfc143937a8
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkXfermode_opts_SSE2_DEFINED
+#define SkXfermode_opts_SSE2_DEFINED
+
+#include "SkTypes.h"
+#include "SkXfermode_proccoeff.h"
+
+class SK_API SkSSE2ProcCoeffXfermode : public SkProcCoeffXfermode {
+public:
+ SkSSE2ProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode,
+ void* procSIMD)
+ : INHERITED(rec, mode), fProcSIMD(procSIMD) {}
+
+ virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count,
+ const SkAlpha aa[]) const SK_OVERRIDE;
+ virtual void xfer16(uint16_t dst[], const SkPMColor src[],
+ int count, const SkAlpha aa[]) const SK_OVERRIDE;
+
+ SK_TO_STRING_OVERRIDE()
+ SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkSSE2ProcCoeffXfermode)
+
+private:
+ SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer);
+
+ void* fProcSIMD;
+ typedef SkProcCoeffXfermode INHERITED;
+};
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
+ SkXfermode::Mode mode);
+
+#endif // SkXfermode_opts_SSE2_DEFINED
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp
index 6a79b737263..70e92af66bc 100644
--- a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp
@@ -41,8 +41,13 @@ static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp
static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
uint16x8_t tmp;
+#ifdef SK_CPU_ARM64
+ tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),
+ vreinterpretq_u32_s32(p2));
+#else
tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
vmovn_u32(vreinterpretq_u32_s32(p2)));
+#endif
tmp += vdupq_n_u16(128);
tmp += vshrq_n_u16(tmp, 8);
@@ -66,7 +71,11 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
// Test if <= 0
cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
+#ifdef SK_CPU_ARM64
+ cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
+#else
cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
+#endif
cmp8_1 = vmovn_u16(cmp16);
// Init to zero
@@ -75,7 +84,11 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
// Test if >= 255*255
cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
+#ifdef SK_CPU_ARM64
+ cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
+#else
cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
+#endif
cmp8 = vmovn_u16(cmp16);
// Insert 255 where true
@@ -409,11 +422,19 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
if (overlay) {
dc2 = vshll_n_u8(dc, 1);
scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
+#ifdef SK_CPU_ARM64
+ scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));
+#else
scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
+#endif
} else {
sc2 = vshll_n_u8(sc, 1);
scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
+#ifdef SK_CPU_ARM64
+ scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));
+#else
scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
+#endif
}
// Calc COM
@@ -421,12 +442,20 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
com1 = vreinterpretq_s32_u32(
vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
com2 = vreinterpretq_s32_u32(
+#ifdef SK_CPU_ARM64
+ vmull_high_u16(const255, sc_plus_dc));
+#else
vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
+#endif
// Calc SUB
int32x4_t sub1, sub2;
sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
+#ifdef SK_CPU_ARM64
+ sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));
+#else
sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
+#endif
sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
@@ -444,10 +473,14 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
int32x4_t val2_1, val2_2;
uint32x4_t cmp1, cmp2;
- cmp1 = vmovl_u16(vget_low_u16(cmp));
- cmp1 |= vshlq_n_u32(cmp1, 16);
- cmp2 = vmovl_u16(vget_high_u16(cmp));
- cmp2 |= vshlq_n_u32(cmp2, 16);
+ // Doing a signed lengthening allows to save a few instructions
+ // thanks to sign extension.
+ cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));
+#ifdef SK_CPU_ARM64
+ cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));
+#else
+ cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));
+#endif
// Calc COM - SUB
val1_1 = com1 - sub1;
@@ -458,7 +491,11 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
val2_2 = com2 + sub2;
val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
+#ifdef SK_CPU_ARM64
+ val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));
+#else
val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
+#endif
// Insert where needed
val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
@@ -628,11 +665,19 @@ static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
term1_1 = vreinterpretq_s32_u32(
vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
term1_2 = vreinterpretq_s32_u32(
+#ifdef SK_CPU_ARM64
+ vmull_high_u16(const255, sc_plus_dc));
+#else
vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
+#endif
/* Calc the second term */
term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
+#ifdef SK_CPU_ARM64
+ term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));
+#else
term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
+#endif
return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
}
@@ -661,10 +706,18 @@ static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
scdc = vmull_u8(sc, dc);
val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
+#ifdef SK_CPU_ARM64
+ val2 = vaddl_high_u16(t1, t2);
+#else
val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
+#endif
val1 = vaddw_u16(val1, vget_low_u16(scdc));
+#ifdef SK_CPU_ARM64
+ val2 = vaddw_high_u16(val2, scdc);
+#else
val2 = vaddw_u16(val2, vget_high_u16(scdc));
+#endif
return clamp_div255round_simd8_32(
vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
@@ -690,7 +743,7 @@ typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
-SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer)
+SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
: INHERITED(buffer) {
fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
}
@@ -708,6 +761,10 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
while (count >= 8) {
uint8x8x4_t vsrc, vdst, vres;
+#ifdef SK_CPU_ARM64
+ vsrc = vld4_u8((uint8_t*)src);
+ vdst = vld4_u8((uint8_t*)dst);
+#else
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
asm volatile (
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
@@ -740,6 +797,7 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
vsrc.val[2] = d2; vdst.val[2] = d6;
vsrc.val[3] = d3; vdst.val[3] = d7;
#endif
+#endif // #ifdef SK_CPU_ARM64
vres = procSIMD(vsrc, vdst);
@@ -747,6 +805,9 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
count -= 8;
dst += 8;
+#ifdef SK_CPU_ARM64
+ src += 8;
+#endif
}
// Leftovers
for (int i = 0; i < count; i++) {
@@ -783,6 +844,9 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
vdst = vld1q_u16(dst);
+#ifdef SK_CPU_ARM64
+ vsrc = vld4_u8((uint8_t*)src);
+#else
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
asm volatile (
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
@@ -806,6 +870,7 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
vsrc.val[2] = d2;
vsrc.val[3] = d3;
#endif
+#endif // #ifdef SK_CPU_ARM64
vdst32 = SkPixel16ToPixel32_neon8(vdst);
vres = procSIMD(vsrc, vdst32);
@@ -815,6 +880,9 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
count -= 8;
dst += 8;
+#ifdef SK_CPU_ARM64
+ src += 8;
+#endif
}
for (int i = 0; i < count; i++) {
SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
@@ -835,7 +903,7 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
}
}
-#ifdef SK_DEVELOPER
+#ifndef SK_IGNORE_TO_STRING
void SkNEONProcCoeffXfermode::toString(SkString* str) const {
this->INHERITED::toString(str);
}
diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h
index a8d438195eb..8f3aaaea9d9 100644
--- a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h
+++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h
@@ -14,11 +14,11 @@ public:
virtual void xfer16(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,
int count, const SkAlpha* SK_RESTRICT aa) const SK_OVERRIDE;
- SK_DEVELOPER_TO_STRING()
+ SK_TO_STRING_OVERRIDE()
SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode)
private:
- SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer);
+ SkNEONProcCoeffXfermode(SkReadBuffer& buffer);
// void* is used to avoid pulling arm_neon.h in the core and having to build
// it with -mfpu=neon.
diff --git a/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp b/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp
deleted file mode 100644
index aaf6b2ef824..00000000000
--- a/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright 2009 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBitmapProcState_opts_SSE2.h"
-#include "SkBitmapProcState_opts_SSSE3.h"
-#include "SkBitmapFilter_opts_SSE2.h"
-#include "SkBlitMask.h"
-#include "SkBlitRow.h"
-#include "SkBlitRect_opts_SSE2.h"
-#include "SkBlitRow_opts_SSE2.h"
-#include "SkBlurImage_opts_SSE2.h"
-#include "SkUtils_opts_SSE2.h"
-#include "SkUtils.h"
-#include "SkMorphology_opts.h"
-#include "SkMorphology_opts_SSE2.h"
-
-#include "SkRTConf.h"
-
-#if defined(_MSC_VER) && defined(_WIN64)
-#include <intrin.h>
-#endif
-
-/* This file must *not* be compiled with -msse or -msse2, otherwise
- gcc may generate sse2 even for scalar ops (and thus give an invalid
- instruction on Pentium3 on the code below). Only files named *_SSE2.cpp
- in this directory should be compiled with -msse2. */
-
-
-#ifdef _MSC_VER
-static inline void getcpuid(int info_type, int info[4]) {
-#if defined(_WIN64)
- __cpuid(info, info_type);
-#else
- __asm {
- mov eax, [info_type]
- cpuid
- mov edi, [info]
- mov [edi], eax
- mov [edi+4], ebx
- mov [edi+8], ecx
- mov [edi+12], edx
- }
-#endif
-}
-#else
-#if defined(__x86_64__)
-static inline void getcpuid(int info_type, int info[4]) {
- asm volatile (
- "cpuid \n\t"
- : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
- : "a"(info_type)
- );
-}
-#else
-static inline void getcpuid(int info_type, int info[4]) {
- // We save and restore ebx, so this code can be compatible with -fPIC
- asm volatile (
- "pushl %%ebx \n\t"
- "cpuid \n\t"
- "movl %%ebx, %1 \n\t"
- "popl %%ebx \n\t"
- : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
- : "a"(info_type)
- );
-}
-#endif
-#endif
-
-#if defined(__x86_64__) || defined(_WIN64) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-/* All x86_64 machines have SSE2, or we know it's supported at compile time, so don't even bother checking. */
-static inline bool hasSSE2() {
- return true;
-}
-#else
-
-static inline bool hasSSE2() {
- int cpu_info[4] = { 0 };
- getcpuid(1, cpu_info);
- return (cpu_info[3] & (1<<26)) != 0;
-}
-#endif
-
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-/* If we know SSSE3 is supported at compile time, don't even bother checking. */
-static inline bool hasSSSE3() {
- return true;
-}
-#else
-
-static inline bool hasSSSE3() {
- int cpu_info[4] = { 0 };
- getcpuid(1, cpu_info);
- return (cpu_info[2] & 0x200) != 0;
-}
-#endif
-
-static bool cachedHasSSE2() {
- static bool gHasSSE2 = hasSSE2();
- return gHasSSE2;
-}
-
-static bool cachedHasSSSE3() {
- static bool gHasSSSE3 = hasSSSE3();
- return gHasSSSE3;
-}
-
-SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
-
-void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) {
- if (cachedHasSSE2()) {
- procs->fExtraHorizontalReads = 3;
- procs->fConvolveVertically = &convolveVertically_SSE2;
- procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
- procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
- procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
- }
-}
-
-void SkBitmapProcState::platformProcs() {
- if (cachedHasSSSE3()) {
- if (fSampleProc32 == S32_opaque_D32_filter_DX) {
- fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
- } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
- fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
- }
-
- if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
- fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
- } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
- fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
- }
- } else if (cachedHasSSE2()) {
- if (fSampleProc32 == S32_opaque_D32_filter_DX) {
- fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
- } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
- fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
- }
-
- if (fSampleProc16 == S32_D16_filter_DX) {
- fSampleProc16 = S32_D16_filter_DX_SSE2;
- }
- }
-
- if (cachedHasSSSE3() || cachedHasSSE2()) {
- if (fMatrixProc == ClampX_ClampY_filter_scale) {
- fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
- } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
- fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
- }
-
- if (fMatrixProc == ClampX_ClampY_filter_affine) {
- fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
- } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
- fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
- }
- if (c_hqfilter_sse) {
- if (fShaderProc32 == highQualityFilter32) {
- fShaderProc32 = highQualityFilter_SSE2;
- }
- }
- }
-}
-
-static SkBlitRow::Proc32 platform_32_procs[] = {
- NULL, // S32_Opaque,
- S32_Blend_BlitRow32_SSE2, // S32_Blend,
- S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
- S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
-};
-
-SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
- return NULL;
-}
-
-SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
- if (cachedHasSSE2()) {
- return Color32_SSE2;
- } else {
- return NULL;
- }
-}
-
-SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
- if (cachedHasSSE2()) {
- return platform_32_procs[flags];
- } else {
- return NULL;
- }
-}
-
-
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
- SkMask::Format maskFormat,
- SkColor color) {
- if (SkMask::kA8_Format != maskFormat) {
- return NULL;
- }
-
- ColorProc proc = NULL;
- if (cachedHasSSE2()) {
- switch (dstConfig) {
- case SkBitmap::kARGB_8888_Config:
- // The SSE2 version is not (yet) faster for black, so we check
- // for that.
- if (SK_ColorBLACK != color) {
- proc = SkARGB32_A8_BlitMask_SSE2;
- }
- break;
- default:
- break;
- }
- }
- return proc;
-}
-
-SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
- if (cachedHasSSE2()) {
- if (isOpaque) {
- return SkBlitLCD16OpaqueRow_SSE2;
- } else {
- return SkBlitLCD16Row_SSE2;
- }
- } else {
- return NULL;
- }
-
-}
-SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
- SkMask::Format maskFormat,
- RowFlags flags) {
- return NULL;
-}
-
-SkMemset16Proc SkMemset16GetPlatformProc() {
- if (cachedHasSSE2()) {
- return sk_memset16_SSE2;
- } else {
- return NULL;
- }
-}
-
-SkMemset32Proc SkMemset32GetPlatformProc() {
- if (cachedHasSSE2()) {
- return sk_memset32_SSE2;
- } else {
- return NULL;
- }
-}
-
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
- if (!cachedHasSSE2()) {
- return NULL;
- }
- switch (type) {
- case kDilateX_SkMorphologyProcType:
- return SkDilateX_SSE2;
- case kDilateY_SkMorphologyProcType:
- return SkDilateY_SSE2;
- case kErodeX_SkMorphologyProcType:
- return SkErodeX_SSE2;
- case kErodeY_SkMorphologyProcType:
- return SkErodeY_SSE2;
- default:
- return NULL;
- }
-}
-
-bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
- SkBoxBlurProc* boxBlurY,
- SkBoxBlurProc* boxBlurXY,
- SkBoxBlurProc* boxBlurYX) {
-#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
- return false;
-#else
- if (!cachedHasSSE2()) {
- return false;
- }
- return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
-#endif
-}
-
-SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
-
-SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
- if (cachedHasSSE2()) {
- return ColorRect32_SSE2;
- } else {
- return NULL;
- }
-}
diff --git a/chromium/third_party/skia/src/opts/opts_check_arm.cpp b/chromium/third_party/skia/src/opts/opts_check_arm.cpp
deleted file mode 100644
index 3a322aa0e9b..00000000000
--- a/chromium/third_party/skia/src/opts/opts_check_arm.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/***************************************************************************
- * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
- * Copyright 2006-2010, The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- ***************************************************************************/
-
-/* Changes:
- * 2011-04-01 ARM
- * Merged the functions from src/opts/opts_check_arm_neon.cpp
- * Modified to return ARM version of memset16 and memset32 if no neon
- * available in the core
- */
-
-#include "SkBlitRow.h"
-#include "SkUtils.h"
-
-#include "SkUtilsArm.h"
-#include "SkMorphology_opts.h"
-#include "SkMorphology_opts_neon.h"
-#include "SkBlurImage_opts_neon.h"
-
-#if defined(SK_CPU_LENDIAN) && !SK_ARM_NEON_IS_NONE
-extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
-extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
-#endif
-
-#if defined(SK_CPU_LENDIAN)
-extern "C" void arm_memset16(uint16_t* dst, uint16_t value, int count);
-extern "C" void arm_memset32(uint32_t* dst, uint32_t value, int count);
-#endif
-
-SkMemset16Proc SkMemset16GetPlatformProc() {
- // FIXME: memset.arm.S is using syntax incompatible with XCode
-#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
- return NULL;
-#elif SK_ARM_NEON_IS_DYNAMIC
- if (sk_cpu_arm_has_neon()) {
- return memset16_neon;
- } else {
- return arm_memset16;
- }
-#elif SK_ARM_NEON_IS_ALWAYS
- return memset16_neon;
-#else
- return arm_memset16;
-#endif
-}
-
-SkMemset32Proc SkMemset32GetPlatformProc() {
- // FIXME: memset.arm.S is using syntax incompatible with XCode
-#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS)
- return NULL;
-#elif SK_ARM_NEON_IS_DYNAMIC
- if (sk_cpu_arm_has_neon()) {
- return memset32_neon;
- } else {
- return arm_memset32;
- }
-#elif SK_ARM_NEON_IS_ALWAYS
- return memset32_neon;
-#else
- return arm_memset32;
-#endif
-}
-
-SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
- return NULL;
-}
-
-SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
-#if SK_ARM_NEON_IS_NONE
- return NULL;
-#else
-#if SK_ARM_NEON_IS_DYNAMIC
- if (!sk_cpu_arm_has_neon()) {
- return NULL;
- }
-#endif
- switch (type) {
- case kDilateX_SkMorphologyProcType:
- return SkDilateX_neon;
- case kDilateY_SkMorphologyProcType:
- return SkDilateY_neon;
- case kErodeX_SkMorphologyProcType:
- return SkErodeX_neon;
- case kErodeY_SkMorphologyProcType:
- return SkErodeY_neon;
- default:
- return NULL;
- }
-#endif
-}
-
-bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
- SkBoxBlurProc* boxBlurY,
- SkBoxBlurProc* boxBlurXY,
- SkBoxBlurProc* boxBlurYX) {
-#if SK_ARM_NEON_IS_NONE
- return false;
-#else
-#if SK_ARM_NEON_IS_DYNAMIC
- if (!sk_cpu_arm_has_neon()) {
- return false;
- }
-#endif
- return SkBoxBlurGetPlatformProcs_NEON(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
-#endif
-}
diff --git a/chromium/third_party/skia/src/opts/opts_check_x86.cpp b/chromium/third_party/skia/src/opts/opts_check_x86.cpp
new file mode 100644
index 00000000000..6af47729cd0
--- /dev/null
+++ b/chromium/third_party/skia/src/opts/opts_check_x86.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2009 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBitmapFilter_opts_SSE2.h"
+#include "SkBitmapProcState_opts_SSE2.h"
+#include "SkBitmapProcState_opts_SSSE3.h"
+#include "SkBlitMask.h"
+#include "SkBlitRect_opts_SSE2.h"
+#include "SkBlitRow.h"
+#include "SkBlitRow_opts_SSE2.h"
+#include "SkBlurImage_opts_SSE2.h"
+#include "SkMorphology_opts.h"
+#include "SkMorphology_opts_SSE2.h"
+#include "SkRTConf.h"
+#include "SkUtils.h"
+#include "SkUtils_opts_SSE2.h"
+#include "SkXfermode.h"
+#include "SkXfermode_proccoeff.h"
+
+#if defined(_MSC_VER) && defined(_WIN64)
+#include <intrin.h>
+#endif
+
+/* This file must *not* be compiled with -msse or any other optional SIMD
+ extension, otherwise gcc may generate SIMD instructions even for scalar ops
+ (and thus give an invalid instruction on Pentium3 on the code below).
+ For example, only files named *_SSE2.cpp in this directory should be
+ compiled with -msse2 or higher. */
+
+
+/* Function to get the CPU SSE-level in runtime, for different compilers. */
+#ifdef _MSC_VER
+static inline void getcpuid(int info_type, int info[4]) {
+#if defined(_WIN64)
+ __cpuid(info, info_type);
+#else
+ __asm {
+ mov eax, [info_type]
+ cpuid
+ mov edi, [info]
+ mov [edi], eax
+ mov [edi+4], ebx
+ mov [edi+8], ecx
+ mov [edi+12], edx
+ }
+#endif
+}
+#elif defined(__x86_64__)
+static inline void getcpuid(int info_type, int info[4]) {
+ asm volatile (
+ "cpuid \n\t"
+ : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+ : "a"(info_type)
+ );
+}
+#else
+static inline void getcpuid(int info_type, int info[4]) {
+ // We save and restore ebx, so this code can be compatible with -fPIC
+ asm volatile (
+ "pushl %%ebx \n\t"
+ "cpuid \n\t"
+ "movl %%ebx, %1 \n\t"
+ "popl %%ebx \n\t"
+ : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
+ : "a"(info_type)
+ );
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+/* Fetch the SIMD level directly from the CPU, at run-time.
+ * Only checks the levels needed by the optimizations in this file.
+ */
+static int get_SIMD_level() {
+ int cpu_info[4] = { 0 };
+
+ getcpuid(1, cpu_info);
+ if ((cpu_info[2] & (1<<20)) != 0) {
+ return SK_CPU_SSE_LEVEL_SSE42;
+ } else if ((cpu_info[2] & (1<<9)) != 0) {
+ return SK_CPU_SSE_LEVEL_SSSE3;
+ } else if ((cpu_info[3] & (1<<26)) != 0) {
+ return SK_CPU_SSE_LEVEL_SSE2;
+ } else {
+ return 0;
+ }
+}
+
+/* Verify that the requested SIMD level is supported in the build.
+ * If not, check if the platform supports it.
+ */
+static inline bool supports_simd(int minLevel) {
+#if defined(SK_CPU_SSE_LEVEL)
+ if (minLevel <= SK_CPU_SSE_LEVEL) {
+ return true;
+ } else
+#endif
+ {
+#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
+ /* For the Android framework we should always know at compile time if the device
+ * we are building for supports SSSE3. The one exception to this rule is on the
+ * emulator where we are compiled without the -mssse3 option (so we have no
+ * SSSE3 procs) but can be run on a host machine that supports SSSE3
+ * instructions. So for that particular case we disable our SSSE3 options.
+ */
+ return false;
+#else
+ static int gSIMDLevel = get_SIMD_level();
+ return (minLevel <= gSIMDLevel);
+#endif
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
+
+void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ procs->fExtraHorizontalReads = 3;
+ procs->fConvolveVertically = &convolveVertically_SSE2;
+ procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
+ procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
+ procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void SkBitmapProcState::platformProcs() {
+ /* Every optimization in the function requires at least SSE2 */
+ if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return;
+ }
+
+ /* Check fSampleProc32 */
+ if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+ fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
+ } else {
+ fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
+ }
+ } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+ fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
+ }
+ } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+ fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
+ } else {
+ fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
+ }
+ } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
+ fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
+ }
+ }
+
+ /* Check fSampleProc16 */
+ if (fSampleProc16 == S32_D16_filter_DX) {
+ fSampleProc16 = S32_D16_filter_DX_SSE2;
+ }
+
+ /* Check fMatrixProc */
+ if (fMatrixProc == ClampX_ClampY_filter_scale) {
+ fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
+ } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+ fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+ } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
+ fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
+ } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
+ fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
+ }
+
+ /* Check fShaderProc32 */
+ if (c_hqfilter_sse) {
+ if (fShaderProc32 == highQualityFilter32) {
+ fShaderProc32 = highQualityFilter_SSE2;
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static SkBlitRow::Proc platform_16_procs[] = {
+ S32_D565_Opaque_SSE2, // S32_D565_Opaque
+ NULL, // S32_D565_Blend
+ S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
+ NULL, // S32A_D565_Blend
+ S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither
+ NULL, // S32_D565_Blend_Dither
+ S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither
+ NULL, // S32A_D565_Blend_Dither
+};
+
+SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return platform_16_procs[flags];
+ } else {
+ return NULL;
+ }
+}
+
+static SkBlitRow::Proc32 platform_32_procs[] = {
+ NULL, // S32_Opaque,
+ S32_Blend_BlitRow32_SSE2, // S32_Blend,
+ S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
+ S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
+};
+
+SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return platform_32_procs[flags];
+ } else {
+ return NULL;
+ }
+}
+
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return Color32_SSE2;
+ } else {
+ return NULL;
+ }
+}
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+/* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled.
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return ColorRect32_SSE2;
+ } else {
+ return NULL;
+ }
+*/
+ return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
+ SkMask::Format maskFormat,
+ SkColor color) {
+ if (SkMask::kA8_Format != maskFormat) {
+ return NULL;
+ }
+
+ ColorProc proc = NULL;
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ switch (dstCT) {
+ case kN32_SkColorType:
+ // The SSE2 version is not (yet) faster for black, so we check
+ // for that.
+ if (SK_ColorBLACK != color) {
+ proc = SkARGB32_A8_BlitMask_SSE2;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return proc;
+}
+
+SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ if (isOpaque) {
+ return SkBlitLCD16OpaqueRow_SSE2;
+ } else {
+ return SkBlitLCD16Row_SSE2;
+ }
+ } else {
+ return NULL;
+ }
+
+}
+
+SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
+ return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return sk_memset16_SSE2;
+ } else {
+ return NULL;
+ }
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return sk_memset32_SSE2;
+ } else {
+ return NULL;
+ }
+}
+
+SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return sk_memcpy32_SSE2;
+ } else {
+ return NULL;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
+ if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return NULL;
+ }
+ switch (type) {
+ case kDilateX_SkMorphologyProcType:
+ return SkDilateX_SSE2;
+ case kDilateY_SkMorphologyProcType:
+ return SkDilateY_SSE2;
+ case kErodeX_SkMorphologyProcType:
+ return SkErodeX_SSE2;
+ case kErodeY_SkMorphologyProcType:
+ return SkErodeY_SSE2;
+ default:
+ return NULL;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
+ SkBoxBlurProc* boxBlurY,
+ SkBoxBlurProc* boxBlurXY,
+ SkBoxBlurProc* boxBlurYX) {
+#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
+ return false;
+#else
+ if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return false;
+ }
+ return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
+ SkXfermode::Mode mode);
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
+ SkXfermode::Mode mode);
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
+ SkXfermode::Mode mode) {
+ return NULL;
+}
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
+ SkXfermode::Mode mode);
+
+SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
+ SkXfermode::Mode mode) {
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return SkPlatformXfermodeFactory_impl_SSE2(rec, mode);
+ } else {
+ return SkPlatformXfermodeFactory_impl(rec, mode);
+ }
+}
+
+SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode);
+
+SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) {
+ return NULL;
+}