diff options
author | Jocelyn Turcotte <jocelyn.turcotte@digia.com> | 2014-08-08 14:30:41 +0200 |
---|---|---|
committer | Jocelyn Turcotte <jocelyn.turcotte@digia.com> | 2014-08-12 13:49:54 +0200 |
commit | ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch) | |
tree | 498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/skia/src/opts | |
parent | 4ce69f7403811819800e7c5ae1318b2647e778d1 (diff) |
Update Chromium to beta version 37.0.2062.68
Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca
Reviewed-by: Andras Becsi <andras.becsi@digia.com>
Diffstat (limited to 'chromium/third_party/skia/src/opts')
44 files changed, 4870 insertions, 2796 deletions
diff --git a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp index 259e2efc0ec..b0405669218 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.cpp @@ -5,17 +5,15 @@ * found in the LICENSE file. */ -#include "SkBitmapProcState.h" +#include <emmintrin.h> #include "SkBitmap.h" +#include "SkBitmapFilter_opts_SSE2.h" +#include "SkBitmapProcState.h" #include "SkColor.h" #include "SkColorPriv.h" -#include "SkUnPreMultiply.h" -#include "SkShader.h" #include "SkConvolver.h" - -#include "SkBitmapFilter_opts_SSE2.h" - -#include <emmintrin.h> +#include "SkShader.h" +#include "SkUnPreMultiply.h" #if 0 static inline void print128i(__m128i value) { @@ -175,7 +173,6 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, s.fInvProc(s.fInvMatrix, SkIntToScalar(x), SkIntToScalar(y), &srcPt); - } } @@ -185,126 +182,126 @@ void convolveHorizontally_SSE2(const unsigned char* src_data, const SkConvolutionFilter1D& filter, unsigned char* out_row, bool /*has_alpha*/) { - int num_values = filter.numValues(); - - int filter_offset, filter_length; - __m128i zero = _mm_setzero_si128(); - __m128i mask[4]; - // |mask| will be used to decimate all extra filter coefficients that are - // loaded by SIMD when |filter_length| is not divisible by 4. - // mask[0] is not used in following algorithm. - mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); - mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); - mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); - - // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int out_x = 0; out_x < num_values; out_x++) { - const SkConvolutionFilter1D::ConvolutionFixed* filter_values = - filter.FilterForValue(out_x, &filter_offset, &filter_length); - - __m128i accum = _mm_setzero_si128(); - - // Compute the first pixel in this row that the filter affects. It will - // touch |filter_length| pixels (4 bytes each) after this. - const __m128i* row_to_filter = - reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); - - // We will load and accumulate with four coefficients per iteration. - for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { - - // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. - __m128i coeff, coeff16; - // [16] xx xx xx xx c3 c2 c1 c0 - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); - // [16] xx xx xx xx c1 c1 c0 c0 - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); - // [16] c1 c1 c1 c1 c0 c0 c0 c0 - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); - - // Load four pixels => unpack the first two pixels to 16 bits => - // multiply with coefficients => accumulate the convolution result. - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src8 = _mm_loadu_si128(row_to_filter); - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a0*c0 b0*c0 g0*c0 r0*c0 - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - // [32] a1*c1 b1*c1 g1*c1 r1*c1 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - - // Duplicate 3rd and 4th coefficients for all channels => - // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients - // => accumulate the convolution results. - // [16] xx xx xx xx c3 c3 c2 c2 - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); - // [16] c3 c3 c3 c3 c2 c2 c2 c2 - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); - // [16] a3 g3 b3 r3 a2 g2 b2 r2 - src16 = _mm_unpackhi_epi8(src8, zero); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a2*c2 b2*c2 g2*c2 r2*c2 - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - // [32] a3*c3 b3*c3 g3*c3 r3*c3 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - - // Advance the pixel and coefficients pointers. - row_to_filter += 1; - filter_values += 4; - } + int num_values = filter.numValues(); + + int filter_offset, filter_length; + __m128i zero = _mm_setzero_si128(); + __m128i mask[4]; + // |mask| will be used to decimate all extra filter coefficients that are + // loaded by SIMD when |filter_length| is not divisible by 4. + // mask[0] is not used in following algorithm. + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); + + // Output one pixel each iteration, calculating all channels (RGBA) together. + for (int out_x = 0; out_x < num_values; out_x++) { + const SkConvolutionFilter1D::ConvolutionFixed* filter_values = + filter.FilterForValue(out_x, &filter_offset, &filter_length); + + __m128i accum = _mm_setzero_si128(); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filter_length| pixels (4 bytes each) after this. + const __m128i* row_to_filter = + reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); + + // We will load and accumulate with four coefficients per iteration. + for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { + + // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. + __m128i coeff, coeff16; + // [16] xx xx xx xx c3 c2 c1 c0 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // [16] xx xx xx xx c1 c1 c0 c0 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + // [16] c1 c1 c1 c1 c0 c0 c0 c0 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + + // Load four pixels => unpack the first two pixels to 16 bits => + // multiply with coefficients => accumulate the convolution result. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src8 = _mm_loadu_si128(row_to_filter); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0*c0 b0*c0 g0*c0 r0*c0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + // [32] a1*c1 b1*c1 g1*c1 r1*c1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + // Duplicate 3rd and 4th coefficients for all channels => + // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients + // => accumulate the convolution results. + // [16] xx xx xx xx c3 c3 c2 c2 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + // [16] c3 c3 c3 c3 c2 c2 c2 c2 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + // [16] a3 g3 b3 r3 a2 g2 b2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2*c2 b2*c2 g2*c2 r2*c2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + // [32] a3*c3 b3*c3 g3*c3 r3*c3 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + // Advance the pixel and coefficients pointers. + row_to_filter += 1; + filter_values += 4; + } - // When |filter_length| is not divisible by 4, we need to decimate some of - // the filter coefficient that was loaded incorrectly to zero; Other than - // that the algorithm is same with above, exceot that the 4th pixel will be - // always absent. - int r = filter_length&3; - if (r) { - // Note: filter_values must be padded to align_up(filter_offset, 8). - __m128i coeff, coeff16; - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); - // Mask out extra filter taps. - coeff = _mm_and_si128(coeff, mask[r]); - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); - - // Note: line buffer must be padded to align_up(filter_offset, 16). - // We resolve this by use C-version for the last horizontal line. - __m128i src8 = _mm_loadu_si128(row_to_filter); - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - - src16 = _mm_unpackhi_epi8(src8, zero); - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - } + // When |filter_length| is not divisible by 4, we need to decimate some of + // the filter coefficient that was loaded incorrectly to zero; Other than + // that the algorithm is same with above, exceot that the 4th pixel will be + // always absent. + int r = filter_length&3; + if (r) { + // Note: filter_values must be padded to align_up(filter_offset, 8). + __m128i coeff, coeff16; + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // Mask out extra filter taps. + coeff = _mm_and_si128(coeff, mask[r]); + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + + // Note: line buffer must be padded to align_up(filter_offset, 16). + // We resolve this by use C-version for the last horizontal line. + __m128i src8 = _mm_loadu_si128(row_to_filter); + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + src16 = _mm_unpackhi_epi8(src8, zero); + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + } - // Shift right for fixed point implementation. - accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); + // Shift right for fixed point implementation. + accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); - // Packing 32 bits |accum| to 16 bits per channel (signed saturation). - accum = _mm_packs_epi32(accum, zero); - // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). - accum = _mm_packus_epi16(accum, zero); + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + accum = _mm_packs_epi32(accum, zero); + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + accum = _mm_packus_epi16(accum, zero); - // Store the pixel value of 32 bits. - *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); - out_row += 4; - } + // Store the pixel value of 32 bits. + *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); + out_row += 4; + } } // Convolves horizontally along four rows. The row data is given in @@ -314,116 +311,116 @@ void convolveHorizontally_SSE2(const unsigned char* src_data, void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], const SkConvolutionFilter1D& filter, unsigned char* out_row[4]) { - int num_values = filter.numValues(); - - int filter_offset, filter_length; - __m128i zero = _mm_setzero_si128(); - __m128i mask[4]; - // |mask| will be used to decimate all extra filter coefficients that are - // loaded by SIMD when |filter_length| is not divisible by 4. - // mask[0] is not used in following algorithm. - mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); - mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); - mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); - - // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int out_x = 0; out_x < num_values; out_x++) { - const SkConvolutionFilter1D::ConvolutionFixed* filter_values = - filter.FilterForValue(out_x, &filter_offset, &filter_length); - - // four pixels in a column per iteration. - __m128i accum0 = _mm_setzero_si128(); - __m128i accum1 = _mm_setzero_si128(); - __m128i accum2 = _mm_setzero_si128(); - __m128i accum3 = _mm_setzero_si128(); - int start = (filter_offset<<2); - // We will load and accumulate with four coefficients per iteration. - for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { - __m128i coeff, coeff16lo, coeff16hi; - // [16] xx xx xx xx c3 c2 c1 c0 - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); - // [16] xx xx xx xx c1 c1 c0 c0 - coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); - // [16] c1 c1 c1 c1 c0 c0 c0 c0 - coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); - // [16] xx xx xx xx c3 c3 c2 c2 - coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); - // [16] c3 c3 c3 c3 c2 c2 c2 c2 - coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); - - __m128i src8, src16, mul_hi, mul_lo, t; - -#define ITERATION(src, accum) \ - src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ - src16 = _mm_unpacklo_epi8(src8, zero); \ - mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ - mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ - t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t); \ - t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t); \ - src16 = _mm_unpackhi_epi8(src8, zero); \ - mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ - mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ - t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t); \ - t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t) - - ITERATION(src_data[0] + start, accum0); - ITERATION(src_data[1] + start, accum1); - ITERATION(src_data[2] + start, accum2); - ITERATION(src_data[3] + start, accum3); - - start += 16; - filter_values += 4; - } + int num_values = filter.numValues(); + + int filter_offset, filter_length; + __m128i zero = _mm_setzero_si128(); + __m128i mask[4]; + // |mask| will be used to decimate all extra filter coefficients that are + // loaded by SIMD when |filter_length| is not divisible by 4. + // mask[0] is not used in following algorithm. + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); + + // Output one pixel each iteration, calculating all channels (RGBA) together. + for (int out_x = 0; out_x < num_values; out_x++) { + const SkConvolutionFilter1D::ConvolutionFixed* filter_values = + filter.FilterForValue(out_x, &filter_offset, &filter_length); + + // four pixels in a column per iteration. + __m128i accum0 = _mm_setzero_si128(); + __m128i accum1 = _mm_setzero_si128(); + __m128i accum2 = _mm_setzero_si128(); + __m128i accum3 = _mm_setzero_si128(); + int start = (filter_offset<<2); + // We will load and accumulate with four coefficients per iteration. + for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { + __m128i coeff, coeff16lo, coeff16hi; + // [16] xx xx xx xx c3 c2 c1 c0 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // [16] xx xx xx xx c1 c1 c0 c0 + coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + // [16] c1 c1 c1 c1 c0 c0 c0 c0 + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); + // [16] xx xx xx xx c3 c3 c2 c2 + coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + // [16] c3 c3 c3 c3 c2 c2 c2 c2 + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); + + __m128i src8, src16, mul_hi, mul_lo, t; + +#define ITERATION(src, accum) \ + src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ + src16 = _mm_unpacklo_epi8(src8, zero); \ + mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ + mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + src16 = _mm_unpackhi_epi8(src8, zero); \ + mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ + mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t) + + ITERATION(src_data[0] + start, accum0); + ITERATION(src_data[1] + start, accum1); + ITERATION(src_data[2] + start, accum2); + ITERATION(src_data[3] + start, accum3); + + start += 16; + filter_values += 4; + } - int r = filter_length & 3; - if (r) { - // Note: filter_values must be padded to align_up(filter_offset, 8); - __m128i coeff; - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); - // Mask out extra filter taps. - coeff = _mm_and_si128(coeff, mask[r]); - - __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); - /* c1 c1 c1 c1 c0 c0 c0 c0 */ - coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); - __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); - coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); - - __m128i src8, src16, mul_hi, mul_lo, t; - - ITERATION(src_data[0] + start, accum0); - ITERATION(src_data[1] + start, accum1); - ITERATION(src_data[2] + start, accum2); - ITERATION(src_data[3] + start, accum3); - } + int r = filter_length & 3; + if (r) { + // Note: filter_values must be padded to align_up(filter_offset, 8); + __m128i coeff; + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // Mask out extra filter taps. + coeff = _mm_and_si128(coeff, mask[r]); + + __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + /* c1 c1 c1 c1 c0 c0 c0 c0 */ + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); + __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); + + __m128i src8, src16, mul_hi, mul_lo, t; + + ITERATION(src_data[0] + start, accum0); + ITERATION(src_data[1] + start, accum1); + ITERATION(src_data[2] + start, accum2); + ITERATION(src_data[3] + start, accum3); + } - accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); - accum0 = _mm_packs_epi32(accum0, zero); - accum0 = _mm_packus_epi16(accum0, zero); - accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); - accum1 = _mm_packs_epi32(accum1, zero); - accum1 = _mm_packus_epi16(accum1, zero); - accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); - accum2 = _mm_packs_epi32(accum2, zero); - accum2 = _mm_packus_epi16(accum2, zero); - accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); - accum3 = _mm_packs_epi32(accum3, zero); - accum3 = _mm_packus_epi16(accum3, zero); - - *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); - *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); - *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); - *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); - - out_row[0] += 4; - out_row[1] += 4; - out_row[2] += 4; - out_row[3] += 4; - } + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum0 = _mm_packs_epi32(accum0, zero); + accum0 = _mm_packus_epi16(accum0, zero); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_packs_epi32(accum1, zero); + accum1 = _mm_packus_epi16(accum1, zero); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_packs_epi32(accum2, zero); + accum2 = _mm_packus_epi16(accum2, zero); + accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); + accum3 = _mm_packs_epi32(accum3, zero); + accum3 = _mm_packus_epi16(accum3, zero); + + *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); + *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); + *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); + *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); + + out_row[0] += 4; + out_row[1] += 4; + out_row[2] += 4; + out_row[3] += 4; + } } // Does vertical convolution to produce one output row. The filter values and @@ -438,166 +435,166 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt unsigned char* const* source_data_rows, int pixel_width, unsigned char* out_row) { - int width = pixel_width & ~3; - - __m128i zero = _mm_setzero_si128(); - __m128i accum0, accum1, accum2, accum3, coeff16; - const __m128i* src; - // Output four pixels per iteration (16 bytes). - for (int out_x = 0; out_x < width; out_x += 4) { - - // Accumulated result for each pixel. 32 bits per RGBA channel. - accum0 = _mm_setzero_si128(); - accum1 = _mm_setzero_si128(); - accum2 = _mm_setzero_si128(); - accum3 = _mm_setzero_si128(); - - // Convolve with one filter coefficient per iteration. - for (int filter_y = 0; filter_y < filter_length; filter_y++) { - - // Duplicate the filter coefficient 8 times. - // [16] cj cj cj cj cj cj cj cj - coeff16 = _mm_set1_epi16(filter_values[filter_y]); - - // Load four pixels (16 bytes) together. - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - src = reinterpret_cast<const __m128i*>( - &source_data_rows[filter_y][out_x << 2]); - __m128i src8 = _mm_loadu_si128(src); - - // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => - // multiply with current coefficient => accumulate the result. - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a0 b0 g0 r0 - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum0 = _mm_add_epi32(accum0, t); - // [32] a1 b1 g1 r1 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum1 = _mm_add_epi32(accum1, t); - - // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => - // multiply with current coefficient => accumulate the result. - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - src16 = _mm_unpackhi_epi8(src8, zero); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a2 b2 g2 r2 - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum2 = _mm_add_epi32(accum2, t); - // [32] a3 b3 g3 r3 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum3 = _mm_add_epi32(accum3, t); - } - - // Shift right for fixed point implementation. - accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); - accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); - - // Packing 32 bits |accum| to 16 bits per channel (signed saturation). - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packs_epi32(accum0, accum1); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - accum2 = _mm_packs_epi32(accum2, accum3); + int width = pixel_width & ~3; + + __m128i zero = _mm_setzero_si128(); + __m128i accum0, accum1, accum2, accum3, coeff16; + const __m128i* src; + // Output four pixels per iteration (16 bytes). + for (int out_x = 0; out_x < width; out_x += 4) { + + // Accumulated result for each pixel. 32 bits per RGBA channel. + accum0 = _mm_setzero_si128(); + accum1 = _mm_setzero_si128(); + accum2 = _mm_setzero_si128(); + accum3 = _mm_setzero_si128(); + + // Convolve with one filter coefficient per iteration. + for (int filter_y = 0; filter_y < filter_length; filter_y++) { + + // Duplicate the filter coefficient 8 times. + // [16] cj cj cj cj cj cj cj cj + coeff16 = _mm_set1_epi16(filter_values[filter_y]); + + // Load four pixels (16 bytes) together. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + src = reinterpret_cast<const __m128i*>( + &source_data_rows[filter_y][out_x << 2]); + __m128i src8 = _mm_loadu_si128(src); + + // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => + // multiply with current coefficient => accumulate the result. + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0 b0 g0 r0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum0 = _mm_add_epi32(accum0, t); + // [32] a1 b1 g1 r1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum1 = _mm_add_epi32(accum1, t); + + // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => + // multiply with current coefficient => accumulate the result. + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2 b2 g2 r2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum2 = _mm_add_epi32(accum2, t); + // [32] a3 b3 g3 r3 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum3 = _mm_add_epi32(accum3, t); + } - // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packus_epi16(accum0, accum2); + // Shift right for fixed point implementation. + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packs_epi32(accum0, accum1); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + accum2 = _mm_packs_epi32(accum2, accum3); + + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packus_epi16(accum0, accum2); + + if (has_alpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + __m128i a = _mm_srli_epi32(accum0, 8); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = _mm_srli_epi32(accum0, 16); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = _mm_max_epu8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = _mm_slli_epi32(b, 24); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum0 = _mm_max_epu8(b, accum0); + } else { + // Set value of alpha channels to 0xFF. + __m128i mask = _mm_set1_epi32(0xff000000); + accum0 = _mm_or_si128(accum0, mask); + } - if (has_alpha) { - // Compute the max(ri, gi, bi) for each pixel. - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - __m128i a = _mm_srli_epi32(accum0, 8); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = _mm_srli_epi32(accum0, 16); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = _mm_max_epu8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = _mm_slli_epi32(b, 24); - - // Make sure the value of alpha channel is always larger than maximum - // value of color channels. - accum0 = _mm_max_epu8(b, accum0); - } else { - // Set value of alpha channels to 0xFF. - __m128i mask = _mm_set1_epi32(0xff000000); - accum0 = _mm_or_si128(accum0, mask); + // Store the convolution result (16 bytes) and advance the pixel pointers. + _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); + out_row += 16; } - // Store the convolution result (16 bytes) and advance the pixel pointers. - _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); - out_row += 16; - } - - // When the width of the output is not divisible by 4, We need to save one - // pixel (4 bytes) each time. And also the fourth pixel is always absent. - if (pixel_width & 3) { - accum0 = _mm_setzero_si128(); - accum1 = _mm_setzero_si128(); - accum2 = _mm_setzero_si128(); - for (int filter_y = 0; filter_y < filter_length; ++filter_y) { - coeff16 = _mm_set1_epi16(filter_values[filter_y]); - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - src = reinterpret_cast<const __m128i*>( - &source_data_rows[filter_y][width<<2]); - __m128i src8 = _mm_loadu_si128(src); - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a0 b0 g0 r0 - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum0 = _mm_add_epi32(accum0, t); - // [32] a1 b1 g1 r1 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum1 = _mm_add_epi32(accum1, t); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - src16 = _mm_unpackhi_epi8(src8, zero); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a2 b2 g2 r2 - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum2 = _mm_add_epi32(accum2, t); - } + // When the width of the output is not divisible by 4, We need to save one + // pixel (4 bytes) each time. And also the fourth pixel is always absent. + if (pixel_width & 3) { + accum0 = _mm_setzero_si128(); + accum1 = _mm_setzero_si128(); + accum2 = _mm_setzero_si128(); + for (int filter_y = 0; filter_y < filter_length; ++filter_y) { + coeff16 = _mm_set1_epi16(filter_values[filter_y]); + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + src = reinterpret_cast<const __m128i*>( + &source_data_rows[filter_y][width<<2]); + __m128i src8 = _mm_loadu_si128(src); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0 b0 g0 r0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum0 = _mm_add_epi32(accum0, t); + // [32] a1 b1 g1 r1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum1 = _mm_add_epi32(accum1, t); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2 b2 g2 r2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum2 = _mm_add_epi32(accum2, t); + } - accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packs_epi32(accum0, accum1); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - accum2 = _mm_packs_epi32(accum2, zero); - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packus_epi16(accum0, accum2); - if (has_alpha) { - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - __m128i a = _mm_srli_epi32(accum0, 8); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = _mm_srli_epi32(accum0, 16); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = _mm_max_epu8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = _mm_slli_epi32(b, 24); - accum0 = _mm_max_epu8(b, accum0); - } else { - __m128i mask = _mm_set1_epi32(0xff000000); - accum0 = _mm_or_si128(accum0, mask); - } + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packs_epi32(accum0, accum1); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + accum2 = _mm_packs_epi32(accum2, zero); + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packus_epi16(accum0, accum2); + if (has_alpha) { + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + __m128i a = _mm_srli_epi32(accum0, 8); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = _mm_srli_epi32(accum0, 16); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = _mm_max_epu8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = _mm_slli_epi32(b, 24); + accum0 = _mm_max_epu8(b, accum0); + } else { + __m128i mask = _mm_set1_epi32(0xff000000); + accum0 = _mm_or_si128(accum0, mask); + } - for (int out_x = width; out_x < pixel_width; out_x++) { - *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); - accum0 = _mm_srli_si128(accum0, 4); - out_row += 4; + for (int out_x = width; out_x < pixel_width; out_x++) { + *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); + accum0 = _mm_srli_si128(accum0, 4); + out_row += 4; + } } - } } void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, @@ -606,19 +603,19 @@ void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt int pixel_width, unsigned char* out_row, bool has_alpha) { - if (has_alpha) { - convolveVertically_SSE2<true>(filter_values, - filter_length, - source_data_rows, - pixel_width, - out_row); - } else { - convolveVertically_SSE2<false>(filter_values, - filter_length, - source_data_rows, - pixel_width, - out_row); - } + if (has_alpha) { + convolveVertically_SSE2<true>(filter_values, + filter_length, + source_data_rows, + pixel_width, + out_row); + } else { + convolveVertically_SSE2<false>(filter_values, + filter_length, + source_data_rows, + pixel_width, + out_row); + } } void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { diff --git a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h index 588f4ef18bb..661a824e227 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkBitmapFilter_opts_SSE2.h @@ -1,4 +1,3 @@ - /* * Copyright 2013 Google Inc. * @@ -6,7 +5,6 @@ * found in the LICENSE file. */ - #ifndef SkBitmapFilter_opts_sse2_DEFINED #define SkBitmapFilter_opts_sse2_DEFINED @@ -14,9 +12,9 @@ #include "SkConvolver.h" void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, - SkPMColor *SK_RESTRICT colors, int count); + SkPMColor *SK_RESTRICT colors, int count); void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y, - SkPMColor *SK_RESTRICT colors, int count); + SkPMColor *SK_RESTRICT colors, int count); void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h index e56b683b874..0887145c3d0 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_filter_neon.h @@ -17,12 +17,15 @@ * exact results for the color components, but if the 4 incoming colors are * all opaque, then the output color must also be opaque. Subsequent parts of * the drawing pipeline may rely on this (e.g. which blitrow proc to use). + * */ - -static inline void Filter_32_opaque_neon(unsigned x, unsigned y, - SkPMColor a00, SkPMColor a01, - SkPMColor a10, SkPMColor a11, - SkPMColor *dst) { +// Chrome on Android uses -Os so we need to force these inline. Otherwise +// calling the function in the inner loops will cause significant overhead on +// some platforms. +static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y, + SkPMColor a00, SkPMColor a01, + SkPMColor a10, SkPMColor a11, + SkPMColor *dst) { uint8x8_t vy, vconst16_8, v16_y, vres; uint16x4_t vx, vconst16_16, v16_x, tmp; uint32x2_t va0, va1; @@ -53,10 +56,11 @@ static inline void Filter_32_opaque_neon(unsigned x, unsigned y, vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result } -static inline void Filter_32_alpha_neon(unsigned x, unsigned y, - SkPMColor a00, SkPMColor a01, - SkPMColor a10, SkPMColor a11, - SkPMColor *dst, uint16_t scale) { +static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y, + SkPMColor a00, SkPMColor a01, + SkPMColor a10, SkPMColor a11, + SkPMColor *dst, + uint16_t scale) { uint8x8_t vy, vconst16_8, v16_y, vres; uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; uint32x2_t va0, va1; diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp index e81da670526..7789031c028 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrixProcs_neon.cpp @@ -10,26 +10,140 @@ #include "SkUtilsArm.h" #include "SkBitmapProcState_utils.h" +#include <arm_neon.h> + extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); -#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon -#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) -#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) -#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) -#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) +// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) +static inline int16x8_t sbpsm_clamp_tile8(int32x4_t low, int32x4_t high, unsigned max) { + int16x8_t res; + + // get the hi 16s of all those 32s + res = vuzpq_s16(vreinterpretq_s16_s32(low), vreinterpretq_s16_s32(high)).val[1]; + + // clamp + res = vmaxq_s16(res, vdupq_n_s16(0)); + res = vminq_s16(res, vdupq_n_s16(max)); + + return res; +} + +// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) +static inline int32x4_t sbpsm_clamp_tile4(int32x4_t f, unsigned max) { + int32x4_t res; + + // get the hi 16s of all those 32s + res = vshrq_n_s32(f, 16); + + // clamp + res = vmaxq_s32(res, vdupq_n_s32(0)); + res = vminq_s32(res, vdupq_n_s32(max)); + + return res; +} + +// TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) +static inline int32x4_t sbpsm_clamp_tile4_low_bits(int32x4_t fx) { + int32x4_t ret; + + ret = vshrq_n_s32(fx, 12); + + /* We don't need the mask below because the caller will + * overwrite the non-masked bits + */ + //ret = vandq_s32(ret, vdupq_n_s32(0xF)); + + return ret; +} + +// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) +static inline int16x8_t sbpsm_repeat_tile8(int32x4_t low, int32x4_t high, unsigned max) { + uint16x8_t res; + uint32x4_t tmpl, tmph; + + // get the lower 16 bits + res = vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).val[0]; + + // bare multiplication, not SkFixedMul + tmpl = vmull_u16(vget_low_u16(res), vdup_n_u16(max+1)); + tmph = vmull_u16(vget_high_u16(res), vdup_n_u16(max+1)); + + // extraction of the 16 upper bits + res = vuzpq_u16(vreinterpretq_u16_u32(tmpl), vreinterpretq_u16_u32(tmph)).val[1]; + + return vreinterpretq_s16_u16(res); +} + +// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) +static inline int32x4_t sbpsm_repeat_tile4(int32x4_t f, unsigned max) { + uint16x4_t res; + uint32x4_t tmp; + + // get the lower 16 bits + res = vmovn_u32(vreinterpretq_u32_s32(f)); + + // bare multiplication, not SkFixedMul + tmp = vmull_u16(res, vdup_n_u16(max+1)); + + // extraction of the 16 upper bits + tmp = vshrq_n_u32(tmp, 16); + + return vreinterpretq_s32_u32(tmp); +} + +// TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) +static inline int32x4_t sbpsm_repeat_tile4_low_bits(int32x4_t fx, unsigned max) { + uint16x4_t res; + uint32x4_t tmp; + int32x4_t ret; + + // get the lower 16 bits + res = vmovn_u32(vreinterpretq_u32_s32(fx)); + + // bare multiplication, not SkFixedMul + tmp = vmull_u16(res, vdup_n_u16(max + 1)); + + // shift and mask + ret = vshrq_n_s32(vreinterpretq_s32_u32(tmp), 12); + + /* We don't need the mask below because the caller will + * overwrite the non-masked bits + */ + //ret = vandq_s32(ret, vdupq_n_s32(0xF)); + + return ret; +} + +#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon +#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) +#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) +#define TILEX_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max) +#define TILEY_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max) +#define TILEX_PROCF_NEON4(fx, max) sbpsm_clamp_tile4(fx, max) +#define TILEY_PROCF_NEON4(fy, max) sbpsm_clamp_tile4(fy, max) +#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) +#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) +#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_clamp_tile4_low_bits(fx) +#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_clamp_tile4_low_bits(fy) #define CHECK_FOR_DECAL -#include "SkBitmapProcState_matrix_clamp_neon.h" - -#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon -#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) -#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) -#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) -#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) -#include "SkBitmapProcState_matrix_repeat_neon.h" +#include "SkBitmapProcState_matrix_neon.h" + +#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon +#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) +#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) +#define TILEX_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max) +#define TILEY_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max) +#define TILEX_PROCF_NEON4(fx, max) sbpsm_repeat_tile4(fx, max) +#define TILEY_PROCF_NEON4(fy, max) sbpsm_repeat_tile4(fy, max) +#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) +#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) +#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_repeat_tile4_low_bits(fx, max) +#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_repeat_tile4_low_bits(fy, max) +#include "SkBitmapProcState_matrix_neon.h" diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h deleted file mode 100644 index a615e26b240..00000000000 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_clamp_neon.h +++ /dev/null @@ -1,911 +0,0 @@ -/* NEON optimized code (C) COPYRIGHT 2009 Motorola - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -/* - * Modifications done in-house at Motorola - * - * this is a clone of SkBitmapProcState_matrix.h - * and has been tuned to work with the NEON unit. - * - * Still going back and forth between whether this approach - * (clone the entire SkBitmapProcState_matrix.h file or - * if I should put just the modified routines in here and - * then use a construct like #define DONT_DO_THIS_FUNCTION or - * something like that... - * - * This is for the ClampX_ClampY instance - * - */ - - -#include <arm_neon.h> - -/* - * This has been modified on the knowledge that (at the time) - * we had the following macro definitions in the parent file - * - * #define MAKENAME(suffix) ClampX_ClampY ## suffix - * #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) - * #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) - * #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) - * #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) - * #define CHECK_FOR_DECAL - */ - -/* SkClampMax(val,max) -- bound to 0..max */ - -#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) -#define SCALE_FILTER_NAME MAKENAME(_filter_scale) -#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) -#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) -#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) -#define PERSP_FILTER_NAME MAKENAME(_filter_persp) - -#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) -#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) - -#ifndef PREAMBLE - #define PREAMBLE(state) - #define PREAMBLE_PARAM_X - #define PREAMBLE_PARAM_Y - #define PREAMBLE_ARG_X - #define PREAMBLE_ARG_Y -#endif - -static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask)) == 0); - - PREAMBLE(s); - // we store y, x, x, x, x, x - - const unsigned maxX = s.fBitmap->width() - 1; - SkFixed fx; - { - SkPoint pt; - s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &pt); - fx = SkScalarToFixed(pt.fY); - const unsigned maxY = s.fBitmap->height() - 1; - *xy++ = TILEY_PROCF(fx, maxY); - fx = SkScalarToFixed(pt.fX); - } - - if (0 == maxX) { - // all of the following X values must be 0 - memset(xy, 0, count * sizeof(uint16_t)); - return; - } - - const SkFixed dx = s.fInvSx; - -#ifdef CHECK_FOR_DECAL - // test if we don't need to apply the tile proc - if ((unsigned)(fx >> 16) <= maxX && - (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { - decal_nofilter_scale_neon(xy, fx, dx, count); - return; - } -#endif - - int i; - - /* very much like done in decal_nofilter, but with - * an extra clamping function applied. - * TILEX_PROCF(fx,max) SkClampMax((fx)>>16, max) - */ - if (count >= 8) { - /* SkFixed is 16.16 fixed point */ - SkFixed dx2 = dx+dx; - SkFixed dx4 = dx2+dx2; - SkFixed dx8 = dx4+dx4; - - /* now build fx/fx+dx/fx+2dx/fx+3dx */ - SkFixed fx1, fx2, fx3; - int32x4_t lbase, hbase; - int16_t *dst16 = (int16_t *)xy; - - fx1 = fx+dx; - fx2 = fx1+dx; - fx3 = fx2+dx; - - /* build my template(s) */ - /* avoid the 'lbase unitialized' warning */ - lbase = vdupq_n_s32(fx); - lbase = vsetq_lane_s32(fx1, lbase, 1); - lbase = vsetq_lane_s32(fx2, lbase, 2); - lbase = vsetq_lane_s32(fx3, lbase, 3); - - hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); - - /* store & bump */ - do { - int32x4_t lout; - int32x4_t hout; - int16x8_t hi16; - - /* get the hi 16s of all those 32s */ - lout = lbase; - hout = hbase; - /* this sets up all lout's then all hout's in hout */ - asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); - hi16 = vreinterpretq_s16_s32(hout); - - /* clamp & output */ - hi16 = vmaxq_s16(hi16, vdupq_n_s16(0)); - hi16 = vminq_s16(hi16, vdupq_n_s16(maxX)); - vst1q_s16(dst16, hi16); - - /* but preserving base & on to the next */ - lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); - hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); - dst16 += 8; - count -= 8; - fx += dx8; - } while (count >= 8); - xy = (uint32_t *) dst16; - } - - uint16_t* xx = (uint16_t*)xy; - for (i = count; i > 0; --i) { - *xx++ = TILEX_PROCF(fx, maxX); fx += dx; - } -} - -// note: we could special-case on a matrix which is skewed in X but not Y. -// this would require a more general setup thatn SCALE does, but could use -// SCALE's inner loop that only looks at dx - -static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask | - SkMatrix::kAffine_Mask)) == 0); - - PREAMBLE(s); - SkPoint srcPt; - s.fInvProc(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &srcPt); - - SkFixed fx = SkScalarToFixed(srcPt.fX); - SkFixed fy = SkScalarToFixed(srcPt.fY); - SkFixed dx = s.fInvSx; - SkFixed dy = s.fInvKy; - int maxX = s.fBitmap->width() - 1; - int maxY = s.fBitmap->height() - 1; - - /* NEON lets us do an 8x unrolling */ - if (count >= 8) { - /* SkFixed is 16.16 fixed point */ - SkFixed dx4 = dx * 4; - SkFixed dy4 = dy * 4; - SkFixed dx8 = dx * 8; - SkFixed dy8 = dy * 8; - - int32x4_t xbase, ybase; - int32x4_t x2base, y2base; - int16_t *dst16 = (int16_t *) xy; - - /* my sets of maxx/maxy for clamping */ - int32_t maxpair = (maxX&0xffff) | ((maxY&0xffff)<<16); - int16x8_t maxXY = vreinterpretq_s16_s32(vdupq_n_s32(maxpair)); - - /* now build fx/fx+dx/fx+2dx/fx+3dx */ - /* avoid the 'xbase unitialized' warning...*/ - xbase = vdupq_n_s32(fx); - xbase = vsetq_lane_s32(fx+dx, xbase, 1); - xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2); - xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3); - - /* same for fy */ - /* avoid the 'ybase unitialized' warning...*/ - ybase = vdupq_n_s32(fy); - ybase = vsetq_lane_s32(fy+dy, ybase, 1); - ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2); - ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); - - x2base = vaddq_s32(xbase, vdupq_n_s32(dx4)); - y2base = vaddq_s32(ybase, vdupq_n_s32(dy4)); - - /* store & bump */ - do { - int32x4_t xout, yout; - int32x4_t x2out, y2out; - int16x8_t hi16, hi16_2; - - xout = xbase; - yout = ybase; - - /* overlay y's low16 with hi16 from x */ - /* so we properly shifted xyxyxyxy */ - yout = vsriq_n_s32(yout, xout, 16); - hi16 = vreinterpretq_s16_s32 (yout); - - /* do the clamping; both guys get 0's */ - hi16 = vmaxq_s16 (hi16, vdupq_n_s16(0)); - hi16 = vminq_s16 (hi16, maxXY); - - vst1q_s16 (dst16, hi16); - - /* and for the other 4 pieces of this iteration */ - x2out = x2base; - y2out = y2base; - - /* overlay y's low16 with hi16 from x */ - /* so we properly shifted xyxyxyxy */ - y2out = vsriq_n_s32(y2out, x2out, 16); - hi16_2 = vreinterpretq_s16_s32 (y2out); - - /* do the clamping; both guys get 0's */ - hi16_2 = vmaxq_s16 (hi16_2, vdupq_n_s16(0)); - hi16_2 = vminq_s16 (hi16_2, maxXY); - - /* RBE: gcc regenerates dst16+8 all the time instead - * of folding it into an addressing mode. *sigh* */ - vst1q_s16 (dst16+8, hi16_2); - - /* moving base and on to the next */ - xbase = vaddq_s32 (xbase, vdupq_n_s32 (dx8)); - ybase = vaddq_s32 (ybase, vdupq_n_s32 (dy8)); - x2base = vaddq_s32 (x2base, vdupq_n_s32 (dx8)); - y2base = vaddq_s32 (y2base, vdupq_n_s32 (dy8)); - - dst16 += 16; /* 8x32 aka 16x16 */ - count -= 8; - fx += dx8; - fy += dy8; - } while (count >= 8); - xy = (uint32_t *) dst16; - } - - for (int i = count; i > 0; --i) { - *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX); - fx += dx; fy += dy; - } -} - -#undef DEBUG_PERSP_NOFILTER - -static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, - uint32_t* SK_RESTRICT xy, - int count, int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); - - PREAMBLE(s); - /* max{X,Y} are int here, but later shown/assumed to fit in 16 bits */ - int maxX = s.fBitmap->width() - 1; - int maxY = s.fBitmap->height() - 1; - - SkPerspIter iter(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, count); - - while ((count = iter.next()) != 0) { - const SkFixed* SK_RESTRICT srcXY = iter.getXY(); - -#if defined(DEBUG_PERSP_NOFILTER) - /* debugging stuff */ - const SkFixed *end_srcXY = srcXY + (count*2); - uint32_t *end_xy = xy + (count); - const SkFixed *base_srcXY = srcXY; - uint32_t *base_xy = xy; - int base_count = count; -#endif - -#if 1 - // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition - // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn - - /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... - * but we immediately discard the low 16 bits... - * so what we're going to do is vld4, which will give us - * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo' - * parts.... - */ - if (count >= 8) { - int16_t *mysrc = (int16_t *) srcXY; - int16_t *mydst = (int16_t *) xy; - int16x4_t maxX4 = vdup_n_s16((int16_t)maxX); - int16x4_t maxY4 = vdup_n_s16((int16_t)maxY); - int16x4_t zero4 = vdup_n_s16(0); - - /* The constructs with local blocks for register assignments - * and asm() instructions is to make keep any hard register - * assignments to as small a scope as possible. and to avoid - * burning call-preserved hard registers on the vld/vst - * instructions. - */ - - do { - int16x4_t xhi, yhi; - int16x4_t x2hi, y2hi; - - /* vld4 does the de-interleaving for us */ - { - register int16x4_t t_xlo asm("d0"); - register int16x4_t t_xhi asm("d1"); - register int16x4_t t_ylo asm("d2"); - register int16x4_t t_yhi asm("d3"); - - asm ("vld4.16 {d0-d3},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */" - : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi) - : "r" (mysrc) - ); - xhi = t_xhi; - yhi = t_yhi; - } - - /* clamp X>>16 (aka xhi) to 0..maxX */ - xhi = vmax_s16(xhi, zero4); /* now 0.. */ - xhi = vmin_s16(xhi, maxX4); /* now 0..maxX */ - - /* clamp Y>>16 (aka yhi) to 0..maxY */ - yhi = vmax_s16(yhi, zero4); /* now 0.. */ - yhi = vmin_s16(yhi, maxY4); /* now 0..maxY */ - - /* deal with the second set of numbers */ - { - register int16x4_t t_xlo asm("d4"); - register int16x4_t t_xhi asm("d5"); - register int16x4_t t_ylo asm("d6"); - register int16x4_t t_yhi asm("d7"); - - /* offset == 256 bits == 32 bytes == 8 longs == 16 shorts */ - asm ("vld4.16 {d4-d7},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */" - : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi) - : "r" (mysrc+16) - ); - x2hi = t_xhi; - y2hi = t_yhi; - } - - /* clamp the second 4 here */ - - if (0) { extern void rbe(void); rbe(); } - - /* clamp X>>16 (aka xhi) to 0..maxX */ - x2hi = vmax_s16(x2hi, zero4); /* now 0.. */ - x2hi = vmin_s16(x2hi, maxX4); /* now 0..maxX */ - - /* clamp Y>>16 (aka yhi) to 0..maxY */ - y2hi = vmax_s16(y2hi, zero4); /* now 0.. */ - y2hi = vmin_s16(y2hi, maxY4); /* now 0..maxY */ - - /* we're storing as {x,y}s: x is [0], y is [1] */ - /* we'll use vst2 to make this happen */ - - { - register int16x4_t out_x asm("d16") = xhi; - register int16x4_t out_y asm("d17") = yhi; - - asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */" - : - : "w" (out_x), "w" (out_y), "r" (mydst) - ); - } - { - register int16x4_t out_x asm("d18") = x2hi; - register int16x4_t out_y asm("d19") = y2hi; - - asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */" - : - : "w" (out_x), "w" (out_y), "r" (mydst+8) - ); - } - - /* XXX: gcc isn't interleaving these with the NEON ops - * but i think that all the scoreboarding works out */ - count -= 8; /* 8 iterations */ - mysrc += 32; /* 16 longs, aka 32 shorts */ - mydst += 16; /* 16 shorts, aka 8 longs */ - } while (count >= 8); - /* get xy and srcXY fixed up */ - srcXY = (const SkFixed *) mysrc; - xy = (uint32_t *) mydst; - } -#endif - - while (--count >= 0) { - *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | - TILEX_PROCF(srcXY[0], maxX); - srcXY += 2; - } - -#if defined(DEBUG_PERSP_NOFILTER) - /* for checking our NEON-produced results against vanilla code */ - { - int bad = (-1); - for (int i = 0; i < base_count; i++) { - uint32_t val; - val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | - TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); - - if (val != base_xy[i]) { - bad = i; - break; - } - } - if (bad >= 0) { - SkDebugf("clamp-nofilter-persp failed piece %d\n", bad); - SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY); - bad -= (bad & 0x7); /* align */ - for (int i = bad; i < bad + 8; i++) { - uint32_t val; - val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | - TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); - - SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", - i, base_xy[i], val, base_srcXY[i * 2 + 0], - base_srcXY[i * 2 + 1]); - } - SkDebugf ("---\n"); - } - - if (end_xy != xy) { - SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy); - } - if (end_srcXY != srcXY) { - SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY, - end_srcXY); - } - } -#endif - } -} - -#undef DEBUG_PERSP_NOFILTER - -////////////////////////////////////////////////////////////////////////////// - -static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, - SkFixed one PREAMBLE_PARAM_Y) { - unsigned i = TILEY_PROCF(f, max); - i = (i << 4) | TILEY_LOW_BITS(f, max); - return (i << 14) | (TILEY_PROCF((f + one), max)); -} - -static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, - SkFixed one PREAMBLE_PARAM_X) { - unsigned i = TILEX_PROCF(f, max); - i = (i << 4) | TILEX_LOW_BITS(f, max); - return (i << 14) | (TILEX_PROCF((f + one), max)); -} - -static void SCALE_FILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask)) == 0); - SkASSERT(s.fInvKy == 0); - - PREAMBLE(s); - - const unsigned maxX = s.fBitmap->width() - 1; - const SkFixed one = s.fFilterOneX; - const SkFixed dx = s.fInvSx; - SkFixed fx; - - { - SkPoint pt; - s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &pt); - const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); - const unsigned maxY = s.fBitmap->height() - 1; - // compute our two Y values up front - *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); - // now initialize fx - fx = SkScalarToFixed(pt.fX) - (one >> 1); - } - -#ifdef CHECK_FOR_DECAL - // test if we don't need to apply the tile proc - if (dx > 0 && - (unsigned)(fx >> 16) <= maxX && - (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { - decal_filter_scale_neon(xy, fx, dx, count); - } else -#endif - - if (count >= 4) { - int32x4_t wide_one, wide_fx, wide_fx1, wide_i, wide_lo; - #if 0 - /* verification hooks -- see below */ - SkFixed debug_fx = fx; - int count_done = 0; - #endif - - wide_fx = vdupq_n_s32(fx); - wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); - wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); - wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); - - wide_one = vdupq_n_s32(one); - - while (count >= 4) { - /* original expands to: - * unsigned i = SkClampMax((f) >> 16, max); - * i = (i << 4) | (((f) >> 12) & 0xF); - * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); - */ - - /* i = SkClampMax(f>>16, maxX) */ - wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0)); - wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX)); - - /* i<<4 | TILEX_LOW_BITS(fx) */ - wide_lo = vshrq_n_s32(wide_fx, 12); - wide_i = vsliq_n_s32(wide_lo, wide_i, 4); - - /* i<<14 */ - wide_i = vshlq_n_s32(wide_i, 14); - - /* SkClampMax(((f + one)) >> 16, max) */ - wide_fx1 = vaddq_s32(wide_fx, wide_one); - wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0)); - wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX)); - - /* final combination */ - wide_i = vorrq_s32(wide_i, wide_fx1); - - vst1q_u32(xy, vreinterpretq_u32_s32(wide_i)); - - #if 0 - /* having a verification hook is a good idea */ - /* use debug_fx, debug_fx+dx, etc. */ - - for (int i=0;i<4;i++) { - uint32_t want = PACK_FILTER_X_NAME(debug_fx, maxX, one PREAMBLE_ARG_X); - if (xy[i] != want) - { - /* print a nastygram */ - SkDebugf("clamp-filter-scale fails\n"); - SkDebugf("got %08x want %08x\n", xy[i], want); - SkDebugf("fx %08x debug_fx %08x dx %08x done %d\n", - fx, debug_fx, dx, count_done); - SkDebugf(" maxX %08x one %08x\n", maxX, one); - - } - debug_fx += dx; - count_done++; - } - #endif - wide_fx += vdupq_n_s32(dx+dx+dx+dx); - fx += dx+dx+dx+dx; - xy += 4; - count -= 4; - } - } - - while (--count >= 0) { - *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X); - fx += dx; - } -} - -static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask | - SkMatrix::kAffine_Mask)) == 0); - - PREAMBLE(s); - SkPoint srcPt; - s.fInvProc(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &srcPt); - - SkFixed oneX = s.fFilterOneX; - SkFixed oneY = s.fFilterOneY; - SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); - SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); - SkFixed dx = s.fInvSx; - SkFixed dy = s.fInvKy; - unsigned maxX = s.fBitmap->width() - 1; - unsigned maxY = s.fBitmap->height() - 1; - - if (count >= 4) { - int32x4_t wide_i, wide_lo; - int32x4_t wide_fx, wide_onex, wide_fx1; - int32x4_t wide_fy, wide_oney, wide_fy1; - - #undef AFFINE_DEBUG - #if defined(AFFINE_DEBUG) - SkFixed fyp = fy; - SkFixed fxp = fx; - uint32_t *xyp = xy; - int count_done = 0; - #endif - - wide_fx = vdupq_n_s32(fx); - wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); - wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); - wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); - - wide_fy = vdupq_n_s32(fy); - wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); - wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); - wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); - - wide_onex = vdupq_n_s32(oneX); - wide_oney = vdupq_n_s32(oneY); - - while (count >= 4) { - int32x4_t wide_x; - int32x4_t wide_y; - - /* do the X side, then the Y side, then interleave them */ - - /* original expands to: - * unsigned i = SkClampMax((f) >> 16, max); - * i = (i << 4) | (((f) >> 12) & 0xF); - * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); - */ - - /* i = SkClampMax(f>>16, maxX) */ - wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0)); - wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX)); - - /* i<<4 | TILEX_LOW_BITS(fx) */ - wide_lo = vshrq_n_s32(wide_fx, 12); - wide_i = vsliq_n_s32(wide_lo, wide_i, 4); - - /* i<<14 */ - wide_i = vshlq_n_s32(wide_i, 14); - - /* SkClampMax(((f + one)) >> 16, max) */ - wide_fx1 = vaddq_s32(wide_fx, wide_onex); - wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0)); - wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX)); - - /* final combination */ - wide_x = vorrq_s32(wide_i, wide_fx1); - - /* And now the Y side */ - - /* i = SkClampMax(f>>16, maxX) */ - wide_i = vmaxq_s32(vshrq_n_s32(wide_fy,16), vdupq_n_s32(0)); - wide_i = vminq_s32(wide_i, vdupq_n_s32(maxY)); - - /* i<<4 | TILEX_LOW_BITS(fx) */ - wide_lo = vshrq_n_s32(wide_fy, 12); - wide_i = vsliq_n_s32(wide_lo, wide_i, 4); - - /* i<<14 */ - wide_i = vshlq_n_s32(wide_i, 14); - - /* SkClampMax(((f + one)) >> 16, max) */ - wide_fy1 = vaddq_s32(wide_fy, wide_oney); - wide_fy1 = vmaxq_s32(vshrq_n_s32(wide_fy1,16), vdupq_n_s32(0)); - wide_fy1 = vminq_s32(wide_fy1, vdupq_n_s32(maxY)); - - /* final combination */ - wide_y = vorrq_s32(wide_i, wide_fy1); - - /* interleave as YXYXYXYX as part of the storing */ - { - /* vst2.32 needs side-by-side registers */ - register int32x4_t t_x asm("q1"); - register int32x4_t t_y asm("q0"); - - t_x = wide_x; t_y = wide_y; - asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" - : - : "w" (t_y), "w" (t_x), "r" (xy) - ); - } - - #if defined(AFFINE_DEBUG) - /* make sure we're good here -- check the 4 we just output */ - for (int i = 0; i<4;i++) { - uint32_t val; - val = PACK_FILTER_Y_NAME(fyp, maxY, oneY PREAMBLE_ARG_Y); - if (val != xy[i*2+0]) { - /* print a nastygram */ - SkDebugf("clamp-filter-affine fails\n"); - SkDebugf("[bad-y] got %08x want %08x\n", xy[i*2+0], val); - SkDebugf("fy %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n", - fy, fxp, fyp, dx, dy, count_done); - SkDebugf(" maxY %08x oneY %08x\n", maxY, oneY); - } - val = PACK_FILTER_X_NAME(fxp, maxX, oneX PREAMBLE_ARG_X); - if (val != xy[i*2+1]) { - /* print a nastygram */ - SkDebugf("clamp-filter-affine fails\n"); - SkDebugf("[bad-x] got %08x want %08x\n", xy[i*2+1], val); - SkDebugf("fx %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n", - fx, fxp, fyp, dx, dy, count_done); - SkDebugf(" maxX %08x one %08x\n", maxX, oneX); - } - fyp += dy; - fxp += dx; - count_done++; - } - #endif - - wide_fx += vdupq_n_s32(dx+dx+dx+dx); - fx += dx+dx+dx+dx; - wide_fy += vdupq_n_s32(dy+dy+dy+dy); - fy += dy+dy+dy+dy; - xy += 8; /* 4 x's, 4 y's */ - count -= 4; - } - } - - while (--count >= 0) { - /* NB: writing Y/X */ - *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); - fy += dy; - *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); - fx += dx; - } -} - -static void PERSP_FILTER_NAME(const SkBitmapProcState& s, - uint32_t* SK_RESTRICT xy, int count, - int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); - - PREAMBLE(s); - unsigned maxX = s.fBitmap->width() - 1; - unsigned maxY = s.fBitmap->height() - 1; - SkFixed oneX = s.fFilterOneX; - SkFixed oneY = s.fFilterOneY; - - SkPerspIter iter(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, count); - - while ((count = iter.next()) != 0) { - const SkFixed* SK_RESTRICT srcXY = iter.getXY(); - - if (count >= 4) { - int32x4_t wide_i, wide_lo; - int32x4_t wide_fx1; - int32x4_t wide_fy1; - int32x4_t wide_x, wide_y; - - while (count >= 4) { - /* RBE: it's good, but: - * -- we spill a constant that could be easily regnerated - * [perhaps tweak gcc's NEON constant costs?] - */ - - /* load src: x-y-x-y-x-y-x-y */ - { - register int32x4_t q0 asm ("q0"); - register int32x4_t q1 asm ("q1"); - asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" - : "=w" (q0), "=w" (q1) - : "r" (srcXY)); - wide_x = q0; wide_y = q1; - } - - /* do the X side, then the Y side, then interleave them */ - - wide_x = vsubq_s32(wide_x, vdupq_n_s32 (oneX>>1)); - - /* original expands to: - * unsigned i = SkClampMax((f) >> 16, max); - * i = (i << 4) | (((f) >> 12) & 0xF); - * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); - */ - - /* i = SkClampMax(f>>16, maxX) */ - wide_i = vmaxq_s32 (vshrq_n_s32 (wide_x, 16), vdupq_n_s32 (0)); - wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxX)); - - /* i<<4 | TILEX_LOW_BITS(fx) */ - wide_lo = vshrq_n_s32 (wide_x, 12); - wide_i = vsliq_n_s32 (wide_lo, wide_i, 4); - - /* i<<14 */ - wide_i = vshlq_n_s32 (wide_i, 14); - - /* SkClampMax(((f + one)) >> 16, max) */ - wide_fx1 = vaddq_s32 (wide_x, vdupq_n_s32(oneX)); - wide_fx1 = vmaxq_s32 (vshrq_n_s32 (wide_fx1, 16), vdupq_n_s32 (0)); - wide_fx1 = vminq_s32 (wide_fx1, vdupq_n_s32 (maxX)); - - /* final combination */ - wide_x = vorrq_s32 (wide_i, wide_fx1); - - - /* And now the Y side */ - - wide_y = vsubq_s32(wide_y, vdupq_n_s32 (oneY>>1)); - - /* i = SkClampMax(f>>16, maxX) */ - wide_i = vmaxq_s32 (vshrq_n_s32 (wide_y, 16), vdupq_n_s32 (0)); - wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxY)); - - /* i<<4 | TILEX_LOW_BITS(fx) */ - wide_lo = vshrq_n_s32 (wide_y, 12); - wide_i = vsliq_n_s32 (wide_lo, wide_i, 4); - - /* i<<14 */ - wide_i = vshlq_n_s32 (wide_i, 14); - - /* SkClampMax(((f + one)) >> 16, max) */ - - /* wide_fy1_1 and wide_fy1_2 are just temporary variables to - * work-around an ICE in debug */ - int32x4_t wide_fy1_1 = vaddq_s32 (wide_y, vdupq_n_s32(oneY)); - int32x4_t wide_fy1_2 = vmaxq_s32 (vshrq_n_s32 (wide_fy1_1, 16), - vdupq_n_s32 (0)); - wide_fy1 = vminq_s32 (wide_fy1_2, vdupq_n_s32 (maxY)); - - /* final combination */ - wide_y = vorrq_s32 (wide_i, wide_fy1); - - /* switch them around; have to do it this way to get them - * in the proper registers to match our instruction */ - - /* iteration bookkeeping, ahead of the asm() for scheduling */ - srcXY += 2*4; - count -= 4; - - /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */ - { - register int32x4_t q0 asm ("q0") = wide_y; - register int32x4_t q1 asm ("q1") = wide_x; - - asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" - : - : "w" (q0), "w" (q1), "r" (xy)); - } - - /* on to the next iteration */ - /* count, srcXY are handled above */ - xy += 2*4; - } - } - - /* was do-while; NEON code invalidates original count>0 assumption */ - while (--count >= 0) { - /* NB: we read x/y, we write y/x */ - *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, - oneY PREAMBLE_ARG_Y); - *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, - oneX PREAMBLE_ARG_X); - srcXY += 2; - } - } -} - -const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { - SCALE_NOFILTER_NAME, - SCALE_FILTER_NAME, - AFFINE_NOFILTER_NAME, - AFFINE_FILTER_NAME, - PERSP_NOFILTER_NAME, - PERSP_FILTER_NAME -}; - -#undef MAKENAME -#undef TILEX_PROCF -#undef TILEY_PROCF -#ifdef CHECK_FOR_DECAL - #undef CHECK_FOR_DECAL -#endif - -#undef SCALE_NOFILTER_NAME -#undef SCALE_FILTER_NAME -#undef AFFINE_NOFILTER_NAME -#undef AFFINE_FILTER_NAME -#undef PERSP_NOFILTER_NAME -#undef PERSP_FILTER_NAME - -#undef PREAMBLE -#undef PREAMBLE_PARAM_X -#undef PREAMBLE_PARAM_Y -#undef PREAMBLE_ARG_X -#undef PREAMBLE_ARG_Y - -#undef TILEX_LOW_BITS -#undef TILEY_LOW_BITS diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h new file mode 100644 index 00000000000..72bf1bce336 --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h @@ -0,0 +1,506 @@ + +#include <arm_neon.h> + + +#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) +#define SCALE_FILTER_NAME MAKENAME(_filter_scale) +#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) +#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) +#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) +#define PERSP_FILTER_NAME MAKENAME(_filter_persp) + +#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) +#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) +#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) +#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) + +#ifndef PREAMBLE + #define PREAMBLE(state) + #define PREAMBLE_PARAM_X + #define PREAMBLE_PARAM_Y + #define PREAMBLE_ARG_X + #define PREAMBLE_ARG_Y +#endif + +static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + + PREAMBLE(s); + + // we store y, x, x, x, x, x + const unsigned maxX = s.fBitmap->width() - 1; + SkFractionalInt fx; + { + SkPoint pt; + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + fx = SkScalarToFractionalInt(pt.fY); + const unsigned maxY = s.fBitmap->height() - 1; + *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY); + fx = SkScalarToFractionalInt(pt.fX); + } + + if (0 == maxX) { + // all of the following X values must be 0 + memset(xy, 0, count * sizeof(uint16_t)); + return; + } + + const SkFractionalInt dx = s.fInvSxFractionalInt; + +#ifdef CHECK_FOR_DECAL + // test if we don't need to apply the tile proc + if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { + decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx), + SkFractionalIntToFixed(dx), count); + return; + } +#endif + + if (count >= 8) { + SkFractionalInt dx2 = dx+dx; + SkFractionalInt dx4 = dx2+dx2; + SkFractionalInt dx8 = dx4+dx4; + + // now build fx/fx+dx/fx+2dx/fx+3dx + SkFractionalInt fx1, fx2, fx3; + int32x4_t lbase, hbase; + int16_t *dst16 = (int16_t *)xy; + + fx1 = fx+dx; + fx2 = fx1+dx; + fx3 = fx2+dx; + + lbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); + lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1); + lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2); + lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3); + hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); + + // store & bump + while (count >= 8) { + + int16x8_t fx8; + + fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); + + vst1q_s16(dst16, fx8); + + // but preserving base & on to the next + lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + dst16 += 8; + count -= 8; + fx += dx8; + }; + xy = (uint32_t *) dst16; + } + + uint16_t* xx = (uint16_t*)xy; + for (int i = count; i > 0; --i) { + *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); + fx += dx; + } +} + +static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask | + SkMatrix::kAffine_Mask)) == 0); + + PREAMBLE(s); + SkPoint srcPt; + s.fInvProc(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); + + SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX); + SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY); + SkFractionalInt dx = s.fInvSxFractionalInt; + SkFractionalInt dy = s.fInvKyFractionalInt; + int maxX = s.fBitmap->width() - 1; + int maxY = s.fBitmap->height() - 1; + + if (count >= 8) { + SkFractionalInt dx4 = dx * 4; + SkFractionalInt dy4 = dy * 4; + SkFractionalInt dx8 = dx * 8; + SkFractionalInt dy8 = dy * 8; + + int32x4_t xbase, ybase; + int32x4_t x2base, y2base; + int16_t *dst16 = (int16_t *) xy; + + // now build fx, fx+dx, fx+2dx, fx+3dx + xbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); + xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1); + xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2); + xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3); + + // same for fy + ybase = vdupq_n_s32(SkFractionalIntToFixed(fy)); + ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1); + ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2); + ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3); + + x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); + y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4))); + + // store & bump + do { + int16x8x2_t hi16; + + hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); + hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); + + vst2q_s16(dst16, hi16); + + // moving base and on to the next + xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8))); + x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8))); + + dst16 += 16; // 8x32 aka 16x16 + count -= 8; + fx += dx8; + fy += dy8; + } while (count >= 8); + xy = (uint32_t *) dst16; + } + + for (int i = count; i > 0; --i) { + *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) | + TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); + fx += dx; fy += dy; + } +} + +static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, + uint32_t* SK_RESTRICT xy, + int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); + + PREAMBLE(s); + // max{X,Y} are int here, but later shown/assumed to fit in 16 bits + int maxX = s.fBitmap->width() - 1; + int maxY = s.fBitmap->height() - 1; + + SkPerspIter iter(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, count); + + while ((count = iter.next()) != 0) { + const SkFixed* SK_RESTRICT srcXY = iter.getXY(); + + if (count >= 8) { + int32_t *mysrc = (int32_t *) srcXY; + int16_t *mydst = (int16_t *) xy; + do { + int16x8x2_t hi16; + int32x4x2_t xy1, xy2; + + xy1 = vld2q_s32(mysrc); + xy2 = vld2q_s32(mysrc+8); + + hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX); + hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY); + + vst2q_s16(mydst, hi16); + + count -= 8; // 8 iterations + mysrc += 16; // 16 longs + mydst += 16; // 16 shorts, aka 8 longs + } while (count >= 8); + // get xy and srcXY fixed up + srcXY = (const SkFixed *) mysrc; + xy = (uint32_t *) mydst; + } + + while (--count >= 0) { + *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | + TILEX_PROCF(srcXY[0], maxX); + srcXY += 2; + } + } +} + +static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, + SkFixed one PREAMBLE_PARAM_Y) { + unsigned i = TILEY_PROCF(f, max); + i = (i << 4) | TILEY_LOW_BITS(f, max); + return (i << 14) | (TILEY_PROCF((f + one), max)); +} + +static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, + SkFixed one PREAMBLE_PARAM_X) { + unsigned i = TILEX_PROCF(f, max); + i = (i << 4) | TILEX_LOW_BITS(f, max); + return (i << 14) | (TILEX_PROCF((f + one), max)); +} + +static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, + SkFixed one PREAMBLE_PARAM_X) { + int32x4_t ret, res, wide_one; + + // Prepare constants + wide_one = vdupq_n_s32(one); + + // Step 1 + res = TILEX_PROCF_NEON4(f, max); + + // Step 2 + ret = TILEX_LOW_BITS_NEON4(f, max); + ret = vsliq_n_s32(ret, res, 4); + + // Step 3 + res = TILEX_PROCF_NEON4(f + wide_one, max); + ret = vorrq_s32(vshlq_n_s32(ret, 14), res); + + return ret; +} + +static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, + SkFixed one PREAMBLE_PARAM_X) { + int32x4_t ret, res, wide_one; + + // Prepare constants + wide_one = vdupq_n_s32(one); + + // Step 1 + res = TILEY_PROCF_NEON4(f, max); + + // Step 2 + ret = TILEY_LOW_BITS_NEON4(f, max); + ret = vsliq_n_s32(ret, res, 4); + + // Step 3 + res = TILEY_PROCF_NEON4(f + wide_one, max); + ret = vorrq_s32(vshlq_n_s32(ret, 14), res); + + return ret; +} + +static void SCALE_FILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + SkASSERT(s.fInvKy == 0); + + PREAMBLE(s); + + const unsigned maxX = s.fBitmap->width() - 1; + const SkFixed one = s.fFilterOneX; + const SkFractionalInt dx = s.fInvSxFractionalInt; + SkFractionalInt fx; + + { + SkPoint pt; + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); + const unsigned maxY = s.fBitmap->height() - 1; + // compute our two Y values up front + *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); + // now initialize fx + fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1); + } + +#ifdef CHECK_FOR_DECAL + // test if we don't need to apply the tile proc + if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { + decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx), + SkFractionalIntToFixed(dx), count); + return; + } +#endif + { + + if (count >= 4) { + int32x4_t wide_fx; + + wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx)); + wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1); + wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2); + wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3); + + while (count >= 4) { + int32x4_t res; + + res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); + + vst1q_u32(xy, vreinterpretq_u32_s32(res)); + + wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx)); + fx += dx+dx+dx+dx; + xy += 4; + count -= 4; + } + } + + while (--count >= 0) { + *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X); + fx += dx; + } + + } +} + +static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask | + SkMatrix::kAffine_Mask)) == 0); + + PREAMBLE(s); + SkPoint srcPt; + s.fInvProc(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); + + SkFixed oneX = s.fFilterOneX; + SkFixed oneY = s.fFilterOneY; + SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); + SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); + SkFixed dx = s.fInvSx; + SkFixed dy = s.fInvKy; + unsigned maxX = s.fBitmap->width() - 1; + unsigned maxY = s.fBitmap->height() - 1; + + if (count >= 4) { + int32x4_t wide_fy, wide_fx; + + wide_fx = vdupq_n_s32(fx); + wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); + wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); + wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); + + wide_fy = vdupq_n_s32(fy); + wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); + wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); + wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); + + while (count >= 4) { + int32x4x2_t vxy; + + // do the X side, then the Y side, then interleave them + vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); + vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); + + // interleave as YXYXYXYX as part of the storing + vst2q_s32((int32_t*)xy, vxy); + + // prepare next iteration + wide_fx += vdupq_n_s32(dx+dx+dx+dx); + fx += dx + dx + dx + dx; + wide_fy += vdupq_n_s32(dy+dy+dy+dy); + fy += dy+dy+dy+dy; + xy += 8; // 4 x's, 4 y's + count -= 4; + } + } + + while (--count >= 0) { + // NB: writing Y/X + *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); + fy += dy; + *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); + fx += dx; + } +} + +static void PERSP_FILTER_NAME(const SkBitmapProcState& s, + uint32_t* SK_RESTRICT xy, int count, + int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); + + PREAMBLE(s); + unsigned maxX = s.fBitmap->width() - 1; + unsigned maxY = s.fBitmap->height() - 1; + SkFixed oneX = s.fFilterOneX; + SkFixed oneY = s.fFilterOneY; + + SkPerspIter iter(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, count); + + while ((count = iter.next()) != 0) { + const SkFixed* SK_RESTRICT srcXY = iter.getXY(); + + while (count >= 4) { + int32x4_t wide_x, wide_y; + int32x4x2_t vxy, vresyx; + + // load src: x-y-x-y-x-y-x-y + vxy = vld2q_s32(srcXY); + + // do the X side, then the Y side, then interleave them + wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1)); + wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1)); + + vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y); + vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X); + + // store interleaved as y-x-y-x-y-x-y-x (NB != read order) + vst2q_s32((int32_t*)xy, vresyx); + + // on to the next iteration + srcXY += 2*4; + count -= 4; + xy += 2*4; + } + + while (--count >= 0) { + // NB: we read x/y, we write y/x + *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, + oneY PREAMBLE_ARG_Y); + *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, + oneX PREAMBLE_ARG_X); + srcXY += 2; + } + } +} + +const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { + SCALE_NOFILTER_NAME, + SCALE_FILTER_NAME, + AFFINE_NOFILTER_NAME, + AFFINE_FILTER_NAME, + PERSP_NOFILTER_NAME, + PERSP_FILTER_NAME +}; + +#undef TILEX_PROCF_NEON8 +#undef TILEY_PROCF_NEON8 +#undef TILEX_PROCF_NEON4 +#undef TILEY_PROCF_NEON4 +#undef TILEX_LOW_BITS_NEON4 +#undef TILEY_LOW_BITS_NEON4 + +#undef MAKENAME +#undef TILEX_PROCF +#undef TILEY_PROCF +#ifdef CHECK_FOR_DECAL + #undef CHECK_FOR_DECAL +#endif + +#undef SCALE_NOFILTER_NAME +#undef SCALE_FILTER_NAME +#undef AFFINE_NOFILTER_NAME +#undef AFFINE_FILTER_NAME +#undef PERSP_NOFILTER_NAME +#undef PERSP_FILTER_NAME + +#undef PREAMBLE +#undef PREAMBLE_PARAM_X +#undef PREAMBLE_PARAM_Y +#undef PREAMBLE_ARG_X +#undef PREAMBLE_ARG_Y + +#undef TILEX_LOW_BITS +#undef TILEY_LOW_BITS diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h deleted file mode 100644 index 55e2997a5ef..00000000000 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_repeat_neon.h +++ /dev/null @@ -1,542 +0,0 @@ -/* NEON optimized code (C) COPYRIGHT 2009 Motorola - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -/* - * Modifications done in-house at Motorola - * - * this is a clone of SkBitmapProcState_matrix.h - * and has been tuned to work with the NEON unit. - * - * Still going back and forth between whether this approach - * (clone the entire SkBitmapProcState_matrix.h file or - * if I should put just the modified routines in here and - * then use a construct like #define DONT_DO_THIS_FUNCTION or - * something like that... - * - * This is for the RepeatX_RepeatY part of the world - */ - - -#include <arm_neon.h> - -/* - * This has been modified on the knowledge that (at the time) - * we had the following macro definitions in the parent file - * - * #define MAKENAME(suffix) RepeatX_RepeatY ## suffix - * #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) - * #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16) - * #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) - * #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) - */ - -/* SkClampMax(val,max) -- bound to 0..max */ - -#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) -#define SCALE_FILTER_NAME MAKENAME(_filter_scale) -#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) -#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) -#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) -#define PERSP_FILTER_NAME MAKENAME(_filter_persp) - -#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) -#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) - -#ifndef PREAMBLE - #define PREAMBLE(state) - #define PREAMBLE_PARAM_X - #define PREAMBLE_PARAM_Y - #define PREAMBLE_ARG_X - #define PREAMBLE_ARG_Y -#endif - -static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask)) == 0); - - PREAMBLE(s); - // we store y, x, x, x, x, x - - const unsigned maxX = s.fBitmap->width() - 1; - SkFixed fx; - { - SkPoint pt; - s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &pt); - fx = SkScalarToFixed(pt.fY); - const unsigned maxY = s.fBitmap->height() - 1; - *xy++ = TILEY_PROCF(fx, maxY); - fx = SkScalarToFixed(pt.fX); - } - - if (0 == maxX) { - // all of the following X values must be 0 - memset(xy, 0, count * sizeof(uint16_t)); - return; - } - - const SkFixed dx = s.fInvSx; - -#ifdef CHECK_FOR_DECAL - // test if we don't need to apply the tile proc - if ((unsigned)(fx >> 16) <= maxX && - (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { - decal_nofilter_scale_neon(xy, fx, dx, count); - } else -#endif - { - int i; - - /* RBE: very much like done in decal_nofilter , - * but some processing of the 'fx' information - * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) - */ - if (count >= 8) { - /* SkFixed is 16.16 fixed point */ - SkFixed dx2 = dx+dx; - SkFixed dx4 = dx2+dx2; - SkFixed dx8 = dx4+dx4; - - /* now build fx/fx+dx/fx+2dx/fx+3dx */ - SkFixed fx1, fx2, fx3; - int32x4_t lbase, hbase; - int16_t *dst16 = (int16_t *)xy; - - fx1 = fx+dx; - fx2 = fx1+dx; - fx3 = fx2+dx; - - lbase = vdupq_n_s32(fx); - lbase = vsetq_lane_s32(fx1, lbase, 1); - lbase = vsetq_lane_s32(fx2, lbase, 2); - lbase = vsetq_lane_s32(fx3, lbase, 3); - hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); - - /* store & bump */ - do - { - int32x4_t lout; - int32x4_t hout; - int16x8_t hi16; - - /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ - /* mask to low 16 [would like to use uzp tricks) */ - lout = vandq_s32(lbase, vdupq_n_s32(0xffff)); - hout = vandq_s32(hbase, vdupq_n_s32(0xffff)); - /* bare multiplication, not SkFixedMul */ - lout = vmulq_s32(lout, vdupq_n_s32(maxX+1)); - hout = vmulq_s32(hout, vdupq_n_s32(maxX+1)); - - /* extraction, using uzp */ - /* this is ok -- we want all hi(lout)s then all hi(hout)s */ - asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); - hi16 = vreinterpretq_s16_s32(hout); - vst1q_s16(dst16, hi16); - - /* bump our base on to the next */ - lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); - hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); - dst16 += 8; - count -= 8; - fx += dx8; - } while (count >= 8); - xy = (uint32_t *) dst16; - } - uint16_t* xx = (uint16_t*)xy; - for (i = count; i > 0; --i) { - *xx++ = TILEX_PROCF(fx, maxX); fx += dx; - } - } -} - -// note: we could special-case on a matrix which is skewed in X but not Y. -// this would require a more general setup thatn SCALE does, but could use -// SCALE's inner loop that only looks at dx - - -static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask | - SkMatrix::kAffine_Mask)) == 0); - - PREAMBLE(s); - SkPoint srcPt; - s.fInvProc(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &srcPt); - - SkFixed fx = SkScalarToFixed(srcPt.fX); - SkFixed fy = SkScalarToFixed(srcPt.fY); - SkFixed dx = s.fInvSx; - SkFixed dy = s.fInvKy; - int maxX = s.fBitmap->width() - 1; - int maxY = s.fBitmap->height() - 1; - -#if 0 - int ocount = count; - uint32_t *oxy = xy; - SkFixed bfx = fx, bfy=fy, bdx=dx, bdy=dy; -#endif - - - if (0) { extern void rbe(void); rbe(); } - - /* RBE: benchmarks show this eats up time; can we neonize it? */ - /* RBE: very much like done in decal_nofilter , - * but some processing of the 'fx' information - * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) - */ - if (count >= 4) { - /* SkFixed is 16.16 fixed point */ - SkFixed dx4 = dx*4; - SkFixed dy4 = dy*4; - - /* now build fx/fx+dx/fx+2dx/fx+3dx */ - int32x4_t xbase, ybase; - int16_t *dst16 = (int16_t *)xy; - - /* synthesize 4x for both X and Y */ - xbase = vdupq_n_s32(fx); - xbase = vsetq_lane_s32(fx+dx, xbase, 1); - xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2); - xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3); - - ybase = vdupq_n_s32(fy); - ybase = vsetq_lane_s32(fy+dy, ybase, 1); - ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2); - ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); - - /* store & bump */ - do { - int32x4_t xout; - int32x4_t yout; - int16x8_t hi16; - - /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ - /* mask to low 16 [would like to use uzp tricks) */ - xout = vandq_s32(xbase, vdupq_n_s32(0xffff)); - yout = vandq_s32(ybase, vdupq_n_s32(0xffff)); - /* bare multiplication, not SkFixedMul */ - xout = vmulq_s32(xout, vdupq_n_s32(maxX+1)); - yout = vmulq_s32(yout, vdupq_n_s32(maxY+1)); - - /* put hi16 from xout over low16 from yout */ - yout = vsriq_n_s32(yout, xout, 16); - - /* and then yout has the interleaved upper 16's */ - hi16 = vreinterpretq_s16_s32(yout); - vst1q_s16(dst16, hi16); - - /* bump preserved base & on to the next */ - xbase = vaddq_s32 (xbase, vdupq_n_s32(dx4)); - ybase = vaddq_s32 (ybase, vdupq_n_s32(dy4)); - dst16 += 8; /* 8 x16 aka 4x32 */ - count -= 4; - fx += dx4; - fy += dy4; - } while (count >= 4); - xy = (uint32_t *) dst16; - } - -#if 0 - /* diagnostics... see whether we agree with the NEON code */ - int bad = 0; - uint32_t *myxy = oxy; - int myi = (-1); - SkFixed ofx = bfx, ofy= bfy, odx= bdx, ody= bdy; - for (myi = ocount; myi > 0; --myi) { - uint32_t val = (TILEY_PROCF(ofy, maxY) << 16) | TILEX_PROCF(ofx, maxX); - if (val != *myxy++) { - bad++; - break; - } - ofx += odx; ofy += ody; - } - if (bad) { - SkDebugf("repeat-nofilter-affine fails\n"); - SkDebugf("count %d myi %d\n", ocount, myi); - SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n", - bfx, bdx, bfy, bdy); - SkDebugf("maxX %08x maxY %08x\n", maxX, maxY); - } -#endif - - for (int i = count; i > 0; --i) { - /* fx, fy, dx, dy are all 32 bit 16.16 fixed point */ - /* (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ - *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX); - fx += dx; fy += dy; - } -} - -static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, - uint32_t* SK_RESTRICT xy, - int count, int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); - - PREAMBLE(s); - int maxX = s.fBitmap->width() - 1; - int maxY = s.fBitmap->height() - 1; - - SkPerspIter iter(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, count); - - while ((count = iter.next()) != 0) { - const SkFixed* SK_RESTRICT srcXY = iter.getXY(); - - /* RBE: */ - /* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ - /* it's a little more complicated than what I did for the - * clamp case -- where I could immediately snip to the top - * 16 bits and do my min/max games there. - * ... might only be able to get 4x unrolling here - */ - - /* vld2 to get a set of 32x4's ... */ - /* do the tile[xy]_procf operations */ - /* which includes doing vuzp to get hi16's */ - /* store it */ - /* -- inner loop (other than vld2) can be had from above */ - - /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... - * but we immediately discard the low 16 bits... - * so what we're going to do is vld4, which will give us - * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo' - * parts.... - */ - if (0) { extern void rbe(void); rbe(); } - if (count >= 8) { - int32_t *mysrc = (int32_t *) srcXY; - int16_t *mydst = (int16_t *) xy; - do { - int32x4_t x, y, x2, y2; - int16x8_t hi, hi2; - - /* read array of x,y,x,y,x,y */ - /* vld2 does the de-interleaving for us */ - /* isolate reg-bound scopes; gcc will minimize register - * motion if possible; this ensures that we don't lose - * a register across a debugging call because it happens - * to be bound into a call-clobbered register - */ - { - register int32x4_t q0 asm("q0"); - register int32x4_t q1 asm("q1"); - asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" - : "=w" (q0), "=w" (q1) - : "r" (mysrc) - ); - x = q0; y = q1; - } - - /* offset == 256 bits == 32 bytes == 8 longs */ - { - register int32x4_t q2 asm("q2"); - register int32x4_t q3 asm("q3"); - asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */" - : "=w" (q2), "=w" (q3) - : "r" (mysrc+8) - ); - x2 = q2; y2 = q3; - } - - /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ - /* mask to low 16 [would like to use uzp tricks) */ - /* bare multiplication, not SkFixedMul */ - x = vandq_s32(x, vdupq_n_s32(0xffff)); - x = vmulq_s32(x, vdupq_n_s32(maxX+1)); - y = vandq_s32(y, vdupq_n_s32(0xffff)); - y = vmulq_s32(y, vdupq_n_s32(maxY+1)); - - x2 = vandq_s32(x2, vdupq_n_s32(0xffff)); - x2 = vmulq_s32(x2, vdupq_n_s32(maxX+1)); - y2 = vandq_s32(y2, vdupq_n_s32(0xffff)); - y2 = vmulq_s32(y2, vdupq_n_s32(maxY+1)); - - /* now collect interleaved high 16's */ - /* (hi-x, hi-y)4 (hi-x2; hi-y2)4 */ - - /* extraction, using uzp, leaves hi16's in y */ - y = vsriq_n_s32(y, x, 16); - hi = vreinterpretq_s16_s32(y); - vst1q_s16(mydst, hi); - - /* and likewise for the second 8 entries */ - y2 = vsriq_n_s32(y2, x2, 16); - hi2 = vreinterpretq_s16_s32(y2); - vst1q_s16(mydst+8, hi2); - - /* XXX: gcc isn't interleaving these with the NEON ops - * but i think that all the scoreboarding works out */ - count -= 8; /* 8 iterations */ - mysrc += 16; /* 16 longs */ - mydst += 16; /* 16 shorts, aka 8 longs */ - } while (count >= 8); - /* get xy and srcXY fixed up */ - srcXY = (const SkFixed *) mysrc; - xy = (uint32_t *) mydst; - } - while (--count >= 0) { - *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | - TILEX_PROCF(srcXY[0], maxX); - srcXY += 2; - } - } -} - -////////////////////////////////////////////////////////////////////////////// - -static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, - SkFixed one PREAMBLE_PARAM_Y) { - unsigned i = TILEY_PROCF(f, max); - i = (i << 4) | TILEY_LOW_BITS(f, max); - return (i << 14) | (TILEY_PROCF((f + one), max)); -} - -static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, - SkFixed one PREAMBLE_PARAM_X) { - unsigned i = TILEX_PROCF(f, max); - i = (i << 4) | TILEX_LOW_BITS(f, max); - return (i << 14) | (TILEX_PROCF((f + one), max)); -} - -static void SCALE_FILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask)) == 0); - SkASSERT(s.fInvKy == 0); - - PREAMBLE(s); - - const unsigned maxX = s.fBitmap->width() - 1; - const SkFixed one = s.fFilterOneX; - const SkFractionalInt dx = s.fInvSxFractionalInt; - SkFractionalInt fx; - - { - SkPoint pt; - s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &pt); - const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); - const unsigned maxY = s.fBitmap->height() - 1; - // compute our two Y values up front - *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); - // now initialize fx - fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1); - } - -#ifdef CHECK_FOR_DECAL - // test if we don't need to apply the tile proc - if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { - decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx), - SkFractionalIntToFixed(dx), count); - } else -#endif - { - do { - SkFixed fixedFx = SkFractionalIntToFixed(fx); - *xy++ = PACK_FILTER_X_NAME(fixedFx, maxX, one PREAMBLE_ARG_X); - fx += dx; - } while (--count != 0); - } -} - -static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); - SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | - SkMatrix::kScale_Mask | - SkMatrix::kAffine_Mask)) == 0); - - PREAMBLE(s); - SkPoint srcPt; - s.fInvProc(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, &srcPt); - - SkFixed oneX = s.fFilterOneX; - SkFixed oneY = s.fFilterOneY; - SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); - SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); - SkFixed dx = s.fInvSx; - SkFixed dy = s.fInvKy; - unsigned maxX = s.fBitmap->width() - 1; - unsigned maxY = s.fBitmap->height() - 1; - - do { - *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); - fy += dy; - *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); - fx += dx; - } while (--count != 0); -} - -static void PERSP_FILTER_NAME(const SkBitmapProcState& s, - uint32_t* SK_RESTRICT xy, int count, - int x, int y) { - SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); - - extern void rbe(void); - - PREAMBLE(s); - unsigned maxX = s.fBitmap->width() - 1; - unsigned maxY = s.fBitmap->height() - 1; - SkFixed oneX = s.fFilterOneX; - SkFixed oneY = s.fFilterOneY; - - - - SkPerspIter iter(s.fInvMatrix, - SkIntToScalar(x) + SK_ScalarHalf, - SkIntToScalar(y) + SK_ScalarHalf, count); - - while ((count = iter.next()) != 0) { - const SkFixed* SK_RESTRICT srcXY = iter.getXY(); - do { - *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, - oneY PREAMBLE_ARG_Y); - *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, - oneX PREAMBLE_ARG_X); - srcXY += 2; - } while (--count != 0); - } -} - -const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { - SCALE_NOFILTER_NAME, - SCALE_FILTER_NAME, - AFFINE_NOFILTER_NAME, - AFFINE_FILTER_NAME, - PERSP_NOFILTER_NAME, - PERSP_FILTER_NAME -}; - -#undef MAKENAME -#undef TILEX_PROCF -#undef TILEY_PROCF -#ifdef CHECK_FOR_DECAL - #undef CHECK_FOR_DECAL -#endif - -#undef SCALE_NOFILTER_NAME -#undef SCALE_FILTER_NAME -#undef AFFINE_NOFILTER_NAME -#undef AFFINE_FILTER_NAME -#undef PERSP_NOFILTER_NAME -#undef PERSP_FILTER_NAME - -#undef PREAMBLE -#undef PREAMBLE_PARAM_X -#undef PREAMBLE_PARAM_Y -#undef PREAMBLE_ARG_X -#undef PREAMBLE_ARG_Y - -#undef TILEX_LOW_BITS -#undef TILEY_LOW_BITS diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp index 0b079977eb8..1f3bbc1f8f7 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.cpp @@ -1,4 +1,3 @@ - /* * Copyright 2009 The Android Open Source Project * @@ -6,9 +5,9 @@ * found in the LICENSE file. */ - #include <emmintrin.h> #include "SkBitmapProcState_opts_SSE2.h" +#include "SkColorPriv.h" #include "SkPaint.h" #include "SkUtils.h" @@ -17,7 +16,7 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, int count, uint32_t* colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); - SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); + SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); SkASSERT(s.fAlphaScale == 256); const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); @@ -123,7 +122,7 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, int count, uint32_t* colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); - SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); + SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); SkASSERT(s.fAlphaScale < 256); const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); @@ -639,11 +638,11 @@ void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, * It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16 */ void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s, - const uint32_t* xy, - int count, uint16_t* colors) { + const uint32_t* xy, + int count, uint16_t* colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); - SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); + SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); SkASSERT(s.fBitmap->isOpaque()); SkPMColor dstColor; @@ -744,23 +743,6 @@ void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s, // Extract low int and store. dstColor = _mm_cvtsi128_si32(sum); - //*colors++ = SkPixel32ToPixel16(dstColor); - // below is much faster than the above. It's tested for Android benchmark--Softweg - __m128i _m_temp1 = _mm_set1_epi32(dstColor); - __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3); - - unsigned int r32 = _mm_cvtsi128_si32(_m_temp2); - unsigned r = (r32 & ((1<<5) -1)) << 11; - - _m_temp2 = _mm_srli_epi32(_m_temp2, 7); - unsigned int g32 = _mm_cvtsi128_si32(_m_temp2); - unsigned g = (g32 & ((1<<6) -1)) << 5; - - _m_temp2 = _mm_srli_epi32(_m_temp2, 9); - unsigned int b32 = _mm_cvtsi128_si32(_m_temp2); - unsigned b = (b32 & ((1<<5) -1)); - - *colors++ = r | g | b; - + *colors++ = SkPixel32ToPixel16(dstColor); } while (--count > 0); } diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h index 46e35a0f96f..82c5cc8d6e1 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSE2.h @@ -1,4 +1,3 @@ - /* * Copyright 2009 The Android Open Source Project * @@ -6,6 +5,8 @@ * found in the LICENSE file. */ +#ifndef SkBitmapProcState_opts_SSE2_DEFINED +#define SkBitmapProcState_opts_SSE2_DEFINED #include "SkBitmapProcState.h" @@ -24,7 +25,9 @@ void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, - uint32_t xy[], int count, int x, int y); + uint32_t xy[], int count, int x, int y); void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s, - const uint32_t* xy, - int count, uint16_t* colors); + const uint32_t* xy, + int count, uint16_t* colors); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp index f8342ecaad5..5b97215cc01 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.cpp @@ -5,11 +5,19 @@ * found in the LICENSE file. */ -#include <tmmintrin.h> // SSSE3 #include "SkBitmapProcState_opts_SSSE3.h" #include "SkPaint.h" #include "SkUtils.h" +/* With the exception of the Android framework we always build the SSSE3 functions + * and enable the caller to determine SSSE3 support. However for the Android framework + * if the device does not support SSSE3 then the compiler will not supply the required + * -mssse3 option needed to build this file, so instead we provide a stub implementation. + */ +#if !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + +#include <tmmintrin.h> // SSSE3 + // adding anonymous namespace seemed to force gcc to inline directly the // instantiation, instead of creating the functions // S32_generic_D32_filter_DX_SSSE3<true> and @@ -387,7 +395,7 @@ void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s, int count, uint32_t* colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); - SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); + SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); if (has_alpha) { SkASSERT(s.fAlphaScale < 256); } else { @@ -417,9 +425,10 @@ void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s, const __m128i zero = _mm_setzero_si128(); __m128i alpha = _mm_setzero_si128(); - if (has_alpha) + if (has_alpha) { // 8x(alpha) alpha = _mm_set1_epi16(s.fAlphaScale); + } if (sub_y == 0) { // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small) @@ -578,7 +587,7 @@ void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, int count, uint32_t* colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); - SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); + SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); if (has_alpha) { SkASSERT(s.fAlphaScale < 256); } else { @@ -697,7 +706,7 @@ void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, *colors++ = _mm_cvtsi128_si32(sum0); } } -} // namepace +} // namespace void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s, const uint32_t* xy, @@ -722,3 +731,31 @@ void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, int count, uint32_t* colors) { S32_generic_D32_filter_DXDY_SSSE3<true>(s, xy, count, colors); } + +#else // !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + +void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint32_t* colors) { + sk_throw(); +} + +void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint32_t* colors) { + sk_throw(); +} + +void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint32_t* colors) { + sk_throw(); +} + +void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint32_t* colors) { + sk_throw(); +} + +#endif diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h index 176f2bfbe74..9fd074aacf2 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_SSSE3.h @@ -5,6 +5,9 @@ * found in the LICENSE file. */ +#ifndef SkBitmapProcState_opts_SSSE3_DEFINED +#define SkBitmapProcState_opts_SSSE3_DEFINED + #include "SkBitmapProcState.h" void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s, @@ -19,3 +22,5 @@ void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, const uint32_t* xy, int count, uint32_t* colors); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp index 96fbebd4e19..ffa0ccfa8aa 100644 --- a/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_opts_arm.cpp @@ -15,7 +15,7 @@ #include "SkConvolver.h" -#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) +#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) void SI8_D16_nofilter_DX_arm( const SkBitmapProcState& s, const uint32_t* SK_RESTRICT xy, @@ -186,7 +186,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, s.fBitmap->getColorTable()->unlockColors(); } -#endif // SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) +#endif // !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) /////////////////////////////////////////////////////////////////////////////// @@ -194,6 +194,7 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, otherwise the shader won't even look at the matrix/sampler */ void SkBitmapProcState::platformProcs() { +#if !defined(SK_CPU_ARM64) && SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) bool isOpaque = 256 == fAlphaScale; bool justDx = false; @@ -201,9 +202,8 @@ void SkBitmapProcState::platformProcs() { justDx = true; } - switch (fBitmap->config()) { - case SkBitmap::kIndex8_Config: -#if SK_ARM_ARCH >= 6 && !defined(SK_CPU_BENDIAN) + switch (fBitmap->colorType()) { + case kIndex_8_SkColorType: if (justDx && SkPaint::kNone_FilterLevel == fFilterLevel) { #if 0 /* crashing on android device */ fSampleProc16 = SI8_D16_nofilter_DX_arm; @@ -215,11 +215,11 @@ void SkBitmapProcState::platformProcs() { fShaderProc32 = NULL; } } -#endif break; default: break; } +#endif } /////////////////////////////////////////////////////////////////////////////// diff --git a/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp index 2bf760313c1..11e172c0d1d 100644 --- a/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp +++ b/chromium/third_party/skia/src/opts/SkBlitMask_opts_arm.cpp @@ -1,3 +1,9 @@ +/* + * Copyright 2014 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ #include "SkColor.h" #include "SkColorPriv.h" @@ -5,21 +11,24 @@ #include "SkUtilsArm.h" #include "SkBlitMask_opts_arm_neon.h" -SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, +SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT, SkMask::Format maskFormat, SkColor color) { #if SK_ARM_NEON_IS_NONE return NULL; #else +/* ** This has been disabled until we can diagnose and fix the SIGILL generated + ** in the NEON code. See http://skbug.com/2067 for details. #if SK_ARM_NEON_IS_DYNAMIC if (!sk_cpu_arm_has_neon()) { return NULL; } #endif - if ((SkBitmap::kARGB_8888_Config == dstConfig) && + if ((kN32_SkColorType == dstCT) && (SkMask::kA8_Format == maskFormat)) { return D32_A8_Factory_neon(color); } +*/ #endif // We don't need to handle the SkMask::kLCD16_Format case as the default @@ -36,7 +45,7 @@ SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { } } -SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, +SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType dstCT, SkMask::Format maskFormat, RowFlags flags) { return NULL; diff --git a/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp b/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp index 0ad09193871..90f89a71292 100644 --- a/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp +++ b/chromium/third_party/skia/src/opts/SkBlitMask_opts_none.cpp @@ -1,7 +1,13 @@ +/* + * Copyright 2014 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ #include "SkBlitMask.h" -SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, +SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT, SkMask::Format maskFormat, SkColor color) { return NULL; @@ -11,7 +17,7 @@ SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { return NULL; } -SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, +SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType dstCT, SkMask::Format maskFormat, RowFlags flags) { return NULL; diff --git a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp index 3cb2b9c6d09..d65a313dadf 100644 --- a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.cpp @@ -5,15 +5,14 @@ * found in the LICENSE file. */ +#include <emmintrin.h> #include "SkBlitRect_opts_SSE2.h" #include "SkBlitRow.h" #include "SkColorPriv.h" -#include <emmintrin.h> - -/** Simple blitting of opaque rectangles less than 31 pixels wide: - inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. -*/ +/* Simple blitting of opaque rectangles less than 31 pixels wide: + * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. + */ static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, int width, int height, size_t rowBytes, uint32_t color) { @@ -42,12 +41,12 @@ static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, } } -/** - Fast blitting of opaque rectangles at least 31 pixels wide: - inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. - A 31 pixel rectangle is guaranteed to have at least one - 16-pixel aligned span that can take advantage of mm_store. -*/ +/* + * Fast blitting of opaque rectangles at least 31 pixels wide: + * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. + * A 31 pixel rectangle is guaranteed to have at least one + * 16-pixel aligned span that can take advantage of mm_store. + */ static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, int width, int height, size_t rowBytes, uint32_t color) { diff --git a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h index 4d2f74a4b1b..3d09f5c3abc 100644 --- a/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkBlitRect_opts_SSE2.h @@ -8,13 +8,11 @@ #ifndef SkBlitRect_opts_SSE2_DEFINED #define SkBlitRect_opts_SSE2_DEFINED -/* - These functions' implementations copy sections of both - SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2. -*/ - #include "SkColor.h" +/* These functions' implementations copy sections of both + * SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2. + */ void ColorRect32_SSE2(SkPMColor* SK_RESTRICT dst, int width, int height, size_t rowBytes, uint32_t color); diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp index f3d010e3bc4..391b24c8673 100644 --- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp @@ -5,14 +5,14 @@ * found in the LICENSE file. */ - -#include "SkBlitRow_opts_SSE2.h" +#include <emmintrin.h> #include "SkBitmapProcState_opts_SSE2.h" +#include "SkBlitRow_opts_SSE2.h" #include "SkColorPriv.h" +#include "SkColor_opts_SSE2.h" +#include "SkDither.h" #include "SkUtils.h" -#include <emmintrin.h> - /* SSE2 version of S32_Blend_BlitRow32() * portable version is in core/SkBlitRow_D32.cpp */ @@ -177,7 +177,7 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, d++; count -= 4; } - #else +#else __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) while (count >= 4) { @@ -340,7 +340,6 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, */ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { - if (count <= 0) { return; } @@ -404,7 +403,7 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, } src = reinterpret_cast<const SkPMColor*>(s); dst = reinterpret_cast<SkPMColor*>(d); - } + } while (count > 0) { *dst = color + SkAlphaMulQ(*src, scale); @@ -502,7 +501,7 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, } dst = reinterpret_cast<SkPMColor *>(d); } - while(count > 0) { + while (count > 0) { *dst= SkBlendARGB32(color, *dst, *mask); dst += 1; mask++; @@ -851,3 +850,512 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], width--; } } + +/* SSE2 version of S32_D565_Opaque() + * portable version is in core/SkBlitRow_D16.cpp + */ +void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 == alpha); + + if (count <= 0) { + return; + } + + if (count >= 8) { + while (((size_t)dst & 0x0F) != 0) { + SkPMColor c = *src++; + SkPMColorAssert(c); + + *dst++ = SkPixel32ToPixel16_ToU16(c); + count--; + } + + const __m128i* s = reinterpret_cast<const __m128i*>(src); + __m128i* d = reinterpret_cast<__m128i*>(dst); + __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); + __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); + __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); + + while (count >= 8) { + // Load 8 pixels of src. + __m128i src_pixel1 = _mm_loadu_si128(s++); + __m128i src_pixel2 = _mm_loadu_si128(s++); + + // Calculate result r. + __m128i r1 = _mm_srli_epi32(src_pixel1, + SK_R32_SHIFT + (8 - SK_R16_BITS)); + r1 = _mm_and_si128(r1, r16_mask); + __m128i r2 = _mm_srli_epi32(src_pixel2, + SK_R32_SHIFT + (8 - SK_R16_BITS)); + r2 = _mm_and_si128(r2, r16_mask); + __m128i r = _mm_packs_epi32(r1, r2); + + // Calculate result g. + __m128i g1 = _mm_srli_epi32(src_pixel1, + SK_G32_SHIFT + (8 - SK_G16_BITS)); + g1 = _mm_and_si128(g1, g16_mask); + __m128i g2 = _mm_srli_epi32(src_pixel2, + SK_G32_SHIFT + (8 - SK_G16_BITS)); + g2 = _mm_and_si128(g2, g16_mask); + __m128i g = _mm_packs_epi32(g1, g2); + + // Calculate result b. + __m128i b1 = _mm_srli_epi32(src_pixel1, + SK_B32_SHIFT + (8 - SK_B16_BITS)); + b1 = _mm_and_si128(b1, b16_mask); + __m128i b2 = _mm_srli_epi32(src_pixel2, + SK_B32_SHIFT + (8 - SK_B16_BITS)); + b2 = _mm_and_si128(b2, b16_mask); + __m128i b = _mm_packs_epi32(b1, b2); + + // Store 8 16-bit colors in dst. + __m128i d_pixel = SkPackRGB16_SSE2(r, g, b); + _mm_store_si128(d++, d_pixel); + count -= 8; + } + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<uint16_t*>(d); + } + + if (count > 0) { + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + *dst++ = SkPixel32ToPixel16_ToU16(c); + } while (--count != 0); + } +} + +/* SSE2 version of S32A_D565_Opaque() + * portable version is in core/SkBlitRow_D16.cpp + */ +void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 == alpha); + + if (count <= 0) { + return; + } + + if (count >= 8) { + // Make dst 16 bytes alignment + while (((size_t)dst & 0x0F) != 0) { + SkPMColor c = *src++; + if (c) { + *dst = SkSrcOver32To16(c, *dst); + } + dst += 1; + count--; + } + + const __m128i* s = reinterpret_cast<const __m128i*>(src); + __m128i* d = reinterpret_cast<__m128i*>(dst); + __m128i var255 = _mm_set1_epi16(255); + __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); + __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); + __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); + + while (count >= 8) { + // Load 8 pixels of src. + __m128i src_pixel1 = _mm_loadu_si128(s++); + __m128i src_pixel2 = _mm_loadu_si128(s++); + + // Check whether src pixels are equal to 0 and get the highest bit + // of each byte of result, if src pixels are all zero, src_cmp1 and + // src_cmp2 will be 0xFFFF. + int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, + _mm_setzero_si128())); + int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, + _mm_setzero_si128())); + if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { + d++; + count -= 8; + continue; + } + + // Load 8 pixels of dst. + __m128i dst_pixel = _mm_load_si128(d); + + // Extract A from src. + __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); + sa1 = _mm_srli_epi32(sa1, 24); + __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); + sa2 = _mm_srli_epi32(sa2, 24); + __m128i sa = _mm_packs_epi32(sa1, sa2); + + // Extract R from src. + __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); + sr1 = _mm_srli_epi32(sr1, 24); + __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); + sr2 = _mm_srli_epi32(sr2, 24); + __m128i sr = _mm_packs_epi32(sr1, sr2); + + // Extract G from src. + __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); + sg1 = _mm_srli_epi32(sg1, 24); + __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); + sg2 = _mm_srli_epi32(sg2, 24); + __m128i sg = _mm_packs_epi32(sg1, sg2); + + // Extract B from src. + __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); + sb1 = _mm_srli_epi32(sb1, 24); + __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); + sb2 = _mm_srli_epi32(sb2, 24); + __m128i sb = _mm_packs_epi32(sb1, sb2); + + // Extract R G B from dst. + __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); + dr = _mm_and_si128(dr, r16_mask); + __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); + dg = _mm_and_si128(dg, g16_mask); + __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); + db = _mm_and_si128(db, b16_mask); + + __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa + + // Calculate R G B of result. + // Original algorithm is in SkSrcOver32To16(). + dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); + dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); + dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); + dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); + db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); + db = _mm_srli_epi16(db, 8 - SK_B16_BITS); + + // Pack R G B into 16-bit color. + __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); + + // Store 8 16-bit colors in dst. + _mm_store_si128(d++, d_pixel); + count -= 8; + } + + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<uint16_t*>(d); + } + + if (count > 0) { + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + if (c) { + *dst = SkSrcOver32To16(c, *dst); + } + dst += 1; + } while (--count != 0); + } +} + +void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int x, int y) { + SkASSERT(255 == alpha); + + if (count <= 0) { + return; + } + + if (count >= 8) { + while (((size_t)dst & 0x0F) != 0) { + DITHER_565_SCAN(y); + SkPMColor c = *src++; + SkPMColorAssert(c); + + unsigned dither = DITHER_VALUE(x); + *dst++ = SkDitherRGB32To565(c, dither); + DITHER_INC_X(x); + count--; + } + + unsigned short dither_value[8]; + __m128i dither; +#ifdef ENABLE_DITHER_MATRIX_4X4 + const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; + dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; + dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; + dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; + dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; +#else + const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; + dither_value[0] = dither_value[4] = (dither_scan + >> (((x) & 3) << 2)) & 0xF; + dither_value[1] = dither_value[5] = (dither_scan + >> (((x + 1) & 3) << 2)) & 0xF; + dither_value[2] = dither_value[6] = (dither_scan + >> (((x + 2) & 3) << 2)) & 0xF; + dither_value[3] = dither_value[7] = (dither_scan + >> (((x + 3) & 3) << 2)) & 0xF; +#endif + dither = _mm_loadu_si128((__m128i*) dither_value); + + const __m128i* s = reinterpret_cast<const __m128i*>(src); + __m128i* d = reinterpret_cast<__m128i*>(dst); + + while (count >= 8) { + // Load 8 pixels of src. + __m128i src_pixel1 = _mm_loadu_si128(s++); + __m128i src_pixel2 = _mm_loadu_si128(s++); + + // Extract R from src. + __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); + sr1 = _mm_srli_epi32(sr1, 24); + __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); + sr2 = _mm_srli_epi32(sr2, 24); + __m128i sr = _mm_packs_epi32(sr1, sr2); + + // SkDITHER_R32To565(sr, dither) + __m128i sr_offset = _mm_srli_epi16(sr, 5); + sr = _mm_add_epi16(sr, dither); + sr = _mm_sub_epi16(sr, sr_offset); + sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); + + // Extract G from src. + __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); + sg1 = _mm_srli_epi32(sg1, 24); + __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); + sg2 = _mm_srli_epi32(sg2, 24); + __m128i sg = _mm_packs_epi32(sg1, sg2); + + // SkDITHER_R32To565(sg, dither) + __m128i sg_offset = _mm_srli_epi16(sg, 6); + sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); + sg = _mm_sub_epi16(sg, sg_offset); + sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); + + // Extract B from src. + __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); + sb1 = _mm_srli_epi32(sb1, 24); + __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); + sb2 = _mm_srli_epi32(sb2, 24); + __m128i sb = _mm_packs_epi32(sb1, sb2); + + // SkDITHER_R32To565(sb, dither) + __m128i sb_offset = _mm_srli_epi16(sb, 5); + sb = _mm_add_epi16(sb, dither); + sb = _mm_sub_epi16(sb, sb_offset); + sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); + + // Pack and store 16-bit dst pixel. + __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); + _mm_store_si128(d++, d_pixel); + + count -= 8; + x += 8; + } + + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<uint16_t*>(d); + } + + if (count > 0) { + DITHER_565_SCAN(y); + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + + unsigned dither = DITHER_VALUE(x); + *dst++ = SkDitherRGB32To565(c, dither); + DITHER_INC_X(x); + } while (--count != 0); + } +} + +/* SSE2 version of S32A_D565_Opaque_Dither() + * portable version is in core/SkBlitRow_D16.cpp + */ +void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int x, int y) { + SkASSERT(255 == alpha); + + if (count <= 0) { + return; + } + + if (count >= 8) { + while (((size_t)dst & 0x0F) != 0) { + DITHER_565_SCAN(y); + SkPMColor c = *src++; + SkPMColorAssert(c); + if (c) { + unsigned a = SkGetPackedA32(c); + + int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); + + unsigned sr = SkGetPackedR32(c); + unsigned sg = SkGetPackedG32(c); + unsigned sb = SkGetPackedB32(c); + sr = SkDITHER_R32_FOR_565(sr, d); + sg = SkDITHER_G32_FOR_565(sg, d); + sb = SkDITHER_B32_FOR_565(sb, d); + + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); + uint32_t dst_expanded = SkExpand_rgb_16(*dst); + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); + // now src and dst expanded are in g:11 r:10 x:1 b:10 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); + } + dst += 1; + DITHER_INC_X(x); + count--; + } + + unsigned short dither_value[8]; + __m128i dither, dither_cur; +#ifdef ENABLE_DITHER_MATRIX_4X4 + const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; + dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; + dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; + dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; + dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; +#else + const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; + dither_value[0] = dither_value[4] = (dither_scan + >> (((x) & 3) << 2)) & 0xF; + dither_value[1] = dither_value[5] = (dither_scan + >> (((x + 1) & 3) << 2)) & 0xF; + dither_value[2] = dither_value[6] = (dither_scan + >> (((x + 2) & 3) << 2)) & 0xF; + dither_value[3] = dither_value[7] = (dither_scan + >> (((x + 3) & 3) << 2)) & 0xF; +#endif + dither = _mm_loadu_si128((__m128i*) dither_value); + + const __m128i* s = reinterpret_cast<const __m128i*>(src); + __m128i* d = reinterpret_cast<__m128i*>(dst); + __m128i var256 = _mm_set1_epi16(256); + __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); + __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); + __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); + + while (count >= 8) { + // Load 8 pixels of src and dst. + __m128i src_pixel1 = _mm_loadu_si128(s++); + __m128i src_pixel2 = _mm_loadu_si128(s++); + __m128i dst_pixel = _mm_load_si128(d); + + // Extract A from src. + __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); + sa1 = _mm_srli_epi32(sa1, 24); + __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); + sa2 = _mm_srli_epi32(sa2, 24); + __m128i sa = _mm_packs_epi32(sa1, sa2); + + // Calculate current dither value. + dither_cur = _mm_mullo_epi16(dither, + _mm_add_epi16(sa, _mm_set1_epi16(1))); + dither_cur = _mm_srli_epi16(dither_cur, 8); + + // Extract R from src. + __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); + sr1 = _mm_srli_epi32(sr1, 24); + __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); + sr2 = _mm_srli_epi32(sr2, 24); + __m128i sr = _mm_packs_epi32(sr1, sr2); + + // SkDITHER_R32_FOR_565(sr, d) + __m128i sr_offset = _mm_srli_epi16(sr, 5); + sr = _mm_add_epi16(sr, dither_cur); + sr = _mm_sub_epi16(sr, sr_offset); + + // Expand sr. + sr = _mm_slli_epi16(sr, 2); + + // Extract G from src. + __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); + sg1 = _mm_srli_epi32(sg1, 24); + __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); + sg2 = _mm_srli_epi32(sg2, 24); + __m128i sg = _mm_packs_epi32(sg1, sg2); + + // sg = SkDITHER_G32_FOR_565(sg, d). + __m128i sg_offset = _mm_srli_epi16(sg, 6); + sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); + sg = _mm_sub_epi16(sg, sg_offset); + + // Expand sg. + sg = _mm_slli_epi16(sg, 3); + + // Extract B from src. + __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); + sb1 = _mm_srli_epi32(sb1, 24); + __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); + sb2 = _mm_srli_epi32(sb2, 24); + __m128i sb = _mm_packs_epi32(sb1, sb2); + + // sb = SkDITHER_B32_FOR_565(sb, d). + __m128i sb_offset = _mm_srli_epi16(sb, 5); + sb = _mm_add_epi16(sb, dither_cur); + sb = _mm_sub_epi16(sb, sb_offset); + + // Expand sb. + sb = _mm_slli_epi16(sb, 2); + + // Extract R G B from dst. + __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); + dr = _mm_and_si128(dr, r16_mask); + __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); + dg = _mm_and_si128(dg, g16_mask); + __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); + db = _mm_and_si128(db, b16_mask); + + // SkAlpha255To256(255 - a) >> 3 + __m128i isa = _mm_sub_epi16(var256, sa); + isa = _mm_srli_epi16(isa, 3); + + dr = _mm_mullo_epi16(dr, isa); + dr = _mm_add_epi16(dr, sr); + dr = _mm_srli_epi16(dr, 5); + + dg = _mm_mullo_epi16(dg, isa); + dg = _mm_add_epi16(dg, sg); + dg = _mm_srli_epi16(dg, 5); + + db = _mm_mullo_epi16(db, isa); + db = _mm_add_epi16(db, sb); + db = _mm_srli_epi16(db, 5); + + // Package and store dst pixel. + __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); + _mm_store_si128(d++, d_pixel); + + count -= 8; + x += 8; + } + + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<uint16_t*>(d); + } + + if (count > 0) { + DITHER_565_SCAN(y); + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + if (c) { + unsigned a = SkGetPackedA32(c); + + int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); + + unsigned sr = SkGetPackedR32(c); + unsigned sg = SkGetPackedG32(c); + unsigned sb = SkGetPackedB32(c); + sr = SkDITHER_R32_FOR_565(sr, d); + sg = SkDITHER_G32_FOR_565(sg, d); + sb = SkDITHER_B32_FOR_565(sb, d); + + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); + uint32_t dst_expanded = SkExpand_rgb_16(*dst); + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); + // now src and dst expanded are in g:11 r:10 x:1 b:10 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); + } + dst += 1; + DITHER_INC_X(x); + } while (--count != 0); + } +} diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h index b443ec7f213..29fd96e5e91 100644 --- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_SSE2.h @@ -1,4 +1,3 @@ - /* * Copyright 2009 The Android Open Source Project * @@ -6,6 +5,8 @@ * found in the LICENSE file. */ +#ifndef SkBlitRow_opts_SSE2_DEFINED +#define SkBlitRow_opts_SSE2_DEFINED #include "SkBlitRow.h" @@ -28,3 +29,18 @@ void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor); void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst); + +void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/); +void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int /*x*/, int /*y*/); +void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int x, int y); +void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int x, int y); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp index e8e544e9dcb..34b8564723c 100644 --- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp +++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm.cpp @@ -12,8 +12,6 @@ #include "SkUtils.h" #include "SkUtilsArm.h" -#include "SkCachePreload_arm.h" - // Define USE_NEON_CODE to indicate that we need to build NEON routines #define USE_NEON_CODE (!SK_ARM_NEON_IS_NONE) @@ -376,3 +374,7 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { return SK_ARM_NEON_WRAP(Color32_arm); } + +SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { + return NULL; +} diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp index 672980d0d26..01a6a2aa745 100644 --- a/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -14,10 +14,56 @@ #include "SkMathPriv.h" #include "SkUtils.h" -#include "SkCachePreload_arm.h" #include "SkColor_opts_neon.h" #include <arm_neon.h> +#ifdef SK_CPU_ARM64 +static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { + uint8x8x4_t vsrc; + uint8x8_t vsrc_0, vsrc_1, vsrc_2; + + asm ( + "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" + "mov %[vsrc0].8b, v0.8b \t\n" + "mov %[vsrc1].8b, v1.8b \t\n" + "mov %[vsrc2].8b, v2.8b \t\n" + : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), + [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) + : : "v0", "v1", "v2", "v3" + ); + + vsrc.val[0] = vsrc_0; + vsrc.val[1] = vsrc_1; + vsrc.val[2] = vsrc_2; + + return vsrc; +} + +static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { + uint8x8x4_t vsrc; + uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; + + asm ( + "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" + "mov %[vsrc0].8b, v0.8b \t\n" + "mov %[vsrc1].8b, v1.8b \t\n" + "mov %[vsrc2].8b, v2.8b \t\n" + "mov %[vsrc3].8b, v3.8b \t\n" + : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), + [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), + [src] "+&r" (src) + : : "v0", "v1", "v2", "v3" + ); + + vsrc.val[0] = vsrc_0; + vsrc.val[1] = vsrc_1; + vsrc.val[2] = vsrc_2; + vsrc.val[3] = vsrc_3; + + return vsrc; +} +#endif + void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int /*x*/, int /*y*/) { @@ -28,7 +74,12 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, uint16x8_t vdst; // Load +#ifdef SK_CPU_ARM64 + vsrc = sk_vld4_u8_arm64_3(src); +#else vsrc = vld4_u8((uint8_t*)src); + src += 8; +#endif // Convert src to 565 vdst = SkPixel32ToPixel16_neon8(vsrc); @@ -38,7 +89,6 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, // Prepare next iteration dst += 8; - src += 8; count -= 8; }; @@ -52,6 +102,92 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, }; } +void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 > alpha); + + uint16x8_t vmask_blue, vscale; + + // prepare constants + vscale = vdupq_n_u16(SkAlpha255To256(alpha)); + vmask_blue = vmovq_n_u16(0x1F); + + while (count >= 8) { + uint8x8x4_t vsrc; + uint16x8_t vdst, vdst_r, vdst_g, vdst_b; + uint16x8_t vres_r, vres_g, vres_b; + + // Load src +#ifdef SK_CPU_ARM64 + vsrc = sk_vld4_u8_arm64_3(src); +#else + { + register uint8x8_t d0 asm("d0"); + register uint8x8_t d1 asm("d1"); + register uint8x8_t d2 asm("d2"); + register uint8x8_t d3 asm("d3"); + + asm ( + "vld4.8 {d0-d3},[%[src]]!" + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) + : + ); + vsrc.val[0] = d0; + vsrc.val[1] = d1; + vsrc.val[2] = d2; + } +#endif + + // Load and unpack dst + vdst = vld1q_u16(dst); + vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes + vdst_b = vandq_u16(vdst, vmask_blue); // extract blue + vdst_r = vshrq_n_u16(vdst, 6+5); // extract red + vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green + + // Shift src to 565 range + vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); + vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); + vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); + + // Scale src - dst + vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; + vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; + vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; + + vres_r = vshrq_n_u16(vres_r * vscale, 8); + vres_g = vshrq_n_u16(vres_g * vscale, 8); + vres_b = vshrq_n_u16(vres_b * vscale, 8); + + vres_r += vdst_r; + vres_g += vdst_g; + vres_b += vdst_b; + + // Combine + vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue + vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue + + // Store + vst1q_u16(dst, vres_b); + dst += 8; + count -= 8; + } + if (count > 0) { + int scale = SkAlpha255To256(alpha); + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + uint16_t d = *dst; + *dst++ = SkPackRGB16( + SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), + SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), + SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); + } while (--count != 0); + } +} + +#ifdef SK_CPU_ARM32 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int /*x*/, int /*y*/) { @@ -229,114 +365,129 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, ); } } +#endif + +static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { + prod += vdupq_n_u16(128); + prod += vshrq_n_u16(prod, 8); + return vshrq_n_u16(prod, 8); +} void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 > alpha); - U8CPU alpha_for_asm = alpha; - - asm volatile ( - /* This code implements a Neon version of S32A_D565_Blend. The output differs from - * the original in two respects: - * 1. The results have a few mismatches compared to the original code. These mismatches - * never exceed 1. It's possible to improve accuracy vs. a floating point - * implementation by introducing rounding right shifts (vrshr) for the final stage. - * Rounding is not present in the code below, because although results would be closer - * to a floating point implementation, the number of mismatches compared to the - * original code would be far greater. - * 2. On certain inputs, the original code can overflow, causing colour channels to - * mix. Although the Neon code can also overflow, it doesn't allow one colour channel - * to affect another. + /* This code implements a Neon version of S32A_D565_Blend. The results have + * a few mismatches compared to the original code. These mismatches never + * exceed 1. */ -#if 1 - /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ - "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 + if (count >= 8) { + uint16x8_t valpha_max, vmask_blue; + uint8x8_t valpha; + + // prepare constants + valpha_max = vmovq_n_u16(255); + valpha = vdup_n_u8(alpha); + vmask_blue = vmovq_n_u16(SK_B16_MASK); + + do { + uint16x8_t vdst, vdst_r, vdst_g, vdst_b; + uint16x8_t vres_a, vres_r, vres_g, vres_b; + uint8x8x4_t vsrc; + + // load pixels + vdst = vld1q_u16(dst); +#ifdef SK_CPU_ARM64 + vsrc = sk_vld4_u8_arm64_4(src); #else - "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) + asm ( + "vld4.u8 %h[vsrc], [%[src]]!" + : [vsrc] "=w" (vsrc), [src] "+&r" (src) + : : + ); +#else + register uint8x8_t d0 asm("d0"); + register uint8x8_t d1 asm("d1"); + register uint8x8_t d2 asm("d2"); + register uint8x8_t d3 asm("d3"); + + asm volatile ( + "vld4.u8 {d0-d3},[%[src]]!;" + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), + [src] "+&r" (src) + : : + ); + vsrc.val[0] = d0; + vsrc.val[1] = d1; + vsrc.val[2] = d2; + vsrc.val[3] = d3; #endif - "vmov.u16 q3, #255 \n\t" // set up constant - "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 - "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon - "beq 2f \n\t" // if count8 == 0, exit - "vmov.u16 q15, #0x1f \n\t" // set up blue mask - - "1: \n\t" - "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels - "subs r4, r4, #1 \n\t" // decrement loop counter - "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels - // and deinterleave - - "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes - "vand q10, q0, q15 \n\t" // extract blue - "vshr.u16 q8, q0, #11 \n\t" // extract red - "vshr.u16 q9, q9, #10 \n\t" // extract green - // dstrgb = {q8, q9, q10} - - "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range - "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range - "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range - - "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits - "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits - "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits - "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits - // srcrgba = {q11, q12, q13, q14} - - "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale - "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale - "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale - "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale - - "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 - "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) - // dst_scale = q2 - - "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale - "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale - "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale - -#if 1 - // trying for a better match with SkDiv255Round(a) - // C alg is: a+=128; (a+a>>8)>>8 - // we'll use just a rounding shift [q2 is available for scratch] - "vrshr.u16 q11, q11, #8 \n\t" // shift down red - "vrshr.u16 q12, q12, #8 \n\t" // shift down green - "vrshr.u16 q13, q13, #8 \n\t" // shift down blue +#endif // #ifdef SK_CPU_ARM64 + + + // deinterleave dst + vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes + vdst_b = vdst & vmask_blue; // extract blue + vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red + vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green + + // shift src to 565 + vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); + vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); + vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); + + // calc src * src_scale + vres_a = vmull_u8(vsrc.val[NEON_A], valpha); + vres_r = vmull_u8(vsrc.val[NEON_R], valpha); + vres_g = vmull_u8(vsrc.val[NEON_G], valpha); + vres_b = vmull_u8(vsrc.val[NEON_B], valpha); + + // prepare dst_scale + vres_a = SkDiv255Round_neon8(vres_a); + vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 + + // add dst * dst_scale to previous result + vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); + vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); + vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); + +#ifdef S32A_D565_BLEND_EXACT + // It is possible to get exact results with this but it is slow, + // even slower than C code in some cases + vres_r = SkDiv255Round_neon8(vres_r); + vres_g = SkDiv255Round_neon8(vres_g); + vres_b = SkDiv255Round_neon8(vres_b); #else - // arm's original "truncating divide by 256" - "vshr.u16 q11, q11, #8 \n\t" // shift down red - "vshr.u16 q12, q12, #8 \n\t" // shift down green - "vshr.u16 q13, q13, #8 \n\t" // shift down blue + vres_r = vrshrq_n_u16(vres_r, 8); + vres_g = vrshrq_n_u16(vres_g, 8); + vres_b = vrshrq_n_u16(vres_b, 8); #endif + // pack result + vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue + vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue - "vsli.u16 q13, q12, #5 \n\t" // insert green into blue - "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue - "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr - - "bne 1b \n\t" // if counter != 0, loop - "2: \n\t" // exit - - : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) - : - : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" - ); + // store + vst1q_u16(dst, vres_b); + dst += 8; + count -= 8; + } while (count >= 8); + } - count &= 7; - if (count > 0) { - do { - SkPMColor sc = *src++; - if (sc) { - uint16_t dc = *dst; - unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); - unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); - unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); - unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); - *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); - } - dst += 1; - } while (--count != 0); + // leftovers + while (count-- > 0) { + SkPMColor sc = *src++; + if (sc) { + uint16_t dc = *dst; + unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); + unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); + unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); + unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); + *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); + } + dst += 1; } } @@ -374,6 +525,7 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, do { + uint8x8x4_t vsrc; uint8x8_t vsrc_r, vsrc_g, vsrc_b; uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; @@ -384,6 +536,9 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, int8x8_t vres8_r, vres8_g, vres8_b; // Load source and add dither +#ifdef SK_CPU_ARM64 + vsrc = sk_vld4_u8_arm64_3(src); +#else { register uint8x8_t d0 asm("d0"); register uint8x8_t d1 asm("d1"); @@ -391,17 +546,18 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, register uint8x8_t d3 asm("d3"); asm ( - "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" + "vld4.8 {d0-d3},[%[src]]! " : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) : ); - vsrc_g = d1; -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) - vsrc_r = d2; vsrc_b = d0; -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) - vsrc_r = d0; vsrc_b = d2; -#endif + vsrc.val[0] = d0; + vsrc.val[1] = d1; + vsrc.val[2] = d2; } +#endif + vsrc_r = vsrc.val[NEON_R]; + vsrc_g = vsrc.val[NEON_G]; + vsrc_b = vsrc.val[NEON_B]; vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 @@ -766,76 +922,67 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha) { SkASSERT(alpha <= 255); - if (count > 0) { - uint16_t src_scale = SkAlpha255To256(alpha); - uint16_t dst_scale = 256 - src_scale; - - /* run them N at a time through the NEON unit */ - /* note that each 1 is 4 bytes, each treated exactly the same, - * so we can work under that guise. We *do* know that the src&dst - * will be 32-bit aligned quantities, so we can specify that on - * the load/store ops and do a neon 'reinterpret' to get us to - * byte-sized (pun intended) pieces that we widen/multiply/shift - * we're limited at 128 bits in the wide ops, which is 8x16bits - * or a pair of 32 bit src/dsts. - */ - /* we *could* manually unroll this loop so that we load 128 bits - * (as a pair of 64s) from each of src and dst, processing them - * in pieces. This might give us a little better management of - * the memory latency, but my initial attempts here did not - * produce an instruction stream that looked all that nice. - */ -#define UNROLL 2 - while (count >= UNROLL) { - uint8x8_t src_raw, dst_raw, dst_final; - uint16x8_t src_wide, dst_wide; - /* get 64 bits of src, widen it, multiply by src_scale */ - src_raw = vreinterpret_u8_u32(vld1_u32(src)); - src_wide = vmovl_u8(src_raw); - /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ - src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); + if (count <= 0) { + return; + } - /* ditto with dst */ - dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); - dst_wide = vmovl_u8(dst_raw); + uint16_t src_scale = SkAlpha255To256(alpha); + uint16_t dst_scale = 256 - src_scale; - /* combine add with dst multiply into mul-accumulate */ - dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); + while (count >= 2) { + uint8x8_t vsrc, vdst, vres; + uint16x8_t vsrc_wide, vdst_wide; - dst_final = vshrn_n_u16(dst_wide, 8); - vst1_u32(dst, vreinterpret_u32_u8(dst_final)); + /* These commented prefetches are a big win for count + * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. + * They also hurt a little (<5%) on an A15 + */ + //__builtin_prefetch(src+32); + //__builtin_prefetch(dst+32); - src += UNROLL; - dst += UNROLL; - count -= UNROLL; + // Load + vsrc = vreinterpret_u8_u32(vld1_u32(src)); + vdst = vreinterpret_u8_u32(vld1_u32(dst)); + + // Process src + vsrc_wide = vmovl_u8(vsrc); + vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); + + // Process dst + vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); + + // Combine + vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); + + // Store + vst1_u32(dst, vreinterpret_u32_u8(vres)); + + src += 2; + dst += 2; + count -= 2; } - /* RBE: well, i don't like how gcc manages src/dst across the above - * loop it's constantly calculating src+bias, dst+bias and it only - * adjusts the real ones when we leave the loop. Not sure why - * it's "hoisting down" (hoisting implies above in my lexicon ;)) - * the adjustments to src/dst/count, but it does... - * (might be SSA-style internal logic... - */ -#if UNROLL == 2 if (count == 1) { - *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); - } -#else - if (count > 0) { - do { - *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); - src += 1; - dst += 1; - } while (--count > 0); - } -#endif + uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; + uint16x8_t vsrc_wide, vdst_wide; -#undef UNROLL + // Load + vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); + vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); + + // Process + vsrc_wide = vmovl_u8(vsrc); + vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); + vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); + vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); + + // Store + vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); } } +#ifdef SK_CPU_ARM32 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha) { @@ -961,6 +1108,7 @@ static void showme16(char *str, void *p, int len) SkDebugf("%s\n", buf); } #endif +#endif // #ifdef SK_CPU_ARM32 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, @@ -970,9 +1118,8 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, #define UNROLL 8 if (count >= UNROLL) { - uint8x8_t dbase; -#if defined(DEBUG_OPAQUE_DITHER) +#if defined(DEBUG_OPAQUE_DITHER) uint16_t tmpbuf[UNROLL]; int td[UNROLL]; int tdv[UNROLL]; @@ -983,35 +1130,37 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, int noisy = 0; #endif + uint8x8_t dbase; const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; dbase = vld1_u8(dstart); do { + uint8x8x4_t vsrc; uint8x8_t sr, sg, sb, sa, d; uint16x8_t dst8, scale8, alpha8; uint16x8_t dst_r, dst_g, dst_b; -#if defined(DEBUG_OPAQUE_DITHER) - /* calculate 8 elements worth into a temp buffer */ - { - int my_y = y; - int my_x = x; - SkPMColor* my_src = (SkPMColor*)src; - uint16_t* my_dst = dst; - int i; - - DITHER_565_SCAN(my_y); - for(i=0;i<UNROLL;i++) { +#if defined(DEBUG_OPAQUE_DITHER) + // calculate 8 elements worth into a temp buffer + { + int my_y = y; + int my_x = x; + SkPMColor* my_src = (SkPMColor*)src; + uint16_t* my_dst = dst; + int i; + + DITHER_565_SCAN(my_y); + for(i = 0; i < UNROLL; i++) { SkPMColor c = *my_src++; SkPMColorAssert(c); if (c) { unsigned a = SkGetPackedA32(c); int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); - tdv[i] = DITHER_VALUE(my_x); - ta[i] = a; - tap[i] = SkAlpha255To256(a); - td[i] = d; + tdv[i] = DITHER_VALUE(my_x); + ta[i] = a; + tap[i] = SkAlpha255To256(a); + td[i] = d; unsigned sr = SkGetPackedR32(c); unsigned sg = SkGetPackedG32(c); @@ -1025,147 +1174,132 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); // now src and dst expanded are in g:11 r:10 x:1 b:10 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); - td[i] = d; - + td[i] = d; } else { - tmpbuf[i] = *my_dst; - ta[i] = tdv[i] = td[i] = 0xbeef; - } - in_dst[i] = *my_dst; + tmpbuf[i] = *my_dst; + ta[i] = tdv[i] = td[i] = 0xbeef; + } + in_dst[i] = *my_dst; my_dst += 1; DITHER_INC_X(my_x); - } - } + } + } #endif - /* source is in ABGR */ +#ifdef SK_CPU_ARM64 + vsrc = sk_vld4_u8_arm64_4(src); +#else { register uint8x8_t d0 asm("d0"); register uint8x8_t d1 asm("d1"); register uint8x8_t d2 asm("d2"); register uint8x8_t d3 asm("d3"); - asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) - : "r" (src) - ); - sr = d0; sg = d1; sb = d2; sa = d3; + asm ("vld4.8 {d0-d3},[%[src]]! " + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) + : + ); + vsrc.val[0] = d0; + vsrc.val[1] = d1; + vsrc.val[2] = d2; + vsrc.val[3] = d3; } - - /* calculate 'd', which will be 0..7 */ - /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ -#if defined(SK_BUILD_FOR_ANDROID) - /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ - alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); -#else - alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); #endif - alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); - d = vshrn_n_u16(alpha8, 8); /* narrowing too */ + sa = vsrc.val[NEON_A]; + sr = vsrc.val[NEON_R]; + sg = vsrc.val[NEON_G]; + sb = vsrc.val[NEON_B]; - /* sr = sr - (sr>>5) + d */ + /* calculate 'd', which will be 0..7 + * dbase[] is 0..7; alpha is 0..256; 16 bits suffice + */ + alpha8 = vmovl_u8(dbase); + alpha8 = vmlal_u8(alpha8, sa, dbase); + d = vshrn_n_u16(alpha8, 8); // narrowing too + + // sr = sr - (sr>>5) + d /* watching for 8-bit overflow. d is 0..7; risky range of * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; - * safe as long as we do ((sr-sr>>5) + d) */ + * safe as long as we do ((sr-sr>>5) + d) + */ sr = vsub_u8(sr, vshr_n_u8(sr, 5)); sr = vadd_u8(sr, d); - /* sb = sb - (sb>>5) + d */ + // sb = sb - (sb>>5) + d sb = vsub_u8(sb, vshr_n_u8(sb, 5)); sb = vadd_u8(sb, d); - /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ + // sg = sg - (sg>>6) + d>>1; similar logic for overflows sg = vsub_u8(sg, vshr_n_u8(sg, 6)); sg = vadd_u8(sg, vshr_n_u8(d,1)); - /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ + // need to pick up 8 dst's -- at 16 bits each, 128 bits dst8 = vld1q_u16(dst); - dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); - dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); - dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ - - /* blend */ -#if 1 - /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ - /* originally 255-sa + 1 */ + dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); + dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); + dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits + + // blend scale8 = vsubw_u8(vdupq_n_u16(256), sa); -#else - scale8 = vsubw_u8(vdupq_n_u16(255), sa); - scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); -#endif -#if 1 - /* combine the addq and mul, save 3 insns */ + // combine the addq and mul, save 3 insns scale8 = vshrq_n_u16(scale8, 3); dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); -#else - /* known correct, but +3 insns over above */ - scale8 = vshrq_n_u16(scale8, 3); - dst_b = vmulq_u16(dst_b, scale8); - dst_g = vmulq_u16(dst_g, scale8); - dst_r = vmulq_u16(dst_r, scale8); - - /* combine */ - /* NB: vshll widens, need to preserve those bits */ - dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); - dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); - dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); -#endif - /* repack to store */ - dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); + // repack to store + dst8 = vshrq_n_u16(dst_b, 5); dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); vst1q_u16(dst, dst8); -#if defined(DEBUG_OPAQUE_DITHER) - /* verify my 8 elements match the temp buffer */ - { - int i, bad=0; - static int invocation; - - for (i=0;i<UNROLL;i++) - if (tmpbuf[i] != dst[i]) bad=1; - if (bad) { - SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", - invocation, offset); - SkDebugf(" alpha 0x%x\n", alpha); - for (i=0;i<UNROLL;i++) - SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", - i, ((tmpbuf[i] != dst[i])?"BAD":"got"), - dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); - - showme16("alpha8", &alpha8, sizeof(alpha8)); - showme16("scale8", &scale8, sizeof(scale8)); - showme8("d", &d, sizeof(d)); - showme16("dst8", &dst8, sizeof(dst8)); - showme16("dst_b", &dst_b, sizeof(dst_b)); - showme16("dst_g", &dst_g, sizeof(dst_g)); - showme16("dst_r", &dst_r, sizeof(dst_r)); - showme8("sb", &sb, sizeof(sb)); - showme8("sg", &sg, sizeof(sg)); - showme8("sr", &sr, sizeof(sr)); - - /* cop out */ - return; - } - offset += UNROLL; - invocation++; - } -#endif +#if defined(DEBUG_OPAQUE_DITHER) + // verify my 8 elements match the temp buffer + { + int i, bad=0; + static int invocation; - dst += UNROLL; - src += UNROLL; + for (i = 0; i < UNROLL; i++) { + if (tmpbuf[i] != dst[i]) { + bad=1; + } + } + if (bad) { + SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", + invocation, offset); + SkDebugf(" alpha 0x%x\n", alpha); + for (i = 0; i < UNROLL; i++) + SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", + i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], + in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); + + showme16("alpha8", &alpha8, sizeof(alpha8)); + showme16("scale8", &scale8, sizeof(scale8)); + showme8("d", &d, sizeof(d)); + showme16("dst8", &dst8, sizeof(dst8)); + showme16("dst_b", &dst_b, sizeof(dst_b)); + showme16("dst_g", &dst_g, sizeof(dst_g)); + showme16("dst_r", &dst_r, sizeof(dst_r)); + showme8("sb", &sb, sizeof(sb)); + showme8("sg", &sg, sizeof(sg)); + showme8("sr", &sr, sizeof(sr)); + + return; + } + offset += UNROLL; + invocation++; + } +#endif + dst += UNROLL; count -= UNROLL; - /* skip x += UNROLL, since it's unchanged mod-4 */ + // skip x += UNROLL, since it's unchanged mod-4 } while (count >= UNROLL); } #undef UNROLL - /* residuals */ + // residuals if (count > 0) { DITHER_565_SCAN(y); do { @@ -1218,7 +1352,11 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, uint8x8_t sr, sg, sb; uint16x8_t dr, dg, db; uint16x8_t dst8; + uint8x8x4_t vsrc; +#ifdef SK_CPU_ARM64 + vsrc = sk_vld4_u8_arm64_3(src); +#else { register uint8x8_t d0 asm("d0"); register uint8x8_t d1 asm("d1"); @@ -1226,17 +1364,19 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, register uint8x8_t d3 asm("d3"); asm ( - "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" + "vld4.8 {d0-d3},[%[src]]! " : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) : ); - sg = d1; -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) - sr = d2; sb = d0; -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) - sr = d0; sb = d2; -#endif + vsrc.val[0] = d0; + vsrc.val[1] = d1; + vsrc.val[2] = d2; } +#endif + sr = vsrc.val[NEON_R]; + sg = vsrc.val[NEON_G]; + sb = vsrc.val[NEON_B]; + /* XXX: if we want to prefetch, hide it in the above asm() * using the gcc __builtin_prefetch(), the prefetch will * fall to the bottom of the loop -- it won't stick up @@ -1321,84 +1461,88 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, unsigned colorA = SkGetPackedA32(color); if (255 == colorA) { sk_memset32(dst, color, count); - } else { - unsigned scale = 256 - SkAlpha255To256(colorA); + return; + } - if (count >= 8) { - // at the end of this assembly, count will have been decremented - // to a negative value. That is, if count mod 8 = x, it will be - // -8 +x coming out. - asm volatile ( - PLD128(src, 0) - - "vdup.32 q0, %[color] \n\t" - - PLD128(src, 128) - - // scale numerical interval [0-255], so load as 8 bits - "vdup.8 d2, %[scale] \n\t" - - PLD128(src, 256) - - "subs %[count], %[count], #8 \n\t" - - PLD128(src, 384) - - "Loop_Color32: \n\t" - - // load src color, 8 pixels, 4 64 bit registers - // (and increment src). - "vld1.32 {d4-d7}, [%[src]]! \n\t" - - PLD128(src, 384) - - // multiply long by scale, 64 bits at a time, - // destination into a 128 bit register. - "vmull.u8 q4, d4, d2 \n\t" - "vmull.u8 q5, d5, d2 \n\t" - "vmull.u8 q6, d6, d2 \n\t" - "vmull.u8 q7, d7, d2 \n\t" - - // shift the 128 bit registers, containing the 16 - // bit scaled values back to 8 bits, narrowing the - // results to 64 bit registers. - "vshrn.i16 d8, q4, #8 \n\t" - "vshrn.i16 d9, q5, #8 \n\t" - "vshrn.i16 d10, q6, #8 \n\t" - "vshrn.i16 d11, q7, #8 \n\t" - - // adding back the color, using 128 bit registers. - "vadd.i8 q6, q4, q0 \n\t" - "vadd.i8 q7, q5, q0 \n\t" - - // store back the 8 calculated pixels (2 128 bit - // registers), and increment dst. - "vst1.32 {d12-d15}, [%[dst]]! \n\t" - - "subs %[count], %[count], #8 \n\t" - "bge Loop_Color32 \n\t" - : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) - : [color] "r" (color), [scale] "r" (scale) - : "cc", "memory", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", - "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" - ); - // At this point, if we went through the inline assembly, count is - // a negative value: - // if the value is -8, there is no pixel left to process. - // if the value is -7, there is one pixel left to process - // ... - // And'ing it with 7 will give us the number of pixels - // left to process. - count = count & 0x7; - } + unsigned scale = 256 - SkAlpha255To256(colorA); - while (count > 0) { - *dst = color + SkAlphaMulQ(*src, scale); - src += 1; - dst += 1; - count--; - } + if (count >= 8) { + uint32x4_t vcolor; + uint8x8_t vscale; + + vcolor = vdupq_n_u32(color); + + // scale numerical interval [0-255], so load as 8 bits + vscale = vdup_n_u8(scale); + + do { + // load src color, 8 pixels, 4 64 bit registers + // (and increment src). + uint32x2x4_t vsrc; +#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))) + asm ( + "vld1.32 %h[vsrc], [%[src]]!" + : [vsrc] "=w" (vsrc), [src] "+r" (src) + : : + ); +#else // 64bit targets and Clang + vsrc.val[0] = vld1_u32(src); + vsrc.val[1] = vld1_u32(src+2); + vsrc.val[2] = vld1_u32(src+4); + vsrc.val[3] = vld1_u32(src+6); + src += 8; +#endif + + // multiply long by scale, 64 bits at a time, + // destination into a 128 bit register. + uint16x8x4_t vtmp; + vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale); + vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale); + vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale); + vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale); + + // shift the 128 bit registers, containing the 16 + // bit scaled values back to 8 bits, narrowing the + // results to 64 bit registers. + uint8x16x2_t vres; + vres.val[0] = vcombine_u8( + vshrn_n_u16(vtmp.val[0], 8), + vshrn_n_u16(vtmp.val[1], 8)); + vres.val[1] = vcombine_u8( + vshrn_n_u16(vtmp.val[2], 8), + vshrn_n_u16(vtmp.val[3], 8)); + + // adding back the color, using 128 bit registers. + uint32x4x2_t vdst; + vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + + vreinterpretq_u8_u32(vcolor)); + vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + + vreinterpretq_u8_u32(vcolor)); + + // store back the 8 calculated pixels (2 128 bit + // registers), and increment dst. +#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))) + asm ( + "vst1.32 %h[vdst], [%[dst]]!" + : [dst] "+r" (dst) + : [vdst] "w" (vdst) + : "memory" + ); +#else // 64bit targets and Clang + vst1q_u32(dst, vdst.val[0]); + vst1q_u32(dst+4, vdst.val[1]); + dst += 8; +#endif + count -= 8; + + } while (count >= 8); + } + + while (count > 0) { + *dst = color + SkAlphaMulQ(*src, scale); + src += 1; + dst += 1; + count--; } } @@ -1406,12 +1550,13 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { // no dither - // NOTE: For the S32_D565_Blend function below, we don't have a special - // version that assumes that each source pixel is opaque. But our - // S32A is still faster than the default, so use it. S32_D565_Opaque_neon, - S32A_D565_Blend_neon, // really S32_D565_Blend + S32_D565_Blend_neon, +#ifdef SK_CPU_ARM32 S32A_D565_Opaque_neon, +#else + NULL, +#endif S32A_D565_Blend_neon, // dither @@ -1439,5 +1584,9 @@ const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { #else S32A_Opaque_BlitRow32_neon, // S32A_Opaque, #endif +#ifdef SK_CPU_ARM32 S32A_Blend_BlitRow32_neon // S32A_Blend +#else + NULL +#endif }; diff --git a/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp b/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp new file mode 100644 index 00000000000..30bb4c2701a --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkBlitRow_opts_mips_dsp.cpp @@ -0,0 +1,848 @@ +/* + * Copyright 2014 The Android Open Source Project + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkBlitRow.h" +#include "SkBlitMask.h" +#include "SkColorPriv.h" +#include "SkDither.h" +#include "SkMathPriv.h" + +static void S32_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + register uint32_t t0, t1, t2, t3, t4, t5, t6; + register uint32_t s0, s1, s2, s4, s5, s6; + + alpha += 1; + if (count >= 2) { + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "sll %[s4], %[alpha], 8 \n\t" + "or %[s4], %[s4], %[alpha] \n\t" + "repl.ph %[s5], 0x1f \n\t" + "repl.ph %[s6], 0x3f \n\t" + "1: \n\t" + "lw %[s2], 0(%[src]) \n\t" + "lw %[s1], 4(%[src]) \n\t" + "lwr %[s0], 0(%[dst]) \n\t" + "lwl %[s0], 3(%[dst]) \n\t" + "and %[t1], %[s0], %[s5] \n\t" + "shra.ph %[t0], %[s0], 5 \n\t" + "and %[t2], %[t0], %[s6] \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "shrl.ph %[t3], %[s0], 11 \n\t" +#else + "shra.ph %[t0], %[s0], 11 \n\t" + "and %[t3], %[t0], %[s5] \n\t" +#endif + "precrq.ph.w %[t0], %[s1], %[s2] \n\t" + "shrl.qb %[t5], %[t0], 3 \n\t" + "and %[t4], %[t5], %[s5] \n\t" + "ins %[s2], %[s1], 16, 16 \n\t" + "preceu.ph.qbra %[t0], %[s2] \n\t" + "shrl.qb %[t6], %[t0], 3 \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "shrl.ph %[t5], %[s2], 10 \n\t" +#else + "shra.ph %[t0], %[s2], 10 \n\t" + "and %[t5], %[t0], %[s6] \n\t" +#endif + "subu.qb %[t4], %[t4], %[t1] \n\t" + "subu.qb %[t5], %[t5], %[t2] \n\t" + "subu.qb %[t6], %[t6], %[t3] \n\t" + "muleu_s.ph.qbr %[t4], %[s4], %[t4] \n\t" + "muleu_s.ph.qbr %[t5], %[s4], %[t5] \n\t" + "muleu_s.ph.qbr %[t6], %[s4], %[t6] \n\t" + "addiu %[count], %[count], -2 \n\t" + "addiu %[src], %[src], 8 \n\t" + "shra.ph %[t4], %[t4], 8 \n\t" + "shra.ph %[t5], %[t5], 8 \n\t" + "shra.ph %[t6], %[t6], 8 \n\t" + "addu.qb %[t4], %[t4], %[t1] \n\t" + "addu.qb %[t5], %[t5], %[t2] \n\t" + "addu.qb %[t6], %[t6], %[t3] \n\t" + "andi %[s0], %[t4], 0xffff \n\t" + "andi %[t0], %[t5], 0xffff \n\t" + "sll %[t0], %[t0], 0x5 \n\t" + "or %[s0], %[s0], %[t0] \n\t" + "sll %[t0], %[t6], 0xb \n\t" + "or %[t0], %[t0], %[s0] \n\t" + "sh %[t0], 0(%[dst]) \n\t" + "srl %[s1], %[t4], 16 \n\t" + "srl %[t0], %[t5], 16 \n\t" + "sll %[t5], %[t0], 5 \n\t" + "or %[t0], %[t5], %[s1] \n\t" + "srl %[s0], %[t6], 16 \n\t" + "sll %[s2], %[s0], 0xb \n\t" + "or %[s1], %[s2], %[t0] \n\t" + "sh %[s1], 2(%[dst]) \n\t" + "bge %[count], 2, 1b \n\t" + " addiu %[dst], %[dst], 4 \n\t" + ".set pop \n\t" + : [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [s0]"=&r"(s0), + [s1]"=&r"(s1), [s2]"=&r"(s2), [s4]"=&r"(s4), [s5]"=&r"(s5), + [s6]"=&r"(s6), [count]"+r"(count), [dst]"+r"(dst), + [src]"+r"(src) + : [alpha]"r"(alpha) + : "memory", "hi", "lo" + ); + } + + if (count == 1) { + SkPMColor c = *src++; + SkPMColorAssert(c); + SkASSERT(SkGetPackedA32(c) == 255); + uint16_t d = *dst; + *dst++ = SkPackRGB16(SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), alpha), + SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), alpha), + SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), alpha)); + } +} + +static void S32A_D565_Opaque_Dither_mips_dsp(uint16_t* __restrict__ dst, + const SkPMColor* __restrict__ src, + int count, U8CPU alpha, int x, int y) { + __asm__ volatile ( + "pref 0, 0(%[src]) \n\t" + "pref 1, 0(%[dst]) \n\t" + "pref 0, 32(%[src]) \n\t" + "pref 1, 32(%[dst]) \n\t" + : + : [src]"r"(src), [dst]"r"(dst) + : "memory" + ); + + register int32_t t0, t1, t2, t3, t4, t5, t6; + register int32_t t7, t8, t9, s0, s1, s2, s3; + const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; + + if (count >= 2) { + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "li %[s1], 0x01010101 \n\t" + "li %[s2], -2017 \n\t" + "1: \n\t" + "bnez %[s3], 4f \n\t" + " li %[s3], 2 \n\t" + "pref 0, 64(%[src]) \n\t" + "pref 1, 64(%[dst]) \n\t" + "4: \n\t" + "addiu %[s3], %[s3], -1 \n\t" + "lw %[t1], 0(%[src]) \n\t" + "andi %[t3], %[x], 0x3 \n\t" + "addiu %[x], %[x], 1 \n\t" + "sll %[t4], %[t3], 2 \n\t" + "srav %[t5], %[dither_scan], %[t4] \n\t" + "andi %[t3], %[t5], 0xf \n\t" + "lw %[t2], 4(%[src]) \n\t" + "andi %[t4], %[x], 0x3 \n\t" + "sll %[t5], %[t4], 2 \n\t" + "srav %[t6], %[dither_scan], %[t5] \n\t" + "addiu %[x], %[x], 1 \n\t" + "ins %[t3], %[t6], 8, 4 \n\t" + "srl %[t4], %[t1], 24 \n\t" + "addiu %[t0], %[t4], 1 \n\t" + "srl %[t4], %[t2], 24 \n\t" + "addiu %[t5], %[t4], 1 \n\t" + "ins %[t0], %[t5], 16, 16 \n\t" + "muleu_s.ph.qbr %[t4], %[t3], %[t0] \n\t" + "preceu.ph.qbla %[t3], %[t4] \n\t" + "andi %[t4], %[t1], 0xff \n\t" + "ins %[t4], %[t2], 16, 8 \n\t" + "shrl.qb %[t5], %[t4], 5 \n\t" + "subu.qb %[t6], %[t3], %[t5] \n\t" + "addq.ph %[t5], %[t6], %[t4] \n\t" + "ext %[t4], %[t1], 8, 8 \n\t" + "srl %[t6], %[t2], 8 \n\t" + "ins %[t4], %[t6], 16, 8 \n\t" + "shrl.qb %[t6], %[t4], 6 \n\t" + "shrl.qb %[t7], %[t3], 1 \n\t" + "subu.qb %[t8], %[t7], %[t6] \n\t" + "addq.ph %[t6], %[t8], %[t4] \n\t" + "ext %[t4], %[t1], 16, 8 \n\t" + "srl %[t7], %[t2], 16 \n\t" + "ins %[t4], %[t7], 16, 8 \n\t" + "shrl.qb %[t7], %[t4], 5 \n\t" + "subu.qb %[t8], %[t3], %[t7] \n\t" + "addq.ph %[t7], %[t8], %[t4] \n\t" + "shll.ph %[t4], %[t7], 2 \n\t" + "andi %[t9], %[t4], 0xffff \n\t" + "srl %[s0], %[t4], 16 \n\t" + "andi %[t3], %[t6], 0xffff \n\t" + "srl %[t4], %[t6], 16 \n\t" + "andi %[t6], %[t5], 0xffff \n\t" + "srl %[t7], %[t5], 16 \n\t" + "subq.ph %[t5], %[s1], %[t0] \n\t" + "srl %[t0], %[t5], 3 \n\t" + "beqz %[t1], 3f \n\t" + " lhu %[t5], 0(%[dst]) \n\t" + "sll %[t1], %[t6], 13 \n\t" + "or %[t8], %[t9], %[t1] \n\t" + "sll %[t1], %[t3], 24 \n\t" + "or %[t9], %[t1], %[t8] \n\t" + "andi %[t3], %[t5], 0x7e0 \n\t" + "sll %[t6], %[t3], 0x10 \n\t" + "and %[t8], %[s2], %[t5] \n\t" + "or %[t5], %[t6], %[t8] \n\t" + "andi %[t6], %[t0], 0xff \n\t" + "mul %[t1], %[t6], %[t5] \n\t" + "addu %[t5], %[t1], %[t9] \n\t" + "srl %[t6], %[t5], 5 \n\t" + "and %[t5], %[s2], %[t6] \n\t" + "srl %[t8], %[t6], 16 \n\t" + "andi %[t6], %[t8], 0x7e0 \n\t" + "or %[t1], %[t5], %[t6] \n\t" + "sh %[t1], 0(%[dst]) \n\t" + "3: \n\t" + "beqz %[t2], 2f \n\t" + " lhu %[t5], 2(%[dst]) \n\t" + "sll %[t1], %[t7], 13 \n\t" + "or %[t8], %[s0], %[t1] \n\t" + "sll %[t1], %[t4], 24 \n\t" + "or %[t9], %[t1], %[t8] \n\t" + "andi %[t3], %[t5], 0x7e0 \n\t" + "sll %[t6], %[t3], 0x10 \n\t" + "and %[t8], %[s2], %[t5] \n\t" + "or %[t5], %[t6], %[t8] \n\t" + "srl %[t6], %[t0], 16 \n\t" + "mul %[t1], %[t6], %[t5] \n\t" + "addu %[t5], %[t1], %[t9] \n\t" + "srl %[t6], %[t5], 5 \n\t" + "and %[t5], %[s2], %[t6] \n\t" + "srl %[t8], %[t6], 16 \n\t" + "andi %[t6], %[t8], 0x7e0 \n\t" + "or %[t1], %[t5], %[t6] \n\t" + "sh %[t1], 2(%[dst]) \n\t" + "2: \n\t" + "addiu %[count], %[count], -2 \n\t" + "addiu %[src], %[src], 8 \n\t" + "addiu %[t1], %[count], -1 \n\t" + "bgtz %[t1], 1b \n\t" + " addiu %[dst], %[dst], 4 \n\t" + ".set pop \n\t" + : [src]"+r"(src), [count]"+r"(count), [dst]"+r"(dst), [x]"+r"(x), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7), + [t8]"=&r"(t8), [t9]"=&r"(t9), [s0]"=&r"(s0), [s1]"=&r"(s1), + [s2]"=&r"(s2), [s3]"=&r"(s3) + : [dither_scan]"r"(dither_scan) + : "memory", "hi", "lo" + ); + } + + if (count == 1) { + SkPMColor c = *src++; + SkPMColorAssert(c); + if (c) { + unsigned a = SkGetPackedA32(c); + int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); + + unsigned sr = SkGetPackedR32(c); + unsigned sg = SkGetPackedG32(c); + unsigned sb = SkGetPackedB32(c); + sr = SkDITHER_R32_FOR_565(sr, d); + sg = SkDITHER_G32_FOR_565(sg, d); + sb = SkDITHER_B32_FOR_565(sb, d); + + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); + uint32_t dst_expanded = SkExpand_rgb_16(*dst); + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); + // now src and dst expanded are in g:11 r:10 x:1 b:10 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); + } + dst += 1; + DITHER_INC_X(x); + } +} + +static void S32_D565_Opaque_Dither_mips_dsp(uint16_t* __restrict__ dst, + const SkPMColor* __restrict__ src, + int count, U8CPU alpha, int x, int y) { + uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; + register uint32_t t0, t1, t2, t3, t4, t5; + register uint32_t t6, t7, t8, t9, s0; + int dither[4]; + int i; + + for (i = 0; i < 4; i++, x++) { + dither[i] = (dither_scan >> ((x & 3) << 2)) & 0xF; + } + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "li %[s0], 1 \n\t" + "2: \n\t" + "beqz %[count], 1f \n\t" + " nop \n\t" + "addiu %[t0], %[count], -1 \n\t" + "beqz %[t0], 1f \n\t" + " nop \n\t" + "beqz %[s0], 3f \n\t" + " nop \n\t" + "lw %[t0], 0(%[dither]) \n\t" + "lw %[t1], 4(%[dither]) \n\t" + "li %[s0], 0 \n\t" + "b 4f \n\t" + " nop \n\t" + "3: \n\t" + "lw %[t0], 8(%[dither]) \n\t" + "lw %[t1], 12(%[dither]) \n\t" + "li %[s0], 1 \n\t" + "4: \n\t" + "sll %[t2], %[t0], 16 \n\t" + "or %[t1], %[t2], %[t1] \n\t" + "lw %[t0], 0(%[src]) \n\t" + "lw %[t2], 4(%[src]) \n\t" + "precrq.ph.w %[t3], %[t0], %[t2] \n\t" + "preceu.ph.qbra %[t9], %[t3] \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "append %[t0], %[t2], 16 \n\t" + "preceu.ph.qbra %[t4], %[t0] \n\t" + "preceu.ph.qbla %[t5], %[t0] \n\t" +#else + "sll %[t6], %[t0], 16 \n\t" + "sll %[t7], %[t2], 16 \n\t" + "precrq.ph.w %[t8], %[t6], %[t7] \n\t" + "preceu.ph.qbra %[t4], %[t8] \n\t" + "preceu.ph.qbla %[t5], %[t8] \n\t" +#endif + "addu.qb %[t0], %[t4], %[t1] \n\t" + "shra.ph %[t2], %[t4], 5 \n\t" + "subu.qb %[t3], %[t0], %[t2] \n\t" + "shra.ph %[t6], %[t3], 3 \n\t" + "addu.qb %[t0], %[t9], %[t1] \n\t" + "shra.ph %[t2], %[t9], 5 \n\t" + "subu.qb %[t3], %[t0], %[t2] \n\t" + "shra.ph %[t7], %[t3], 3 \n\t" + "shra.ph %[t0], %[t1], 1 \n\t" + "shra.ph %[t2], %[t5], 6 \n\t" + "addu.qb %[t3], %[t5], %[t0] \n\t" + "subu.qb %[t4], %[t3], %[t2] \n\t" + "shra.ph %[t8], %[t4], 2 \n\t" + "precrq.ph.w %[t0], %[t6], %[t7] \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "append %[t6], %[t7], 16 \n\t" +#else + "sll %[t6], %[t6], 16 \n\t" + "sll %[t2], %[t7], 16 \n\t" + "precrq.ph.w %[t6], %[t6], %[t2] \n\t" +#endif + "sra %[t4], %[t8], 16 \n\t" + "andi %[t5], %[t8], 0xFF \n\t" + "sll %[t7], %[t4], 5 \n\t" + "sra %[t8], %[t0], 5 \n\t" + "or %[t9], %[t7], %[t8] \n\t" + "or %[t3], %[t9], %[t0] \n\t" + "andi %[t4], %[t3], 0xFFFF \n\t" + "sll %[t7], %[t5], 5 \n\t" + "sra %[t8], %[t6], 5 \n\t" + "or %[t9], %[t7], %[t8] \n\t" + "or %[t3], %[t9], %[t6] \n\t" + "and %[t7], %[t3], 0xFFFF \n\t" + "sh %[t4], 0(%[dst]) \n\t" + "sh %[t7], 2(%[dst]) \n\t" + "addiu %[count], %[count], -2 \n\t" + "addiu %[src], %[src], 8 \n\t" + "b 2b \n\t" + " addiu %[dst], %[dst], 4 \n\t" + "1: \n\t" + ".set pop \n\t" + : [dst]"+r"(dst), [src]"+r"(src), [count]"+r"(count), + [x]"+r"(x), [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), + [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), + [t7]"=&r"(t7), [t8]"=&r"(t8), [t9]"=&r"(t9), [s0]"=&r"(s0) + : [dither] "r" (dither) + : "memory" + ); + + if (count == 1) { + SkPMColor c = *src++; + SkPMColorAssert(c); // only if DEBUG is turned on + SkASSERT(SkGetPackedA32(c) == 255); + unsigned dither = DITHER_VALUE(x); + *dst++ = SkDitherRGB32To565(c, dither); + } +} + +static void S32_D565_Blend_Dither_mips_dsp(uint16_t* dst, + const SkPMColor* src, + int count, U8CPU alpha, int x, int y) { + register int32_t t0, t1, t2, t3, t4, t5, t6; + register int32_t s0, s1, s2, s3; + register int x1 = 0; + register uint32_t sc_mul; + register uint32_t sc_add; +#ifdef ENABLE_DITHER_MATRIX_4X4 + const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; +#else // ENABLE_DITHER_MATRIX_4X4 + const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; +#endif // ENABLE_DITHER_MATRIX_4X4 + int dither[4]; + + for (int i = 0; i < 4; i++) { + dither[i] = (dither_scan >> ((x & 3) << 2)) & 0xF; + x += 1; + } + alpha += 1; + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "li %[t0], 0x100 \n\t" + "subu %[t0], %[t0], %[alpha] \n\t" + "replv.ph %[sc_mul], %[alpha] \n\t" + "beqz %[alpha], 1f \n\t" + " nop \n\t" + "replv.qb %[sc_add], %[t0] \n\t" + "b 2f \n\t" + " nop \n\t" + "1: \n\t" + "replv.qb %[sc_add], %[alpha] \n\t" + "2: \n\t" + "addiu %[t2], %[count], -1 \n\t" + "blez %[t2], 3f \n\t" + " nop \n\t" + "lw %[s0], 0(%[src]) \n\t" + "lw %[s1], 4(%[src]) \n\t" + "bnez %[x1], 4f \n\t" + " nop \n\t" + "lw %[t0], 0(%[dither]) \n\t" + "lw %[t1], 4(%[dither]) \n\t" + "li %[x1], 1 \n\t" + "b 5f \n\t" + " nop \n\t" + "4: \n\t" + "lw %[t0], 8(%[dither]) \n\t" + "lw %[t1], 12(%[dither]) \n\t" + "li %[x1], 0 \n\t" + "5: \n\t" + "sll %[t3], %[t0], 7 \n\t" + "sll %[t4], %[t1], 7 \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "append %[t0], %[t1], 16 \n\t" +#else + "sll %[t0], %[t0], 8 \n\t" + "sll %[t2], %[t1], 8 \n\t" + "precrq.qb.ph %[t0], %[t0], %[t2] \n\t" +#endif + "precrq.qb.ph %[t1], %[t3], %[t4] \n\t" + "sll %[t5], %[s0], 8 \n\t" + "sll %[t6], %[s1], 8 \n\t" + "precrq.qb.ph %[t4], %[t5], %[t6] \n\t" + "precrq.qb.ph %[t6], %[s0], %[s1] \n\t" + "preceu.ph.qbla %[t5], %[t4] \n\t" + "preceu.ph.qbra %[t4], %[t4] \n\t" + "preceu.ph.qbra %[t6], %[t6] \n\t" + "lh %[t2], 0(%[dst]) \n\t" + "lh %[s1], 2(%[dst]) \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "append %[t2], %[s1], 16 \n\t" +#else + "sll %[s1], %[s1], 16 \n\t" + "packrl.ph %[t2], %[t2], %[s1] \n\t" +#endif + "shra.ph %[s1], %[t2], 11 \n\t" + "and %[s1], %[s1], 0x1F001F \n\t" + "shra.ph %[s2], %[t2], 5 \n\t" + "and %[s2], %[s2], 0x3F003F \n\t" + "and %[s3], %[t2], 0x1F001F \n\t" + "shrl.qb %[t3], %[t4], 5 \n\t" + "addu.qb %[t4], %[t4], %[t0] \n\t" + "subu.qb %[t4], %[t4], %[t3] \n\t" + "shrl.qb %[t4], %[t4], 3 \n\t" + "shrl.qb %[t3], %[t5], 5 \n\t" + "addu.qb %[t5], %[t5], %[t0] \n\t" + "subu.qb %[t5], %[t5], %[t3] \n\t" + "shrl.qb %[t5], %[t5], 3 \n\t" + "shrl.qb %[t3], %[t6], 6 \n\t" + "addu.qb %[t6], %[t6], %[t1] \n\t" + "subu.qb %[t6], %[t6], %[t3] \n\t" + "shrl.qb %[t6], %[t6], 2 \n\t" + "cmpu.lt.qb %[t4], %[s1] \n\t" + "pick.qb %[s0], %[sc_add], $0 \n\t" + "addu.qb %[s0], %[s0], %[s1] \n\t" + "subu.qb %[t4], %[t4], %[s1] \n\t" + "muleu_s.ph.qbl %[t0], %[t4], %[sc_mul] \n\t" + "muleu_s.ph.qbr %[t1], %[t4], %[sc_mul] \n\t" + "precrq.qb.ph %[t4], %[t0], %[t1] \n\t" + "addu.qb %[t4], %[t4], %[s0] \n\t" + "cmpu.lt.qb %[t5], %[s3] \n\t" + "pick.qb %[s0], %[sc_add], $0 \n\t" + "addu.qb %[s0], %[s0], %[s3] \n\t" + "subu.qb %[t5], %[t5], %[s3] \n\t" + "muleu_s.ph.qbl %[t0], %[t5], %[sc_mul] \n\t" + "muleu_s.ph.qbr %[t1], %[t5], %[sc_mul] \n\t" + "precrq.qb.ph %[t5], %[t0], %[t1] \n\t" + "addu.qb %[t5], %[t5], %[s0] \n\t" + "cmpu.lt.qb %[t6], %[s2] \n\t" + "pick.qb %[s0], %[sc_add], $0 \n\t" + "addu.qb %[s0], %[s0], %[s2] \n\t" + "subu.qb %[t6], %[t6], %[s2] \n\t" + "muleu_s.ph.qbl %[t0], %[t6], %[sc_mul] \n\t" + "muleu_s.ph.qbr %[t1], %[t6], %[sc_mul] \n\t" + "precrq.qb.ph %[t6], %[t0], %[t1] \n\t" + "addu.qb %[t6], %[t6], %[s0] \n\t" + "shll.ph %[s1], %[t4], 11 \n\t" + "shll.ph %[t0], %[t6], 5 \n\t" + "or %[s0], %[s1], %[t0] \n\t" + "or %[s1], %[s0], %[t5] \n\t" + "srl %[t2], %[s1], 16 \n\t" + "and %[t3], %[s1], 0xFFFF \n\t" + "sh %[t2], 0(%[dst]) \n\t" + "sh %[t3], 2(%[dst]) \n\t" + "addiu %[src], %[src], 8 \n\t" + "addi %[count], %[count], -2 \n\t" + "b 2b \n\t" + " addu %[dst], %[dst], 4 \n\t" + "3: \n\t" + ".set pop \n\t" + : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count), + [x1]"+r"(x1), [sc_mul]"=&r"(sc_mul), [sc_add]"=&r"(sc_add), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [s0]"=&r"(s0), + [s1]"=&r"(s1), [s2]"=&r"(s2), [s3]"=&r"(s3) + : [dither]"r"(dither), [alpha]"r"(alpha) + : "memory", "hi", "lo" + ); + + if(count == 1) { + SkPMColor c = *src++; + SkPMColorAssert(c); + SkASSERT(SkGetPackedA32(c) == 255); + DITHER_565_SCAN(y); + int dither = DITHER_VALUE(x); + int sr = SkGetPackedR32(c); + int sg = SkGetPackedG32(c); + int sb = SkGetPackedB32(c); + sr = SkDITHER_R32To565(sr, dither); + sg = SkDITHER_G32To565(sg, dither); + sb = SkDITHER_B32To565(sb, dither); + + uint16_t d = *dst; + *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), alpha), + SkAlphaBlend(sg, SkGetPackedG16(d), alpha), + SkAlphaBlend(sb, SkGetPackedB16(d), alpha)); + DITHER_INC_X(x); + } +} + +static void S32A_D565_Opaque_mips_dsp(uint16_t* __restrict__ dst, + const SkPMColor* __restrict__ src, + int count, U8CPU alpha, int x, int y) { + + __asm__ volatile ( + "pref 0, 0(%[src]) \n\t" + "pref 1, 0(%[dst]) \n\t" + "pref 0, 32(%[src]) \n\t" + "pref 1, 32(%[dst]) \n\t" + : + : [src]"r"(src), [dst]"r"(dst) + : "memory" + ); + + register uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8; + register uint32_t t16; + register uint32_t add_x10 = 0x100010; + register uint32_t add_x20 = 0x200020; + register uint32_t sa = 0xff00ff; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "blez %[count], 1f \n\t" + " nop \n\t" + "2: \n\t" + "beqz %[count], 1f \n\t" + " nop \n\t" + "addiu %[t0], %[count], -1 \n\t" + "beqz %[t0], 1f \n\t" + " nop \n\t" + "bnez %[t16], 3f \n\t" + " nop \n\t" + "li %[t16], 2 \n\t" + "pref 0, 64(%[src]) \n\t" + "pref 1, 64(%[dst]) \n\t" + "3: \n\t" + "addiu %[t16], %[t16], -1 \n\t" + "lw %[t0], 0(%[src]) \n\t" + "lw %[t1], 4(%[src]) \n\t" + "precrq.ph.w %[t2], %[t0], %[t1] \n\t" + "preceu.ph.qbra %[t8], %[t2] \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "append %[t0], %[t1], 16 \n\t" +#else + "sll %[t0], %[t0], 16 \n\t" + "sll %[t6], %[t1], 16 \n\t" + "precrq.ph.w %[t0], %[t0], %[t6] \n\t" +#endif + "preceu.ph.qbra %[t3], %[t0] \n\t" + "preceu.ph.qbla %[t4], %[t0] \n\t" + "preceu.ph.qbla %[t0], %[t2] \n\t" + "subq.ph %[t1], %[sa], %[t0] \n\t" + "sra %[t2], %[t1], 8 \n\t" + "or %[t5], %[t2], %[t1] \n\t" + "replv.ph %[t2], %[t5] \n\t" + "lh %[t0], 0(%[dst]) \n\t" + "lh %[t1], 2(%[dst]) \n\t" + "and %[t1], %[t1], 0xffff \n\t" +#ifdef __MIPS_HAVE_DSPR2 + "append %[t0], %[t1], 16 \n\t" +#else + "sll %[t5], %[t0], 16 \n\t" + "or %[t0], %[t5], %[t1] \n\t" +#endif + "and %[t1], %[t0], 0x1f001f \n\t" + "shra.ph %[t6], %[t0], 11 \n\t" + "and %[t6], %[t6], 0x1f001f \n\t" + "and %[t7], %[t0], 0x7e007e0 \n\t" + "shra.ph %[t5], %[t7], 5 \n\t" + "muleu_s.ph.qbl %[t0], %[t2], %[t6] \n\t" + "addq.ph %[t7], %[t0], %[add_x10] \n\t" + "shra.ph %[t6], %[t7], 5 \n\t" + "addq.ph %[t6], %[t7], %[t6] \n\t" + "shra.ph %[t0], %[t6], 5 \n\t" + "addq.ph %[t7], %[t0], %[t3] \n\t" + "shra.ph %[t6], %[t7], 3 \n\t" + "muleu_s.ph.qbl %[t0], %[t2], %[t1] \n\t" + "addq.ph %[t7], %[t0], %[add_x10] \n\t" + "shra.ph %[t0], %[t7], 5 \n\t" + "addq.ph %[t7], %[t7], %[t0] \n\t" + "shra.ph %[t0], %[t7], 5 \n\t" + "addq.ph %[t7], %[t0], %[t8] \n\t" + "shra.ph %[t3], %[t7], 3 \n\t" + "muleu_s.ph.qbl %[t0], %[t2], %[t5] \n\t" + "addq.ph %[t7], %[t0], %[add_x20] \n\t" + "shra.ph %[t0], %[t7], 6 \n\t" + "addq.ph %[t8], %[t7], %[t0] \n\t" + "shra.ph %[t0], %[t8], 6 \n\t" + "addq.ph %[t7], %[t0], %[t4] \n\t" + "shra.ph %[t8], %[t7], 2 \n\t" + "shll.ph %[t0], %[t8], 5 \n\t" + "shll.ph %[t1], %[t6], 11 \n\t" + "or %[t2], %[t0], %[t1] \n\t" + "or %[t3], %[t2], %[t3] \n\t" + "sra %[t4], %[t3], 16 \n\t" + "sh %[t4], 0(%[dst]) \n\t" + "sh %[t3], 2(%[dst]) \n\t" + "addiu %[count], %[count], -2 \n\t" + "addiu %[src], %[src], 8 \n\t" + "b 2b \n\t" + " addiu %[dst], %[dst], 4 \n\t" + "1: \n\t" + ".set pop \n\t" + : [dst]"+r"(dst), [src]"+r"(src), [count]"+r"(count), + [t16]"=&r"(t16), [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), + [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), + [t7]"=&r"(t7), [t8]"=&r"(t8) + : [add_x10]"r"(add_x10), [add_x20]"r"(add_x20), [sa]"r"(sa) + : "memory", "hi", "lo" + ); + + if (count == 1) { + SkPMColor c = *src++; + SkPMColorAssert(c); + if (c) { + *dst = SkSrcOver32To16(c, *dst); + } + dst += 1; + } +} + +static void S32A_D565_Blend_mips_dsp(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + register uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + register uint32_t s0, s1, s2, s3; + register unsigned dst_scale = 0; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "replv.qb %[t0], %[alpha] \n\t" + "repl.ph %[t6], 0x80 \n\t" + "repl.ph %[t7], 0xFF \n\t" + "1: \n\t" + "addiu %[t8], %[count], -1 \n\t" + "blez %[t8], 2f \n\t" + " nop \n\t" + "lw %[t8], 0(%[src]) \n\t" + "lw %[t9], 4(%[src]) \n\t" + "lh %[t4], 0(%[dst]) \n\t" + "lh %[t5], 2(%[dst]) \n\t" + "sll %[t5], %[t5], 16 \n\t" + "sll %[t2], %[t8], 8 \n\t" + "sll %[t3], %[t9], 8 \n\t" + "precrq.qb.ph %[t1], %[t2], %[t3] \n\t" + "precrq.qb.ph %[t3], %[t8], %[t9] \n\t" + "preceu.ph.qbla %[t8], %[t3] \n\t" + "muleu_s.ph.qbr %[s3], %[t0], %[t8] \n\t" + "preceu.ph.qbla %[t2], %[t1] \n\t" + "preceu.ph.qbra %[t1], %[t1] \n\t" + "preceu.ph.qbra %[t3], %[t3] \n\t" + "packrl.ph %[t9], %[t4], %[t5] \n\t" + "shra.ph %[s0], %[t9], 11 \n\t" + "and %[s0], %[s0], 0x1F001F \n\t" + "shra.ph %[s1], %[t9], 5 \n\t" + "and %[s1], %[s1], 0x3F003F \n\t" + "and %[s2], %[t9], 0x1F001F \n\t" + "addq.ph %[s3], %[s3], %[t6] \n\t" + "shra.ph %[t5], %[s3], 8 \n\t" + "and %[t5], %[t5], 0xFF00FF \n\t" + "addq.ph %[dst_scale], %[s3], %[t5] \n\t" + "shra.ph %[dst_scale], %[dst_scale], 8 \n\t" + "subq_s.ph %[dst_scale], %[t7], %[dst_scale] \n\t" + "sll %[dst_scale], %[dst_scale], 8 \n\t" + "precrq.qb.ph %[dst_scale], %[dst_scale], %[dst_scale] \n\t" + "shrl.qb %[t1], %[t1], 3 \n\t" + "shrl.qb %[t2], %[t2], 3 \n\t" + "shrl.qb %[t3], %[t3], 2 \n\t" + "muleu_s.ph.qbl %[t1], %[t0], %[t1] \n\t" + "muleu_s.ph.qbl %[t2], %[t0], %[t2] \n\t" + "muleu_s.ph.qbl %[t3], %[t0], %[t3] \n\t" + "muleu_s.ph.qbl %[t8], %[dst_scale], %[s0] \n\t" + "muleu_s.ph.qbl %[t9], %[dst_scale], %[s2] \n\t" + "muleu_s.ph.qbl %[t4], %[dst_scale], %[s1] \n\t" + "addq.ph %[t1], %[t1], %[t8] \n\t" + "addq.ph %[t2], %[t2], %[t9] \n\t" + "addq.ph %[t3], %[t3], %[t4] \n\t" + "addq.ph %[t8], %[t1], %[t6] \n\t" + "addq.ph %[t9], %[t2], %[t6] \n\t" + "addq.ph %[t4], %[t3], %[t6] \n\t" + "shra.ph %[t1], %[t8], 8 \n\t" + "addq.ph %[t1], %[t1], %[t8] \n\t" + "preceu.ph.qbla %[t1], %[t1] \n\t" + "shra.ph %[t2], %[t9], 8 \n\t" + "addq.ph %[t2], %[t2], %[t9] \n\t" + "preceu.ph.qbla %[t2], %[t2] \n\t" + "shra.ph %[t3], %[t4], 8 \n\t" + "addq.ph %[t3], %[t3], %[t4] \n\t" + "preceu.ph.qbla %[t3], %[t3] \n\t" + "shll.ph %[t8], %[t1], 11 \n\t" + "shll.ph %[t9], %[t3], 5 \n\t" + "or %[t8], %[t8], %[t9] \n\t" + "or %[s0], %[t8], %[t2] \n\t" + "srl %[t8], %[s0], 16 \n\t" + "and %[t9], %[s0], 0xFFFF \n\t" + "sh %[t8], 0(%[dst]) \n\t" + "sh %[t9], 2(%[dst]) \n\t" + "addiu %[src], %[src], 8 \n\t" + "addiu %[count], %[count], -2 \n\t" + "b 1b \n\t" + " addiu %[dst], %[dst], 4 \n\t" + "2: \n\t" + ".set pop \n\t" + : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count), + [dst_scale]"+r"(dst_scale), [s0]"=&r"(s0), [s1]"=&r"(s1), + [s2]"=&r"(s2), [s3]"=&r"(s3), [t0]"=&r"(t0), [t1]"=&r"(t1), + [t2]"=&r"(t2), [t3]"=&r"(t3), [t4]"=&r"(t4), [t5]"=&r"(t5), + [t6]"=&r"(t6), [t7]"=&r"(t7), [t8]"=&r"(t8), [t9]"=&r"(t9) + : [alpha]"r"(alpha) + : "memory", "hi", "lo" + ); + + if (count == 1) { + SkPMColor sc = *src++; + SkPMColorAssert(sc); + if (sc) { + uint16_t dc = *dst; + unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); + unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + + SkMulS16(SkGetPackedR16(dc), dst_scale); + unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + + SkMulS16(SkGetPackedG16(dc), dst_scale); + unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + + SkMulS16(SkGetPackedB16(dc), dst_scale); + *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); + } + dst += 1; + } +} + +static void S32_Blend_BlitRow32_mips_dsp(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha) { + register int32_t t0, t1, t2, t3, t4, t5, t6, t7; + + __asm__ volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "li %[t2], 0x100 \n\t" + "addiu %[t0], %[alpha], 1 \n\t" + "subu %[t1], %[t2], %[t0] \n\t" + "replv.qb %[t7], %[t0] \n\t" + "replv.qb %[t6], %[t1] \n\t" + "1: \n\t" + "blez %[count], 2f \n\t" + "lw %[t0], 0(%[src]) \n\t" + "lw %[t1], 0(%[dst]) \n\t" + "preceu.ph.qbr %[t2], %[t0] \n\t" + "preceu.ph.qbl %[t3], %[t0] \n\t" + "preceu.ph.qbr %[t4], %[t1] \n\t" + "preceu.ph.qbl %[t5], %[t1] \n\t" + "muleu_s.ph.qbr %[t2], %[t7], %[t2] \n\t" + "muleu_s.ph.qbr %[t3], %[t7], %[t3] \n\t" + "muleu_s.ph.qbr %[t4], %[t6], %[t4] \n\t" + "muleu_s.ph.qbr %[t5], %[t6], %[t5] \n\t" + "addiu %[src], %[src], 4 \n\t" + "addiu %[count], %[count], -1 \n\t" + "precrq.qb.ph %[t0], %[t3], %[t2] \n\t" + "precrq.qb.ph %[t2], %[t5], %[t4] \n\t" + "addu %[t1], %[t0], %[t2] \n\t" + "sw %[t1], 0(%[dst]) \n\t" + "b 1b \n\t" + " addi %[dst], %[dst], 4 \n\t" + "2: \n\t" + ".set pop \n\t" + : [src]"+r"(src), [dst]"+r"(dst), [count]"+r"(count), + [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3), + [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7) + : [alpha]"r"(alpha) + : "memory", "hi", "lo" + ); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +const SkBlitRow::Proc platform_565_procs_mips_dsp[] = { + // no dither + NULL, + S32_D565_Blend_mips_dsp, + S32A_D565_Opaque_mips_dsp, + S32A_D565_Blend_mips_dsp, + + // dither + S32_D565_Opaque_Dither_mips_dsp, + S32_D565_Blend_Dither_mips_dsp, + S32A_D565_Opaque_Dither_mips_dsp, + NULL, +}; + +static const SkBlitRow::Proc32 platform_32_procs_mips_dsp[] = { + NULL, // S32_Opaque, + S32_Blend_BlitRow32_mips_dsp, // S32_Blend, + NULL, // S32A_Opaque, + NULL, // S32A_Blend, +}; + +SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { + return platform_565_procs_mips_dsp[flags]; +} + +SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { + return platform_32_procs_mips_dsp[flags]; +} + +SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { + return NULL; +} + +SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { + return NULL; +} diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp index 93830d78b46..bbc6a66462e 100644 --- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.cpp @@ -5,36 +5,31 @@ * found in the LICENSE file. */ - +#include <emmintrin.h> #include "SkBitmap.h" -#include "SkColorPriv.h" #include "SkBlurImage_opts_SSE2.h" +#include "SkColorPriv.h" #include "SkRect.h" -#include <emmintrin.h> - namespace { - enum BlurDirection { kX, kY }; -/** - * Helper function to spread the components of a 32-bit integer into the +/* Helper function to spread the components of a 32-bit integer into the * lower 8 bits of each 32-bit element of an SSE register. */ - inline __m128i expand(int a) { - const __m128i zero = _mm_setzero_si128(); + const __m128i zero = _mm_setzero_si128(); - // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B - __m128i result = _mm_cvtsi32_si128(a); + // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B + __m128i result = _mm_cvtsi32_si128(a); - // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B - result = _mm_unpacklo_epi8(result, zero); + // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B + result = _mm_unpacklo_epi8(result, zero); - // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B - return _mm_unpacklo_epi16(result, zero); + // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B + return _mm_unpacklo_epi16(result, zero); } template<BlurDirection srcDirection, BlurDirection dstDirection> diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h index c8deea4bb9c..db104bacf4f 100644 --- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_SSE2.h @@ -5,9 +5,14 @@ * found in the LICENSE file. */ +#ifndef SkBlurImage_opts_SSE2_DEFINED +#define SkBlurImage_opts_SSE2_DEFINED + #include "SkBlurImage_opts.h" bool SkBoxBlurGetPlatformProcs_SSE2(SkBoxBlurProc* boxBlurX, SkBoxBlurProc* boxBlurY, SkBoxBlurProc* boxBlurXY, SkBoxBlurProc* boxBlurYX); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp new file mode 100644 index 00000000000..10d595afa59 --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_arm.cpp @@ -0,0 +1,25 @@ +/* + * Copyright 2014 ARM Ltd. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkBlurImage_opts_neon.h" +#include "SkUtilsArm.h" + +bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX, + SkBoxBlurProc* boxBlurY, + SkBoxBlurProc* boxBlurXY, + SkBoxBlurProc* boxBlurYX) { +#if SK_ARM_NEON_IS_NONE + return false; +#else +#if SK_ARM_NEON_IS_DYNAMIC + if (!sk_cpu_arm_has_neon()) { + return false; + } +#endif + return SkBoxBlurGetPlatformProcs_NEON(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX); +#endif +} diff --git a/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp b/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp index 4e33d72d462..08187f3e55e 100644 --- a/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp +++ b/chromium/third_party/skia/src/opts/SkBlurImage_opts_neon.cpp @@ -20,6 +20,86 @@ enum BlurDirection { }; /** + * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register + * and also pre-load pixels for future read + */ +template<BlurDirection srcDirection> +inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) { + if (srcDirection == kX) { + uint32x2_t temp = vdup_n_u32(0); + // 10% faster by adding these 2 prefetches + SK_PREFETCH(src + 16); + SK_PREFETCH(src + srcStride + 16); + return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(src, temp, 0), 1)); + } else { + return vld1_u8((uint8_t*)src); + } +} + +/** + * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows + */ +template<BlurDirection dstDirection> +inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) { + if (dstDirection == kX) { + uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8)); + vst1_lane_u32(dst, temp, 0); + vst1_lane_u32(dst + dstStride, temp, 1); + } else { + uint8x8_t temp = vmovn_u16(result16x8); + vst1_u8((uint8_t*)dst, temp); + } +} + +/** + * fast path for kernel size less than 128 + */ +template<BlurDirection srcDirection, BlurDirection dstDirection> +void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize, + int leftOffset, int rightOffset, int width, int* height) +{ + const int rightBorder = SkMin32(rightOffset + 1, width); + const int srcStrideX = srcDirection == kX ? 1 : srcStride; + const int dstStrideX = dstDirection == kX ? 1 : *height; + const int srcStrideY = srcDirection == kX ? srcStride : 1; + const int dstStrideY = dstDirection == kX ? width : 1; + const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize); + + for (; *height >= 2; *height -= 2) { + uint16x8_t sum = vdupq_n_u16(0); + const SkPMColor* p = *src; + for (int i = 0; i < rightBorder; i++) { + sum = vaddw_u8(sum, + load_2_pixels<srcDirection>(p, srcStride)); + p += srcStrideX; + } + + const SkPMColor* sptr = *src; + SkPMColor* dptr = *dst; + for (int x = 0; x < width; x++) { + // val = (sum * scale * 2 + 0x8000) >> 16 + uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( + vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); + store_2_pixels<dstDirection>(resultPixels, dptr, width); + + if (x >= leftOffset) { + sum = vsubw_u8(sum, + load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX, srcStride)); + } + if (x + rightOffset + 1 < width) { + sum = vaddw_u8(sum, + load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcStrideX, srcStride)); + } + sptr += srcStrideX; + dptr += dstStrideX; + } + *src += srcStrideY * 2; + *dst += dstStrideY * 2; + } +} + + +/** * Helper function to spread the components of a 32-bit integer into the * lower 8 bits of each 16-bit element of a NEON register. */ @@ -42,7 +122,14 @@ void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker const int dstStrideY = dstDirection == kX ? width : 1; const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize); const uint32x4_t half = vdupq_n_u32(1 << 23); - for (int y = 0; y < height; ++y) { + + if (kernelSize < 128) + { + SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &dst, kernelSize, + leftOffset, rightOffset, width, &height); + } + + for (; height > 0; height--) { uint32x4_t sum = vdupq_n_u32(0); const SkPMColor* p = src; for (int i = 0; i < rightBorder; ++i) { @@ -77,8 +164,8 @@ void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker sum = vaddw_u16(sum, expand(*r)); } sptr += srcStrideX; - if (srcDirection == kY) { - SK_PREFETCH(sptr + (rightOffset + 1) * srcStrideX); + if (srcDirection == kX) { + SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX); } dptr += dstStrideX; } diff --git a/chromium/third_party/skia/src/opts/SkCachePreload_arm.h b/chromium/third_party/skia/src/opts/SkCachePreload_arm.h deleted file mode 100644 index cff8c2a9b79..00000000000 --- a/chromium/third_party/skia/src/opts/SkCachePreload_arm.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2012 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ -#ifndef SkCachePreload_arm_DEFINED -#define SkCachePreload_arm_DEFINED - -// This file defines macros for preload instructions for ARM. These macros -// are designed to be embedded inside GNU inline assembly. -// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache -// line size also needs to be known (and needs to be contained inside -// __ARM_CACHE_LINE_SIZE). -#if defined(__ARM_USE_PLD) - -#define PLD(x, n) "pld [%["#x"], #("#n")]\n\t" - -#if __ARM_CACHE_LINE_SIZE == 32 - #define PLD64(x, n) PLD(x, n) PLD(x, (n) + 32) -#elif __ARM_CACHE_LINE_SIZE == 64 - #define PLD64(x, n) PLD(x, n) -#else - #error "unknown __ARM_CACHE_LINE_SIZE." -#endif -#else - // PLD is disabled, all macros become empty. - #define PLD(x, n) - #define PLD64(x, n) -#endif - -#define PLD128(x, n) PLD64(x, n) PLD64(x, (n) + 64) - -#endif // SkCachePreload_arm_DEFINED diff --git a/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h new file mode 100644 index 00000000000..7e61d526b3b --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkColor_opts_SSE2.h @@ -0,0 +1,186 @@ +/* + * Copyright 2014 The Android Open Source Project + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkColor_opts_SSE2_DEFINED +#define SkColor_opts_SSE2_DEFINED + +#include <emmintrin.h> + +// Because no _mm_mul_epi32() in SSE2, we emulate it here. +// Multiplies 4 32-bit integers from a by 4 32-bit intergers from b. +// The 4 multiplication results should be represented within 32-bit +// integers, otherwise they would be overflow. +static inline __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) { + // Calculate results of a0 * b0 and a2 * b2. + __m128i r1 = _mm_mul_epu32(a, b); + // Calculate results of a1 * b1 and a3 * b3. + __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); + // Shuffle results to [63..0] and interleave the results. + __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)), + _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0))); + return r; +} + +static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) { + return _mm_add_epi32(alpha, _mm_set1_epi32(1)); +} + +// See #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) in SkXfermode.cpp. +static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a, + const __m128i& b) { + __m128i prod = _mm_mullo_epi16(a, b); + prod = _mm_add_epi32(prod, _mm_set1_epi32(128)); + prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8)); + prod = _mm_srli_epi32(prod, 8); + + return prod; +} + +// Portable version SkAlphaMulQ is in SkColorPriv.h. +static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) { + __m128i mask = _mm_set1_epi32(0xFF00FF); + __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale); + + // uint32_t rb = ((c & mask) * scale) >> 8 + __m128i rb = _mm_and_si128(mask, c); + rb = _mm_mullo_epi16(rb, s); + rb = _mm_srli_epi16(rb, 8); + + // uint32_t ag = ((c >> 8) & mask) * scale + __m128i ag = _mm_srli_epi16(c, 8); + ag = _mm_and_si128(ag, mask); + ag = _mm_mullo_epi16(ag, s); + + // (rb & mask) | (ag & ~mask) + rb = _mm_and_si128(mask, rb); + ag = _mm_andnot_si128(mask, ag); + return _mm_or_si128(rb, ag); +} + +static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) { + __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT)); + return _mm_srli_epi32(a, 24); +} + +static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) { + __m128i r = _mm_slli_epi32(src, (24 - SK_R32_SHIFT)); + return _mm_srli_epi32(r, 24); +} + +static inline __m128i SkGetPackedG32_SSE2(const __m128i& src) { + __m128i g = _mm_slli_epi32(src, (24 - SK_G32_SHIFT)); + return _mm_srli_epi32(g, 24); +} + +static inline __m128i SkGetPackedB32_SSE2(const __m128i& src) { + __m128i b = _mm_slli_epi32(src, (24 - SK_B32_SHIFT)); + return _mm_srli_epi32(b, 24); +} + +static inline __m128i SkMul16ShiftRound_SSE2(const __m128i& a, + const __m128i& b, int shift) { + __m128i prod = _mm_mullo_epi16(a, b); + prod = _mm_add_epi16(prod, _mm_set1_epi16(1 << (shift - 1))); + prod = _mm_add_epi16(prod, _mm_srli_epi16(prod, shift)); + prod = _mm_srli_epi16(prod, shift); + + return prod; +} + +static inline __m128i SkPackRGB16_SSE2(const __m128i& r, + const __m128i& g, const __m128i& b) { + __m128i dr = _mm_slli_epi16(r, SK_R16_SHIFT); + __m128i dg = _mm_slli_epi16(g, SK_G16_SHIFT); + __m128i db = _mm_slli_epi16(b, SK_B16_SHIFT); + + __m128i c = _mm_or_si128(dr, dg); + return _mm_or_si128(c, db); +} + +static inline __m128i SkPackARGB32_SSE2(const __m128i& a, const __m128i& r, + const __m128i& g, const __m128i& b) { + __m128i da = _mm_slli_epi32(a, SK_A32_SHIFT); + __m128i dr = _mm_slli_epi32(r, SK_R32_SHIFT); + __m128i dg = _mm_slli_epi32(g, SK_G32_SHIFT); + __m128i db = _mm_slli_epi32(b, SK_B32_SHIFT); + + __m128i c = _mm_or_si128(da, dr); + c = _mm_or_si128(c, dg); + return _mm_or_si128(c, db); +} + +static inline __m128i SkPacked16ToR32_SSE2(const __m128i& src) { + __m128i r = _mm_srli_epi32(src, SK_R16_SHIFT); + r = _mm_and_si128(r, _mm_set1_epi32(SK_R16_MASK)); + r = _mm_or_si128(_mm_slli_epi32(r, (8 - SK_R16_BITS)), + _mm_srli_epi32(r, (2 * SK_R16_BITS - 8))); + + return r; +} + +static inline __m128i SkPacked16ToG32_SSE2(const __m128i& src) { + __m128i g = _mm_srli_epi32(src, SK_G16_SHIFT); + g = _mm_and_si128(g, _mm_set1_epi32(SK_G16_MASK)); + g = _mm_or_si128(_mm_slli_epi32(g, (8 - SK_G16_BITS)), + _mm_srli_epi32(g, (2 * SK_G16_BITS - 8))); + + return g; +} + +static inline __m128i SkPacked16ToB32_SSE2(const __m128i& src) { + __m128i b = _mm_srli_epi32(src, SK_B16_SHIFT); + b = _mm_and_si128(b, _mm_set1_epi32(SK_B16_MASK)); + b = _mm_or_si128(_mm_slli_epi32(b, (8 - SK_B16_BITS)), + _mm_srli_epi32(b, (2 * SK_B16_BITS - 8))); + + return b; +} + +static inline __m128i SkPixel16ToPixel32_SSE2(const __m128i& src) { + __m128i r = SkPacked16ToR32_SSE2(src); + __m128i g = SkPacked16ToG32_SSE2(src); + __m128i b = SkPacked16ToB32_SSE2(src); + + return SkPackARGB32_SSE2(_mm_set1_epi32(0xFF), r, g, b); +} + +static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1, + const __m128i& src_pixel2) { + // Calculate result r. + __m128i r1 = _mm_srli_epi32(src_pixel1, + SK_R32_SHIFT + (8 - SK_R16_BITS)); + r1 = _mm_and_si128(r1, _mm_set1_epi32(SK_R16_MASK)); + __m128i r2 = _mm_srli_epi32(src_pixel2, + SK_R32_SHIFT + (8 - SK_R16_BITS)); + r2 = _mm_and_si128(r2, _mm_set1_epi32(SK_R16_MASK)); + __m128i r = _mm_packs_epi32(r1, r2); + + // Calculate result g. + __m128i g1 = _mm_srli_epi32(src_pixel1, + SK_G32_SHIFT + (8 - SK_G16_BITS)); + g1 = _mm_and_si128(g1, _mm_set1_epi32(SK_G16_MASK)); + __m128i g2 = _mm_srli_epi32(src_pixel2, + SK_G32_SHIFT + (8 - SK_G16_BITS)); + g2 = _mm_and_si128(g2, _mm_set1_epi32(SK_G16_MASK)); + __m128i g = _mm_packs_epi32(g1, g2); + + // Calculate result b. + __m128i b1 = _mm_srli_epi32(src_pixel1, + SK_B32_SHIFT + (8 - SK_B16_BITS)); + b1 = _mm_and_si128(b1, _mm_set1_epi32(SK_B16_MASK)); + __m128i b2 = _mm_srli_epi32(src_pixel2, + SK_B32_SHIFT + (8 - SK_B16_BITS)); + b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK)); + __m128i b = _mm_packs_epi32(b1, b2); + + // Store 8 16-bit colors in dst. + __m128i d_pixel = SkPackRGB16_SSE2(r, g, b); + + return d_pixel; +} + +#endif // SkColor_opts_SSE2_DEFINED diff --git a/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h new file mode 100644 index 00000000000..2cc21afa0df --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkMath_opts_SSE2.h @@ -0,0 +1,51 @@ +/* + * Copyright 2014 The Android Open Source Project + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkMath_opts_SSE2_DEFINED +#define SkMath_opts_SSE2_DEFINED + +#include <emmintrin.h> + +// Because no _mm_div_epi32() in SSE2, we use float division to emulate. +// When using this function, make sure a and b don't exceed float's precision. +static inline __m128i shim_mm_div_epi32(const __m128i& a, const __m128i& b) { + __m128 x = _mm_cvtepi32_ps(a); + __m128 y = _mm_cvtepi32_ps(b); + return _mm_cvttps_epi32(_mm_div_ps(x, y)); +} + +// Portable version of SkSqrtBits is in SkMath.cpp. +static inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) { + __m128i root = _mm_setzero_si128(); + __m128i remHi = _mm_setzero_si128(); + __m128i remLo = x; + __m128i one128 = _mm_set1_epi32(1); + + do { + root = _mm_slli_epi32(root, 1); + + remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2), + _mm_srli_epi32(remLo, 30)); + remLo = _mm_slli_epi32(remLo, 2); + + __m128i testDiv = _mm_slli_epi32(root, 1); + testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1)); + + __m128i cmp = _mm_cmplt_epi32(remHi, testDiv); + __m128i remHi1 = _mm_and_si128(cmp, remHi); + __m128i root1 = _mm_and_si128(cmp, root); + __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv)); + __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128)); + + remHi = _mm_or_si128(remHi1, remHi2); + root = _mm_or_si128(root1, root2); + } while (--count >= 0); + + return root; +} + +#endif // SkMath_opts_SSE2_DEFINED diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts.h b/chromium/third_party/skia/src/opts/SkMorphology_opts.h index e3ad853cf64..7ea7c546231 100644 --- a/chromium/third_party/skia/src/opts/SkMorphology_opts.h +++ b/chromium/third_party/skia/src/opts/SkMorphology_opts.h @@ -5,17 +5,10 @@ * found in the LICENSE file. */ -#include <SkColor.h> +#ifndef SkMorphology_opts_DEFINED +#define SkMorphology_opts_DEFINED -/** - * All morphology procs have the same signature: src is the source buffer, dst the - * destination buffer, radius is the morphology radius, width and height are the bounds - * of the destination buffer (in pixels), and srcStride and dstStride are the - * number of pixels per row in each buffer. All buffers are 8888. - */ - -typedef void (*SkMorphologyProc)(const SkPMColor* src, SkPMColor* dst, int radius, - int width, int height, int srcStride, int dstStride); +#include <SkMorphologyImageFilter.h> enum SkMorphologyProcType { kDilateX_SkMorphologyProcType, @@ -24,4 +17,6 @@ enum SkMorphologyProcType { kErodeY_SkMorphologyProcType }; -SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type); +SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp index b58fced2c12..e782950956a 100644 --- a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.cpp @@ -5,12 +5,10 @@ * found in the LICENSE file. */ - +#include <emmintrin.h> #include "SkColorPriv.h" #include "SkMorphology_opts_SSE2.h" -#include <emmintrin.h> - /* SSE2 version of dilateX, dilateY, erodeX, erodeY. * portable versions are in src/effects/SkMorphologyImageFilter.cpp. */ @@ -48,8 +46,12 @@ static void SkMorph_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, lp += srcStrideY; up += srcStrideY; } - if (x >= radius) src += srcStrideX; - if (x + radius < width - 1) upperSrc += srcStrideX; + if (x >= radius) { + src += srcStrideX; + } + if (x + radius < width - 1) { + upperSrc += srcStrideX; + } dst += dstStrideX; } } diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h index bd103e6eba9..bf5aa03b092 100644 --- a/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_SSE2.h @@ -5,6 +5,11 @@ * found in the LICENSE file. */ +#ifndef SkMorphology_opts_SSE2_DEFINED +#define SkMorphology_opts_SSE2_DEFINED + +#include "SkColor.h" + void SkDilateX_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, int width, int height, int srcStride, int dstStride); void SkDilateY_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, @@ -13,3 +18,5 @@ void SkErodeX_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, int width, int height, int srcStride, int dstStride); void SkErodeY_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, int width, int height, int srcStride, int dstStride); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp new file mode 100644 index 00000000000..2bba4929c22 --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_arm.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2014 ARM Ltd. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkMorphology_opts.h" +#include "SkMorphology_opts_neon.h" +#include "SkUtilsArm.h" + +SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) { +#if SK_ARM_NEON_IS_NONE + return NULL; +#else +#if SK_ARM_NEON_IS_DYNAMIC + if (!sk_cpu_arm_has_neon()) { + return NULL; + } +#endif + switch (type) { + case kDilateX_SkMorphologyProcType: + return SkDilateX_neon; + case kDilateY_SkMorphologyProcType: + return SkDilateY_neon; + case kErodeX_SkMorphologyProcType: + return SkErodeX_neon; + case kErodeY_SkMorphologyProcType: + return SkErodeY_neon; + default: + return NULL; + } +#endif +} diff --git a/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp b/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp index 66d58ba571f..ade261fc7d2 100644 --- a/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp +++ b/chromium/third_party/skia/src/opts/SkMorphology_opts_none.cpp @@ -7,6 +7,6 @@ #include "SkMorphology_opts.h" -SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType) { +SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType) { return NULL; } diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp index e22044d39d3..bd2f9b29a44 100644 --- a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp +++ b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.cpp @@ -1,4 +1,3 @@ - /* * Copyright 2009 The Android Open Source Project * @@ -6,7 +5,6 @@ * found in the LICENSE file. */ - #include <emmintrin.h> #include "SkUtils_opts_SSE2.h" @@ -69,3 +67,33 @@ void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count) --count; } } + +void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count) +{ + if (count >= 16) { + while (((size_t)dst) & 0x0F) { + *dst++ = *src++; + --count; + } + __m128i *dst128 = reinterpret_cast<__m128i*>(dst); + const __m128i *src128 = reinterpret_cast<const __m128i*>(src); + while (count >= 16) { + __m128i a = _mm_loadu_si128(src128++); + __m128i b = _mm_loadu_si128(src128++); + __m128i c = _mm_loadu_si128(src128++); + __m128i d = _mm_loadu_si128(src128++); + + _mm_store_si128(dst128++, a); + _mm_store_si128(dst128++, b); + _mm_store_si128(dst128++, c); + _mm_store_si128(dst128++, d); + count -= 16; + } + dst = reinterpret_cast<uint32_t*>(dst128); + src = reinterpret_cast<const uint32_t*>(src128); + } + while (count > 0) { + *dst++ = *src++; + --count; + } +} diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h index ed24c1ffa40..009f01894b4 100644 --- a/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h +++ b/chromium/third_party/skia/src/opts/SkUtils_opts_SSE2.h @@ -1,4 +1,3 @@ - /* * Copyright 2009 The Android Open Source Project * @@ -6,8 +5,13 @@ * found in the LICENSE file. */ +#ifndef SkUtils_opts_SSE2_DEFINED +#define SkUtils_opts_SSE2_DEFINED #include "SkTypes.h" void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count); void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count); +void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count); + +#endif diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp new file mode 100644 index 00000000000..b1c9d0aa93e --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkUtils_opts_arm.cpp @@ -0,0 +1,57 @@ +/* + * Copyright 2014 ARM Ltd. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkUtils.h" +#include "SkUtilsArm.h" + +#if defined(SK_CPU_LENDIAN) && !SK_ARM_NEON_IS_NONE +extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count); +extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count); +#endif + +#if defined(SK_CPU_LENDIAN) +extern "C" void arm_memset16(uint16_t* dst, uint16_t value, int count); +extern "C" void arm_memset32(uint32_t* dst, uint32_t value, int count); +#endif + +SkMemset16Proc SkMemset16GetPlatformProc() { + // FIXME: memset.arm.S is using syntax incompatible with XCode +#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS) + return NULL; +#elif SK_ARM_NEON_IS_DYNAMIC + if (sk_cpu_arm_has_neon()) { + return memset16_neon; + } else { + return arm_memset16; + } +#elif SK_ARM_NEON_IS_ALWAYS + return memset16_neon; +#else + return arm_memset16; +#endif +} + +SkMemset32Proc SkMemset32GetPlatformProc() { + // FIXME: memset.arm.S is using syntax incompatible with XCode +#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS) + return NULL; +#elif SK_ARM_NEON_IS_DYNAMIC + if (sk_cpu_arm_has_neon()) { + return memset32_neon; + } else { + return arm_memset32; + } +#elif SK_ARM_NEON_IS_ALWAYS + return memset32_neon; +#else + return arm_memset32; +#endif +} + +SkMemcpy32Proc SkMemcpy32GetPlatformProc() { + return NULL; +} diff --git a/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp b/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp index 286f10d7e53..18f52496db4 100644 --- a/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp +++ b/chromium/third_party/skia/src/opts/SkUtils_opts_none.cpp @@ -16,3 +16,7 @@ SkMemset16Proc SkMemset16GetPlatformProc() { SkMemset32Proc SkMemset32GetPlatformProc() { return NULL; } + +SkMemcpy32Proc SkMemcpy32GetPlatformProc() { + return NULL; +} diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp new file mode 100644 index 00000000000..94f9a4aea3b --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.cpp @@ -0,0 +1,819 @@ +/* + * Copyright 2014 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkColorPriv.h" +#include "SkColor_opts_SSE2.h" +#include "SkMathPriv.h" +#include "SkMath_opts_SSE2.h" +#include "SkXfermode.h" +#include "SkXfermode_opts_SSE2.h" +#include "SkXfermode_proccoeff.h" + +//////////////////////////////////////////////////////////////////////////////// +// 4 pixels SSE2 version functions +//////////////////////////////////////////////////////////////////////////////// + +static inline __m128i SkDiv255Round_SSE2(const __m128i& a) { + __m128i prod = _mm_add_epi32(a, _mm_set1_epi32(128)); // prod += 128; + prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8)); // prod + (prod >> 8) + prod = _mm_srli_epi32(prod, 8); // >> 8 + + return prod; +} + +static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) { + __m128i sum = _mm_add_epi32(a, b); + __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255)); + + sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)), + _mm_andnot_si128(cmp, sum)); + return sum; +} + +static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) { + __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128()); + __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255)); + __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255)); + + __m128i cmp = _mm_or_si128(cmp1, cmp2); + ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n)); + + return ret; +} + +static inline __m128i clamp_div255round_SSE2(const __m128i& prod) { + // test if > 0 + __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128()); + // test if < 255*255 + __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255)); + + __m128i ret = _mm_setzero_si128(); + + // if value >= 255*255, value = 255 + ret = _mm_andnot_si128(cmp2, _mm_set1_epi32(255)); + + __m128i div = SkDiv255Round_SSE2(prod); + + // test if > 0 && < 255*255 + __m128i cmp = _mm_and_si128(cmp1, cmp2); + + ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret)); + + return ret; +} + +static __m128i srcover_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src)); + return _mm_add_epi32(src, SkAlphaMulQ_SSE2(dst, isa)); +} + +static __m128i dstover_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst)); + return _mm_add_epi32(dst, SkAlphaMulQ_SSE2(src, ida)); +} + +static __m128i srcin_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i da = SkGetPackedA32_SSE2(dst); + return SkAlphaMulQ_SSE2(src, SkAlpha255To256_SSE2(da)); +} + +static __m128i dstin_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + return SkAlphaMulQ_SSE2(dst, SkAlpha255To256_SSE2(sa)); +} + +static __m128i srcout_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst)); + return SkAlphaMulQ_SSE2(src, ida); +} + +static __m128i dstout_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src)); + return SkAlphaMulQ_SSE2(dst, isa); +} + +static __m128i srcatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + + __m128i a = da; + + __m128i r1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedR32_SSE2(src)); + __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst)); + __m128i r = _mm_add_epi32(r1, r2); + + __m128i g1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedG32_SSE2(src)); + __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst)); + __m128i g = _mm_add_epi32(g1, g2); + + __m128i b1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedB32_SSE2(src)); + __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst)); + __m128i b = _mm_add_epi32(b1, b2); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i dstatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + + __m128i a = sa; + + __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src)); + __m128i r2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedR32_SSE2(dst)); + __m128i r = _mm_add_epi32(r1, r2); + + __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src)); + __m128i g2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedG32_SSE2(dst)); + __m128i g = _mm_add_epi32(g1, g2); + + __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src)); + __m128i b2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedB32_SSE2(dst)); + __m128i b = _mm_add_epi32(b1, b2); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + + __m128i a1 = _mm_add_epi32(sa, da); + __m128i a2 = SkAlphaMulAlpha_SSE2(sa, da); + a2 = _mm_slli_epi32(a2, 1); + __m128i a = _mm_sub_epi32(a1, a2); + + __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src)); + __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst)); + __m128i r = _mm_add_epi32(r1, r2); + + __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src)); + __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst)); + __m128i g = _mm_add_epi32(g1, g2); + + __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src)); + __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst)); + __m128i b = _mm_add_epi32(b1, b2); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i plus_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i b = saturated_add_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst)); + __m128i g = saturated_add_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst)); + __m128i r = saturated_add_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst)); + __m128i a = saturated_add_SSE2(SkGetPackedA32_SSE2(src), + SkGetPackedA32_SSE2(dst)); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i modulate_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i a = SkAlphaMulAlpha_SSE2(SkGetPackedA32_SSE2(src), + SkGetPackedA32_SSE2(dst)); + __m128i r = SkAlphaMulAlpha_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst)); + __m128i g = SkAlphaMulAlpha_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst)); + __m128i b = SkAlphaMulAlpha_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst)); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) { + __m128i cmp = _mm_cmplt_epi32(a, b); + return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b)); +} + +static inline __m128i srcover_byte_SSE2(const __m128i& a, const __m128i& b) { + // a + b - SkAlphaMulAlpha(a, b); + return _mm_sub_epi32(_mm_add_epi32(a, b), SkAlphaMulAlpha_SSE2(a, b)); + +} + +static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + // sc * (255 - da) + __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da); + ret1 = _mm_mullo_epi16(sc, ret1); + + // dc * (255 - sa) + __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); + ret2 = _mm_mullo_epi16(dc, ret2); + + // sc * dc + __m128i ret3 = _mm_mullo_epi16(sc, dc); + + __m128i ret = _mm_add_epi32(ret1, ret2); + ret = _mm_add_epi32(ret, ret3); + + return clamp_div255round_SSE2(ret); +} + +static __m128i multiply_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i a = srcover_byte_SSE2(sa, da); + + __m128i sr = SkGetPackedR32_SSE2(src); + __m128i dr = SkGetPackedR32_SSE2(dst); + __m128i r = blendfunc_multiply_byte_SSE2(sr, dr, sa, da); + + __m128i sg = SkGetPackedG32_SSE2(src); + __m128i dg = SkGetPackedG32_SSE2(dst); + __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da); + + + __m128i sb = SkGetPackedB32_SSE2(src); + __m128i db = SkGetPackedB32_SSE2(dst); + __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i screen_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i a = srcover_byte_SSE2(SkGetPackedA32_SSE2(src), + SkGetPackedA32_SSE2(dst)); + __m128i r = srcover_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst)); + __m128i g = srcover_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst)); + __m128i b = srcover_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst)); + return SkPackARGB32_SSE2(a, r, g, b); +} + +// Portable version overlay_byte() is in SkXfermode.cpp. +static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + __m128i tmp1 = _mm_mullo_epi16(sc, ida); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + __m128i tmp2 = _mm_mullo_epi16(dc, isa); + __m128i tmp = _mm_add_epi32(tmp1, tmp2); + + __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da); + __m128i rc1 = _mm_slli_epi32(sc, 1); // 2 * sc + rc1 = Multiply32_SSE2(rc1, dc); // *dc + + __m128i rc2 = _mm_mullo_epi16(sa, da); // sa * da + __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1); // 2 * (da - dc) + tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc)); // * (sa - sc) + rc2 = _mm_sub_epi32(rc2, tmp3); + + __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1), + _mm_and_si128(cmp, rc2)); + return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp)); +} + +static __m128i overlay_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = overlay_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = overlay_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = overlay_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i sd = _mm_mullo_epi16(sc, da); + __m128i ds = _mm_mullo_epi16(dc, sa); + + __m128i cmp = _mm_cmplt_epi32(sd, ds); + + __m128i tmp = _mm_add_epi32(sc, dc); + __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); + __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); + __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), + _mm_andnot_si128(cmp, ret2)); + return ret; +} + +static __m128i darken_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = darken_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = darken_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = darken_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i lighten_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i sd = _mm_mullo_epi16(sc, da); + __m128i ds = _mm_mullo_epi16(dc, sa); + + __m128i cmp = _mm_cmpgt_epi32(sd, ds); + + __m128i tmp = _mm_add_epi32(sc, dc); + __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); + __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); + __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), + _mm_andnot_si128(cmp, ret2)); + return ret; +} + +static __m128i lighten_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = lighten_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = lighten_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = lighten_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i diff = _mm_sub_epi32(sa, sc); + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + + // if (0 == dc) + __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128()); + __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida)); + + // else if (0 == diff) + __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128()); + __m128i cmp = _mm_andnot_si128(cmp1, cmp2); + __m128i tmp1 = _mm_mullo_epi16(sa, da); + __m128i tmp2 = _mm_mullo_epi16(sc, ida); + __m128i tmp3 = _mm_mullo_epi16(dc, isa); + __m128i rc2 = _mm_add_epi32(tmp1, tmp2); + rc2 = _mm_add_epi32(rc2, tmp3); + rc2 = clamp_div255round_SSE2(rc2); + rc2 = _mm_and_si128(cmp, rc2); + + // else + __m128i cmp3 = _mm_or_si128(cmp1, cmp2); + __m128i value = _mm_mullo_epi16(dc, sa); + diff = shim_mm_div_epi32(value, diff); + + __m128i tmp4 = SkMin32_SSE2(da, diff); + tmp4 = Multiply32_SSE2(sa, tmp4); + __m128i rc3 = _mm_add_epi32(tmp4, tmp2); + rc3 = _mm_add_epi32(rc3, tmp3); + rc3 = clamp_div255round_SSE2(rc3); + rc3 = _mm_andnot_si128(cmp3, rc3); + + __m128i rc = _mm_or_si128(rc1, rc2); + rc = _mm_or_si128(rc, rc3); + + return rc; +} + +static __m128i colordodge_modeproc_SSE2(const __m128i& src, + const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = colordodge_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = colordodge_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = colordodge_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + + // if (dc == da) + __m128i cmp1 = _mm_cmpeq_epi32(dc, da); + __m128i tmp1 = _mm_mullo_epi16(sa, da); + __m128i tmp2 = _mm_mullo_epi16(sc, ida); + __m128i tmp3 = _mm_mullo_epi16(dc, isa); + __m128i rc1 = _mm_add_epi32(tmp1, tmp2); + rc1 = _mm_add_epi32(rc1, tmp3); + rc1 = clamp_div255round_SSE2(rc1); + rc1 = _mm_and_si128(cmp1, rc1); + + // else if (0 == sc) + __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128()); + __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa); + __m128i cmp = _mm_andnot_si128(cmp1, cmp2); + rc2 = _mm_and_si128(cmp, rc2); + + // else + __m128i cmp3 = _mm_or_si128(cmp1, cmp2); + __m128i tmp4 = _mm_sub_epi32(da, dc); + tmp4 = Multiply32_SSE2(tmp4, sa); + tmp4 = shim_mm_div_epi32(tmp4, sc); + + __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4)); + tmp5 = Multiply32_SSE2(sa, tmp5); + __m128i rc3 = _mm_add_epi32(tmp5, tmp2); + rc3 = _mm_add_epi32(rc3, tmp3); + rc3 = clamp_div255round_SSE2(rc3); + rc3 = _mm_andnot_si128(cmp3, rc3); + + __m128i rc = _mm_or_si128(rc1, rc2); + rc = _mm_or_si128(rc, rc3); + + return rc; +} + +static __m128i colorburn_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = colorburn_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = colorburn_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = colorburn_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + // if (2 * sc <= sa) + __m128i tmp1 = _mm_slli_epi32(sc, 1); + __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); + __m128i rc1 = _mm_mullo_epi16(sc, dc); // sc * dc; + rc1 = _mm_slli_epi32(rc1, 1); // 2 * sc * dc + rc1 = _mm_andnot_si128(cmp1, rc1); + + // else + tmp1 = _mm_mullo_epi16(sa, da); + __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc), + _mm_sub_epi32(sa, sc)); + tmp2 = _mm_slli_epi32(tmp2, 1); + __m128i rc2 = _mm_sub_epi32(tmp1, tmp2); + rc2 = _mm_and_si128(cmp1, rc2); + + __m128i rc = _mm_or_si128(rc1, rc2); + + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + tmp1 = _mm_mullo_epi16(sc, ida); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + tmp2 = _mm_mullo_epi16(dc, isa); + rc = _mm_add_epi32(rc, tmp1); + rc = _mm_add_epi32(rc, tmp2); + return clamp_div255round_SSE2(rc); +} + +static __m128i hardlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = hardlight_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = hardlight_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = hardlight_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i sqrt_unit_byte_SSE2(const __m128i& n) { + return SkSqrtBits_SSE2(n, 15+4); +} + +static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i tmp1, tmp2, tmp3; + + // int m = da ? dc * 256 / da : 0; + __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); + __m128i m = _mm_slli_epi32(dc, 8); + __m128 x = _mm_cvtepi32_ps(m); + __m128 y = _mm_cvtepi32_ps(da); + m = _mm_cvttps_epi32(_mm_div_ps(x, y)); + m = _mm_andnot_si128(cmp, m); + + // if (2 * sc <= sa) + tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc + __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); + tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa + tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m + tmp1 = Multiply32_SSE2(tmp1, tmp2); + tmp1 = _mm_srai_epi32(tmp1, 8); + tmp1 = _mm_add_epi32(sa, tmp1); + tmp1 = Multiply32_SSE2(dc, tmp1); + __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); + + // else if (4 * dc <= da) + tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 + __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); + __m128i i = _mm_slli_epi32(m, 2); // 4 * m + __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 + __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) + __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 + i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) + i = _mm_srai_epi32(i, 16); // >> 16 + j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m + tmp2 = _mm_add_epi32(i, j); + i = Multiply32_SSE2(dc, sa); // dc * sa + j = _mm_slli_epi32(sc, 1); // 2 * sc + j = _mm_sub_epi32(j, sa); // 2 * sc - sa + j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) + tmp2 = Multiply32_SSE2(j, tmp2); // * tmp + tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 + tmp2 = _mm_add_epi32(i, tmp2); + cmp = _mm_andnot_si128(cmp2, cmp1); + __m128i rc2 = _mm_and_si128(cmp, tmp2); + __m128i rc = _mm_or_si128(rc1, rc2); + + // else + tmp3 = sqrt_unit_byte_SSE2(m); + tmp3 = _mm_sub_epi32(tmp3, m); + tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) + tmp3 = _mm_srai_epi32(tmp3, 8); + tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa + cmp = _mm_and_si128(cmp1, cmp2); + __m128i rc3 = _mm_and_si128(cmp, tmp3); + rc = _mm_or_si128(rc, rc3); + + tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da + tmp1 = _mm_mullo_epi16(sc, tmp1); + tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa + tmp2 = _mm_mullo_epi16(dc, tmp2); + rc = _mm_add_epi32(rc, tmp1); + rc = _mm_add_epi32(rc, tmp2); + return clamp_div255round_SSE2(rc); +} + +static __m128i softlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = softlight_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = softlight_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = softlight_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i difference_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i& sa, const __m128i& da) { + __m128i tmp1 = _mm_mullo_epi16(sc, da); + __m128i tmp2 = _mm_mullo_epi16(dc, sa); + __m128i tmp = SkMin32_SSE2(tmp1, tmp2); + + __m128i ret1 = _mm_add_epi32(sc, dc); + __m128i ret2 = _mm_slli_epi32(SkDiv255Round_SSE2(tmp), 1); + __m128i ret = _mm_sub_epi32(ret1, ret2); + + ret = clamp_signed_byte_SSE2(ret); + return ret; +} + +static __m128i difference_modeproc_SSE2(const __m128i& src, + const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = difference_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = difference_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = difference_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc, + const __m128i&, __m128i&) { + __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc + __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_mullo_epi16(sc, dc); // sc * dc + tmp2 = _mm_slli_epi32(tmp2, 1); // 2 * sc * dc + + __m128i r = _mm_sub_epi32(tmp1, tmp2); + return clamp_div255round_SSE2(r); +} + +static __m128i exclusion_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + + __m128i a = srcover_byte_SSE2(sa, da); + __m128i r = exclusion_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst), sa, da); + __m128i g = exclusion_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst), sa, da); + __m128i b = exclusion_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst), sa, da); + return SkPackARGB32_SSE2(a, r, g, b); +} + +//////////////////////////////////////////////////////////////////////////////// + +typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst); + +extern SkXfermodeProcSIMD gSSE2XfermodeProcs[]; + +SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer) + : INHERITED(buffer) { + fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]); + buffer.validate(fProcSIMD != NULL); +} + +void SkSSE2ProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], + int count, const SkAlpha aa[]) const { + SkASSERT(dst && src && count >= 0); + + SkXfermodeProc proc = this->getProc(); + SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); + SkASSERT(procSIMD != NULL); + + if (NULL == aa) { + if (count >= 4) { + while (((size_t)dst & 0x0F) != 0) { + *dst = proc(*src, *dst); + dst++; + src++; + count--; + } + + const __m128i* s = reinterpret_cast<const __m128i*>(src); + __m128i* d = reinterpret_cast<__m128i*>(dst); + + while (count >= 4) { + __m128i src_pixel = _mm_loadu_si128(s++); + __m128i dst_pixel = _mm_load_si128(d); + + dst_pixel = procSIMD(src_pixel, dst_pixel); + _mm_store_si128(d++, dst_pixel); + count -= 4; + } + + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<SkPMColor*>(d); + } + + for (int i = count - 1; i >= 0; --i) { + *dst = proc(*src, *dst); + dst++; + src++; + } + } else { + for (int i = count - 1; i >= 0; --i) { + unsigned a = aa[i]; + if (0 != a) { + SkPMColor dstC = dst[i]; + SkPMColor C = proc(src[i], dstC); + if (a != 0xFF) { + C = SkFourByteInterp(C, dstC, a); + } + dst[i] = C; + } + } + } +} + +void SkSSE2ProcCoeffXfermode::xfer16(uint16_t dst[], const SkPMColor src[], + int count, const SkAlpha aa[]) const { + SkASSERT(dst && src && count >= 0); + + SkXfermodeProc proc = this->getProc(); + SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); + SkASSERT(procSIMD != NULL); + + if (NULL == aa) { + if (count >= 8) { + while (((size_t)dst & 0x0F) != 0) { + SkPMColor dstC = SkPixel16ToPixel32(*dst); + *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC)); + dst++; + src++; + count--; + } + + const __m128i* s = reinterpret_cast<const __m128i*>(src); + __m128i* d = reinterpret_cast<__m128i*>(dst); + + while (count >= 8) { + __m128i src_pixel1 = _mm_loadu_si128(s++); + __m128i src_pixel2 = _mm_loadu_si128(s++); + __m128i dst_pixel = _mm_load_si128(d); + + __m128i dst_pixel1 = _mm_unpacklo_epi16(dst_pixel, _mm_setzero_si128()); + __m128i dst_pixel2 = _mm_unpackhi_epi16(dst_pixel, _mm_setzero_si128()); + + __m128i dstC1 = SkPixel16ToPixel32_SSE2(dst_pixel1); + __m128i dstC2 = SkPixel16ToPixel32_SSE2(dst_pixel2); + + dst_pixel1 = procSIMD(src_pixel1, dstC1); + dst_pixel2 = procSIMD(src_pixel2, dstC2); + dst_pixel = SkPixel32ToPixel16_ToU16_SSE2(dst_pixel1, dst_pixel2); + + _mm_store_si128(d++, dst_pixel); + count -= 8; + } + + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<uint16_t*>(d); + } + + for (int i = count - 1; i >= 0; --i) { + SkPMColor dstC = SkPixel16ToPixel32(*dst); + *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC)); + dst++; + src++; + } + } else { + for (int i = count - 1; i >= 0; --i) { + unsigned a = aa[i]; + if (0 != a) { + SkPMColor dstC = SkPixel16ToPixel32(dst[i]); + SkPMColor C = proc(src[i], dstC); + if (0xFF != a) { + C = SkFourByteInterp(C, dstC, a); + } + dst[i] = SkPixel32ToPixel16_ToU16(C); + } + } + } +} + +#ifndef SK_IGNORE_TO_STRING +void SkSSE2ProcCoeffXfermode::toString(SkString* str) const { + this->INHERITED::toString(str); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// + +// 4 pixels modeprocs with SSE2 +SkXfermodeProcSIMD gSSE2XfermodeProcs[] = { + NULL, // kClear_Mode + NULL, // kSrc_Mode + NULL, // kDst_Mode + srcover_modeproc_SSE2, + dstover_modeproc_SSE2, + srcin_modeproc_SSE2, + dstin_modeproc_SSE2, + srcout_modeproc_SSE2, + dstout_modeproc_SSE2, + srcatop_modeproc_SSE2, + dstatop_modeproc_SSE2, + xor_modeproc_SSE2, + plus_modeproc_SSE2, + modulate_modeproc_SSE2, + screen_modeproc_SSE2, + + overlay_modeproc_SSE2, + darken_modeproc_SSE2, + lighten_modeproc_SSE2, + colordodge_modeproc_SSE2, + colorburn_modeproc_SSE2, + hardlight_modeproc_SSE2, + softlight_modeproc_SSE2, + difference_modeproc_SSE2, + exclusion_modeproc_SSE2, + multiply_modeproc_SSE2, + + NULL, // kHue_Mode + NULL, // kSaturation_Mode + NULL, // kColor_Mode + NULL, // kLuminosity_Mode +}; + +SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, + SkXfermode::Mode mode) { + void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]); + + if (procSIMD != NULL) { + return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD)); + } + return NULL; +} diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h new file mode 100644 index 00000000000..bfc143937a8 --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_SSE2.h @@ -0,0 +1,38 @@ +/* + * Copyright 2014 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkXfermode_opts_SSE2_DEFINED +#define SkXfermode_opts_SSE2_DEFINED + +#include "SkTypes.h" +#include "SkXfermode_proccoeff.h" + +class SK_API SkSSE2ProcCoeffXfermode : public SkProcCoeffXfermode { +public: + SkSSE2ProcCoeffXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, + void* procSIMD) + : INHERITED(rec, mode), fProcSIMD(procSIMD) {} + + virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count, + const SkAlpha aa[]) const SK_OVERRIDE; + virtual void xfer16(uint16_t dst[], const SkPMColor src[], + int count, const SkAlpha aa[]) const SK_OVERRIDE; + + SK_TO_STRING_OVERRIDE() + SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkSSE2ProcCoeffXfermode) + +private: + SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer); + + void* fProcSIMD; + typedef SkProcCoeffXfermode INHERITED; +}; + +SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, + SkXfermode::Mode mode); + +#endif // SkXfermode_opts_SSE2_DEFINED diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp index 6a79b737263..70e92af66bc 100644 --- a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp +++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.cpp @@ -41,8 +41,13 @@ static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alp static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { uint16x8_t tmp; +#ifdef SK_CPU_ARM64 + tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), + vreinterpretq_u32_s32(p2)); +#else tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), vmovn_u32(vreinterpretq_u32_s32(p2))); +#endif tmp += vdupq_n_u16(128); tmp += vshrq_n_u16(tmp, 8); @@ -66,7 +71,11 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val // Test if <= 0 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); +#ifdef SK_CPU_ARM64 + cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); +#else cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); +#endif cmp8_1 = vmovn_u16(cmp16); // Init to zero @@ -75,7 +84,11 @@ static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val // Test if >= 255*255 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); +#ifdef SK_CPU_ARM64 + cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); +#else cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); +#endif cmp8 = vmovn_u16(cmp16); // Insert 255 where true @@ -409,11 +422,19 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, if (overlay) { dc2 = vshll_n_u8(dc, 1); scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); +#ifdef SK_CPU_ARM64 + scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); +#else scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); +#endif } else { sc2 = vshll_n_u8(sc, 1); scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); +#ifdef SK_CPU_ARM64 + scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); +#else scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); +#endif } // Calc COM @@ -421,12 +442,20 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, com1 = vreinterpretq_s32_u32( vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); com2 = vreinterpretq_s32_u32( +#ifdef SK_CPU_ARM64 + vmull_high_u16(const255, sc_plus_dc)); +#else vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); +#endif // Calc SUB int32x4_t sub1, sub2; sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa))); +#ifdef SK_CPU_ARM64 + sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); +#else sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa))); +#endif sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); @@ -444,10 +473,14 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, int32x4_t val2_1, val2_2; uint32x4_t cmp1, cmp2; - cmp1 = vmovl_u16(vget_low_u16(cmp)); - cmp1 |= vshlq_n_u32(cmp1, 16); - cmp2 = vmovl_u16(vget_high_u16(cmp)); - cmp2 |= vshlq_n_u32(cmp2, 16); + // Doing a signed lengthening allows to save a few instructions + // thanks to sign extension. + cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp)))); +#ifdef SK_CPU_ARM64 + cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); +#else + cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp)))); +#endif // Calc COM - SUB val1_1 = com1 - sub1; @@ -458,7 +491,11 @@ static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, val2_2 = com2 + sub2; val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada)))); +#ifdef SK_CPU_ARM64 + val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); +#else val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada)))); +#endif // Insert where needed val1_1 = vbslq_s32(cmp1, val1_1, val2_1); @@ -628,11 +665,19 @@ static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, term1_1 = vreinterpretq_s32_u32( vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); term1_2 = vreinterpretq_s32_u32( +#ifdef SK_CPU_ARM64 + vmull_high_u16(const255, sc_plus_dc)); +#else vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); +#endif /* Calc the second term */ term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); +#ifdef SK_CPU_ARM64 + term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); +#else term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); +#endif return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); } @@ -661,10 +706,18 @@ static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, scdc = vmull_u8(sc, dc); val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); +#ifdef SK_CPU_ARM64 + val2 = vaddl_high_u16(t1, t2); +#else val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); +#endif val1 = vaddw_u16(val1, vget_low_u16(scdc)); +#ifdef SK_CPU_ARM64 + val2 = vaddw_high_u16(val2, scdc); +#else val2 = vaddw_u16(val2, vget_high_u16(scdc)); +#endif return clamp_div255round_simd8_32( vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); @@ -690,7 +743,7 @@ typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; -SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer) +SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) : INHERITED(buffer) { fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); } @@ -708,6 +761,10 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], while (count >= 8) { uint8x8x4_t vsrc, vdst, vres; +#ifdef SK_CPU_ARM64 + vsrc = vld4_u8((uint8_t*)src); + vdst = vld4_u8((uint8_t*)dst); +#else #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) asm volatile ( "vld4.u8 %h[vsrc], [%[src]]! \t\n" @@ -740,6 +797,7 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], vsrc.val[2] = d2; vdst.val[2] = d6; vsrc.val[3] = d3; vdst.val[3] = d7; #endif +#endif // #ifdef SK_CPU_ARM64 vres = procSIMD(vsrc, vdst); @@ -747,6 +805,9 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], count -= 8; dst += 8; +#ifdef SK_CPU_ARM64 + src += 8; +#endif } // Leftovers for (int i = 0; i < count; i++) { @@ -783,6 +844,9 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, vdst = vld1q_u16(dst); +#ifdef SK_CPU_ARM64 + vsrc = vld4_u8((uint8_t*)src); +#else #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) asm volatile ( "vld4.u8 %h[vsrc], [%[src]]! \t\n" @@ -806,6 +870,7 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, vsrc.val[2] = d2; vsrc.val[3] = d3; #endif +#endif // #ifdef SK_CPU_ARM64 vdst32 = SkPixel16ToPixel32_neon8(vdst); vres = procSIMD(vsrc, vdst32); @@ -815,6 +880,9 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, count -= 8; dst += 8; +#ifdef SK_CPU_ARM64 + src += 8; +#endif } for (int i = 0; i < count; i++) { SkPMColor dstC = SkPixel16ToPixel32(dst[i]); @@ -835,7 +903,7 @@ void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, } } -#ifdef SK_DEVELOPER +#ifndef SK_IGNORE_TO_STRING void SkNEONProcCoeffXfermode::toString(SkString* str) const { this->INHERITED::toString(str); } diff --git a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h index a8d438195eb..8f3aaaea9d9 100644 --- a/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h +++ b/chromium/third_party/skia/src/opts/SkXfermode_opts_arm_neon.h @@ -14,11 +14,11 @@ public: virtual void xfer16(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, const SkAlpha* SK_RESTRICT aa) const SK_OVERRIDE; - SK_DEVELOPER_TO_STRING() + SK_TO_STRING_OVERRIDE() SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode) private: - SkNEONProcCoeffXfermode(SkFlattenableReadBuffer& buffer); + SkNEONProcCoeffXfermode(SkReadBuffer& buffer); // void* is used to avoid pulling arm_neon.h in the core and having to build // it with -mfpu=neon. diff --git a/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp b/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp deleted file mode 100644 index aaf6b2ef824..00000000000 --- a/chromium/third_party/skia/src/opts/opts_check_SSE2.cpp +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright 2009 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include "SkBitmapProcState_opts_SSE2.h" -#include "SkBitmapProcState_opts_SSSE3.h" -#include "SkBitmapFilter_opts_SSE2.h" -#include "SkBlitMask.h" -#include "SkBlitRow.h" -#include "SkBlitRect_opts_SSE2.h" -#include "SkBlitRow_opts_SSE2.h" -#include "SkBlurImage_opts_SSE2.h" -#include "SkUtils_opts_SSE2.h" -#include "SkUtils.h" -#include "SkMorphology_opts.h" -#include "SkMorphology_opts_SSE2.h" - -#include "SkRTConf.h" - -#if defined(_MSC_VER) && defined(_WIN64) -#include <intrin.h> -#endif - -/* This file must *not* be compiled with -msse or -msse2, otherwise - gcc may generate sse2 even for scalar ops (and thus give an invalid - instruction on Pentium3 on the code below). Only files named *_SSE2.cpp - in this directory should be compiled with -msse2. */ - - -#ifdef _MSC_VER -static inline void getcpuid(int info_type, int info[4]) { -#if defined(_WIN64) - __cpuid(info, info_type); -#else - __asm { - mov eax, [info_type] - cpuid - mov edi, [info] - mov [edi], eax - mov [edi+4], ebx - mov [edi+8], ecx - mov [edi+12], edx - } -#endif -} -#else -#if defined(__x86_64__) -static inline void getcpuid(int info_type, int info[4]) { - asm volatile ( - "cpuid \n\t" - : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) - : "a"(info_type) - ); -} -#else -static inline void getcpuid(int info_type, int info[4]) { - // We save and restore ebx, so this code can be compatible with -fPIC - asm volatile ( - "pushl %%ebx \n\t" - "cpuid \n\t" - "movl %%ebx, %1 \n\t" - "popl %%ebx \n\t" - : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3]) - : "a"(info_type) - ); -} -#endif -#endif - -#if defined(__x86_64__) || defined(_WIN64) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 -/* All x86_64 machines have SSE2, or we know it's supported at compile time, so don't even bother checking. */ -static inline bool hasSSE2() { - return true; -} -#else - -static inline bool hasSSE2() { - int cpu_info[4] = { 0 }; - getcpuid(1, cpu_info); - return (cpu_info[3] & (1<<26)) != 0; -} -#endif - -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 -/* If we know SSSE3 is supported at compile time, don't even bother checking. */ -static inline bool hasSSSE3() { - return true; -} -#else - -static inline bool hasSSSE3() { - int cpu_info[4] = { 0 }; - getcpuid(1, cpu_info); - return (cpu_info[2] & 0x200) != 0; -} -#endif - -static bool cachedHasSSE2() { - static bool gHasSSE2 = hasSSE2(); - return gHasSSE2; -} - -static bool cachedHasSSSE3() { - static bool gHasSSSE3 = hasSSSE3(); - return gHasSSSE3; -} - -SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters"); - -void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) { - if (cachedHasSSE2()) { - procs->fExtraHorizontalReads = 3; - procs->fConvolveVertically = &convolveVertically_SSE2; - procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2; - procs->fConvolveHorizontally = &convolveHorizontally_SSE2; - procs->fApplySIMDPadding = &applySIMDPadding_SSE2; - } -} - -void SkBitmapProcState::platformProcs() { - if (cachedHasSSSE3()) { - if (fSampleProc32 == S32_opaque_D32_filter_DX) { - fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; - } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { - fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; - } - - if (fSampleProc32 == S32_opaque_D32_filter_DXDY) { - fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3; - } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) { - fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3; - } - } else if (cachedHasSSE2()) { - if (fSampleProc32 == S32_opaque_D32_filter_DX) { - fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; - } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { - fSampleProc32 = S32_alpha_D32_filter_DX_SSE2; - } - - if (fSampleProc16 == S32_D16_filter_DX) { - fSampleProc16 = S32_D16_filter_DX_SSE2; - } - } - - if (cachedHasSSSE3() || cachedHasSSE2()) { - if (fMatrixProc == ClampX_ClampY_filter_scale) { - fMatrixProc = ClampX_ClampY_filter_scale_SSE2; - } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) { - fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; - } - - if (fMatrixProc == ClampX_ClampY_filter_affine) { - fMatrixProc = ClampX_ClampY_filter_affine_SSE2; - } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) { - fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2; - } - if (c_hqfilter_sse) { - if (fShaderProc32 == highQualityFilter32) { - fShaderProc32 = highQualityFilter_SSE2; - } - } - } -} - -static SkBlitRow::Proc32 platform_32_procs[] = { - NULL, // S32_Opaque, - S32_Blend_BlitRow32_SSE2, // S32_Blend, - S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque - S32A_Blend_BlitRow32_SSE2, // S32A_Blend, -}; - -SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { - return NULL; -} - -SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { - if (cachedHasSSE2()) { - return Color32_SSE2; - } else { - return NULL; - } -} - -SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { - if (cachedHasSSE2()) { - return platform_32_procs[flags]; - } else { - return NULL; - } -} - - -SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, - SkMask::Format maskFormat, - SkColor color) { - if (SkMask::kA8_Format != maskFormat) { - return NULL; - } - - ColorProc proc = NULL; - if (cachedHasSSE2()) { - switch (dstConfig) { - case SkBitmap::kARGB_8888_Config: - // The SSE2 version is not (yet) faster for black, so we check - // for that. - if (SK_ColorBLACK != color) { - proc = SkARGB32_A8_BlitMask_SSE2; - } - break; - default: - break; - } - } - return proc; -} - -SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { - if (cachedHasSSE2()) { - if (isOpaque) { - return SkBlitLCD16OpaqueRow_SSE2; - } else { - return SkBlitLCD16Row_SSE2; - } - } else { - return NULL; - } - -} -SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, - SkMask::Format maskFormat, - RowFlags flags) { - return NULL; -} - -SkMemset16Proc SkMemset16GetPlatformProc() { - if (cachedHasSSE2()) { - return sk_memset16_SSE2; - } else { - return NULL; - } -} - -SkMemset32Proc SkMemset32GetPlatformProc() { - if (cachedHasSSE2()) { - return sk_memset32_SSE2; - } else { - return NULL; - } -} - -SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type) { - if (!cachedHasSSE2()) { - return NULL; - } - switch (type) { - case kDilateX_SkMorphologyProcType: - return SkDilateX_SSE2; - case kDilateY_SkMorphologyProcType: - return SkDilateY_SSE2; - case kErodeX_SkMorphologyProcType: - return SkErodeX_SSE2; - case kErodeY_SkMorphologyProcType: - return SkErodeY_SSE2; - default: - return NULL; - } -} - -bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX, - SkBoxBlurProc* boxBlurY, - SkBoxBlurProc* boxBlurXY, - SkBoxBlurProc* boxBlurYX) { -#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION - return false; -#else - if (!cachedHasSSE2()) { - return false; - } - return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX); -#endif -} - -SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning - -SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { - if (cachedHasSSE2()) { - return ColorRect32_SSE2; - } else { - return NULL; - } -} diff --git a/chromium/third_party/skia/src/opts/opts_check_arm.cpp b/chromium/third_party/skia/src/opts/opts_check_arm.cpp deleted file mode 100644 index 3a322aa0e9b..00000000000 --- a/chromium/third_party/skia/src/opts/opts_check_arm.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2010, Code Aurora Forum. All rights reserved. - * Copyright 2006-2010, The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - ***************************************************************************/ - -/* Changes: - * 2011-04-01 ARM - * Merged the functions from src/opts/opts_check_arm_neon.cpp - * Modified to return ARM version of memset16 and memset32 if no neon - * available in the core - */ - -#include "SkBlitRow.h" -#include "SkUtils.h" - -#include "SkUtilsArm.h" -#include "SkMorphology_opts.h" -#include "SkMorphology_opts_neon.h" -#include "SkBlurImage_opts_neon.h" - -#if defined(SK_CPU_LENDIAN) && !SK_ARM_NEON_IS_NONE -extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count); -extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count); -#endif - -#if defined(SK_CPU_LENDIAN) -extern "C" void arm_memset16(uint16_t* dst, uint16_t value, int count); -extern "C" void arm_memset32(uint32_t* dst, uint32_t value, int count); -#endif - -SkMemset16Proc SkMemset16GetPlatformProc() { - // FIXME: memset.arm.S is using syntax incompatible with XCode -#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS) - return NULL; -#elif SK_ARM_NEON_IS_DYNAMIC - if (sk_cpu_arm_has_neon()) { - return memset16_neon; - } else { - return arm_memset16; - } -#elif SK_ARM_NEON_IS_ALWAYS - return memset16_neon; -#else - return arm_memset16; -#endif -} - -SkMemset32Proc SkMemset32GetPlatformProc() { - // FIXME: memset.arm.S is using syntax incompatible with XCode -#if !defined(SK_CPU_LENDIAN) || defined(SK_BUILD_FOR_IOS) - return NULL; -#elif SK_ARM_NEON_IS_DYNAMIC - if (sk_cpu_arm_has_neon()) { - return memset32_neon; - } else { - return arm_memset32; - } -#elif SK_ARM_NEON_IS_ALWAYS - return memset32_neon; -#else - return arm_memset32; -#endif -} - -SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { - return NULL; -} - -SkMorphologyProc SkMorphologyGetPlatformProc(SkMorphologyProcType type) { -#if SK_ARM_NEON_IS_NONE - return NULL; -#else -#if SK_ARM_NEON_IS_DYNAMIC - if (!sk_cpu_arm_has_neon()) { - return NULL; - } -#endif - switch (type) { - case kDilateX_SkMorphologyProcType: - return SkDilateX_neon; - case kDilateY_SkMorphologyProcType: - return SkDilateY_neon; - case kErodeX_SkMorphologyProcType: - return SkErodeX_neon; - case kErodeY_SkMorphologyProcType: - return SkErodeY_neon; - default: - return NULL; - } -#endif -} - -bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX, - SkBoxBlurProc* boxBlurY, - SkBoxBlurProc* boxBlurXY, - SkBoxBlurProc* boxBlurYX) { -#if SK_ARM_NEON_IS_NONE - return false; -#else -#if SK_ARM_NEON_IS_DYNAMIC - if (!sk_cpu_arm_has_neon()) { - return false; - } -#endif - return SkBoxBlurGetPlatformProcs_NEON(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX); -#endif -} diff --git a/chromium/third_party/skia/src/opts/opts_check_x86.cpp b/chromium/third_party/skia/src/opts/opts_check_x86.cpp new file mode 100644 index 00000000000..6af47729cd0 --- /dev/null +++ b/chromium/third_party/skia/src/opts/opts_check_x86.cpp @@ -0,0 +1,379 @@ +/* + * Copyright 2009 The Android Open Source Project + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkBitmapFilter_opts_SSE2.h" +#include "SkBitmapProcState_opts_SSE2.h" +#include "SkBitmapProcState_opts_SSSE3.h" +#include "SkBlitMask.h" +#include "SkBlitRect_opts_SSE2.h" +#include "SkBlitRow.h" +#include "SkBlitRow_opts_SSE2.h" +#include "SkBlurImage_opts_SSE2.h" +#include "SkMorphology_opts.h" +#include "SkMorphology_opts_SSE2.h" +#include "SkRTConf.h" +#include "SkUtils.h" +#include "SkUtils_opts_SSE2.h" +#include "SkXfermode.h" +#include "SkXfermode_proccoeff.h" + +#if defined(_MSC_VER) && defined(_WIN64) +#include <intrin.h> +#endif + +/* This file must *not* be compiled with -msse or any other optional SIMD + extension, otherwise gcc may generate SIMD instructions even for scalar ops + (and thus give an invalid instruction on Pentium3 on the code below). + For example, only files named *_SSE2.cpp in this directory should be + compiled with -msse2 or higher. */ + + +/* Function to get the CPU SSE-level in runtime, for different compilers. */ +#ifdef _MSC_VER +static inline void getcpuid(int info_type, int info[4]) { +#if defined(_WIN64) + __cpuid(info, info_type); +#else + __asm { + mov eax, [info_type] + cpuid + mov edi, [info] + mov [edi], eax + mov [edi+4], ebx + mov [edi+8], ecx + mov [edi+12], edx + } +#endif +} +#elif defined(__x86_64__) +static inline void getcpuid(int info_type, int info[4]) { + asm volatile ( + "cpuid \n\t" + : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) + : "a"(info_type) + ); +} +#else +static inline void getcpuid(int info_type, int info[4]) { + // We save and restore ebx, so this code can be compatible with -fPIC + asm volatile ( + "pushl %%ebx \n\t" + "cpuid \n\t" + "movl %%ebx, %1 \n\t" + "popl %%ebx \n\t" + : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3]) + : "a"(info_type) + ); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// + +/* Fetch the SIMD level directly from the CPU, at run-time. + * Only checks the levels needed by the optimizations in this file. + */ +static int get_SIMD_level() { + int cpu_info[4] = { 0 }; + + getcpuid(1, cpu_info); + if ((cpu_info[2] & (1<<20)) != 0) { + return SK_CPU_SSE_LEVEL_SSE42; + } else if ((cpu_info[2] & (1<<9)) != 0) { + return SK_CPU_SSE_LEVEL_SSSE3; + } else if ((cpu_info[3] & (1<<26)) != 0) { + return SK_CPU_SSE_LEVEL_SSE2; + } else { + return 0; + } +} + +/* Verify that the requested SIMD level is supported in the build. + * If not, check if the platform supports it. + */ +static inline bool supports_simd(int minLevel) { +#if defined(SK_CPU_SSE_LEVEL) + if (minLevel <= SK_CPU_SSE_LEVEL) { + return true; + } else +#endif + { +#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) + /* For the Android framework we should always know at compile time if the device + * we are building for supports SSSE3. The one exception to this rule is on the + * emulator where we are compiled without the -mssse3 option (so we have no + * SSSE3 procs) but can be run on a host machine that supports SSSE3 + * instructions. So for that particular case we disable our SSSE3 options. + */ + return false; +#else + static int gSIMDLevel = get_SIMD_level(); + return (minLevel <= gSIMDLevel); +#endif + } +} + +//////////////////////////////////////////////////////////////////////////////// + +SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters"); + +void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + procs->fExtraHorizontalReads = 3; + procs->fConvolveVertically = &convolveVertically_SSE2; + procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2; + procs->fConvolveHorizontally = &convolveHorizontally_SSE2; + procs->fApplySIMDPadding = &applySIMDPadding_SSE2; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +void SkBitmapProcState::platformProcs() { + /* Every optimization in the function requires at least SSE2 */ + if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return; + } + + /* Check fSampleProc32 */ + if (fSampleProc32 == S32_opaque_D32_filter_DX) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; + } else { + fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; + } + } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3; + } + } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; + } else { + fSampleProc32 = S32_alpha_D32_filter_DX_SSE2; + } + } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3; + } + } + + /* Check fSampleProc16 */ + if (fSampleProc16 == S32_D16_filter_DX) { + fSampleProc16 = S32_D16_filter_DX_SSE2; + } + + /* Check fMatrixProc */ + if (fMatrixProc == ClampX_ClampY_filter_scale) { + fMatrixProc = ClampX_ClampY_filter_scale_SSE2; + } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) { + fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; + } else if (fMatrixProc == ClampX_ClampY_filter_affine) { + fMatrixProc = ClampX_ClampY_filter_affine_SSE2; + } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) { + fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2; + } + + /* Check fShaderProc32 */ + if (c_hqfilter_sse) { + if (fShaderProc32 == highQualityFilter32) { + fShaderProc32 = highQualityFilter_SSE2; + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +static SkBlitRow::Proc platform_16_procs[] = { + S32_D565_Opaque_SSE2, // S32_D565_Opaque + NULL, // S32_D565_Blend + S32A_D565_Opaque_SSE2, // S32A_D565_Opaque + NULL, // S32A_D565_Blend + S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither + NULL, // S32_D565_Blend_Dither + S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither + NULL, // S32A_D565_Blend_Dither +}; + +SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return platform_16_procs[flags]; + } else { + return NULL; + } +} + +static SkBlitRow::Proc32 platform_32_procs[] = { + NULL, // S32_Opaque, + S32_Blend_BlitRow32_SSE2, // S32_Blend, + S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque + S32A_Blend_BlitRow32_SSE2, // S32A_Blend, +}; + +SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return platform_32_procs[flags]; + } else { + return NULL; + } +} + +SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return Color32_SSE2; + } else { + return NULL; + } +} + +SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning + +SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { +/* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled. + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return ColorRect32_SSE2; + } else { + return NULL; + } +*/ + return NULL; +} + +//////////////////////////////////////////////////////////////////////////////// + +SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT, + SkMask::Format maskFormat, + SkColor color) { + if (SkMask::kA8_Format != maskFormat) { + return NULL; + } + + ColorProc proc = NULL; + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + switch (dstCT) { + case kN32_SkColorType: + // The SSE2 version is not (yet) faster for black, so we check + // for that. + if (SK_ColorBLACK != color) { + proc = SkARGB32_A8_BlitMask_SSE2; + } + break; + default: + break; + } + } + return proc; +} + +SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + if (isOpaque) { + return SkBlitLCD16OpaqueRow_SSE2; + } else { + return SkBlitLCD16Row_SSE2; + } + } else { + return NULL; + } + +} + +SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) { + return NULL; +} + +//////////////////////////////////////////////////////////////////////////////// + +SkMemset16Proc SkMemset16GetPlatformProc() { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return sk_memset16_SSE2; + } else { + return NULL; + } +} + +SkMemset32Proc SkMemset32GetPlatformProc() { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return sk_memset32_SSE2; + } else { + return NULL; + } +} + +SkMemcpy32Proc SkMemcpy32GetPlatformProc() { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return sk_memcpy32_SSE2; + } else { + return NULL; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) { + if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return NULL; + } + switch (type) { + case kDilateX_SkMorphologyProcType: + return SkDilateX_SSE2; + case kDilateY_SkMorphologyProcType: + return SkDilateY_SSE2; + case kErodeX_SkMorphologyProcType: + return SkErodeX_SSE2; + case kErodeY_SkMorphologyProcType: + return SkErodeY_SSE2; + default: + return NULL; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX, + SkBoxBlurProc* boxBlurY, + SkBoxBlurProc* boxBlurXY, + SkBoxBlurProc* boxBlurYX) { +#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION + return false; +#else + if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return false; + } + return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// + +extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, + SkXfermode::Mode mode); + +SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec, + SkXfermode::Mode mode); + +SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec, + SkXfermode::Mode mode) { + return NULL; +} + +SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, + SkXfermode::Mode mode); + +SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, + SkXfermode::Mode mode) { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return SkPlatformXfermodeFactory_impl_SSE2(rec, mode); + } else { + return SkPlatformXfermodeFactory_impl(rec, mode); + } +} + +SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode); + +SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) { + return NULL; +} |