diff options
Diffstat (limited to 'chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h')
-rw-r--r-- | chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h new file mode 100644 index 00000000000..72bf1bce336 --- /dev/null +++ b/chromium/third_party/skia/src/opts/SkBitmapProcState_matrix_neon.h @@ -0,0 +1,506 @@ + +#include <arm_neon.h> + + +#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) +#define SCALE_FILTER_NAME MAKENAME(_filter_scale) +#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) +#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) +#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) +#define PERSP_FILTER_NAME MAKENAME(_filter_persp) + +#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) +#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) +#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) +#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) + +#ifndef PREAMBLE + #define PREAMBLE(state) + #define PREAMBLE_PARAM_X + #define PREAMBLE_PARAM_Y + #define PREAMBLE_ARG_X + #define PREAMBLE_ARG_Y +#endif + +static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + + PREAMBLE(s); + + // we store y, x, x, x, x, x + const unsigned maxX = s.fBitmap->width() - 1; + SkFractionalInt fx; + { + SkPoint pt; + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + fx = SkScalarToFractionalInt(pt.fY); + const unsigned maxY = s.fBitmap->height() - 1; + *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY); + fx = SkScalarToFractionalInt(pt.fX); + } + + if (0 == maxX) { + // all of the following X values must be 0 + memset(xy, 0, count * sizeof(uint16_t)); + return; + } + + const SkFractionalInt dx = s.fInvSxFractionalInt; + +#ifdef CHECK_FOR_DECAL + // test if we don't need to apply the tile proc + if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { + decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx), + SkFractionalIntToFixed(dx), count); + return; + } +#endif + + if (count >= 8) { + SkFractionalInt dx2 = dx+dx; + SkFractionalInt dx4 = dx2+dx2; + SkFractionalInt dx8 = dx4+dx4; + + // now build fx/fx+dx/fx+2dx/fx+3dx + SkFractionalInt fx1, fx2, fx3; + int32x4_t lbase, hbase; + int16_t *dst16 = (int16_t *)xy; + + fx1 = fx+dx; + fx2 = fx1+dx; + fx3 = fx2+dx; + + lbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); + lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1); + lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2); + lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3); + hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); + + // store & bump + while (count >= 8) { + + int16x8_t fx8; + + fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); + + vst1q_s16(dst16, fx8); + + // but preserving base & on to the next + lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + dst16 += 8; + count -= 8; + fx += dx8; + }; + xy = (uint32_t *) dst16; + } + + uint16_t* xx = (uint16_t*)xy; + for (int i = count; i > 0; --i) { + *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); + fx += dx; + } +} + +static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask | + SkMatrix::kAffine_Mask)) == 0); + + PREAMBLE(s); + SkPoint srcPt; + s.fInvProc(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); + + SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX); + SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY); + SkFractionalInt dx = s.fInvSxFractionalInt; + SkFractionalInt dy = s.fInvKyFractionalInt; + int maxX = s.fBitmap->width() - 1; + int maxY = s.fBitmap->height() - 1; + + if (count >= 8) { + SkFractionalInt dx4 = dx * 4; + SkFractionalInt dy4 = dy * 4; + SkFractionalInt dx8 = dx * 8; + SkFractionalInt dy8 = dy * 8; + + int32x4_t xbase, ybase; + int32x4_t x2base, y2base; + int16_t *dst16 = (int16_t *) xy; + + // now build fx, fx+dx, fx+2dx, fx+3dx + xbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); + xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1); + xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2); + xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3); + + // same for fy + ybase = vdupq_n_s32(SkFractionalIntToFixed(fy)); + ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1); + ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2); + ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3); + + x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); + y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4))); + + // store & bump + do { + int16x8x2_t hi16; + + hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); + hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); + + vst2q_s16(dst16, hi16); + + // moving base and on to the next + xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8))); + x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8))); + y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8))); + + dst16 += 16; // 8x32 aka 16x16 + count -= 8; + fx += dx8; + fy += dy8; + } while (count >= 8); + xy = (uint32_t *) dst16; + } + + for (int i = count; i > 0; --i) { + *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) | + TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); + fx += dx; fy += dy; + } +} + +static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, + uint32_t* SK_RESTRICT xy, + int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); + + PREAMBLE(s); + // max{X,Y} are int here, but later shown/assumed to fit in 16 bits + int maxX = s.fBitmap->width() - 1; + int maxY = s.fBitmap->height() - 1; + + SkPerspIter iter(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, count); + + while ((count = iter.next()) != 0) { + const SkFixed* SK_RESTRICT srcXY = iter.getXY(); + + if (count >= 8) { + int32_t *mysrc = (int32_t *) srcXY; + int16_t *mydst = (int16_t *) xy; + do { + int16x8x2_t hi16; + int32x4x2_t xy1, xy2; + + xy1 = vld2q_s32(mysrc); + xy2 = vld2q_s32(mysrc+8); + + hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX); + hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY); + + vst2q_s16(mydst, hi16); + + count -= 8; // 8 iterations + mysrc += 16; // 16 longs + mydst += 16; // 16 shorts, aka 8 longs + } while (count >= 8); + // get xy and srcXY fixed up + srcXY = (const SkFixed *) mysrc; + xy = (uint32_t *) mydst; + } + + while (--count >= 0) { + *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | + TILEX_PROCF(srcXY[0], maxX); + srcXY += 2; + } + } +} + +static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, + SkFixed one PREAMBLE_PARAM_Y) { + unsigned i = TILEY_PROCF(f, max); + i = (i << 4) | TILEY_LOW_BITS(f, max); + return (i << 14) | (TILEY_PROCF((f + one), max)); +} + +static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, + SkFixed one PREAMBLE_PARAM_X) { + unsigned i = TILEX_PROCF(f, max); + i = (i << 4) | TILEX_LOW_BITS(f, max); + return (i << 14) | (TILEX_PROCF((f + one), max)); +} + +static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, + SkFixed one PREAMBLE_PARAM_X) { + int32x4_t ret, res, wide_one; + + // Prepare constants + wide_one = vdupq_n_s32(one); + + // Step 1 + res = TILEX_PROCF_NEON4(f, max); + + // Step 2 + ret = TILEX_LOW_BITS_NEON4(f, max); + ret = vsliq_n_s32(ret, res, 4); + + // Step 3 + res = TILEX_PROCF_NEON4(f + wide_one, max); + ret = vorrq_s32(vshlq_n_s32(ret, 14), res); + + return ret; +} + +static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, + SkFixed one PREAMBLE_PARAM_X) { + int32x4_t ret, res, wide_one; + + // Prepare constants + wide_one = vdupq_n_s32(one); + + // Step 1 + res = TILEY_PROCF_NEON4(f, max); + + // Step 2 + ret = TILEY_LOW_BITS_NEON4(f, max); + ret = vsliq_n_s32(ret, res, 4); + + // Step 3 + res = TILEY_PROCF_NEON4(f + wide_one, max); + ret = vorrq_s32(vshlq_n_s32(ret, 14), res); + + return ret; +} + +static void SCALE_FILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + SkASSERT(s.fInvKy == 0); + + PREAMBLE(s); + + const unsigned maxX = s.fBitmap->width() - 1; + const SkFixed one = s.fFilterOneX; + const SkFractionalInt dx = s.fInvSxFractionalInt; + SkFractionalInt fx; + + { + SkPoint pt; + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); + const unsigned maxY = s.fBitmap->height() - 1; + // compute our two Y values up front + *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); + // now initialize fx + fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1); + } + +#ifdef CHECK_FOR_DECAL + // test if we don't need to apply the tile proc + if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { + decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx), + SkFractionalIntToFixed(dx), count); + return; + } +#endif + { + + if (count >= 4) { + int32x4_t wide_fx; + + wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx)); + wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1); + wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2); + wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3); + + while (count >= 4) { + int32x4_t res; + + res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); + + vst1q_u32(xy, vreinterpretq_u32_s32(res)); + + wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx)); + fx += dx+dx+dx+dx; + xy += 4; + count -= 4; + } + } + + while (--count >= 0) { + *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X); + fx += dx; + } + + } +} + +static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask | + SkMatrix::kAffine_Mask)) == 0); + + PREAMBLE(s); + SkPoint srcPt; + s.fInvProc(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); + + SkFixed oneX = s.fFilterOneX; + SkFixed oneY = s.fFilterOneY; + SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); + SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); + SkFixed dx = s.fInvSx; + SkFixed dy = s.fInvKy; + unsigned maxX = s.fBitmap->width() - 1; + unsigned maxY = s.fBitmap->height() - 1; + + if (count >= 4) { + int32x4_t wide_fy, wide_fx; + + wide_fx = vdupq_n_s32(fx); + wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); + wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); + wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); + + wide_fy = vdupq_n_s32(fy); + wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); + wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); + wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); + + while (count >= 4) { + int32x4x2_t vxy; + + // do the X side, then the Y side, then interleave them + vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); + vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); + + // interleave as YXYXYXYX as part of the storing + vst2q_s32((int32_t*)xy, vxy); + + // prepare next iteration + wide_fx += vdupq_n_s32(dx+dx+dx+dx); + fx += dx + dx + dx + dx; + wide_fy += vdupq_n_s32(dy+dy+dy+dy); + fy += dy+dy+dy+dy; + xy += 8; // 4 x's, 4 y's + count -= 4; + } + } + + while (--count >= 0) { + // NB: writing Y/X + *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); + fy += dy; + *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); + fx += dx; + } +} + +static void PERSP_FILTER_NAME(const SkBitmapProcState& s, + uint32_t* SK_RESTRICT xy, int count, + int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); + + PREAMBLE(s); + unsigned maxX = s.fBitmap->width() - 1; + unsigned maxY = s.fBitmap->height() - 1; + SkFixed oneX = s.fFilterOneX; + SkFixed oneY = s.fFilterOneY; + + SkPerspIter iter(s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, count); + + while ((count = iter.next()) != 0) { + const SkFixed* SK_RESTRICT srcXY = iter.getXY(); + + while (count >= 4) { + int32x4_t wide_x, wide_y; + int32x4x2_t vxy, vresyx; + + // load src: x-y-x-y-x-y-x-y + vxy = vld2q_s32(srcXY); + + // do the X side, then the Y side, then interleave them + wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1)); + wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1)); + + vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y); + vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X); + + // store interleaved as y-x-y-x-y-x-y-x (NB != read order) + vst2q_s32((int32_t*)xy, vresyx); + + // on to the next iteration + srcXY += 2*4; + count -= 4; + xy += 2*4; + } + + while (--count >= 0) { + // NB: we read x/y, we write y/x + *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, + oneY PREAMBLE_ARG_Y); + *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, + oneX PREAMBLE_ARG_X); + srcXY += 2; + } + } +} + +const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { + SCALE_NOFILTER_NAME, + SCALE_FILTER_NAME, + AFFINE_NOFILTER_NAME, + AFFINE_FILTER_NAME, + PERSP_NOFILTER_NAME, + PERSP_FILTER_NAME +}; + +#undef TILEX_PROCF_NEON8 +#undef TILEY_PROCF_NEON8 +#undef TILEX_PROCF_NEON4 +#undef TILEY_PROCF_NEON4 +#undef TILEX_LOW_BITS_NEON4 +#undef TILEY_LOW_BITS_NEON4 + +#undef MAKENAME +#undef TILEX_PROCF +#undef TILEY_PROCF +#ifdef CHECK_FOR_DECAL + #undef CHECK_FOR_DECAL +#endif + +#undef SCALE_NOFILTER_NAME +#undef SCALE_FILTER_NAME +#undef AFFINE_NOFILTER_NAME +#undef AFFINE_FILTER_NAME +#undef PERSP_NOFILTER_NAME +#undef PERSP_FILTER_NAME + +#undef PREAMBLE +#undef PREAMBLE_PARAM_X +#undef PREAMBLE_PARAM_Y +#undef PREAMBLE_ARG_X +#undef PREAMBLE_ARG_Y + +#undef TILEX_LOW_BITS +#undef TILEY_LOW_BITS |