diff options
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp/dec_neon.c')
-rw-r--r-- | src/3rdparty/libwebp/src/dsp/dec_neon.c | 397 |
1 files changed, 372 insertions, 25 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c index 4afae07..a63f43f 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_neon.c +++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c @@ -389,9 +389,9 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, #endif // !WORK_AROUND_GCC -// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. -static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { - return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); +// Zero extend 'v' to an int16x8_t. +static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) { + return vreinterpretq_s16_u16(vmovl_u8(v)); } // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result @@ -423,8 +423,8 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, { // Convert to 16b. - const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); - const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); + const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01)); + const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23)); // Descale with rounding. const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); @@ -479,6 +479,21 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { //------------------------------------------------------------------------------ +static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, + const int8x16_t delta, + int8x16_t* const op0, int8x16_t* const oq0) { + const int8x16_t kCst3 = vdupq_n_s8(0x03); + const int8x16_t kCst4 = vdupq_n_s8(0x04); + const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); + const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); + const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); + const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); + *op0 = vqaddq_s8(p0s, delta3); + *oq0 = vqsubq_s8(q0s, delta4); +} + +#if defined(WEBP_USE_INTRINSICS) + static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, const int8x16_t delta, uint8x16_t* const op0, uint8x16_t* const oq0) { @@ -494,8 +509,6 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, *oq0 = FlipSignBack(sq0); } -#if defined(USE_INTRINSICS) - static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t mask, @@ -626,7 +639,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { ); } -#endif // USE_INTRINSICS +#endif // WEBP_USE_INTRINSICS static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { uint32_t k; @@ -721,11 +734,7 @@ static void DoFilter4( const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); const int8x16_t simple_lf_delta = vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); - uint8x16_t tmp_p0, tmp_q0; - ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); - // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here - p0s = FlipSign(tmp_p0); - q0s = FlipSign(tmp_q0); + ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); } // do_filter4 part (complex loopfilter on pixels without hev) @@ -797,11 +806,7 @@ static void DoFilter6( { const int8x16_t simple_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); - uint8x16_t tmp_p0, tmp_q0; - ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); - // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here - p0s = FlipSign(tmp_p0); - q0s = FlipSign(tmp_q0); + ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); } // do_filter6 part (complex loopfilter on pixels without hev) @@ -986,7 +991,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, static const int16_t kC1 = 20091; static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. -#if defined(USE_INTRINSICS) +#if defined(WEBP_USE_INTRINSICS) static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, int16x8x2_t* const out) { // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 @@ -1163,7 +1168,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ); } -#endif // USE_INTRINSICS +#endif // WEBP_USE_INTRINSICS static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { TransformOne(in, dst); @@ -1241,7 +1246,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1_full = 20091 + (1 << 16); static const int kC2_full = 35468; - const int16x4_t A = vdup_n_s16(in[0]); + const int16x4_t A = vld1_dup_s16(in); const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); const int c1 = MUL(in[1], kC2_full); @@ -1258,15 +1263,330 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { } #undef MUL -#endif // WEBP_USE_NEON +//------------------------------------------------------------------------------ +// 4x4 + +static void DC4(uint8_t* dst) { // DC + const uint8x8_t A = vld1_u8(dst - BPS); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); + const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); + const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); + const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); + const uint16x8_t s0 = vaddq_u16(L0, L1); + const uint16x8_t s1 = vaddq_u16(L2, L3); + const uint16x8_t s01 = vaddq_u16(s0, s1); + const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); + const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3 + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0); + } +} + +// TrueMotion (4x4 + 8x8) +static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { + const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' + const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' + const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] + int y; + for (y = 0; y < size; y += 4) { + // left edge + const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); + const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); + const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); + const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); + const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] + const int16x8_t r1 = vaddq_s16(L1, d); + const int16x8_t r2 = vaddq_s16(L2, d); + const int16x8_t r3 = vaddq_s16(L3, d); + // Saturate and store the result. + const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); + const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); + const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); + const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); + if (size == 4) { + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); + } else { + vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32); + vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32); + vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32); + vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32); + } + dst += 4 * BPS; + } +} + +static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } + +static void VE4(uint8_t* dst) { // vertical + // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row + const uint64x1_t A1 = vshr_n_u64(A0, 8); + const uint64x1_t A2 = vshr_n_u64(A0, 16); + const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); + const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); + const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00); + const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0); + } +} + +static void RD4(uint8_t* dst) { // Down-right + const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32_t I = dst[-1 + 0 * BPS]; + const uint32_t J = dst[-1 + 1 * BPS]; + const uint32_t K = dst[-1 + 2 * BPS]; + const uint32_t L = dst[-1 + 3 * BPS]; + const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); +} + +static void LD4(uint8_t* dst) { // Down-left + // Note using the same shift trick as VE4() is slower here. + const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); + const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); + const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2); + const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6); + const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0); + const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r0 = vreinterpret_u32_u8(avg2); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); +} + +//------------------------------------------------------------------------------ +// Chroma + +static void VE8uv(uint8_t* dst) { // vertical + const uint8x8_t top = vld1_u8(dst - BPS); + int j; + for (j = 0; j < 8; ++j) { + vst1_u8(dst + j * BPS, top); + } +} + +static void HE8uv(uint8_t* dst) { // horizontal + int j; + for (j = 0; j < 8; ++j) { + const uint8x8_t left = vld1_dup_u8(dst - 1); + vst1_u8(dst, left); + dst += BPS; + } +} + +static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_top) { + const uint8x8_t A = vld1_u8(dst - BPS); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); + const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); + const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); + const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); + const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); + const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); + const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); + const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); + const uint16x8_t s0 = vaddq_u16(L0, L1); + const uint16x8_t s1 = vaddq_u16(L2, L3); + const uint16x8_t s2 = vaddq_u16(L4, L5); + const uint16x8_t s3 = vaddq_u16(L6, L7); + const uint16x8_t s01 = vaddq_u16(s0, s1); + const uint16x8_t s23 = vaddq_u16(s2, s3); + sum_left = vaddq_u16(s01, s23); + } + + if (do_top && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_top) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc)); + } + } +} + +static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); } +static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); } +static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); } +static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); } + +static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } + +//------------------------------------------------------------------------------ +// 16x16 + +static void VE16(uint8_t* dst) { // vertical + const uint8x16_t top = vld1q_u8(dst - BPS); + int j; + for (j = 0; j < 16; ++j) { + vst1q_u8(dst + j * BPS, top); + } +} + +static void HE16(uint8_t* dst) { // horizontal + int j; + for (j = 0; j < 16; ++j) { + const uint8x16_t left = vld1q_dup_u8(dst - 1); + vst1q_u8(dst, left); + dst += BPS; + } +} + +static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_top) { + const uint8x16_t A = vld1q_u8(dst - BPS); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + int i; + sum_left = vdupq_n_u16(0); + for (i = 0; i < 16; i += 8) { + const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); + const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); + const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); + const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); + const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); + const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); + const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); + const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); + const uint16x8_t s0 = vaddq_u16(L0, L1); + const uint16x8_t s1 = vaddq_u16(L2, L3); + const uint16x8_t s2 = vaddq_u16(L4, L5); + const uint16x8_t s3 = vaddq_u16(L6, L7); + const uint16x8_t s01 = vaddq_u16(s0, s1); + const uint16x8_t s23 = vaddq_u16(s2, s3); + const uint16x8_t sum = vaddq_u16(s01, s23); + sum_left = vaddq_u16(sum_left, sum); + } + } + + if (do_top && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_top) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * BPS, dc); + } + } +} + +static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); } +static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); } +static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); } +static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); } + +static void TM16(uint8_t* dst) { + const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' + const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' + // A[c] - A[-1] + const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL)); + const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL)); + int y; + for (y = 0; y < 16; y += 4) { + // left edge + const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); + const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); + const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); + const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); + const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] + const int16x8_t r1_lo = vaddq_s16(L1, d_lo); + const int16x8_t r2_lo = vaddq_s16(L2, d_lo); + const int16x8_t r3_lo = vaddq_s16(L3, d_lo); + const int16x8_t r0_hi = vaddq_s16(L0, d_hi); + const int16x8_t r1_hi = vaddq_s16(L1, d_hi); + const int16x8_t r2_hi = vaddq_s16(L2, d_hi); + const int16x8_t r3_hi = vaddq_s16(L3, d_hi); + // Saturate and store the result. + const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi)); + const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi)); + const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi)); + const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi)); + vst1q_u8(dst + 0 * BPS, row0); + vst1q_u8(dst + 1 * BPS, row1); + vst1q_u8(dst + 2 * BPS, row2); + vst1q_u8(dst + 3 * BPS, row3); + dst += 4 * BPS; + } +} //------------------------------------------------------------------------------ // Entry point extern void VP8DspInitNEON(void); -void VP8DspInitNEON(void) { -#if defined(WEBP_USE_NEON) +WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { VP8Transform = TransformTwo; VP8TransformAC3 = TransformAC3; VP8TransformDC = TransformDC; @@ -1288,5 +1608,32 @@ void VP8DspInitNEON(void) { VP8SimpleHFilter16 = SimpleHFilter16; VP8SimpleVFilter16i = SimpleVFilter16i; VP8SimpleHFilter16i = SimpleHFilter16i; -#endif // WEBP_USE_NEON + + VP8PredLuma4[0] = DC4; + VP8PredLuma4[1] = TM4; + VP8PredLuma4[2] = VE4; + VP8PredLuma4[4] = RD4; + VP8PredLuma4[6] = LD4; + + VP8PredLuma16[0] = DC16TopLeft; + VP8PredLuma16[1] = TM16; + VP8PredLuma16[2] = VE16; + VP8PredLuma16[3] = HE16; + VP8PredLuma16[4] = DC16NoTop; + VP8PredLuma16[5] = DC16NoLeft; + VP8PredLuma16[6] = DC16NoTopLeft; + + VP8PredChroma8[0] = DC8uv; + VP8PredChroma8[1] = TM8uv; + VP8PredChroma8[2] = VE8uv; + VP8PredChroma8[3] = HE8uv; + VP8PredChroma8[4] = DC8uvNoTop; + VP8PredChroma8[5] = DC8uvNoLeft; + VP8PredChroma8[6] = DC8uvNoTopLeft; } + +#else // !WEBP_USE_NEON + +WEBP_DSP_INIT_STUB(VP8DspInitNEON) + +#endif // WEBP_USE_NEON |