summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
diff options
context:
space:
mode:
authorLiang Qi <liang.qi@qt.io>2017-03-07 13:05:21 +0100
committerLiang Qi <liang.qi@qt.io>2017-03-13 10:47:45 +0000
commitb7ec9e78633d8f2c75a8b02e17e169497bb103e2 (patch)
treee4be04af4dbcf8cd635715efdf4e769281183746 /src/3rdparty/libwebp/src/dsp/rescaler_msa.c
parentf2dbc67c2b032a5f27d0224e020fb6dfcd3fd142 (diff)
Bundled libwebp updated to version 0.6.0
This commit imports libwebp 0.6.0, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. Upstream changes since 0.5.1 have been merged in. Also updated version in qt_attribution.json. Conflicts: src/3rdparty/libwebp.pri src/3rdparty/libwebp/qt_attribution.json src/3rdparty/libwebp/src/webp/config.h Change-Id: I001aa7a3fabf0130b54f9005c23aa822bc1d0ec1 Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp/rescaler_msa.c')
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler_msa.c444
1 files changed, 444 insertions, 0 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
new file mode 100644
index 0000000..2c10e55
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
@@ -0,0 +1,444 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of rescaling functions
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <assert.h>
+
+#include "../utils/rescaler_utils.h"
+#include "./msa_macro.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1, tmp2, tmp3; \
+ v16u8 t0, t1, t2, t3, t4, t5; \
+ v2u64 out0, out1, out2, out3; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
+ ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
+ PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
+ dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
+} while (0)
+
+#define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1; \
+ v16i8 t0, t1; \
+ v2u64 out0, out1; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
+ t1 = __msa_pckev_b(t0, t0); \
+ t0 = __msa_pckev_b(t1, t1); \
+ dst = __msa_copy_s_w((v4i32)t0, 0); \
+} while (0)
+
+#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
+ dst0, dst1, dst2, dst3) do { \
+ v4u32 tmp0, tmp1, tmp2, tmp3; \
+ v2u64 out0, out1, out2, out3; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
+ ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
+ ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
+} while (0)
+
+#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1; \
+ v2u64 out0, out1; \
+ ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
+ DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
+} while (0)
+
+#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
+ dst0, dst1) do { \
+ v4u32 tmp0, tmp1, tmp2, tmp3; \
+ v2u64 out0, out1, out2, out3; \
+ ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
+ ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
+ DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
+ DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
+ DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
+ SRAR_D4_UD(out0, out1, out2, out3, shift); \
+ PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
+} while (0)
+
+#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
+ v4u32 tmp0, tmp1; \
+ v2u64 out0, out1; \
+ v16i8 t0, t1; \
+ ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
+ DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
+ SRAR_D2_UD(out0, out1, shift); \
+ t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
+ t1 = __msa_pckev_b(t0, t0); \
+ t0 = __msa_pckev_b(t1, t1); \
+ dst = __msa_copy_s_w((v4i32)t0, 0); \
+} while (0)
+
+static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+ int length,
+ WebPRescaler* const wrk) {
+ const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+ const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+ const v4i32 zero = { 0 };
+
+ while (length >= 16) {
+ v4u32 src0, src1, src2, src3;
+ v16u8 out;
+ LD_UW4(frow, 4, src0, src1, src2, src3);
+ CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
+ ST_UB(out, dst);
+ length -= 16;
+ frow += 16;
+ dst += 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 src0, src1, src2;
+ LD_UW3(frow, 4, src0, src1, src2);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ length -= 12;
+ frow += 12;
+ dst += 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 src0, src1;
+ LD_UW2(frow, 4, src0, src1);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ length -= 8;
+ frow += 8;
+ dst += 8;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ const v4u32 src0 = LD_UW(frow);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ SW(val0_m, dst);
+ length -= 4;
+ frow += 4;
+ dst += 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const uint32_t J = frow[x_out];
+ const int v = (int)MULT_FIX(J, wrk->fy_scale);
+ assert(v >= 0 && v <= 255);
+ dst[x_out] = v;
+ }
+ }
+}
+
+static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+ uint8_t* dst, int length,
+ WebPRescaler* const wrk) {
+ const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+ const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+ const v4i32 B1 = __msa_fill_w(B);
+ const v4i32 A1 = __msa_fill_w(A);
+ const v4i32 AB = __msa_ilvr_w(A1, B1);
+ const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+ const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+
+ while (length >= 16) {
+ v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
+ v16u8 t0, t1, t2, t3, t4, t5;
+ LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
+ LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
+ CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
+ CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
+ PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
+ t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
+ ST_UB(t0, dst);
+ frow += 16;
+ irow += 16;
+ dst += 16;
+ length -= 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
+ LD_UW3(frow, 4, frow0, frow1, frow2);
+ LD_UW3(irow, 4, irow0, irow1, irow2);
+ CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+ CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+ CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ frow += 12;
+ irow += 12;
+ dst += 12;
+ length -= 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 frow0, frow1, irow0, irow1;
+ LD_UW2(frow, 4, frow0, frow1);
+ LD_UW2(irow, 4, irow0, irow1);
+ CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+ CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ frow += 4;
+ irow += 4;
+ dst += 4;
+ length -= 4;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ const v4u32 frow0 = LD_UW(frow + 0);
+ const v4u32 irow0 = LD_UW(irow + 0);
+ CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+ SW(val0_m, dst);
+ frow += 4;
+ irow += 4;
+ dst += 4;
+ length -= 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const uint64_t I = (uint64_t)A * frow[x_out]
+ + (uint64_t)B * irow[x_out];
+ const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+ const int v = (int)MULT_FIX(J, wrk->fy_scale);
+ assert(v >= 0 && v <= 255);
+ dst[x_out] = v;
+ }
+ }
+}
+
+static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const rescaler_t* frow = wrk->frow;
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(wrk->y_expand);
+ assert(wrk->y_sub != 0);
+ if (wrk->y_accum == 0) {
+ ExportRowExpand_0(frow, dst, x_out_max, wrk);
+ } else {
+ ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
+ }
+}
+
+static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+ uint8_t* dst, int length,
+ const uint32_t yscale,
+ WebPRescaler* const wrk) {
+ const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
+ const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+ const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+ const v4i32 zero = { 0 };
+
+ while (length >= 16) {
+ v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
+ v16u8 out;
+ LD_UW4(frow, 4, src0, src1, src2, src3);
+ CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
+ frac0, frac1, frac2, frac3);
+ LD_UW4(irow, 4, src0, src1, src2, src3);
+ SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
+ src0, src1, src2, src3);
+ CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
+ ST_UB(out, dst);
+ ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
+ frow += 16;
+ irow += 16;
+ dst += 16;
+ length -= 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 src0, src1, src2, frac0, frac1, frac2;
+ LD_UW3(frow, 4, src0, src1, src2);
+ CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+ CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+ CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
+ LD_UW3(irow, 4, src0, src1, src2);
+ SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
+ CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+ CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+ CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ ST_UW3(frac0, frac1, frac2, irow, 4);
+ frow += 12;
+ irow += 12;
+ dst += 12;
+ length -= 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 src0, src1, frac0, frac1;
+ LD_UW2(frow, 4, src0, src1);
+ CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+ CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+ LD_UW2(irow, 4, src0, src1);
+ SUB2(src0, frac0, src1, frac1, src0, src1);
+ CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+ CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ ST_UW2(frac0, frac1, irow, 4);
+ frow += 8;
+ irow += 8;
+ dst += 8;
+ length -= 8;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ v4u32 frac0;
+ v4u32 src0 = LD_UW(frow);
+ CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+ src0 = LD_UW(irow);
+ src0 = src0 - frac0;
+ CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+ SW(val0_m, dst);
+ ST_UW(frac0, irow);
+ frow += 4;
+ irow += 4;
+ dst += 4;
+ length -= 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
+ const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+ assert(v >= 0 && v <= 255);
+ dst[x_out] = v;
+ irow[x_out] = frac;
+ }
+ }
+}
+
+static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+ int length,
+ WebPRescaler* const wrk) {
+ const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+ const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+ const v4i32 zero = { 0 };
+
+ while (length >= 16) {
+ v4u32 src0, src1, src2, src3;
+ v16u8 dst0;
+ LD_UW4(irow, 4, src0, src1, src2, src3);
+ CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
+ ST_UB(dst0, dst);
+ ST_SW4(zero, zero, zero, zero, irow, 4);
+ length -= 16;
+ irow += 16;
+ dst += 16;
+ }
+ if (length > 0) {
+ int x_out;
+ if (length >= 12) {
+ uint32_t val0_m, val1_m, val2_m;
+ v4u32 src0, src1, src2;
+ LD_UW3(irow, 4, src0, src1, src2);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+ SW3(val0_m, val1_m, val2_m, dst, 4);
+ ST_SW3(zero, zero, zero, irow, 4);
+ length -= 12;
+ irow += 12;
+ dst += 12;
+ } else if (length >= 8) {
+ uint32_t val0_m, val1_m;
+ v4u32 src0, src1;
+ LD_UW2(irow, 4, src0, src1);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+ SW2(val0_m, val1_m, dst, 4);
+ ST_SW2(zero, zero, irow, 4);
+ length -= 8;
+ irow += 8;
+ dst += 8;
+ } else if (length >= 4) {
+ uint32_t val0_m;
+ const v4u32 src0 = LD_UW(irow + 0);
+ CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+ SW(val0_m, dst);
+ ST_SW(zero, irow);
+ length -= 4;
+ irow += 4;
+ dst += 4;
+ }
+ for (x_out = 0; x_out < length; ++x_out) {
+ const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+ assert(v >= 0 && v <= 255);
+ dst[x_out] = v;
+ irow[x_out] = 0;
+ }
+ }
+}
+
+static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+ uint8_t* dst = wrk->dst;
+ rescaler_t* irow = wrk->irow;
+ const int x_out_max = wrk->dst_width * wrk->num_channels;
+ const rescaler_t* frow = wrk->frow;
+ const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+ assert(!WebPRescalerOutputDone(wrk));
+ assert(wrk->y_accum <= 0);
+ assert(!wrk->y_expand);
+ if (yscale) {
+ ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
+ } else {
+ ExportRowShrink_1(irow, dst, x_out_max, wrk);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
+ WebPRescalerExportRowExpand = RescalerExportRowExpand;
+ WebPRescalerExportRowShrink = RescalerExportRowShrink;
+}
+
+#else // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
+
+#endif // WEBP_USE_MSA