summaryrefslogtreecommitdiffstats
path: root/src/gui/painting/qdrawhelper_mips_dspr2_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/gui/painting/qdrawhelper_mips_dspr2_asm.S')
-rw-r--r--src/gui/painting/qdrawhelper_mips_dspr2_asm.S550
1 files changed, 549 insertions, 1 deletions
diff --git a/src/gui/painting/qdrawhelper_mips_dspr2_asm.S b/src/gui/painting/qdrawhelper_mips_dspr2_asm.S
index ec220732be..c7a603eebe 100644
--- a/src/gui/painting/qdrawhelper_mips_dspr2_asm.S
+++ b/src/gui/painting/qdrawhelper_mips_dspr2_asm.S
@@ -1,6 +1,6 @@
/****************************************************************************
**
-** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic <dtatalovic@mips.com>
+** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com
** Contact: http://www.qt-project.org/legal
**
** This file is part of the QtGui module of the Qt Toolkit.
@@ -122,3 +122,551 @@ LEAF_MIPS_DSPR2(qConvertRgb16To32_asm_mips_dspr2)
nop
END(qConvertRgb16To32_asm_mips_dspr2)
+
+
+#if defined(__MIPSEL) && __MIPSEL
+# define PACK(r, s, t) packrl.ph r, s, t
+# define LDHI(r, o, b) lwl r, o + 1 (b)
+# define LDLO(r, o, b) lwr r, o + 2 (b)
+#else
+# define PACK(r, s, t) packrl.ph r, t, s
+# define LDHI(r, o, b) lwr r, o + 1 (b)
+# define LDLO(r, o, b) lwl r, o + 2 (b)
+#endif
+
+
+LEAF_MIPS_DSPR2(qt_blend_rgb16_on_rgb16_mips_dspr2_asm)
+/*
++ * a0 - dst (*r5g6b5)
+ * a1 - src (const *r5g6b5)
+ * a2 - len (unsigned int) - batch length
+ * a3 - alpha (int)
+ *
+ * Register usage:
+ * t0-3 - Scratch registers
+ * t4 - Number of iterations to do in unrolled loops
+ * t5 - Inverse alpha
+ * t6 - Alpha >> 2
+ * t7 - Inverse alpha >> 2
+ * t8 - magic1 (0x07e007e0)
+ * t9 - magic2 (0xf81ff81f)
+ *
+ * NOTE:
+ * Cannot use DSP instructions for the multiplication of two
+ * 16-bit values: overflow would be always rounded or saturated.
+ */
+
+ beqz a2, 0f
+ andi t0, a0, 0x3
+ andi t1, a1, 0x3
+ /* Adjust alpha value, and calculate inverse alpha value */
+ li t5, 255
+ or t2, t0, t1 /* t0 = (dst & 0x3) | (src & 0x3) */
+ sll t8, a3, 8
+ subu a3, t8, a3
+ li t8, 0x07e007e0 /* magic1 */
+ srl a3, a3, 8 /* alpha >>= 8 */
+ li t9, 0xf81ff81f /* magic2 */
+ subu t5, t5, a3 /* ialpha = 255 - alpha */
+ addiu a3, a3, 1 /* alpha++ */
+ addiu t5, t5, 1 /* ialpha++ */
+ srl t6, a3, 2 /* ashift = alpha >> 2 */
+
+ beqz t2, 4f /* both aligned */
+ srl t7, t5, 2 /* iashift = ialpha >> 2 */
+
+ beqz t1, 2f /* src aligned, dst unaligned */
+ nop
+
+ beqz t0, 3f /* dst aligned, src unaligned */
+ nop
+
+ /*
+ * Both src/dst are unaligned: read 1 halfword from each, then
+ * fall-off to continue with word-aligned operation.
+ */
+ lhu t1, 0 (a1)
+ lhu t0, 0 (a0)
+ addiu a2, a2, -1 /* len-- */
+ andi t2, t1, 0x07e0
+ andi t1, t1, 0xf81f
+ mul t2, t2, a3
+ mul t1, t1, t6
+ andi t3, t0, 0x07e0
+ andi t0, t0, 0xf81f
+ mul t3, t3, t5
+ mul t0, t0, t7
+ addiu a1, a1, 2 /* src++ */
+ srl t2, t2, 8
+ srl t1, t1, 6
+ andi t2, t2, 0x07e0
+ andi t1, t1, 0xf81f
+ or t1, t1, t2
+ srl t3, t3, 8
+ srl t0, t0, 6
+ andi t3, t3, 0x07e0
+ andi t0, t0, 0xf81f
+ or t0, t0, t3
+ addu t0, t0, t1 /* src * alpha + dst * ialpha */
+ sh t0, 0 (a0)
+ addiu a0, a0, 2 /* dst++ */
+
+ /*
+ * Both src/dst pointers are word-aligned, process eight
+ * items at a time in an unrolled loop.
+ */
+4: beqz a2, 0f
+ srl t4, a2, 3 /* t4 = len / 8 */
+ beqz t4, 5f
+ andi a2, a2, 0x7 /* len = len % 8 */
+ SAVE_REGS_ON_STACK 12, s0, s1, s2, s3, s4, v0, v1
+
+1: lw t1, 0 (a1) /* [s0, s1] */
+ lw v1, 4 (a1) /* [s2, s3] */
+ lw s1, 8 (a1) /* [s4, s5] */
+ lw s3, 12 (a1) /* [s6, s7] */
+
+ lw t0, 0 (a0) /* [d0, d1] */
+ lw v0, 4 (a0) /* [d2, d3] */
+ lw s0, 8 (a0) /* [d4, d5] */
+ lw s2, 12 (a0) /* [d6, d7] */
+
+ pref 4, 16 (a1)
+ pref 5, 16 (a0)
+
+ and t2, t1, t8
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, a3
+ mul t2, t2, a3
+ and t1, t1, t9
+ ext s4, t1, 0, 16
+ mul s4, s4, t6
+ srl t1, t1, 16
+ mul t1, t1, t6
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, v1, t8
+ srl t1, t1, 6
+ append t1, s4, 16
+ and t1, t1, t9
+ or t1, t1, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, a3
+ mul t3, t3, a3
+ and v1, v1, t9
+ ext s4, v1, 0, 16
+ mul s4, s4, t6
+ srl v1, v1, 16
+ mul v1, v1, t6
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ and t2, s1, t8
+ srl v1, v1, 6
+ append v1, s4, 16
+ and v1, v1, t9
+ or v1, v1, t3
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, a3
+ mul t2, t2, a3
+ and s1, s1, t9
+ ext s4, s1, 0, 16
+ mul s4, s4, t6
+ srl s1, s1, 16
+ mul s1, s1, t6
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, s3, t8
+ srl s1, s1, 6
+ append s1, s4, 16
+ and s1, s1, t9
+ or s1, s1, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, a3
+ mul t3, t3, a3
+ and s3, s3, t9
+ ext s4, s3, 0, 16
+ mul s4, s4, t6
+ srl s3, s3, 16
+ mul s3, s3, t6
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ and t2, t0, t8
+ srl s3, s3, 6
+ append s3, s4, 16
+ and s3, s3, t9
+ or s3, s3, t3
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, t5
+ mul t2, t2, t5
+ and t0, t0, t9
+ ext s4, t0, 0, 16
+ mul s4, s4, t7
+ srl t0, t0, 16
+ mul t0, t0, t7
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, v0, t8
+ srl t0, t0, 6
+ append t0, s4, 16
+ and t0, t0, t9
+ or t0, t0, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, t5
+ mul t3, t3, t5
+ and v0, v0, t9
+ ext s4, v0, 0, 16
+ mul s4, s4, t7
+ srl v0, v0, 16
+ mul v0, v0, t7
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ and t2, s0, t8
+ srl v0, v0, 6
+ append v0, s4, 16
+ and v0, v0, t9
+ or v0, v0, t3
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, t5
+ mul t2, t2, t5
+ and s0, s0, t9
+ ext s4, s0, 0, 16
+ mul s4, s4, t7
+ srl s0, s0, 16
+ mul s0, s0, t7
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, s2, t8
+ srl s0, s0, 6
+ append s0, s4, 16
+ and s0, s0, t9
+ or s0, s0, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, t5
+ mul t3, t3, t5
+ and s2, s2, t9
+ ext s4, s2, 0, 16
+ mul s4, s4, t7
+ srl s2, s2, 16
+ mul s2, s2, t7
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ addu.ph t0, t0, t1
+ srl s2, s2, 6
+ append s2, s4, 16
+ and s2, s2, t9
+ or s2, s2, t3
+ addu.ph v0, v0, v1 /* v0 = [S2 + D2, S3 + D3] */
+ addu.ph s0, s0, s1 /* s0 = [S4 + D4, S5 + D5] */
+ addu.ph s2, s2, s3 /* s2 = [S6 + D6, S7 + D7] */
+
+ sw t0, 0 (a0) /* [SS0, SS1] */
+ sw v0, 4 (a0) /* [SS2, SS3] */
+ sw s0, 8 (a0) /* [SS4, SS5] */
+ sw s2, 12 (a0) /* [SS6, SS7] */
+
+ addiu t4, t4, -1 /* t4-- */
+ addiu a1, a1, 16 /* src += 8 */
+
+ bnez t4, 1b
+ addiu a0, a0, 16 /* dst += 8 */
+
+ RESTORE_REGS_FROM_STACK 12, s0, s1, s2, s3, s4, v0, v1
+ b 5f
+ nop
+
+
+ /* dst unaligned: do one item and fall down to the src unaligned case */
+2: lhu t1, 0 (a1)
+ lhu t0, 0 (a0)
+ addiu a2, a2, -1 /* len-- */
+ andi t2, t1, 0x07e0
+ andi t1, t1, 0xf81f
+ mul t2, t2, a3
+ mul t1, t1, t6
+ andi t3, t0, 0x07e0
+ andi t0, t0, 0xf81f
+ mul t3, t3, t5
+ mul t0, t0, t7
+ addiu a1, a1, 2 /* src++ */
+ srl t2, t2, 8
+ srl t1, t1, 6
+ andi t2, t2, 0x07e0
+ andi t1, t1, 0xf81f
+ or t1, t1, t2
+ srl t3, t3, 8
+ srl t0, t0, 6
+ andi t3, t3, 0x07e0
+ andi t0, t0, 0xf81f
+ or t0, t0, t3
+ addu t0, t0, t1 /* src * alpha + dst * ialpha */
+ sh t0, 0 (a0)
+ addiu a0, a0, 2 /* dst++ */
+
+ /* src unaligned */
+3: beqz a2, 0f
+ srl t4, a2, 3 /* t4 = len / 8 */
+ beqz t4, 5f
+ andi a2, a2, 0x7 /* len = len % 8 */
+ SAVE_REGS_ON_STACK 12, s0, s1, s2, s3, s4, v0, v1
+
+1: lw t0, 0 (a0) /* [d0, d1] */
+ lw v0, 4 (a0) /* [d2, d3] */
+ lw s0, 8 (a0) /* [d4, d5] */
+ lw s2, 12 (a0) /* [d6, d7] */
+
+ LDHI (t1, 0, a1) /* [s0, __] */
+ lw v1, 2 (a1) /* [s1, s2] */
+ lw s1, 6 (a1) /* [s3, s4] */
+ lw s3, 10 (a1) /* [s5, s6] */
+ LDLO (s4, 12, a1) /* [__, s7] */
+
+ pref 4, 14 (a1)
+ pref 5, 16 (a0)
+
+ PACK (t1, v1, t1) /* [s0, s1] */
+ PACK (v1, s1, v1) /* [s2, s3] */
+ PACK (s1, s3, s1) /* [s4, s5] */
+ PACK (s3, s4, s3) /* [s6, s7] */
+
+ and t2, t1, t8
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, a3
+ mul t2, t2, a3
+ and t1, t1, t9
+ ext s4, t1, 0, 16
+ mul s4, s4, t6
+ srl t1, t1, 16
+ mul t1, t1, t6
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, v1, t8
+ srl t1, t1, 6
+ append t1, s4, 16
+ and t1, t1, t9
+ or t1, t1, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, a3
+ mul t3, t3, a3
+ and v1, v1, t9
+ ext s4, v1, 0, 16
+ mul s4, s4, t6
+ srl v1, v1, 16
+ mul v1, v1, t6
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ and t2, s1, t8
+ srl v1, v1, 6
+ append v1, s4, 16
+ and v1, v1, t9
+ or v1, v1, t3
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, a3
+ mul t2, t2, a3
+ and s1, s1, t9
+ ext s4, s1, 0, 16
+ mul s4, s4, t6
+ srl s1, s1, 16
+ mul s1, s1, t6
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, s3, t8
+ srl s1, s1, 6
+ append s1, s4, 16
+ and s1, s1, t9
+ or s1, s1, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, a3
+ mul t3, t3, a3
+ and s3, s3, t9
+ ext s4, s3, 0, 16
+ mul s4, s4, t6
+ srl s3, s3, 16
+ mul s3, s3, t6
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ and t2, t0, t8
+ srl s3, s3, 6
+ append s3, s4, 16
+ and s3, s3, t9
+ or s3, s3, t3
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, t5
+ mul t2, t2, t5
+ and t0, t0, t9
+ ext s4, t0, 0, 16
+ mul s4, s4, t7
+ srl t0, t0, 16
+ mul t0, t0, t7
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, v0, t8
+ srl t0, t0, 6
+ append t0, s4, 16
+ and t0, t0, t9
+ or t0, t0, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, t5
+ mul t3, t3, t5
+ and v0, v0, t9
+ ext s4, v0, 0, 16
+ mul s4, s4, t7
+ srl v0, v0, 16
+ mul v0, v0, t7
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ and t2, s0, t8
+ srl v0, v0, 6
+ append v0, s4, 16
+ and v0, v0, t9
+ or v0, v0, t3
+ ext t3, t2, 0, 16
+ srl t2, t2, 16
+ mul t3, t3, t5
+ mul t2, t2, t5
+ and s0, s0, t9
+ ext s4, s0, 0, 16
+ mul s4, s4, t7
+ srl s0, s0, 16
+ mul s0, s0, t7
+ srl t3, t3, 8
+ srl t2, t2, 8
+ append t2, t3, 16
+ and t2, t2, t8
+ srl s4, s4, 6
+ and t3, s2, t8
+ srl s0, s0, 6
+ append s0, s4, 16
+ and s0, s0, t9
+ or s0, s0, t2
+ ext t2, t3, 0, 16
+ srl t3, t3, 16
+ mul t2, t2, t5
+ mul t3, t3, t5
+ and s2, s2, t9
+ ext s4, s2, 0, 16
+ mul s4, s4, t7
+ srl s2, s2, 16
+ mul s2, s2, t7
+ srl t2, t2, 8
+ srl t3, t3, 8
+ append t3, t2, 16
+ and t3, t3, t8
+ srl s4, s4, 6
+ addu.ph t0, t0, t1
+ srl s2, s2, 6
+ append s2, s4, 16
+ and s2, s2, t9
+ or s2, s2, t3
+ addu.ph v0, v0, v1 /* v0 = [S2 + D2, S3 + D3] */
+ addu.ph s0, s0, s1 /* s0 = [S4 + D4, S5 + D5] */
+ addu.ph s2, s2, s3 /* s2 = [S6 + D6, S7 + D7] */
+
+ sw t0, 0 (a0) /* [SS0, SS1] */
+ sw v0, 4 (a0) /* [SS2, SS3] */
+ sw s0, 8 (a0) /* [SS4, SS5] */
+ sw s2, 12 (a0) /* [SS6, SS7] */
+
+ addiu t4, t4, -1 /* t4-- */
+ addiu a1, a1, 16 /* src += 8 */
+
+ bnez t4, 1b
+ addiu a0, a0, 16 /* dst += 8 */
+
+ RESTORE_REGS_FROM_STACK 12, s0, s1, s2, s3, s4, v0, v1
+
+5: /* Process remaining items (len < 8), one at a time */
+ beqz a2, 0f
+ nop
+
+1: lhu t1, 0 (a1)
+ lhu t0, 0 (a0)
+ addiu a1, a1, 2 /* src++ */
+ andi t2, t1, 0x07e0
+ andi t1, t1, 0xf81f
+ mul t2, t2, a3
+ mul t1, t1, t6
+ andi t3, t0, 0x07e0
+ andi t0, t0, 0xf81f
+ mul t3, t3, t5
+ mul t0, t0, t7
+ addiu a2, a2, -1 /* len-- */
+ srl t2, t2, 8
+ srl t1, t1, 6
+ andi t2, t2, 0x07e0
+ andi t1, t1, 0xf81f
+ or t1, t1, t2
+ srl t3, t3, 8
+ srl t0, t0, 6
+ andi t3, t3, 0x07e0
+ andi t0, t0, 0xf81f
+ or t0, t0, t3
+
+ addu t0, t0, t1 /* src*alpha + dst*ialpha */
+ sh t0, 0 (a0)
+ bnez a2, 1b
+ addiu a0, a0, 2 /* dst++ */
+
+0: jr ra
+ nop
+
+END(qt_blend_rgb16_on_rgb16_mips_dspr2_asm)
+
+#undef PACK
+#undef LDHI
+#undef LDLO