diff options
Diffstat (limited to 'src/gui/painting/qdrawhelper_mips_dspr2_asm.S')
-rw-r--r-- | src/gui/painting/qdrawhelper_mips_dspr2_asm.S | 550 |
1 files changed, 549 insertions, 1 deletions
diff --git a/src/gui/painting/qdrawhelper_mips_dspr2_asm.S b/src/gui/painting/qdrawhelper_mips_dspr2_asm.S index ec220732be..c7a603eebe 100644 --- a/src/gui/painting/qdrawhelper_mips_dspr2_asm.S +++ b/src/gui/painting/qdrawhelper_mips_dspr2_asm.S @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic <dtatalovic@mips.com> +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -122,3 +122,551 @@ LEAF_MIPS_DSPR2(qConvertRgb16To32_asm_mips_dspr2) nop END(qConvertRgb16To32_asm_mips_dspr2) + + +#if defined(__MIPSEL) && __MIPSEL +# define PACK(r, s, t) packrl.ph r, s, t +# define LDHI(r, o, b) lwl r, o + 1 (b) +# define LDLO(r, o, b) lwr r, o + 2 (b) +#else +# define PACK(r, s, t) packrl.ph r, t, s +# define LDHI(r, o, b) lwr r, o + 1 (b) +# define LDLO(r, o, b) lwl r, o + 2 (b) +#endif + + +LEAF_MIPS_DSPR2(qt_blend_rgb16_on_rgb16_mips_dspr2_asm) +/* ++ * a0 - dst (*r5g6b5) + * a1 - src (const *r5g6b5) + * a2 - len (unsigned int) - batch length + * a3 - alpha (int) + * + * Register usage: + * t0-3 - Scratch registers + * t4 - Number of iterations to do in unrolled loops + * t5 - Inverse alpha + * t6 - Alpha >> 2 + * t7 - Inverse alpha >> 2 + * t8 - magic1 (0x07e007e0) + * t9 - magic2 (0xf81ff81f) + * + * NOTE: + * Cannot use DSP instructions for the multiplication of two + * 16-bit values: overflow would be always rounded or saturated. + */ + + beqz a2, 0f + andi t0, a0, 0x3 + andi t1, a1, 0x3 + /* Adjust alpha value, and calculate inverse alpha value */ + li t5, 255 + or t2, t0, t1 /* t0 = (dst & 0x3) | (src & 0x3) */ + sll t8, a3, 8 + subu a3, t8, a3 + li t8, 0x07e007e0 /* magic1 */ + srl a3, a3, 8 /* alpha >>= 8 */ + li t9, 0xf81ff81f /* magic2 */ + subu t5, t5, a3 /* ialpha = 255 - alpha */ + addiu a3, a3, 1 /* alpha++ */ + addiu t5, t5, 1 /* ialpha++ */ + srl t6, a3, 2 /* ashift = alpha >> 2 */ + + beqz t2, 4f /* both aligned */ + srl t7, t5, 2 /* iashift = ialpha >> 2 */ + + beqz t1, 2f /* src aligned, dst unaligned */ + nop + + beqz t0, 3f /* dst aligned, src unaligned */ + nop + + /* + * Both src/dst are unaligned: read 1 halfword from each, then + * fall-off to continue with word-aligned operation. + */ + lhu t1, 0 (a1) + lhu t0, 0 (a0) + addiu a2, a2, -1 /* len-- */ + andi t2, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t2, t2, a3 + mul t1, t1, t6 + andi t3, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t3, t3, t5 + mul t0, t0, t7 + addiu a1, a1, 2 /* src++ */ + srl t2, t2, 8 + srl t1, t1, 6 + andi t2, t2, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t2 + srl t3, t3, 8 + srl t0, t0, 6 + andi t3, t3, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t3 + addu t0, t0, t1 /* src * alpha + dst * ialpha */ + sh t0, 0 (a0) + addiu a0, a0, 2 /* dst++ */ + + /* + * Both src/dst pointers are word-aligned, process eight + * items at a time in an unrolled loop. + */ +4: beqz a2, 0f + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f + andi a2, a2, 0x7 /* len = len % 8 */ + SAVE_REGS_ON_STACK 12, s0, s1, s2, s3, s4, v0, v1 + +1: lw t1, 0 (a1) /* [s0, s1] */ + lw v1, 4 (a1) /* [s2, s3] */ + lw s1, 8 (a1) /* [s4, s5] */ + lw s3, 12 (a1) /* [s6, s7] */ + + lw t0, 0 (a0) /* [d0, d1] */ + lw v0, 4 (a0) /* [d2, d3] */ + lw s0, 8 (a0) /* [d4, d5] */ + lw s2, 12 (a0) /* [d6, d7] */ + + pref 4, 16 (a1) + pref 5, 16 (a0) + + and t2, t1, t8 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and t1, t1, t9 + ext s4, t1, 0, 16 + mul s4, s4, t6 + srl t1, t1, 16 + mul t1, t1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v1, t8 + srl t1, t1, 6 + append t1, s4, 16 + and t1, t1, t9 + or t1, t1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and v1, v1, t9 + ext s4, v1, 0, 16 + mul s4, s4, t6 + srl v1, v1, 16 + mul v1, v1, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s1, t8 + srl v1, v1, 6 + append v1, s4, 16 + and v1, v1, t9 + or v1, v1, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and s1, s1, t9 + ext s4, s1, 0, 16 + mul s4, s4, t6 + srl s1, s1, 16 + mul s1, s1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s3, t8 + srl s1, s1, 6 + append s1, s4, 16 + and s1, s1, t9 + or s1, s1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and s3, s3, t9 + ext s4, s3, 0, 16 + mul s4, s4, t6 + srl s3, s3, 16 + mul s3, s3, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, t0, t8 + srl s3, s3, 6 + append s3, s4, 16 + and s3, s3, t9 + or s3, s3, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and t0, t0, t9 + ext s4, t0, 0, 16 + mul s4, s4, t7 + srl t0, t0, 16 + mul t0, t0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v0, t8 + srl t0, t0, 6 + append t0, s4, 16 + and t0, t0, t9 + or t0, t0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and v0, v0, t9 + ext s4, v0, 0, 16 + mul s4, s4, t7 + srl v0, v0, 16 + mul v0, v0, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s0, t8 + srl v0, v0, 6 + append v0, s4, 16 + and v0, v0, t9 + or v0, v0, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and s0, s0, t9 + ext s4, s0, 0, 16 + mul s4, s4, t7 + srl s0, s0, 16 + mul s0, s0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s2, t8 + srl s0, s0, 6 + append s0, s4, 16 + and s0, s0, t9 + or s0, s0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and s2, s2, t9 + ext s4, s2, 0, 16 + mul s4, s4, t7 + srl s2, s2, 16 + mul s2, s2, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + addu.ph t0, t0, t1 + srl s2, s2, 6 + append s2, s4, 16 + and s2, s2, t9 + or s2, s2, t3 + addu.ph v0, v0, v1 /* v0 = [S2 + D2, S3 + D3] */ + addu.ph s0, s0, s1 /* s0 = [S4 + D4, S5 + D5] */ + addu.ph s2, s2, s3 /* s2 = [S6 + D6, S7 + D7] */ + + sw t0, 0 (a0) /* [SS0, SS1] */ + sw v0, 4 (a0) /* [SS2, SS3] */ + sw s0, 8 (a0) /* [SS4, SS5] */ + sw s2, 12 (a0) /* [SS6, SS7] */ + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + RESTORE_REGS_FROM_STACK 12, s0, s1, s2, s3, s4, v0, v1 + b 5f + nop + + + /* dst unaligned: do one item and fall down to the src unaligned case */ +2: lhu t1, 0 (a1) + lhu t0, 0 (a0) + addiu a2, a2, -1 /* len-- */ + andi t2, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t2, t2, a3 + mul t1, t1, t6 + andi t3, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t3, t3, t5 + mul t0, t0, t7 + addiu a1, a1, 2 /* src++ */ + srl t2, t2, 8 + srl t1, t1, 6 + andi t2, t2, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t2 + srl t3, t3, 8 + srl t0, t0, 6 + andi t3, t3, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t3 + addu t0, t0, t1 /* src * alpha + dst * ialpha */ + sh t0, 0 (a0) + addiu a0, a0, 2 /* dst++ */ + + /* src unaligned */ +3: beqz a2, 0f + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f + andi a2, a2, 0x7 /* len = len % 8 */ + SAVE_REGS_ON_STACK 12, s0, s1, s2, s3, s4, v0, v1 + +1: lw t0, 0 (a0) /* [d0, d1] */ + lw v0, 4 (a0) /* [d2, d3] */ + lw s0, 8 (a0) /* [d4, d5] */ + lw s2, 12 (a0) /* [d6, d7] */ + + LDHI (t1, 0, a1) /* [s0, __] */ + lw v1, 2 (a1) /* [s1, s2] */ + lw s1, 6 (a1) /* [s3, s4] */ + lw s3, 10 (a1) /* [s5, s6] */ + LDLO (s4, 12, a1) /* [__, s7] */ + + pref 4, 14 (a1) + pref 5, 16 (a0) + + PACK (t1, v1, t1) /* [s0, s1] */ + PACK (v1, s1, v1) /* [s2, s3] */ + PACK (s1, s3, s1) /* [s4, s5] */ + PACK (s3, s4, s3) /* [s6, s7] */ + + and t2, t1, t8 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and t1, t1, t9 + ext s4, t1, 0, 16 + mul s4, s4, t6 + srl t1, t1, 16 + mul t1, t1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v1, t8 + srl t1, t1, 6 + append t1, s4, 16 + and t1, t1, t9 + or t1, t1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and v1, v1, t9 + ext s4, v1, 0, 16 + mul s4, s4, t6 + srl v1, v1, 16 + mul v1, v1, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s1, t8 + srl v1, v1, 6 + append v1, s4, 16 + and v1, v1, t9 + or v1, v1, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and s1, s1, t9 + ext s4, s1, 0, 16 + mul s4, s4, t6 + srl s1, s1, 16 + mul s1, s1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s3, t8 + srl s1, s1, 6 + append s1, s4, 16 + and s1, s1, t9 + or s1, s1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and s3, s3, t9 + ext s4, s3, 0, 16 + mul s4, s4, t6 + srl s3, s3, 16 + mul s3, s3, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, t0, t8 + srl s3, s3, 6 + append s3, s4, 16 + and s3, s3, t9 + or s3, s3, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and t0, t0, t9 + ext s4, t0, 0, 16 + mul s4, s4, t7 + srl t0, t0, 16 + mul t0, t0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v0, t8 + srl t0, t0, 6 + append t0, s4, 16 + and t0, t0, t9 + or t0, t0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and v0, v0, t9 + ext s4, v0, 0, 16 + mul s4, s4, t7 + srl v0, v0, 16 + mul v0, v0, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s0, t8 + srl v0, v0, 6 + append v0, s4, 16 + and v0, v0, t9 + or v0, v0, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and s0, s0, t9 + ext s4, s0, 0, 16 + mul s4, s4, t7 + srl s0, s0, 16 + mul s0, s0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s2, t8 + srl s0, s0, 6 + append s0, s4, 16 + and s0, s0, t9 + or s0, s0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and s2, s2, t9 + ext s4, s2, 0, 16 + mul s4, s4, t7 + srl s2, s2, 16 + mul s2, s2, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + addu.ph t0, t0, t1 + srl s2, s2, 6 + append s2, s4, 16 + and s2, s2, t9 + or s2, s2, t3 + addu.ph v0, v0, v1 /* v0 = [S2 + D2, S3 + D3] */ + addu.ph s0, s0, s1 /* s0 = [S4 + D4, S5 + D5] */ + addu.ph s2, s2, s3 /* s2 = [S6 + D6, S7 + D7] */ + + sw t0, 0 (a0) /* [SS0, SS1] */ + sw v0, 4 (a0) /* [SS2, SS3] */ + sw s0, 8 (a0) /* [SS4, SS5] */ + sw s2, 12 (a0) /* [SS6, SS7] */ + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + RESTORE_REGS_FROM_STACK 12, s0, s1, s2, s3, s4, v0, v1 + +5: /* Process remaining items (len < 8), one at a time */ + beqz a2, 0f + nop + +1: lhu t1, 0 (a1) + lhu t0, 0 (a0) + addiu a1, a1, 2 /* src++ */ + andi t2, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t2, t2, a3 + mul t1, t1, t6 + andi t3, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t3, t3, t5 + mul t0, t0, t7 + addiu a2, a2, -1 /* len-- */ + srl t2, t2, 8 + srl t1, t1, 6 + andi t2, t2, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t2 + srl t3, t3, 8 + srl t0, t0, 6 + andi t3, t3, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t3 + + addu t0, t0, t1 /* src*alpha + dst*ialpha */ + sh t0, 0 (a0) + bnez a2, 1b + addiu a0, a0, 2 /* dst++ */ + +0: jr ra + nop + +END(qt_blend_rgb16_on_rgb16_mips_dspr2_asm) + +#undef PACK +#undef LDHI +#undef LDLO |