From f10356ead13e39c9501b84ee5e92efe722a2d2c7 Mon Sep 17 00:00:00 2001 From: lpapuga Date: Wed, 20 Nov 2013 17:09:57 +0100 Subject: MIPS DSP build system fix and additional optimizations. Changed MIPS DSP portion of the mkspecs/features/simd.prf file in order to fix the corrupted build system for MIPS platforms. List of the additionally optimized functions from file src/gui/painting/qdrawhelper.cpp: - qt_blend_rgb16_on_rgb16 - qt_fetchUntransformed_888 - qt_fetchUntransformed_444 - qt_fetchUntransformed_argb8565 from file src/gui/image/qimage.cpp: - convert_ARGB_to_ARGB_PM_inplace from file src/corelib/qstring.cpp: - ucstrncmp - toLatin1_helper - fromLatin1_helper Change-Id: I5c47a69784917eee29a8dbd2718828a390b27c93 Reviewed-by: Thiago Macieira --- src/gui/painting/qdrawhelper_mips_dsp_asm.S | 478 +++++++++++++++++++++++++++- 1 file changed, 477 insertions(+), 1 deletion(-) (limited to 'src/gui/painting/qdrawhelper_mips_dsp_asm.S') diff --git a/src/gui/painting/qdrawhelper_mips_dsp_asm.S b/src/gui/painting/qdrawhelper_mips_dsp_asm.S index 64fc635970..26b48f9d62 100644 --- a/src/gui/painting/qdrawhelper_mips_dsp_asm.S +++ b/src/gui/painting/qdrawhelper_mips_dsp_asm.S @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -1601,3 +1601,479 @@ LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm) nop END(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm) + + +#if defined(__MIPSEL) && __MIPSEL +# define PACK(r, s, t) packrl.ph r, s, t +# define SWHI(r, o, b) swl r, o + 1 (b) +# define SWLO(r, o, b) swr r, o + 0 (b) +# define LDHI(r, o, b) lwl r, o + 1 (b) +# define LDLO(r, o, b) lwr r, o + 2 (b) +#else +# define PACK(r, s, t) packrl.ph r, t, s +# define SWHI(r, o, b) swr r, o + 1 (b) +# define SWLO(r, o, b) swl r, o + 0 (b) +# define LDHI(r, o, b) lwr r, o + 1 (b) +# define LDLO(r, o, b) lwl r, o + 2 (b) +#endif + +LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm) +/* + * a0 - dst (*r5g6b5) + * a1 - src (const *r5g6b5) + * a2 - len (unsigned int) + * + * Register usage: + * t0-3 - Scratch registers + * t4 - Number of iterations to do in unrolled loops + * t5-7 - Auxiliary scratch registers. + * + * Check if base addresses of src/dst are aligned, cases: + * a) Both aligned. + * b) Both unaligned: + * 1. Copy a halfword + * 2. Use aligned case. + * c) dst aligned, src unaligned: + * 1. Read a word from dst, halfword from src. + * 2. Continue reading words from both. + * d) dst unaligned, src aligned: + * 1. Read a word from src, halfword from dst. + * 2. Continue reading words from both. + */ + + beqz a2, 0f /* if (a2:len == 0): return */ + andi t0, a0, 0x3 /* t0 = a0:dst % 4 */ + andi t1, a1, 0x3 /* t1 = a1:dst % 4 */ + or t2, t0, t1 /* t1 = t0 | t1 */ + + beqz t2, 4f /* both aligned */ + nop + beqz t0, 3f /* dst aligned, src unaligned */ + nop + beqz t1, 2f /* src aligned, dst unaligned */ + nop + + /* + * Both src/dst are unaligned: read 1 halfword from each, + * the fall-off to continue with word-aligned copy. + */ + lhu t0, 0 (a1) /* t0 <- ((uint16_t*) src)[0] */ + addiu a1, a1, 2 /* src++ */ + addiu a2, a2,-1 /* len-- */ + sh t0, 0 (a0) /* t1 -> ((uint16_t*) dst)[0] */ + addiu a0, a0, 2 /* dst++ */ + + /* + * Both src/dst pointers are word-aligned, process eight + * items at a time in an unrolled loop. + */ +4: beqz a2, 0f /* if (len == 0): return */ + srl t4, a2, 3 /* t4 = len / 8 */ + + beqz t4, 5f /* if (t4 == 0): tail */ + andi a2, a2, 0x07 /* len = len % 8 */ + +1: lw t0, 0 (a1) + lw t1, 4 (a1) + lw t2, 8 (a1) + lw t3, 12 (a1) + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + sw t0, 0 (a0) + sw t1, 4 (a0) + sw t2, 8 (a0) + sw t3, 12 (a0) + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + b 5f + nop + + + /* + * dst pointer is unaligned + */ +2: beqz a2, 0f /* if (len == 0): return */ + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f /* if (t4 == 0): tail */ + andi a2, a2, 0x07 /* len = len % 8 */ + +1: lw t0, 0 (a1) + lw t1, 4 (a1) + lw t2, 8 (a1) + lw t3, 12 (a1) + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + SWLO (t0, 0, a0) + PACK (t5, t1, t0) + PACK (t6, t2, t1) + PACK (t7, t3, t2) + SWHI (t3, 14, a0) + sw t5, 2 (a0) + sw t6, 6 (a0) + sw t7, 10 (a0) + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + b 5f + nop + + /* + * src pointer is unaligned + */ +3: beqz a2, 0f /* if (len == 0): return */ + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f /* if (t4 == 0): tail */ + andi a2, a2, 0x07 /* len = len % 8 */ + +1: LDHI (t0, 0, a1) + lw t1, 2 (a1) + lw t2, 6 (a1) + lw t3, 10 (a1) + LDLO (t5, 12, a1) + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + PACK (t0, t1, t0) + PACK (t6, t2, t1) + PACK (t7, t3, t2) + sw t0, 0 (a0) + PACK (t0, t5, t3) + sw t6, 4 (a0) + sw t7, 8 (a0) + sw t0, 12 (a0) + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + +5: /* Process remaining items (a2:len < 4), one at a time */ + beqz a2, 0f + nop + +1: lhu t0, 0 (a1) /* t0 <- ((uint16_t*) src)[0] */ + addiu a2, a2,-1 /* len-- */ + addiu a1, a1, 2 /* src++ */ + sh t0, 0 (a0) /* to -> ((uint16_t*) dst)[0] */ + bnez a2, 1b /* if (len != 0): loop */ + addiu a0, a0, 2 /* dst++ */ + +0: jr ra + nop + +END(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm) + + +#undef LDHI +#undef LDLO +#undef PACK +#undef SWHI +#undef SWLO + + +LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_mips_dsp_asm) +/* + * a0 - dst (*r5g6b5) + * a1 - src (const *r5g6b5) + * a2 - len (unsigned int) - batch length + * a3 - alpha (int) + */ + + beqz a2, 2f + li t9, 255 + sll t8, a3, 8 + subu a3, t8, a3 + srl a3, a3, 8 + subu t9, t9, a3 + addiu a3, a3, 1 + srl t4, a3, 2 + addiu t9, t9, 1 + srl t5, t9, 2 +1: + lhu t0, 0(a1) + lhu t1, 0(a0) + addiu a2, a2, -1 + andi t2, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t2, t2, a3 + mul t0, t0, t4 + andi t3, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t3, t3, t9 + mul t1, t1, t5 + addiu a1, a1, 2 + srl t2, t2, 8 + srl t0, t0, 6 + andi t2, t2, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t2 + srl t3, t3, 8 + srl t1, t1, 6 + andi t3, t3, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t3 + addu t0, t0, t1 + sh t0, 0(a0) + bgtz a2, 1b + addiu a0, a0, 2 +2: + jr ra + nop + +END(qt_blend_rgb16_on_rgb16_mips_dsp_asm) + + +LEAF_MIPS_DSP(fetchUntransformed_888_asm_mips_dsp) +/* + * a0 - dst address (address of 32-bit aRGB value) + * a1 - src address + * a2 - length + */ + + beqz a2, 4f + lui t8, 0xff00 + andi t0, a2, 0x1 + beqz t0, 1f + nop +/* case for one pixel */ + lbu t1, 0(a1) + lbu v1, 2(a1) + lbu t0, 1(a1) + addiu a1, a1, 3 + addiu a2, a2, -1 + sll t1, t1, 0x10 + or v1, v1, t8 + sll t0, t0, 0x8 + or v1, v1, t1 + or v1, v1, t0 + sw v1, 0(a0) + addiu a0, a0, 4 + + beqz a2, 4f /* only one pixel is present (length = 1) */ + nop +1: + andi t0, a1, 0x1 + beqz t0, 3f + nop +2: + lbu t0, 0(a1) /* t0 = | 0 | 0 | 0 | R1 | */ + lhu t1, 1(a1) /* t1 = | 0 | 0 | B1 | G1 | */ + addiu a1, a1, 3 + lhu t2, 0(a1) /* t2 = | 0 | 0 | G2 | R2 | */ + lbu t3, 2(a1) /* t3 = | 0 | 0 | 0 | B2 | */ + + sll t0, t0, 16 + or t0, t0, t8 /* t0 = | ff | R1 | 0 | 0 | */ + shll.ph t4, t1, 8 /* t4 = | 0 | 0 | G1 | 0 | */ + srl t5, t1, 8 + or t4, t4, t5 /* t4 = | 0 | 0 | G1 | B1 | */ + or t0, t0, t4 /* t0 = | ff | R1 | G1 | B1 | */ + + shll.ph t4, t2, 8 /* t4 = | 0 | 0 | R2 | 0 | */ + srl t5, t2, 8 /* t5 = | 0 | 0 | 0 | G2 | */ + or t4, t4, t5 + sll t4, t4, 8 /* t4 = | 0 | R2 | G2 | 0 | */ + or t5, t3, t8 + or t2, t4, t5 /* t2 = | ff | R2 | G2 | B2 | */ + + sw t0, 0(a0) + addiu a1, a1, 3 + sw t2, 4(a0) + addiu a2, a2, -2 + bnez a2, 2b + addiu a0, a0, 8 + b 4f + nop +3: + lhu t0, 0(a1) /* t0 = | 0 | 0 | G1 | R1 | */ + lbu t1, 2(a1) /* t1 = | 0 | 0 | 0 | B1 | */ + addiu a1, a1, 3 + lbu t2, 0(a1) /* t2 = | 0 | 0 | 0 | R2 | */ + lhu t3, 1(a1) /* t3 = | 0 | 0 | B2 | G2 | */ + + srl t4, t0, 8 /* t4 = | 0 | 0 | 0 | G1 | */ + shll.ph t5, t0, 8 /* t5 = | 0 | 0 | R1 | 0 | */ + or t0, t4, t5 + sll t6, t0, 8 /* t6 = | 0 | R1 | G1 | 0 | */ + or t4, t1, t8 /* t4 = | ff | 0 | 0 | B1 | */ + or t0, t6, t4 + + sll t2, t2, 16 + srl t4, t3, 8 + shll.ph t5, t3, 8 + or t3, t4, t5 + or t2, t2, t3 + or t2, t2, t8 + + sw t0, 0(a0) + addiu a1, a1, 3 + sw t2, 4(a0) + addiu a2, a2, -2 + bnez a2, 3b + addiu a0, a0, 8 +4: + jr ra + nop + +END(fetchUntransformed_888_asm_mips_dsp) + + +LEAF_MIPS_DSP(fetchUntransformed_444_asm_mips_dsp) +/* + * a0 - dst address (address of 32-bit aRGB value) + * a1 - src address + * a2 - length + */ + + lui t8, 0xff00 + li t4, 0x1 + + beqz a2, 5f + move v0, a0 /* just return the address of buffer + * for storing returning values */ + andi t0, a2, 0x1 + beqz t0, 2f /* there is more then one pixel + * (check src memory alignment (word)) */ + nop +1: + lhu v0, 0(a1) + addiu a1, a1, 2 + addiu a2, a2, -1 + andi t0, v0, 0xf00 + andi v1, v0, 0xf + andi v0, v0, 0xf0 + sra t3, t0, 0x4 + sra t1, v0, 0x4 + sra t0, t0, 0x8 + sll t2, v1, 0x4 + or t0, t0, t3 + or v0, t1, v0 + lui t1, 0xff00 + or v1, t2, v1 + sll t0, t0, 0x10 + or v1, v1, t1 + sll v0, v0, 0x8 + or v1, v1, t0 + or v0, v1, v0 + sw v0, 0(a0) + addiu a0, a0, 4 + beqz a2, 5f /* no more pixels for processing */ + nop + beq a2, t4, 4f /* only one more pixel remained */ + nop +/* check if src memory address is word aligned */ +2: + andi t0, a1, 0x3 + beqz t0, 3f /* memory is word aligned */ + andi a3, a2, 0x1 /* set the a3 register as the comparation + * for ending the unrolled loop + * (1 if odd, 0 if even) */ + b 1b /* not word aligned, + * go another turn with + * just one pixel processing */ + nop +3: + lw t0, 0(a1) + addiu a2, a2, -2 + preceu.ph.qbr t1, t0 /* t1 = | 0 | aR1 | 0 | G1B1 | */ + preceu.ph.qbl t2, t0 /* t1 = | 0 | aR2 | 0 | G2B2 | */ + shll.qb t3, t1, 4 /* t3 = | 0 | R1 0 | 0 | B1 0 | */ + srl t4, t3, 4 + or t0, t3, t4 /* t0 = | 0 | R1R1 | 0 | B1B1 | */ + andi t3, t1, 0xf0 + sll t3, t3, 8 + srl t4, t3, 4 + or t1, t3, t4 + or t0, t0, t1 /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */ + or t0, t0, t8 /* t0 = | ff | R1R1 | G1G1 | B1B1 | */ + + shll.qb t3, t2, 4 /* t3 = | 0 | R1 0 | 0 | B1 0 | */ + srl t4, t3, 4 + or t7, t3, t4 /* t0 = | 0 | R1R1 | 0 | B1B1 | */ + andi t3, t2, 0xf0 + sll t3, t3, 8 + srl t4, t3, 4 + or t1, t3, t4 + or t2, t7, t1 /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */ + or t2, t2, t8 /* t0 = | ff | R1R1 | G1G1 | B1B1 | */ + + sw t0, 0(a0) + addiu a1, a1, 4 + sw t2, 4(a0) + bne a2, a3, 3b + addiu a0, a0, 8 + beqz a2, 5f /* no more pixels for processing */ + nop +4: +/* one more pixel remained (after loop unrolling process finished) */ + lhu v0, 0(a1) + addiu a1, a1, 2 + addiu a2, a2, -1 + andi t0, v0, 0xf00 + andi v1, v0, 0xf + andi v0, v0, 0xf0 + sra t3, t0, 0x4 + sra t1, v0, 0x4 + sra t0, t0, 0x8 + sll t2, v1, 0x4 + or t0, t0, t3 + or v0, t1, v0 + lui t1, 0xff00 + or v1, t2, v1 + sll t0, t0, 0x10 + or v1, v1, t1 + sll v0, v0, 0x8 + or v1, v1, t0 + or v0, v1, v0 + sw v0, 0(a0) + addiu a0, a0, 4 +5: + jr ra + nop + +END(fetchUntransformed_444_asm_mips_dsp) + + +LEAF_MIPS_DSP(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp) +/* + * a0 - dst address + * a1 - src address + * a2 - length + */ + + beqz a2, 2f + nop + +1: + ulh t1, 0(a1) + lbu t2, 2(a1) + addiu a2, a2, -1 + wsbh t1, t1 + sll t0, t1, 8 /* t0 = 00000000rrrrrggggggbbbbb00000000 */ + ins t0, t1, 3, 16 /* t0 = 00000000rrrrrrrrrrggggggbbbbb000 */ + ins t0, t1, 5, 11 /* t0 = 00000000rrrrrrrrggggggbbbbbbb000 */ + srl t4, t1, 9 /* t4 = 0000000000000000000000000rrrrrgg */ + replv.qb t3, t2 + ins t0, t4, 8, 2 /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */ + ins t0, t1, 3, 5 /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */ + srl t4, t1, 2 /* t4 = 000000000000000000rrrrrggggggbbb */ + ins t0, t4, 0, 3 /* t0 = 00000000rrrrrrrrggggggggbbbbbbbb */ + ins t0, t2, 24, 8 /* t0 =aaaaaaaarrrrrrrrggggggggbbbbbbbb */ + cmpu.lt.qb t3, t0 + pick.qb t0, t3, t0 + addiu a1, a1, 3 + sw t0, 0(a0) + bgtz a2, 1b + addiu a0, a0, 4 +2: + jr ra + nop + +END(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp) -- cgit v1.2.3