From f10356ead13e39c9501b84ee5e92efe722a2d2c7 Mon Sep 17 00:00:00 2001 From: lpapuga Date: Wed, 20 Nov 2013 17:09:57 +0100 Subject: MIPS DSP build system fix and additional optimizations. Changed MIPS DSP portion of the mkspecs/features/simd.prf file in order to fix the corrupted build system for MIPS platforms. List of the additionally optimized functions from file src/gui/painting/qdrawhelper.cpp: - qt_blend_rgb16_on_rgb16 - qt_fetchUntransformed_888 - qt_fetchUntransformed_444 - qt_fetchUntransformed_argb8565 from file src/gui/image/qimage.cpp: - convert_ARGB_to_ARGB_PM_inplace from file src/corelib/qstring.cpp: - ucstrncmp - toLatin1_helper - fromLatin1_helper Change-Id: I5c47a69784917eee29a8dbd2718828a390b27c93 Reviewed-by: Thiago Macieira --- mkspecs/features/simd.prf | 12 +- src/corelib/corelib.pro | 14 + src/corelib/tools/qstring.cpp | 35 ++ src/corelib/tools/qstring_mips_dsp_asm.S | 449 +++++++++++++++++++++ src/corelib/tools/tools.pri | 4 + src/gui/image/image.pri | 2 + src/gui/image/qimage.cpp | 6 + src/gui/image/qimage_mips_dspr2.cpp | 69 ++++ src/gui/image/qimage_mips_dspr2_asm.S | 207 ++++++++++ src/gui/painting/qdrawhelper.cpp | 17 +- src/gui/painting/qdrawhelper_mips_dsp.cpp | 97 ++++- src/gui/painting/qdrawhelper_mips_dsp_asm.S | 478 +++++++++++++++++++++- src/gui/painting/qdrawhelper_mips_dsp_p.h | 45 ++- src/gui/painting/qdrawhelper_mips_dspr2_asm.S | 550 +++++++++++++++++++++++++- src/gui/painting/qt_mips_asm_dsp_p.h | 166 +++++++- 15 files changed, 2141 insertions(+), 10 deletions(-) create mode 100644 src/corelib/tools/qstring_mips_dsp_asm.S create mode 100644 src/gui/image/qimage_mips_dspr2.cpp create mode 100644 src/gui/image/qimage_mips_dspr2_asm.S diff --git a/mkspecs/features/simd.prf b/mkspecs/features/simd.prf index a98683d929..ad8c545819 100644 --- a/mkspecs/features/simd.prf +++ b/mkspecs/features/simd.prf @@ -174,10 +174,19 @@ QT_CPU_FEATURES = $$eval(QT_CPU_FEATURES.$$QT_ARCH) mips_dsp_assembler.name = assembling[mips_dsp] ${QMAKE_FILE_IN} silent:mips_dsp_assembler.commands = @echo assembling[mips_dsp] ${QMAKE_FILE_IN} && $$mips_dsp_assembler.commands QMAKE_EXTRA_COMPILERS += mips_dsp_compiler + QMAKE_EXTRA_COMPILERS += mips_dsp_assembler } mips_dspr2 { HEADERS += $$MIPS_DSP_HEADERS + mips_dspr2_compiler.commands = $$QMAKE_CXX -c + mips_dspr2_compiler.commands += $(CXXFLAGS) $(INCPATH) ${QMAKE_FILE_IN} -o ${QMAKE_FILE_OUT} + mips_dspr2_compiler.dependency_type = TYPE_C + mips_dspr2_compiler.output = ${QMAKE_VAR_OBJECTS_DIR}${QMAKE_FILE_BASE}$${first(QMAKE_EXT_OBJ)} + mips_dspr2_compiler.input = MIPS_DSPR2_SOURCES + mips_dspr2_compiler.variable_out = OBJECTS + mips_dspr2_compiler.name = compiling[mips_dspr2] ${QMAKE_FILE_IN} + silent:mips_dspr2_compiler.commands = @echo compiling[mips_dspr2] ${QMAKE_FILE_IN} && $$mips_dspr2_compiler.commands mips_dspr2_assembler.commands = $$QMAKE_CC -c mips_dspr2_assembler.commands += $(CFLAGS) $(INCPATH) ${QMAKE_FILE_IN} -o ${QMAKE_FILE_OUT} mips_dspr2_assembler.dependency_type = TYPE_C @@ -186,6 +195,7 @@ QT_CPU_FEATURES = $$eval(QT_CPU_FEATURES.$$QT_ARCH) mips_dspr2_assembler.variable_out = OBJECTS mips_dspr2_assembler.name = assembling[mips_dspr2] ${QMAKE_FILE_IN} silent:mips_dspr2_assembler.commands = @echo assembling[mips_dspr2] ${QMAKE_FILE_IN} && $$mips_dspr2_assembler.commands + QMAKE_EXTRA_COMPILERS += mips_dspr2_compiler QMAKE_EXTRA_COMPILERS += mips_dspr2_assembler } } else:win32-msvc*|winrt { @@ -297,7 +307,7 @@ QT_CPU_FEATURES = $$eval(QT_CPU_FEATURES.$$QT_ARCH) $$AVX_SOURCES $$AVX2_SOURCES \ $$NEON_SOURCES $$NEON_ASM \ $$IWMMXT_SOURCES \ - $$MIPS_DSP_SOURCES $$MIPS_DSP_ASM $$MIPS_DSPR2_ASM + $$MIPS_DSP_SOURCES $$MIPS_DSPR2_SOURCES $$MIPS_DSP_ASM $$MIPS_DSPR2_ASM # Headers are already done in the above sections. } diff --git a/src/corelib/corelib.pro b/src/corelib/corelib.pro index b513149e7c..df28183fdc 100644 --- a/src/corelib/corelib.pro +++ b/src/corelib/corelib.pro @@ -111,3 +111,17 @@ ctest_qt5_module_files.files += $$ctest_macros_file.output $$cmake_extras_mkspec ctest_qt5_module_files.path = $$[QT_INSTALL_LIBS]/cmake/Qt5Core INSTALLS += ctest_qt5_module_files cmake_qt5_umbrella_module_files + +mips_dsp:*-g++* { + HEADERS += $$MIPS_DSP_HEADERS + + mips_dsp_corelib_assembler.commands = $$QMAKE_CXX -c + mips_dsp_corelib_assembler.commands += $(CXXFLAGS) $(INCPATH) -mips32r2 -mdsp ${QMAKE_FILE_IN} -o ${QMAKE_FILE_OUT} + mips_dsp_corelib_assembler.dependency_type = TYPE_C + mips_dsp_corelib_assembler.output = ${QMAKE_VAR_OBJECTS_DIR}${QMAKE_FILE_BASE}$${first(QMAKE_EXT_OBJ)} + mips_dsp_corelib_assembler.input = MIPS_DSP_ASM + mips_dsp_corelib_assembler.variable_out = OBJECTS + mips_dsp_corelib_assembler.name = assembling[mips_dsp] ${QMAKE_FILE_IN} + silent:mips_dsp_corelib_assembler.commands = @echo assembling[mips_dsp] ${QMAKE_FILE_IN} && $$mips_dsp_corelib_assembler.commands + QMAKE_EXTRA_COMPILERS += mips_dsp_corelib_assembler +} diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 3976f2cb6f..d682207314 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -188,9 +188,23 @@ static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b, const uc return 1; } +#if defined(__mips_dsp) +// From qstring_mips_dsp_asm.S +extern "C" int qt_ucstrncmp_mips_dsp_asm(const ushort *a, + const ushort *b, + unsigned len); +#endif + // Unicode case-sensitive compare two same-sized strings static int ucstrncmp(const QChar *a, const QChar *b, int l) { +#if defined(__mips_dsp) + if (l >= 8) { + return qt_ucstrncmp_mips_dsp_asm(reinterpret_cast(a), + reinterpret_cast(b), + l); + } +#endif // __mips_dsp while (l-- && *a == *b) a++,b++; if (l==-1) @@ -3937,6 +3951,10 @@ static inline __m128i mergeQuestionMarks(__m128i chunk) } #endif +#if defined(__mips_dsp) +extern "C" void qt_toLatin1_mips_dsp_asm(uchar *dst, const ushort *src, int length); +#endif + static QByteArray toLatin1_helper(const QChar *data, int length) { QByteArray ba; @@ -3989,10 +4007,14 @@ static QByteArray toLatin1_helper(const QChar *data, int length) length = length % 8; } #endif +#if defined(__mips_dsp) + qt_toLatin1_mips_dsp_asm(dst, src, length); +#else while (length--) { *dst++ = (*src>0xff) ? '?' : (uchar) *src; ++src; } +#endif } return ba; } @@ -4104,6 +4126,12 @@ QVector QString::toUcs4() const return v; } +#if defined(__mips_dsp) +// From qstring_mips_dsp_asm.S +extern "C" void qt_fromlatin1_mips_asm_unroll4 (ushort*, const char*, uint); +extern "C" void qt_fromlatin1_mips_asm_unroll8 (ushort*, const char*, uint); +#endif + QString::Data *QString::fromLatin1_helper(const char *str, int size) { Data *d; @@ -4144,8 +4172,15 @@ QString::Data *QString::fromLatin1_helper(const char *str, int size) size = size % 16; } #endif +#if defined(__mips_dsp) + if (size > 20) + qt_fromlatin1_mips_asm_unroll8(dst, str, size); + else + qt_fromlatin1_mips_asm_unroll4(dst, str, size); +#else while (size--) *dst++ = (uchar)*str++; +#endif } return d; } diff --git a/src/corelib/tools/qstring_mips_dsp_asm.S b/src/corelib/tools/qstring_mips_dsp_asm.S new file mode 100644 index 0000000000..aee162c290 --- /dev/null +++ b/src/corelib/tools/qstring_mips_dsp_asm.S @@ -0,0 +1,449 @@ +/**************************************************************************** +** +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com +** Contact: http://www.qt-project.org/legal +** +** This file is part of the QtGui module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "../../gui/painting/qt_mips_asm_dsp_p.h" + +.macro EXTRACT_HALVES_x2 src1, src2, hi_1, hi_2 +#if defined(__MIPSEL) && __MIPSEL + srl \hi_1, \src1, 16 + ext \src1, \src1, 0, 16 + srl \hi_2, \src2, 16 + ext \src2, \src2, 0, 16 +#else + ext \hi_1, \src1, 0, 16 + srl \src1, \src1, 16 + ext \hi_2, \src2, 0, 16 + srl \src2, \src2, 16 +#endif +.endm + + +LEAF_MIPS_DSP(qt_ucstrncmp_mips_dsp_asm) +/* + * Arguments: + * a0 - string_a (uint16_t*) + * a1 - string_b (uint16_t*) + * a2 - length (uint32_t) + * + * Register usage: + * t0 - batches + */ + + move v0, zero /* result = 0 */ + andi t0, a0, 0x3 /* t1 = string_a % 4 */ + andi t1, a1, 0x3 /* t0 = string_b % 4 */ + or t2, t0, t1 /* t2 = t0 | t1 */ + + beqz t2, 5f /* both aligned */ + and t2, t0 ,t1 + beqz t2, 6f /* one aligned */ + nop + + /* + * Both strings are unaligned: read 1 halfword from each, + * then fall-off to continue with the both-aligned case. + */ + lhu t0, 0 (a0) + lhu t1, 0 (a1) + addiu a2, a2, -1 /* len-- */ + sub v0, t0, t1 /* v0 = t0-t1 */ + addiu a0, a0, 2 /* string_a++ */ + bnez v0, 0f /* if (t0-t1): return */ + addiu a1, a1, 2 /* string_b++ */ + beqz a2, 0f /* if !len: return */ + /* next instruction (srl) fills delay branch slot */ + +5: /* Both string pointers are aligned */ + srl t0, a2, 3 /* batches = length / 8 */ + beqz t0, 9f /* if !batches: tail */ + andi a2, a2, 0x7 /* length = length % 8 */ + + SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 + +1: lw t1, 0 (a0) /* [a0 a1] */ + lw t3, 4 (a0) /* [a2 a3] */ + lw t5, 8 (a0) /* [a4 a5] */ + lw t7, 12 (a0) /* [a6 a7] */ + + lw t2, 0 (a1) /* [b0 b1] */ + lw t4, 4 (a1) /* [b2 b3] */ + lw t6, 8 (a1) /* [b4 b5] */ + lw t8, 12 (a1) /* [b6 b7] */ + + /* + * Subtract elements one by one, if the result is zero + * both halves of the registers (shorts) are equal. + */ + subq.ph s0, t1, t2 /* [a0-b0 a1-b1] */ + subq.ph s1, t3, t4 /* [a2-b2 a3-b3] */ + + bnez s0, 1f + subq.ph s2, t5, t6 /* [a4-b4 a5-b5] */ + bnez s1, 2f + subq.ph s3, t7, t8 /* [a6-b6 a7-b7] */ + bnez s2, 3f + addiu t0, t0, -1 /* batches-- */ + bnez s3, 4f + addiu a0, a0, 8*2 /* string_a += 8 */ + + bnez t0, 1b /* if batches: loop */ + addiu a1, a1, 8*2 /* string_b += 8 */ + + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + + bnez a2, 9f /* if length: tail */ + nop + jr ra + nop + + + 1: /* Check t1 [a0 a1] vs. t2 [b0 b1] */ + EXTRACT_HALVES_x2 t1, t2, t3, t4 /* a0, b0, a1, b1 */ + sub v0, t1, t2 + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + bnez v0, 0f /* if (a0-b0): return */ + nop + jr ra + sub v0, t3, t4 /* return a1-b1 */ + + 2: /* Check t3 [a2 a3] vs. t4 [b2 b3] */ + EXTRACT_HALVES_x2 t3, t4, t1, t2 /* a2, b2, a3, b3 */ + sub v0, t3, t4 + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + bnez v0, 0f /* if (a2-b2): return */ + nop + jr ra + sub v0, t1, t2 /* return a3-b3 */ + + 3: /* Check t5 [a4 a5] vs. t6 [b4 b5] */ + EXTRACT_HALVES_x2 t5, t6, t1, t2 /* a4, b4, a5, b5 */ + sub v0, t5, t6 + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + bnez v0, 0f /* if (a4-b4): return */ + nop + jr ra + sub v0, t1, t2 /* return a5-b5 */ + + 4: /* Check t7 [a6 a7] vs. t8 [b6 b7] */ + EXTRACT_HALVES_x2 t7, t8, t1, t2 /* a6, b6, a7, b7 */ + sub v0, t7, t8 + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + bnez v0, 0f /* if (a6-b6): return */ + nop + jr ra + sub v0, t1, t2 + + + /* Process remaining tail items. */ +9: lhu t1, 0 (a0) /* a */ + lhu t2, 0 (a1) /* b */ + addiu a2, a2, -1 /* length-- */ + sub v0, t1, t2 /* result = (a - b) */ + + bnez v0, 0f /* if (a - b): return */ + addiu a0, a0, 2 /* string_a++ */ + + bnez a2, 9b /* if length: loop */ + addiu a1, a1, 2 /* string_b++ */ + +0: jr ra + nop + + + /* One of the inputs is unaligned, do unrolled half-word loads */ +6: srl t0, a2, 3 /* batches = length / 8 */ + andi a2, a2, 0x7 /* length = length % 8 */ + +1: lhu t1, 0 (a0) + lhu t2, 0 (a1) + lhu t3, 2 (a0) + lhu t4, 2 (a1) + lhu t5, 4 (a0) + lhu t6, 4 (a1) + lhu t7, 6 (a0) + lhu t8, 6 (a1) + + sub v0, t1, t2 + sub t1, t3, t4 + + bnez v0, 0f + sub t2, t5, t6 + bnez t1, 2f + sub t3, t7, t8 + bnez t2, 3f + lhu t1, 8 (a0) + bnez t3, 4f + lhu t2, 8 (a1) + + lhu t3, 10 (a0) + lhu t4, 10 (a1) + lhu t5, 12 (a0) + lhu t6, 12 (a1) + lhu t7, 14 (a0) + lhu t8, 14 (a1) + + sub v0, t1, t2 + sub t1, t3, t4 + + bnez v0, 0f + sub t2, t5, t6 + bnez t1, 2f + sub t3, t7, t8 + bnez t2, 3f + addiu t0, t0, -1 /* batches-- */ + bnez t3, 4f + addiu a0, a0, 8*2 /* string_a += 8 */ + + bnez t0, 1b + addiu a1, a1, 8*2 /* string_b += 8 */ + + bnez a2, 9b /* if length: tail */ + nop + +0: jr ra + nop +2: jr ra + move v0, t1 +3: jr ra + move v0, t2 +4: jr ra + move v0, t3 + +END(qt_ucstrncmp_mips_dsp_asm) + + +#if defined(__mips_dspr2) +LEAF_MIPS_DSPR2(qt_fromlatin1_mips_asm_unroll8) +#else +LEAF_MIPS_DSP(qt_fromlatin1_mips_asm_unroll8) +#endif + + andi t0, a1, 0x3 + beqz t0, 9f /* check that src is aligned */ + nop + +1: lbu t1, 0 (a1) + addiu a1, a1, 1 + addiu a2, a2, -1 + sh t1, 0 (a0) + beqz a2, 0f + andi t0, a1, 0x3 + bnez t0, 1b + addiu a0, a0, 2 + +9: /* source pointer is aligned: do batches of 8 elements */ + andi t0, a0, 3 /* check if dst is aligned */ + bnez t0, 6f + srl t0, a2, 3 /* batches = len / 8 */ + andi a2, a2, 0x7 /* tail = len % 8 */ + + beqz t0, 8f /* if !batches: tail */ + nop + +1: lw t1, 0 (a1) + lw t2, 4 (a1) + + addiu a1, a1, 8*1 + addiu t0, t0, -1 + + preceu.ph.qbl t3, t1 + preceu.ph.qbr t1, t1 + preceu.ph.qbl t4, t2 + preceu.ph.qbr t2, t2 + +#if defined(__MIPSEL) && __MIPSEL + sw t1, 0 (a0) + sw t3, 4 (a0) + sw t2, 8 (a0) + sw t4, 12 (a0) +#else + sw t3, 0 (a0) + sw t1, 4 (a0) + sw t4, 8 (a0) + sw t2, 12 (a0) +#endif + + bnez t0, 1b + addiu a0, a0, 8*2 + +8: /* process tail items */ + beqz a2, 0f + nop + +1: lbu t1, 0 (a1) + addiu a2, a2, -1 + sh t1, 0 (a0) + addiu a1, a1, 1 + bnez a2, 1b + addiu a0, a0, 2 + +0: jr ra + nop + +6: beqz t0, 8b + andi a2, a2, 7 +7: lw t1, 0(a1) + lw t2, 4(a1) + addiu t0, t0, -1 + addiu a1, a1, 8 + andi t3, t1, 0xff +#if defined(__mips_dspr2) + prepend t1, t2, 8 +#else + sll t4, t4, 24 + srl t1, t1, 8 + or t1, t1, t4 +#endif + srl t2, t2, 8 + preceu.ph.qbr t4, t1 + preceu.ph.qbl t1, t1 + preceu.ph.qbr t5, t2 + srl t2, t2, 16 + sh t3, 0(a0) + sw t4, 2(a0) + sw t1, 6(a0) + sw t5, 10(a0) + sh t2, 14(a0) + bnez t0, 7b + addiu a0, a0, 16 + bnez a2, 1b + nop + + jr ra + nop + +END(qt_fromlatin1_mips_asm_unroll8) + + +LEAF_MIPS_DSP(qt_fromlatin1_mips_asm_unroll4) +/* + * Arguments: + * a0 - dst (uint16_t*) + * a1 - src (const char*) + * a2 - len (unsigned int) + */ + + /* + * QString::fromLatin1_helper() already handles the len==0 + * case: assume that len is never zero. + */ + srl t0, a2, 2 + beqz t0, 9f + andi a2, a2, 0x3 + +1: lbu t1, 0(a1) + lbu t2, 1(a1) + lbu t3, 2(a1) + lbu t4, 3(a1) + sh t1, 0(a0) + sh t2, 2(a0) + sh t3, 4(a0) + sh t4, 6(a0) + addiu t0, t0, -1 + addiu a1, a1, 4 + bnez t0, 1b + addiu a0, a0, 8 + +8: beqz a2, 0f + nop + +9: lbu t1, 0(a1) + addiu a2, a2, -1 + addiu a1, a1, 1 + sh t1, 0(a0) + bnez a2, 9b + addiu a0, a0, 2 + +0: jr ra + nop + +END(qt_fromlatin1_mips_asm_unroll4) + + +LEAF_MIPS_DSP(qt_toLatin1_mips_dsp_asm) + /* + * a0 - dst + * a1 - src + * a2 - length + */ + + addiu t9, zero, 0x3f + srl t8, a2, 2 + beqz t8, 2f + andi a2, a2, 3 +1: + lhu t0, 0(a1) + lhu t1, 2(a1) + lhu t2, 4(a1) + lhu t3, 6(a1) + srl t4, t0, 8 + srl t5, t1, 8 + srl t6, t2, 8 + srl t7, t3, 8 + movn t0, t9, t4 + movn t1, t9, t5 + movn t2, t9, t6 + movn t3, t9, t7 + addiu a1, a1, 8 + addiu t8, t8, -1 + sb t0, 0(a0) + sb t1, 1(a0) + sb t2, 2(a0) + sb t3, 3(a0) + bgtz t8, 1b + addiu a0, a0, 4 +2: beqz a2, 4f + nop +3: + lhu t0, 0(a1) + addiu a1, a1, 2 + addiu a2, a2, -1 + srl t1, t0, 8 + movn t0, t9, t1 + sb t0, 0(a0) + bgtz a2, 3b + addiu a0, a0, 1 +4: + jr ra + nop + +END(qt_toLatin1_mips_dsp_asm) + diff --git a/src/corelib/tools/tools.pri b/src/corelib/tools/tools.pri index e4a7b02aee..cac596f0bc 100644 --- a/src/corelib/tools/tools.pri +++ b/src/corelib/tools/tools.pri @@ -192,3 +192,7 @@ INCLUDEPATH += ../3rdparty/md5 \ !macx-icc:!vxworks:unix:LIBS_PRIVATE += -lm TR_EXCLUDE += ../3rdparty/* + +# MIPS DSP +MIPS_DSP_ASM += tools/qstring_mips_dsp_asm.S +MIPS_DSP_HEADERS += ../gui/painting/qt_mips_asm_dsp_p.h diff --git a/src/gui/image/image.pri b/src/gui/image/image.pri index a80ab4a2fe..bf4b5ddf01 100644 --- a/src/gui/image/image.pri +++ b/src/gui/image/image.pri @@ -78,3 +78,5 @@ NEON_SOURCES += image/qimage_neon.cpp SSE2_SOURCES += image/qimage_sse2.cpp SSSE3_SOURCES += image/qimage_ssse3.cpp AVX_SOURCES += image/qimage_avx.cpp +MIPS_DSPR2_SOURCES += image/qimage_mips_dspr2.cpp +MIPS_DSPR2_ASM += image/qimage_mips_dspr2_asm.S diff --git a/src/gui/image/qimage.cpp b/src/gui/image/qimage.cpp index 12ab5eaffa..70fe7b783f 100644 --- a/src/gui/image/qimage.cpp +++ b/src/gui/image/qimage.cpp @@ -3960,6 +3960,12 @@ void qInitImageConversions() return; } #endif + +#ifdef QT_COMPILER_SUPPORTS_MIPS_DSPR2 + extern bool convert_ARGB_to_ARGB_PM_inplace_mips_dspr2(QImageData *data, Qt::ImageConversionFlags); + inplace_converter_map[QImage::Format_ARGB32][QImage::Format_ARGB32_Premultiplied] = convert_ARGB_to_ARGB_PM_inplace_mips_dspr2; + return; +#endif } extern const uchar *qt_pow_rgb_gamma(); diff --git a/src/gui/image/qimage_mips_dspr2.cpp b/src/gui/image/qimage_mips_dspr2.cpp new file mode 100644 index 0000000000..a1c40a16df --- /dev/null +++ b/src/gui/image/qimage_mips_dspr2.cpp @@ -0,0 +1,69 @@ +/**************************************************************************** +** +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com +** Contact: http://www.qt-project.org/legal +** +** This file is part of the QtGui module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qimage.h" +#include + +QT_BEGIN_NAMESPACE + +// Defined in qimage_mips_dspr2_asm.S +// +extern "C" void premultiply_argb_inplace_mips_asm(void*, unsigned, unsigned, int); + +bool convert_ARGB_to_ARGB_PM_inplace_mips_dspr2(QImageData *data, Qt::ImageConversionFlags) +{ + Q_ASSERT(data->format == QImage::Format_ARGB32); + + if (!data->width || !data->height) + return true; + + Q_ASSERT((data->bytes_per_line - (data->width << 2)) >= 0); + + premultiply_argb_inplace_mips_asm(data->data, + data->height, + data->width, + data->bytes_per_line - (data->width << 2)); + + data->format = QImage::Format_ARGB32_Premultiplied; + return true; +} + +QT_END_NAMESPACE diff --git a/src/gui/image/qimage_mips_dspr2_asm.S b/src/gui/image/qimage_mips_dspr2_asm.S new file mode 100644 index 0000000000..1f03b72dd4 --- /dev/null +++ b/src/gui/image/qimage_mips_dspr2_asm.S @@ -0,0 +1,207 @@ +/**************************************************************************** +** +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com +** Contact: http://www.qt-project.org/legal +** +** This file is part of the QtGui module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "../painting/qt_mips_asm_dsp_p.h" + +LEAF_MIPS_DSPR2(premultiply_argb_inplace_mips_asm) + + SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 + +3: srl v1, a2, 3 /* t1 = linelen / 8 */ + addiu a1, a1, -1 /* numlines-- */ + beqz v1, 1f /* if (!(linelen / 8)): tail */ + andi v0, a2, 0x7 /* v0 = linelen % 8 */ + pref 5, 0 (a0) /* cache-hint: store-streamed */ + + /* unrolled loop, handles (v1 = len / 8) batches of 8 pixels */ +2: addiu v1, v1, -1 + pref 5, 0(a0) + pref 5, 32(a0) + + lw t0, 0(a0) + lw t1, 4(a0) + lw t2, 8(a0) + lw t3, 12(a0) + srl t4, t0, 24 /* 00|00|00|A1 */ + replv.ph t5, t4 /* 00|A1|00|A1 */ + srl t6, t1, 24 /* 00|00|00|A2 */ + replv.ph t7, t6 /* 00|A2|00|A2 */ + muleu_s.ph.qbl t8, t0, t5 /* A1*A1|A1*R1 */ + muleu_s.ph.qbr t0, t0, t5 /* A1*G1|A1*B1 */ + muleu_s.ph.qbl t9, t1, t7 /* A2*A2|A2*R2 */ + muleu_s.ph.qbr t1, t1, t7 /* A2*G2|A2*B2 */ + srl t5, t2, 24 /* 00|00|00|A3 */ + replv.ph s0, t5 /* 00|A3|00|A3 */ + srl t7, t3, 24 /* 00|00|00|A4 */ + replv.ph s1, t7 /* 00|A4|00|A4 */ + muleu_s.ph.qbl s2, t2, s0 /* A3*A3|A3*R3 */ + muleu_s.ph.qbr t2, t2, s0 /* A3*G3|A3*B3 */ + muleu_s.ph.qbl s0, t3, s1 /* A4*A4|A4*R4 */ + muleu_s.ph.qbr t3, t3, s1 /* A4*G4|A4*B4 */ + preceu.ph.qbla s1, t8 + preceu.ph.qbla s3, t0 + addu.ph t8, t8, s1 + addu.ph t0, t0, s3 + preceu.ph.qbla s1, t9 + preceu.ph.qbla s3, t1 + addu.ph t9, t9, s1 + addu.ph t1, t1, s3 + preceu.ph.qbla s1, s2 + preceu.ph.qbla s3, t2 + addu.ph s2, s2, s1 + addu.ph t2, t2, s3 + preceu.ph.qbla s1, s0 + preceu.ph.qbla s3, t3 + addu.ph s0, s0, s1 + addu.ph t3, t3, s3 + shra_r.ph t8, t8, 8 /* xxAA1|xxRR1 */ + shra_r.ph t0, t0, 8 /* xxBB1|xxGG1 */ + shra_r.ph t9, t9, 8 + shra_r.ph t1, t1, 8 + shra_r.ph s2, s2, 8 + shra_r.ph t2, t2, 8 + shra_r.ph s0, s0, 8 + shra_r.ph t3, t3, 8 + precr.qb.ph t0, t8, t0 + precr.qb.ph t1, t9, t1 + precr.qb.ph t2, s2, t2 + precr.qb.ph t3, s0, t3 + append t4, t0, 24 + append t6, t1, 24 + append t5, t2, 24 + append t7, t3, 24 + sw t4, 0(a0) + sw t6, 4(a0) + sw t5, 8(a0) + sw t7, 12(a0) + + lw t0, 16(a0) + lw t1, 20(a0) + lw t2, 24(a0) + lw t3, 28(a0) + srl t4, t0, 24 /* 00|00|00|A1 */ + replv.ph t5, t4 /* 00|A1|00|A1 */ + srl t6, t1, 24 /* 00|00|00|A2 */ + replv.ph t7, t6 /* 00|A2|00|A2 */ + muleu_s.ph.qbl t8, t0, t5 /* A1*A1|A1*R1 */ + muleu_s.ph.qbr t0, t0, t5 /* A1*G1|A1*B1 */ + muleu_s.ph.qbl t9, t1, t7 /* A2*A2|A2*R2 */ + muleu_s.ph.qbr t1, t1, t7 /* A2*G2|A2*B2 */ + srl t5, t2, 24 /* 00|00|00|A3 */ + replv.ph s0, t5 /* 00|A3|00|A3 */ + srl t7, t3, 24 /* 00|00|00|A4 */ + replv.ph s1, t7 /* 00|A4|00|A4 */ + muleu_s.ph.qbl s2, t2, s0 /* A3*A3|A3*R3 */ + muleu_s.ph.qbr t2, t2, s0 /* A3*G3|A3*B3 */ + muleu_s.ph.qbl s0, t3, s1 /* A4*A4|A4*R4 */ + muleu_s.ph.qbr t3, t3, s1 /* A4*G4|A4*B4 */ + preceu.ph.qbla s1, t8 + preceu.ph.qbla s3, t0 + addu.ph t8, t8, s1 + addu.ph t0, t0, s3 + preceu.ph.qbla s1, t9 + preceu.ph.qbla s3, t1 + addu.ph t9, t9, s1 + addu.ph t1, t1, s3 + preceu.ph.qbla s1, s2 + preceu.ph.qbla s3, t2 + addu.ph s2, s2, s1 + addu.ph t2, t2, s3 + preceu.ph.qbla s1, s0 + preceu.ph.qbla s3, t3 + addu.ph s0, s0, s1 + addu.ph t3, t3, s3 + shra_r.ph t8, t8, 8 /* xxAA1|xxRR1 */ + shra_r.ph t0, t0, 8 /* xxBB1|xxGG1 */ + shra_r.ph t9, t9, 8 + shra_r.ph t1, t1, 8 + shra_r.ph s2, s2, 8 + shra_r.ph t2, t2, 8 + shra_r.ph s0, s0, 8 + shra_r.ph t3, t3, 8 + precr.qb.ph t0, t8, t0 + precr.qb.ph t1, t9, t1 + precr.qb.ph t2, s2, t2 + precr.qb.ph t3, s0, t3 + append t4, t0, 24 + append t6, t1, 24 + append t5, t2, 24 + append t7, t3, 24 + sw t4, 16(a0) + sw t6, 20(a0) + sw t5, 24(a0) + sw t7, 28(a0) + bgtz v1, 2b /* if (t1): unrolled loop */ + addiu a0, a0, 32 /* data += 8 */ + + beqz v0, 4f /* if (!v0): skip tail loop */ + nop + + /* tail loop, handles (len < 8), one pixel at a time */ +1: lw t1, 0 (a0) + addiu v0, v0, -1 /* len-- */ + srl t2, t1, 24 /* t2 = alpha */ + replv.ph t3, t2 + muleu_s.ph.qbl t4, t1, t3 + muleu_s.ph.qbr t1, t1, t3 + preceu.ph.qbla t3, t4 + preceu.ph.qbla t5, t1 + addu.ph t4, t4, t3 + addu.ph t1, t1, t5 + shra_r.ph t4, t4, 8 + shra_r.ph t1, t1, 8 + precr.qb.ph t1, t4, t1 + append t2, t1, 24 + sw t2, 0(a0) + bgtz v0, 1b + addiu a0, a0, 4 /* src++ */ + +4: bnez a1, 3b /* if (numlines): loop */ + addu a0, a0, a3 /* src += srclineskip */ + +0: /* return */ + RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 + + jr ra + nop + +END(premultiply_argb_inplace_mips_asm) + diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index a037545dc2..cc28076b7c 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -1790,7 +1790,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper return buffer; } -static const SourceFetchProc sourceFetch[NBlendTypes][QImage::NImageFormats] = { +static SourceFetchProc sourceFetch[NBlendTypes][QImage::NImageFormats] = { // Untransformed { 0, // Invalid @@ -6402,6 +6402,21 @@ void qInitDrawhelperAsm() destStoreProc[QImage::Format_ARGB32] = qt_destStoreARGB32_mips_dsp; + sourceFetch[BlendUntransformed][QImage::Format_RGB888] = qt_fetchUntransformed_888_mips_dsp; + sourceFetch[BlendTiled][QImage::Format_RGB888] = qt_fetchUntransformed_888_mips_dsp; + + sourceFetch[BlendUntransformed][QImage::Format_RGB444] = qt_fetchUntransformed_444_mips_dsp; + sourceFetch[BlendTiled][QImage::Format_RGB444] = qt_fetchUntransformed_444_mips_dsp; + + sourceFetch[BlendUntransformed][QImage::Format_ARGB8565_Premultiplied] = qt_fetchUntransformed_argb8565_premultiplied_mips_dsp; + sourceFetch[BlendTiled][QImage::Format_ARGB8565_Premultiplied] = qt_fetchUntransformed_argb8565_premultiplied_mips_dsp; + +#if defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2) + qBlendFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_blend_rgb16_on_rgb16_mips_dspr2; +#else + qBlendFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_blend_rgb16_on_rgb16_mips_dsp; +#endif // QT_COMPILER_SUPPORTS_MIPS_DSPR2 + #endif // QT_COMPILER_SUPPORTS_MIPS_DSP if (functionForModeSolidAsm) { const int destinationMode = QPainter::CompositionMode_Destination; diff --git a/src/gui/painting/qdrawhelper_mips_dsp.cpp b/src/gui/painting/qdrawhelper_mips_dsp.cpp index a9b551c226..2202b78ce8 100644 --- a/src/gui/painting/qdrawhelper_mips_dsp.cpp +++ b/src/gui/painting/qdrawhelper_mips_dsp.cpp @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -110,6 +110,78 @@ void qt_blend_rgb32_on_rgb32_mips_dsp(uchar *destPixels, int dbpl, } } +#if defined QT_COMPILER_SUPPORTS_MIPS_DSPR2 +void qt_blend_rgb16_on_rgb16_mips_dspr2(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + if (const_alpha == 256) { + if (w < 256) { + const quint16 *src = (const quint16*) srcPixels; + quint16 *dst = (quint16*) destPixels; + for (int y = 0; y < h; ++y) { + qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm(dst, src, w); + dst = (quint16*) (((uchar*) dst) + dbpl); + src = (quint16*) (((uchar*) src) + sbpl); + } + } + else { + int length = w << 1; + while (h--) { + memcpy(destPixels, srcPixels, length); + destPixels += dbpl; + srcPixels += sbpl; + } + } + } + else if (const_alpha != 0) { + const quint16 *src = (const quint16*) srcPixels; + quint16 *dst = (quint16*) destPixels; + for (int y = 0; y < h; ++y) { + qt_blend_rgb16_on_rgb16_mips_dspr2_asm(dst, src, w, const_alpha); + dst = (quint16*) (((uchar*) dst) + dbpl); + src = (quint16*) (((uchar*) src) + sbpl); + } + } +} +#else +void qt_blend_rgb16_on_rgb16_mips_dsp(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + if (const_alpha == 256) { + if (w < 256) { + const quint16 *src = (const quint16*) srcPixels; + quint16 *dst = (quint16*) destPixels; + for (int y = 0; y < h; ++y) { + qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm(dst, src, w); + dst = (quint16*) (((uchar*) dst) + dbpl); + src = (quint16*) (((uchar*) src) + sbpl); + } + } + else { + int length = w << 1; + while (h--) { + memcpy(destPixels, srcPixels, length); + destPixels += dbpl; + srcPixels += sbpl; + } + } + } + else if (const_alpha != 0) { + const quint16 *src = (const quint16*) srcPixels; + quint16 *dst = (quint16*) destPixels; + for (int y = 0; y < h; ++y) { + qt_blend_rgb16_on_rgb16_mips_dsp_asm(dst, src, w, const_alpha); + dst = (quint16*) (((uchar*) dst) + dbpl); + src = (quint16*) (((uchar*) src) + sbpl); + } + } +} +#endif + void comp_func_Source_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha) { if (const_alpha == 255) { @@ -422,5 +494,28 @@ void QT_FASTCALL comp_func_SourceOut_mips_dsp(uint *dest, const uint *src, int l comp_func_SourceOut_dsp_asm_x2(dest, src, length, const_alpha); } +const uint * QT_FASTCALL qt_fetchUntransformed_888_mips_dsp (uint *buffer, const Operator *, const QSpanData *data, + int y, int x, int length) +{ + uchar *line = (uchar *)data->texture.scanLine(y) + x; + fetchUntransformed_888_asm_mips_dsp(buffer, line, length); + return buffer; +} + +const uint * QT_FASTCALL qt_fetchUntransformed_444_mips_dsp (uint *buffer, const Operator *, const QSpanData *data, + int y, int x, int length) +{ + uchar *line = (uchar *)data->texture.scanLine(y) + x; + fetchUntransformed_444_asm_mips_dsp(buffer, line, length); + return buffer; +} + +const uint * QT_FASTCALL qt_fetchUntransformed_argb8565_premultiplied_mips_dsp (uint *buffer, const Operator *, const QSpanData *data, + int y, int x, int length) +{ + uchar *line = (uchar *)data->texture.scanLine(y) + x; + fetchUntransformed_argb8565_premultiplied_asm_mips_dsp(buffer, line, length); + return buffer; +} QT_END_NAMESPACE diff --git a/src/gui/painting/qdrawhelper_mips_dsp_asm.S b/src/gui/painting/qdrawhelper_mips_dsp_asm.S index 64fc635970..26b48f9d62 100644 --- a/src/gui/painting/qdrawhelper_mips_dsp_asm.S +++ b/src/gui/painting/qdrawhelper_mips_dsp_asm.S @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -1601,3 +1601,479 @@ LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm) nop END(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm) + + +#if defined(__MIPSEL) && __MIPSEL +# define PACK(r, s, t) packrl.ph r, s, t +# define SWHI(r, o, b) swl r, o + 1 (b) +# define SWLO(r, o, b) swr r, o + 0 (b) +# define LDHI(r, o, b) lwl r, o + 1 (b) +# define LDLO(r, o, b) lwr r, o + 2 (b) +#else +# define PACK(r, s, t) packrl.ph r, t, s +# define SWHI(r, o, b) swr r, o + 1 (b) +# define SWLO(r, o, b) swl r, o + 0 (b) +# define LDHI(r, o, b) lwr r, o + 1 (b) +# define LDLO(r, o, b) lwl r, o + 2 (b) +#endif + +LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm) +/* + * a0 - dst (*r5g6b5) + * a1 - src (const *r5g6b5) + * a2 - len (unsigned int) + * + * Register usage: + * t0-3 - Scratch registers + * t4 - Number of iterations to do in unrolled loops + * t5-7 - Auxiliary scratch registers. + * + * Check if base addresses of src/dst are aligned, cases: + * a) Both aligned. + * b) Both unaligned: + * 1. Copy a halfword + * 2. Use aligned case. + * c) dst aligned, src unaligned: + * 1. Read a word from dst, halfword from src. + * 2. Continue reading words from both. + * d) dst unaligned, src aligned: + * 1. Read a word from src, halfword from dst. + * 2. Continue reading words from both. + */ + + beqz a2, 0f /* if (a2:len == 0): return */ + andi t0, a0, 0x3 /* t0 = a0:dst % 4 */ + andi t1, a1, 0x3 /* t1 = a1:dst % 4 */ + or t2, t0, t1 /* t1 = t0 | t1 */ + + beqz t2, 4f /* both aligned */ + nop + beqz t0, 3f /* dst aligned, src unaligned */ + nop + beqz t1, 2f /* src aligned, dst unaligned */ + nop + + /* + * Both src/dst are unaligned: read 1 halfword from each, + * the fall-off to continue with word-aligned copy. + */ + lhu t0, 0 (a1) /* t0 <- ((uint16_t*) src)[0] */ + addiu a1, a1, 2 /* src++ */ + addiu a2, a2,-1 /* len-- */ + sh t0, 0 (a0) /* t1 -> ((uint16_t*) dst)[0] */ + addiu a0, a0, 2 /* dst++ */ + + /* + * Both src/dst pointers are word-aligned, process eight + * items at a time in an unrolled loop. + */ +4: beqz a2, 0f /* if (len == 0): return */ + srl t4, a2, 3 /* t4 = len / 8 */ + + beqz t4, 5f /* if (t4 == 0): tail */ + andi a2, a2, 0x07 /* len = len % 8 */ + +1: lw t0, 0 (a1) + lw t1, 4 (a1) + lw t2, 8 (a1) + lw t3, 12 (a1) + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + sw t0, 0 (a0) + sw t1, 4 (a0) + sw t2, 8 (a0) + sw t3, 12 (a0) + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + b 5f + nop + + + /* + * dst pointer is unaligned + */ +2: beqz a2, 0f /* if (len == 0): return */ + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f /* if (t4 == 0): tail */ + andi a2, a2, 0x07 /* len = len % 8 */ + +1: lw t0, 0 (a1) + lw t1, 4 (a1) + lw t2, 8 (a1) + lw t3, 12 (a1) + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + SWLO (t0, 0, a0) + PACK (t5, t1, t0) + PACK (t6, t2, t1) + PACK (t7, t3, t2) + SWHI (t3, 14, a0) + sw t5, 2 (a0) + sw t6, 6 (a0) + sw t7, 10 (a0) + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + b 5f + nop + + /* + * src pointer is unaligned + */ +3: beqz a2, 0f /* if (len == 0): return */ + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f /* if (t4 == 0): tail */ + andi a2, a2, 0x07 /* len = len % 8 */ + +1: LDHI (t0, 0, a1) + lw t1, 2 (a1) + lw t2, 6 (a1) + lw t3, 10 (a1) + LDLO (t5, 12, a1) + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + PACK (t0, t1, t0) + PACK (t6, t2, t1) + PACK (t7, t3, t2) + sw t0, 0 (a0) + PACK (t0, t5, t3) + sw t6, 4 (a0) + sw t7, 8 (a0) + sw t0, 12 (a0) + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + +5: /* Process remaining items (a2:len < 4), one at a time */ + beqz a2, 0f + nop + +1: lhu t0, 0 (a1) /* t0 <- ((uint16_t*) src)[0] */ + addiu a2, a2,-1 /* len-- */ + addiu a1, a1, 2 /* src++ */ + sh t0, 0 (a0) /* to -> ((uint16_t*) dst)[0] */ + bnez a2, 1b /* if (len != 0): loop */ + addiu a0, a0, 2 /* dst++ */ + +0: jr ra + nop + +END(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm) + + +#undef LDHI +#undef LDLO +#undef PACK +#undef SWHI +#undef SWLO + + +LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_mips_dsp_asm) +/* + * a0 - dst (*r5g6b5) + * a1 - src (const *r5g6b5) + * a2 - len (unsigned int) - batch length + * a3 - alpha (int) + */ + + beqz a2, 2f + li t9, 255 + sll t8, a3, 8 + subu a3, t8, a3 + srl a3, a3, 8 + subu t9, t9, a3 + addiu a3, a3, 1 + srl t4, a3, 2 + addiu t9, t9, 1 + srl t5, t9, 2 +1: + lhu t0, 0(a1) + lhu t1, 0(a0) + addiu a2, a2, -1 + andi t2, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t2, t2, a3 + mul t0, t0, t4 + andi t3, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t3, t3, t9 + mul t1, t1, t5 + addiu a1, a1, 2 + srl t2, t2, 8 + srl t0, t0, 6 + andi t2, t2, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t2 + srl t3, t3, 8 + srl t1, t1, 6 + andi t3, t3, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t3 + addu t0, t0, t1 + sh t0, 0(a0) + bgtz a2, 1b + addiu a0, a0, 2 +2: + jr ra + nop + +END(qt_blend_rgb16_on_rgb16_mips_dsp_asm) + + +LEAF_MIPS_DSP(fetchUntransformed_888_asm_mips_dsp) +/* + * a0 - dst address (address of 32-bit aRGB value) + * a1 - src address + * a2 - length + */ + + beqz a2, 4f + lui t8, 0xff00 + andi t0, a2, 0x1 + beqz t0, 1f + nop +/* case for one pixel */ + lbu t1, 0(a1) + lbu v1, 2(a1) + lbu t0, 1(a1) + addiu a1, a1, 3 + addiu a2, a2, -1 + sll t1, t1, 0x10 + or v1, v1, t8 + sll t0, t0, 0x8 + or v1, v1, t1 + or v1, v1, t0 + sw v1, 0(a0) + addiu a0, a0, 4 + + beqz a2, 4f /* only one pixel is present (length = 1) */ + nop +1: + andi t0, a1, 0x1 + beqz t0, 3f + nop +2: + lbu t0, 0(a1) /* t0 = | 0 | 0 | 0 | R1 | */ + lhu t1, 1(a1) /* t1 = | 0 | 0 | B1 | G1 | */ + addiu a1, a1, 3 + lhu t2, 0(a1) /* t2 = | 0 | 0 | G2 | R2 | */ + lbu t3, 2(a1) /* t3 = | 0 | 0 | 0 | B2 | */ + + sll t0, t0, 16 + or t0, t0, t8 /* t0 = | ff | R1 | 0 | 0 | */ + shll.ph t4, t1, 8 /* t4 = | 0 | 0 | G1 | 0 | */ + srl t5, t1, 8 + or t4, t4, t5 /* t4 = | 0 | 0 | G1 | B1 | */ + or t0, t0, t4 /* t0 = | ff | R1 | G1 | B1 | */ + + shll.ph t4, t2, 8 /* t4 = | 0 | 0 | R2 | 0 | */ + srl t5, t2, 8 /* t5 = | 0 | 0 | 0 | G2 | */ + or t4, t4, t5 + sll t4, t4, 8 /* t4 = | 0 | R2 | G2 | 0 | */ + or t5, t3, t8 + or t2, t4, t5 /* t2 = | ff | R2 | G2 | B2 | */ + + sw t0, 0(a0) + addiu a1, a1, 3 + sw t2, 4(a0) + addiu a2, a2, -2 + bnez a2, 2b + addiu a0, a0, 8 + b 4f + nop +3: + lhu t0, 0(a1) /* t0 = | 0 | 0 | G1 | R1 | */ + lbu t1, 2(a1) /* t1 = | 0 | 0 | 0 | B1 | */ + addiu a1, a1, 3 + lbu t2, 0(a1) /* t2 = | 0 | 0 | 0 | R2 | */ + lhu t3, 1(a1) /* t3 = | 0 | 0 | B2 | G2 | */ + + srl t4, t0, 8 /* t4 = | 0 | 0 | 0 | G1 | */ + shll.ph t5, t0, 8 /* t5 = | 0 | 0 | R1 | 0 | */ + or t0, t4, t5 + sll t6, t0, 8 /* t6 = | 0 | R1 | G1 | 0 | */ + or t4, t1, t8 /* t4 = | ff | 0 | 0 | B1 | */ + or t0, t6, t4 + + sll t2, t2, 16 + srl t4, t3, 8 + shll.ph t5, t3, 8 + or t3, t4, t5 + or t2, t2, t3 + or t2, t2, t8 + + sw t0, 0(a0) + addiu a1, a1, 3 + sw t2, 4(a0) + addiu a2, a2, -2 + bnez a2, 3b + addiu a0, a0, 8 +4: + jr ra + nop + +END(fetchUntransformed_888_asm_mips_dsp) + + +LEAF_MIPS_DSP(fetchUntransformed_444_asm_mips_dsp) +/* + * a0 - dst address (address of 32-bit aRGB value) + * a1 - src address + * a2 - length + */ + + lui t8, 0xff00 + li t4, 0x1 + + beqz a2, 5f + move v0, a0 /* just return the address of buffer + * for storing returning values */ + andi t0, a2, 0x1 + beqz t0, 2f /* there is more then one pixel + * (check src memory alignment (word)) */ + nop +1: + lhu v0, 0(a1) + addiu a1, a1, 2 + addiu a2, a2, -1 + andi t0, v0, 0xf00 + andi v1, v0, 0xf + andi v0, v0, 0xf0 + sra t3, t0, 0x4 + sra t1, v0, 0x4 + sra t0, t0, 0x8 + sll t2, v1, 0x4 + or t0, t0, t3 + or v0, t1, v0 + lui t1, 0xff00 + or v1, t2, v1 + sll t0, t0, 0x10 + or v1, v1, t1 + sll v0, v0, 0x8 + or v1, v1, t0 + or v0, v1, v0 + sw v0, 0(a0) + addiu a0, a0, 4 + beqz a2, 5f /* no more pixels for processing */ + nop + beq a2, t4, 4f /* only one more pixel remained */ + nop +/* check if src memory address is word aligned */ +2: + andi t0, a1, 0x3 + beqz t0, 3f /* memory is word aligned */ + andi a3, a2, 0x1 /* set the a3 register as the comparation + * for ending the unrolled loop + * (1 if odd, 0 if even) */ + b 1b /* not word aligned, + * go another turn with + * just one pixel processing */ + nop +3: + lw t0, 0(a1) + addiu a2, a2, -2 + preceu.ph.qbr t1, t0 /* t1 = | 0 | aR1 | 0 | G1B1 | */ + preceu.ph.qbl t2, t0 /* t1 = | 0 | aR2 | 0 | G2B2 | */ + shll.qb t3, t1, 4 /* t3 = | 0 | R1 0 | 0 | B1 0 | */ + srl t4, t3, 4 + or t0, t3, t4 /* t0 = | 0 | R1R1 | 0 | B1B1 | */ + andi t3, t1, 0xf0 + sll t3, t3, 8 + srl t4, t3, 4 + or t1, t3, t4 + or t0, t0, t1 /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */ + or t0, t0, t8 /* t0 = | ff | R1R1 | G1G1 | B1B1 | */ + + shll.qb t3, t2, 4 /* t3 = | 0 | R1 0 | 0 | B1 0 | */ + srl t4, t3, 4 + or t7, t3, t4 /* t0 = | 0 | R1R1 | 0 | B1B1 | */ + andi t3, t2, 0xf0 + sll t3, t3, 8 + srl t4, t3, 4 + or t1, t3, t4 + or t2, t7, t1 /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */ + or t2, t2, t8 /* t0 = | ff | R1R1 | G1G1 | B1B1 | */ + + sw t0, 0(a0) + addiu a1, a1, 4 + sw t2, 4(a0) + bne a2, a3, 3b + addiu a0, a0, 8 + beqz a2, 5f /* no more pixels for processing */ + nop +4: +/* one more pixel remained (after loop unrolling process finished) */ + lhu v0, 0(a1) + addiu a1, a1, 2 + addiu a2, a2, -1 + andi t0, v0, 0xf00 + andi v1, v0, 0xf + andi v0, v0, 0xf0 + sra t3, t0, 0x4 + sra t1, v0, 0x4 + sra t0, t0, 0x8 + sll t2, v1, 0x4 + or t0, t0, t3 + or v0, t1, v0 + lui t1, 0xff00 + or v1, t2, v1 + sll t0, t0, 0x10 + or v1, v1, t1 + sll v0, v0, 0x8 + or v1, v1, t0 + or v0, v1, v0 + sw v0, 0(a0) + addiu a0, a0, 4 +5: + jr ra + nop + +END(fetchUntransformed_444_asm_mips_dsp) + + +LEAF_MIPS_DSP(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp) +/* + * a0 - dst address + * a1 - src address + * a2 - length + */ + + beqz a2, 2f + nop + +1: + ulh t1, 0(a1) + lbu t2, 2(a1) + addiu a2, a2, -1 + wsbh t1, t1 + sll t0, t1, 8 /* t0 = 00000000rrrrrggggggbbbbb00000000 */ + ins t0, t1, 3, 16 /* t0 = 00000000rrrrrrrrrrggggggbbbbb000 */ + ins t0, t1, 5, 11 /* t0 = 00000000rrrrrrrrggggggbbbbbbb000 */ + srl t4, t1, 9 /* t4 = 0000000000000000000000000rrrrrgg */ + replv.qb t3, t2 + ins t0, t4, 8, 2 /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */ + ins t0, t1, 3, 5 /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */ + srl t4, t1, 2 /* t4 = 000000000000000000rrrrrggggggbbb */ + ins t0, t4, 0, 3 /* t0 = 00000000rrrrrrrrggggggggbbbbbbbb */ + ins t0, t2, 24, 8 /* t0 =aaaaaaaarrrrrrrrggggggggbbbbbbbb */ + cmpu.lt.qb t3, t0 + pick.qb t0, t3, t0 + addiu a1, a1, 3 + sw t0, 0(a0) + bgtz a2, 1b + addiu a0, a0, 4 +2: + jr ra + nop + +END(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp) diff --git a/src/gui/painting/qdrawhelper_mips_dsp_p.h b/src/gui/painting/qdrawhelper_mips_dsp_p.h index 2df7d4920a..3dceb7793d 100644 --- a/src/gui/painting/qdrawhelper_mips_dsp_p.h +++ b/src/gui/painting/qdrawhelper_mips_dsp_p.h @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -101,19 +101,34 @@ extern "C" void qt_blend_argb32_on_argb32_mips_dsp_asm_x2(uint *dest, const uint extern "C" void qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm(uint *dest, const uint *src, int length); +extern "C" void qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm(quint16 *dest, const quint16 *src, int length); + +extern "C" void qt_blend_rgb16_on_rgb16_mips_dsp_asm(quint16 *dest, const quint16 *src, int length, uint const_alpha); + extern "C" uint * destfetchARGB32_asm_mips_dsp(uint *buffer, const uint *data, int length); extern "C" uint * qt_destStoreARGB32_asm_mips_dsp(uint *buffer, const uint *data, int length); +extern "C" uint * fetchUntransformed_888_asm_mips_dsp(uint *buffer, const uchar *line, int length); + +extern "C" uint * fetchUntransformed_444_asm_mips_dsp(uint *buffer, const uchar *line, int length); + +extern "C" uint * fetchUntransformed_argb8565_premultiplied_asm_mips_dsp(uint *buffer, const uchar *line, int length); + void qt_blend_argb32_on_argb32_mips_dsp(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha); void qt_blend_rgb32_on_rgb32_mips_dsp(uchar *destPixels, int dbpl, - const uchar *srcPixels, int sbpl, - int w, int h, - int const_alpha); + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + +void qt_blend_rgb16_on_rgb16_mips_dsp(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); void comp_func_Source_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha); @@ -164,6 +179,21 @@ void QT_FASTCALL comp_func_SourceOut_mips_dsp(uint *dest, const uint *src, int l void QT_FASTCALL comp_func_XOR_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha); +const uint * QT_FASTCALL qt_fetchUntransformed_888_mips_dsp (uint *buffer, + const Operator *, + const QSpanData *data, + int y, int x, int length); + +const uint * QT_FASTCALL qt_fetchUntransformed_444_mips_dsp (uint *buffer, + const Operator *, + const QSpanData *data, + int y, int x, int length); + +const uint * QT_FASTCALL qt_fetchUntransformed_argb8565_premultiplied_mips_dsp (uint *buffer, + const Operator *, + const QSpanData *data, + int y, int x, int length); + #endif // QT_COMPILER_SUPPORTS_MIPS_DSP @@ -171,6 +201,13 @@ void QT_FASTCALL comp_func_XOR_mips_dsp(uint *dest, const uint *src, int length, extern "C" void qConvertRgb16To32_asm_mips_dspr2(quint32 *dest, const quint16 *src, int length); +extern "C" void qt_blend_rgb16_on_rgb16_mips_dspr2_asm(quint16 *dest, const quint16 *src, int length, uint const_alpha); + +void qt_blend_rgb16_on_rgb16_mips_dspr2(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + #endif // QT_COMPILER_SUPPORTS_MIPS_DSPR2 QT_END_NAMESPACE diff --git a/src/gui/painting/qdrawhelper_mips_dspr2_asm.S b/src/gui/painting/qdrawhelper_mips_dspr2_asm.S index ec220732be..c7a603eebe 100644 --- a/src/gui/painting/qdrawhelper_mips_dspr2_asm.S +++ b/src/gui/painting/qdrawhelper_mips_dspr2_asm.S @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -122,3 +122,551 @@ LEAF_MIPS_DSPR2(qConvertRgb16To32_asm_mips_dspr2) nop END(qConvertRgb16To32_asm_mips_dspr2) + + +#if defined(__MIPSEL) && __MIPSEL +# define PACK(r, s, t) packrl.ph r, s, t +# define LDHI(r, o, b) lwl r, o + 1 (b) +# define LDLO(r, o, b) lwr r, o + 2 (b) +#else +# define PACK(r, s, t) packrl.ph r, t, s +# define LDHI(r, o, b) lwr r, o + 1 (b) +# define LDLO(r, o, b) lwl r, o + 2 (b) +#endif + + +LEAF_MIPS_DSPR2(qt_blend_rgb16_on_rgb16_mips_dspr2_asm) +/* ++ * a0 - dst (*r5g6b5) + * a1 - src (const *r5g6b5) + * a2 - len (unsigned int) - batch length + * a3 - alpha (int) + * + * Register usage: + * t0-3 - Scratch registers + * t4 - Number of iterations to do in unrolled loops + * t5 - Inverse alpha + * t6 - Alpha >> 2 + * t7 - Inverse alpha >> 2 + * t8 - magic1 (0x07e007e0) + * t9 - magic2 (0xf81ff81f) + * + * NOTE: + * Cannot use DSP instructions for the multiplication of two + * 16-bit values: overflow would be always rounded or saturated. + */ + + beqz a2, 0f + andi t0, a0, 0x3 + andi t1, a1, 0x3 + /* Adjust alpha value, and calculate inverse alpha value */ + li t5, 255 + or t2, t0, t1 /* t0 = (dst & 0x3) | (src & 0x3) */ + sll t8, a3, 8 + subu a3, t8, a3 + li t8, 0x07e007e0 /* magic1 */ + srl a3, a3, 8 /* alpha >>= 8 */ + li t9, 0xf81ff81f /* magic2 */ + subu t5, t5, a3 /* ialpha = 255 - alpha */ + addiu a3, a3, 1 /* alpha++ */ + addiu t5, t5, 1 /* ialpha++ */ + srl t6, a3, 2 /* ashift = alpha >> 2 */ + + beqz t2, 4f /* both aligned */ + srl t7, t5, 2 /* iashift = ialpha >> 2 */ + + beqz t1, 2f /* src aligned, dst unaligned */ + nop + + beqz t0, 3f /* dst aligned, src unaligned */ + nop + + /* + * Both src/dst are unaligned: read 1 halfword from each, then + * fall-off to continue with word-aligned operation. + */ + lhu t1, 0 (a1) + lhu t0, 0 (a0) + addiu a2, a2, -1 /* len-- */ + andi t2, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t2, t2, a3 + mul t1, t1, t6 + andi t3, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t3, t3, t5 + mul t0, t0, t7 + addiu a1, a1, 2 /* src++ */ + srl t2, t2, 8 + srl t1, t1, 6 + andi t2, t2, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t2 + srl t3, t3, 8 + srl t0, t0, 6 + andi t3, t3, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t3 + addu t0, t0, t1 /* src * alpha + dst * ialpha */ + sh t0, 0 (a0) + addiu a0, a0, 2 /* dst++ */ + + /* + * Both src/dst pointers are word-aligned, process eight + * items at a time in an unrolled loop. + */ +4: beqz a2, 0f + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f + andi a2, a2, 0x7 /* len = len % 8 */ + SAVE_REGS_ON_STACK 12, s0, s1, s2, s3, s4, v0, v1 + +1: lw t1, 0 (a1) /* [s0, s1] */ + lw v1, 4 (a1) /* [s2, s3] */ + lw s1, 8 (a1) /* [s4, s5] */ + lw s3, 12 (a1) /* [s6, s7] */ + + lw t0, 0 (a0) /* [d0, d1] */ + lw v0, 4 (a0) /* [d2, d3] */ + lw s0, 8 (a0) /* [d4, d5] */ + lw s2, 12 (a0) /* [d6, d7] */ + + pref 4, 16 (a1) + pref 5, 16 (a0) + + and t2, t1, t8 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and t1, t1, t9 + ext s4, t1, 0, 16 + mul s4, s4, t6 + srl t1, t1, 16 + mul t1, t1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v1, t8 + srl t1, t1, 6 + append t1, s4, 16 + and t1, t1, t9 + or t1, t1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and v1, v1, t9 + ext s4, v1, 0, 16 + mul s4, s4, t6 + srl v1, v1, 16 + mul v1, v1, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s1, t8 + srl v1, v1, 6 + append v1, s4, 16 + and v1, v1, t9 + or v1, v1, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and s1, s1, t9 + ext s4, s1, 0, 16 + mul s4, s4, t6 + srl s1, s1, 16 + mul s1, s1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s3, t8 + srl s1, s1, 6 + append s1, s4, 16 + and s1, s1, t9 + or s1, s1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and s3, s3, t9 + ext s4, s3, 0, 16 + mul s4, s4, t6 + srl s3, s3, 16 + mul s3, s3, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, t0, t8 + srl s3, s3, 6 + append s3, s4, 16 + and s3, s3, t9 + or s3, s3, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and t0, t0, t9 + ext s4, t0, 0, 16 + mul s4, s4, t7 + srl t0, t0, 16 + mul t0, t0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v0, t8 + srl t0, t0, 6 + append t0, s4, 16 + and t0, t0, t9 + or t0, t0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and v0, v0, t9 + ext s4, v0, 0, 16 + mul s4, s4, t7 + srl v0, v0, 16 + mul v0, v0, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s0, t8 + srl v0, v0, 6 + append v0, s4, 16 + and v0, v0, t9 + or v0, v0, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and s0, s0, t9 + ext s4, s0, 0, 16 + mul s4, s4, t7 + srl s0, s0, 16 + mul s0, s0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s2, t8 + srl s0, s0, 6 + append s0, s4, 16 + and s0, s0, t9 + or s0, s0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and s2, s2, t9 + ext s4, s2, 0, 16 + mul s4, s4, t7 + srl s2, s2, 16 + mul s2, s2, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + addu.ph t0, t0, t1 + srl s2, s2, 6 + append s2, s4, 16 + and s2, s2, t9 + or s2, s2, t3 + addu.ph v0, v0, v1 /* v0 = [S2 + D2, S3 + D3] */ + addu.ph s0, s0, s1 /* s0 = [S4 + D4, S5 + D5] */ + addu.ph s2, s2, s3 /* s2 = [S6 + D6, S7 + D7] */ + + sw t0, 0 (a0) /* [SS0, SS1] */ + sw v0, 4 (a0) /* [SS2, SS3] */ + sw s0, 8 (a0) /* [SS4, SS5] */ + sw s2, 12 (a0) /* [SS6, SS7] */ + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + RESTORE_REGS_FROM_STACK 12, s0, s1, s2, s3, s4, v0, v1 + b 5f + nop + + + /* dst unaligned: do one item and fall down to the src unaligned case */ +2: lhu t1, 0 (a1) + lhu t0, 0 (a0) + addiu a2, a2, -1 /* len-- */ + andi t2, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t2, t2, a3 + mul t1, t1, t6 + andi t3, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t3, t3, t5 + mul t0, t0, t7 + addiu a1, a1, 2 /* src++ */ + srl t2, t2, 8 + srl t1, t1, 6 + andi t2, t2, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t2 + srl t3, t3, 8 + srl t0, t0, 6 + andi t3, t3, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t3 + addu t0, t0, t1 /* src * alpha + dst * ialpha */ + sh t0, 0 (a0) + addiu a0, a0, 2 /* dst++ */ + + /* src unaligned */ +3: beqz a2, 0f + srl t4, a2, 3 /* t4 = len / 8 */ + beqz t4, 5f + andi a2, a2, 0x7 /* len = len % 8 */ + SAVE_REGS_ON_STACK 12, s0, s1, s2, s3, s4, v0, v1 + +1: lw t0, 0 (a0) /* [d0, d1] */ + lw v0, 4 (a0) /* [d2, d3] */ + lw s0, 8 (a0) /* [d4, d5] */ + lw s2, 12 (a0) /* [d6, d7] */ + + LDHI (t1, 0, a1) /* [s0, __] */ + lw v1, 2 (a1) /* [s1, s2] */ + lw s1, 6 (a1) /* [s3, s4] */ + lw s3, 10 (a1) /* [s5, s6] */ + LDLO (s4, 12, a1) /* [__, s7] */ + + pref 4, 14 (a1) + pref 5, 16 (a0) + + PACK (t1, v1, t1) /* [s0, s1] */ + PACK (v1, s1, v1) /* [s2, s3] */ + PACK (s1, s3, s1) /* [s4, s5] */ + PACK (s3, s4, s3) /* [s6, s7] */ + + and t2, t1, t8 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and t1, t1, t9 + ext s4, t1, 0, 16 + mul s4, s4, t6 + srl t1, t1, 16 + mul t1, t1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v1, t8 + srl t1, t1, 6 + append t1, s4, 16 + and t1, t1, t9 + or t1, t1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and v1, v1, t9 + ext s4, v1, 0, 16 + mul s4, s4, t6 + srl v1, v1, 16 + mul v1, v1, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s1, t8 + srl v1, v1, 6 + append v1, s4, 16 + and v1, v1, t9 + or v1, v1, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, a3 + mul t2, t2, a3 + and s1, s1, t9 + ext s4, s1, 0, 16 + mul s4, s4, t6 + srl s1, s1, 16 + mul s1, s1, t6 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s3, t8 + srl s1, s1, 6 + append s1, s4, 16 + and s1, s1, t9 + or s1, s1, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, a3 + mul t3, t3, a3 + and s3, s3, t9 + ext s4, s3, 0, 16 + mul s4, s4, t6 + srl s3, s3, 16 + mul s3, s3, t6 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, t0, t8 + srl s3, s3, 6 + append s3, s4, 16 + and s3, s3, t9 + or s3, s3, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and t0, t0, t9 + ext s4, t0, 0, 16 + mul s4, s4, t7 + srl t0, t0, 16 + mul t0, t0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, v0, t8 + srl t0, t0, 6 + append t0, s4, 16 + and t0, t0, t9 + or t0, t0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and v0, v0, t9 + ext s4, v0, 0, 16 + mul s4, s4, t7 + srl v0, v0, 16 + mul v0, v0, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + and t2, s0, t8 + srl v0, v0, 6 + append v0, s4, 16 + and v0, v0, t9 + or v0, v0, t3 + ext t3, t2, 0, 16 + srl t2, t2, 16 + mul t3, t3, t5 + mul t2, t2, t5 + and s0, s0, t9 + ext s4, s0, 0, 16 + mul s4, s4, t7 + srl s0, s0, 16 + mul s0, s0, t7 + srl t3, t3, 8 + srl t2, t2, 8 + append t2, t3, 16 + and t2, t2, t8 + srl s4, s4, 6 + and t3, s2, t8 + srl s0, s0, 6 + append s0, s4, 16 + and s0, s0, t9 + or s0, s0, t2 + ext t2, t3, 0, 16 + srl t3, t3, 16 + mul t2, t2, t5 + mul t3, t3, t5 + and s2, s2, t9 + ext s4, s2, 0, 16 + mul s4, s4, t7 + srl s2, s2, 16 + mul s2, s2, t7 + srl t2, t2, 8 + srl t3, t3, 8 + append t3, t2, 16 + and t3, t3, t8 + srl s4, s4, 6 + addu.ph t0, t0, t1 + srl s2, s2, 6 + append s2, s4, 16 + and s2, s2, t9 + or s2, s2, t3 + addu.ph v0, v0, v1 /* v0 = [S2 + D2, S3 + D3] */ + addu.ph s0, s0, s1 /* s0 = [S4 + D4, S5 + D5] */ + addu.ph s2, s2, s3 /* s2 = [S6 + D6, S7 + D7] */ + + sw t0, 0 (a0) /* [SS0, SS1] */ + sw v0, 4 (a0) /* [SS2, SS3] */ + sw s0, 8 (a0) /* [SS4, SS5] */ + sw s2, 12 (a0) /* [SS6, SS7] */ + + addiu t4, t4, -1 /* t4-- */ + addiu a1, a1, 16 /* src += 8 */ + + bnez t4, 1b + addiu a0, a0, 16 /* dst += 8 */ + + RESTORE_REGS_FROM_STACK 12, s0, s1, s2, s3, s4, v0, v1 + +5: /* Process remaining items (len < 8), one at a time */ + beqz a2, 0f + nop + +1: lhu t1, 0 (a1) + lhu t0, 0 (a0) + addiu a1, a1, 2 /* src++ */ + andi t2, t1, 0x07e0 + andi t1, t1, 0xf81f + mul t2, t2, a3 + mul t1, t1, t6 + andi t3, t0, 0x07e0 + andi t0, t0, 0xf81f + mul t3, t3, t5 + mul t0, t0, t7 + addiu a2, a2, -1 /* len-- */ + srl t2, t2, 8 + srl t1, t1, 6 + andi t2, t2, 0x07e0 + andi t1, t1, 0xf81f + or t1, t1, t2 + srl t3, t3, 8 + srl t0, t0, 6 + andi t3, t3, 0x07e0 + andi t0, t0, 0xf81f + or t0, t0, t3 + + addu t0, t0, t1 /* src*alpha + dst*ialpha */ + sh t0, 0 (a0) + bnez a2, 1b + addiu a0, a0, 2 /* dst++ */ + +0: jr ra + nop + +END(qt_blend_rgb16_on_rgb16_mips_dspr2_asm) + +#undef PACK +#undef LDHI +#undef LDLO diff --git a/src/gui/painting/qt_mips_asm_dsp_p.h b/src/gui/painting/qt_mips_asm_dsp_p.h index 1b78eaf52c..54fcfab206 100644 --- a/src/gui/painting/qt_mips_asm_dsp_p.h +++ b/src/gui/painting/qt_mips_asm_dsp_p.h @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2012 MIPS Technologies, www.mips.com, author Damir Tatalovic +** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -261,4 +261,168 @@ LEAF_MIPS32R2(symbol) \ or \out_1, \scratch1, \scratch3 .endm +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be positive and multiple of 4." + .endif + .if \stack_offset != 0 + addiu sp, sp, -\stack_offset + .endif + sw \r1, 0(sp) + .if \r2 != 0 + sw \r2, 4(sp) + .endif + .if \r3 != 0 + sw \r3, 8(sp) + .endif + .if \r4 != 0 + sw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) + .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + lw \r1, 0(sp) + .if \r2 != 0 + lw \r2, 4(sp) + .endif + .if \r3 != 0 + lw \r3, 8(sp) + .endif + .if \r4 != 0 + lw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) + .endif + .if \stack_offset != 0 + addiu sp, sp, \stack_offset + .endif +.endm + #endif // QT_MIPS_ASM_DSP_H -- cgit v1.2.3