summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2012-05-30 14:43:06 +0200
committerQt by Nokia <qt-info@nokia.com>2012-05-31 17:55:43 +0200
commit05dc32ef6cc0d7279642839ea21100dee40a81c4 (patch)
treea83e6e2eebc9bdd5d522f6fb695bdde80c9ae9d6
parent398c8513b172d4605a27dfa6125045b55e7cb29e (diff)
Fix the confusion caused by the QT_ALWAYS_HAVE_xxx macros
The QT_ALWAYS_HAVE_xxx macros are gone: they were hard to use and not defined properly. It indicated that the compiler was producing code that required that particular instruction set, so we could use it in our code unconditionally. Instead, let's use the GCC-style __SSE2__ and __ARM_NEON__. MSVC does not generate the __SSE2__ macro, so let's do it for the compiler. Also, define __AVX__ and the macros for the technologies leading to it when we manage to detect an /arch:AVX build (currently not possible, see note in the header). ICC and MSVC allow one to use the intrinsics anywhere, but for Qt all uses of the intrinsics are either in specially-built files, protected by runtime checks, or they are unconditional (qstring.cpp). So we only use the intrinsics when the compiler was instructed to generate code for that instruction set anyway. Change-Id: If8382f30422cee0e5831d051b003acf036824abf Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@nokia.com>
-rw-r--r--src/corelib/tools/qsimd.cpp2
-rw-r--r--src/corelib/tools/qsimd_p.h26
-rw-r--r--src/corelib/tools/qstring.cpp8
-rw-r--r--src/gui/painting/qdrawhelper.cpp12
4 files changed, 36 insertions, 12 deletions
diff --git a/src/corelib/tools/qsimd.cpp b/src/corelib/tools/qsimd.cpp
index b07667b4f9..c399a5a527 100644
--- a/src/corelib/tools/qsimd.cpp
+++ b/src/corelib/tools/qsimd.cpp
@@ -139,7 +139,7 @@ static inline uint detectProcessorFeatures()
#if defined(QT_COMPILER_SUPPORTS_IWMMXT)
// runtime detection only available when running as a previlegied process
features = IWMMXT;
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
features = NEON;
#endif
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h
index f22558834b..a6ae680c3e 100644
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@@ -57,6 +57,8 @@ QT_BEGIN_HEADER
*
* We will try to include all headers possible under this configuration.
*
+ * MSVC does not define __SSE2__ & family, so we will define them.
+ *
* Supported XXX are:
* Flag | Arch | GCC | Intel CC | MSVC |
* NEON | ARM | I & C | None | ? |
@@ -86,6 +88,10 @@ QT_BEGIN_HEADER
#else
# include <emmintrin.h>
#endif
+#if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
+# define __SSE__ 1
+# define __SSE2__ 1
+#endif
#endif
// SSE3 intrinsics
@@ -112,11 +118,29 @@ QT_BEGIN_HEADER
#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_MSVC))
// immintrin.h is the ultimate header, we don't need anything else after this
#include <immintrin.h>
+
+# if defined(Q_CC_MSVC) && defined(_M_AVX)
+// MS Visual Studio 2010 has no macro pre-defined to identify the use of /arch:AVX
+// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
+// When such a macro exists, add it above, replacing _M_AVX as appropriate
+# define __SSE3__ 1
+# define __SSSE3__ 1
+// no Intel CPU supports SSE4a, so don't define it
+# define __SSE4_1__ 1
+# define __SSE4_2__ 1
+# define __AVX__ 1
+# ifdef _M_AVX2
+// replace the macro above with the proper MS macro when it exists
+// All processors with AVX2 will support BMI1 and FMA
+# define __AVX2__ 1
+# define __BMI__ 1
+# define __FMA__ 1
+# endif
+# endif
#endif
// NEON intrinsics
#if defined __ARM_NEON__
-#define QT_ALWAYS_HAVE_NEON
#include <arm_neon.h>
#endif
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 8d8bda4644..a5a7badeac 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3789,7 +3789,7 @@ bool QString::endsWith(QChar c, Qt::CaseSensitivity cs) const
}
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
static inline __m128i mergeQuestionMarks(__m128i chunk)
{
const __m128i questionMark = _mm_set1_epi16('?');
@@ -3851,7 +3851,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
ba.resize(length);
const ushort *src = reinterpret_cast<const ushort *>(data);
uchar *dst = (uchar*) ba.data();
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
if (length >= 16) {
const int chunkCount = length >> 4; // divided by 16
@@ -3872,7 +3872,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
}
length = length % 16;
}
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
// Refer to the documentation of the SSE2 implementation
// this use eactly the same method as for SSE except:
// 1) neon has unsigned comparison
@@ -4028,7 +4028,7 @@ QString::Data *QString::fromLatin1_helper(const char *str, int size)
* Unpacking with SSE has been shown to improve performance on recent CPUs
* The same method gives no improvement with NEON.
*/
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
if (size >= 16) {
int chunkCount = size >> 4; // divided by 16
const __m128i nullMask = _mm_set1_epi32(0);
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 8888883e51..a91f683a1c 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -751,7 +751,7 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
return (((tlrb + trrb + blrb + brrb) >> 8) & 0x00ff00ff) | ((tlag + trag + blag + brag) & 0xff00ff00);
}
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
#define interpolate_4_pixels_16_sse2(tl, tr, bl, br, distx, disty, colorMask, v_256, b) \
{ \
const __m128i dxdy = _mm_mullo_epi16 (distx, disty); \
@@ -788,7 +788,7 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
}
#endif
-#if defined(QT_ALWAYS_HAVE_NEON)
+#if defined(__ARM_NEON__)
#define interpolate_4_pixels_16_neon(tl, tr, bl, br, distx, disty, disty_, colorMask, invColorMask, v_256, b) \
{ \
const int16x8_t dxdy = vmulq_s16(distx, disty); \
@@ -925,7 +925,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
}
if (blendType != BlendTransformedBilinearTiled) {
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
const __m128i disty_ = _mm_set1_epi16(disty);
const __m128i idisty_ = _mm_set1_epi16(idisty);
const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
@@ -955,7 +955,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
rRB = _mm_srli_epi16(rRB, 8);
_mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
}
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
const int16x8_t disty_ = vdupq_n_s16(disty);
const int16x8_t idisty_ = vdupq_n_s16(idisty);
const int16x8_t colorMask = vdupq_n_s16(0x00ff);
@@ -1077,7 +1077,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
boundedEnd = qMin(end, buffer + uint((image_x1 - (fx >> 16)) / data->m11)); \
boundedEnd -= 3;
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
BILINEAR_DOWNSCALE_BOUNDS_PROLOG
const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
@@ -1117,7 +1117,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx);
}
fx = v_fx.i[0];
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
BILINEAR_DOWNSCALE_BOUNDS_PROLOG
const int16x8_t colorMask = vdupq_n_s16(0x00ff);