4 files changed, 36 insertions, 12 deletions
diff --git a/src/corelib/tools/qsimd.cpp b/src/corelib/tools/qsimd.cpp
index b07667b4f9..c399a5a527 100644
--- a/src/corelib/tools/qsimd.cpp
+++ b/src/corelib/tools/qsimd.cpp
@@ -139,7 +139,7 @@ static inline uint detectProcessorFeatures()
 #if defined(QT_COMPILER_SUPPORTS_IWMMXT)
     // runtime detection only available when running as a previlegied process
     features = IWMMXT;
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
     features = NEON;
 #endif
 
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h
index f22558834b..a6ae680c3e 100644
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@@ -57,6 +57,8 @@ QT_BEGIN_HEADER
  *
  * We will try to include all headers possible under this configuration.
  *
+ * MSVC does not define __SSE2__ & family, so we will define them.
+ *
  * Supported XXX are:
  *   Flag  | Arch |  GCC  | Intel CC |  MSVC  |
  *  NEON   | ARM  | I & C | None     |   ?    |
@@ -86,6 +88,10 @@ QT_BEGIN_HEADER
 #else
 #  include <emmintrin.h>
 #endif
+#if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
+#  define __SSE__ 1
+#  define __SSE2__ 1
+#endif
 #endif
 
 // SSE3 intrinsics
@@ -112,11 +118,29 @@ QT_BEGIN_HEADER
 #if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_MSVC))
 // immintrin.h is the ultimate header, we don't need anything else after this
 #include <immintrin.h>
+
+#  if defined(Q_CC_MSVC) && defined(_M_AVX)
+// MS Visual Studio 2010 has no macro pre-defined to identify the use of /arch:AVX
+// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
+// When such a macro exists, add it above, replacing _M_AVX as appropriate
+#    define __SSE3__ 1
+#    define __SSSE3__ 1
+// no Intel CPU supports SSE4a, so don't define it
+#    define __SSE4_1__ 1
+#    define __SSE4_2__ 1
+#    define __AVX__ 1
+#    ifdef _M_AVX2
+// replace the macro above with the proper MS macro when it exists
+// All processors with AVX2 will support BMI1 and FMA
+#      define __AVX2__ 1
+#      define __BMI__ 1
+#      define __FMA__ 1
+#   endif
+#  endif
 #endif
 
 // NEON intrinsics
 #if defined __ARM_NEON__
-#define QT_ALWAYS_HAVE_NEON
 #include <arm_neon.h>
 #endif
 
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 8d8bda4644..a5a7badeac 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3789,7 +3789,7 @@ bool QString::endsWith(QChar c, Qt::CaseSensitivity cs) const
 }
 
 
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
 static inline __m128i mergeQuestionMarks(__m128i chunk)
 {
     const __m128i questionMark = _mm_set1_epi16('?');
@@ -3851,7 +3851,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
         ba.resize(length);
         const ushort *src = reinterpret_cast<const ushort *>(data);
         uchar *dst = (uchar*) ba.data();
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
         if (length >= 16) {
             const int chunkCount = length >> 4; // divided by 16
 
@@ -3872,7 +3872,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
             }
             length = length % 16;
         }
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
         // Refer to the documentation of the SSE2 implementation
         // this use eactly the same method as for SSE except:
         // 1) neon has unsigned comparison
@@ -4028,7 +4028,7 @@ QString::Data *QString::fromLatin1_helper(const char *str, int size)
          * Unpacking with SSE has been shown to improve performance on recent CPUs
          * The same method gives no improvement with NEON.
          */
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
         if (size >= 16) {
             int chunkCount = size >> 4; // divided by 16
             const __m128i nullMask = _mm_set1_epi32(0);
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 8888883e51..a91f683a1c 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -751,7 +751,7 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
     return (((tlrb + trrb + blrb + brrb) >> 8) & 0x00ff00ff) | ((tlag + trag + blag + brag) & 0xff00ff00);
 }
 
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
 #define interpolate_4_pixels_16_sse2(tl, tr, bl, br, distx, disty, colorMask, v_256, b)  \
 { \
     const __m128i dxdy = _mm_mullo_epi16 (distx, disty); \
@@ -788,7 +788,7 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
 }
 #endif
 
-#if defined(QT_ALWAYS_HAVE_NEON)
+#if defined(__ARM_NEON__)
 #define interpolate_4_pixels_16_neon(tl, tr, bl, br, distx, disty, disty_, colorMask, invColorMask, v_256, b)  \
 { \
     const int16x8_t dxdy = vmulq_s16(distx, disty); \
@@ -925,7 +925,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                 }
 
                 if (blendType != BlendTransformedBilinearTiled) {
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
                     const __m128i disty_ = _mm_set1_epi16(disty);
                     const __m128i idisty_ = _mm_set1_epi16(idisty);
                     const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
@@ -955,7 +955,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         rRB = _mm_srli_epi16(rRB, 8);
                         _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
                     }
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
                     const int16x8_t disty_ = vdupq_n_s16(disty);
                     const int16x8_t idisty_ = vdupq_n_s16(idisty);
                     const int16x8_t colorMask = vdupq_n_s16(0x00ff);
@@ -1077,7 +1077,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         boundedEnd = qMin(end, buffer + uint((image_x1 - (fx >> 16)) / data->m11)); \
                     boundedEnd -= 3;
 
-#if defined(QT_ALWAYS_HAVE_SSE2)
+#if defined(__SSE2__)
                     BILINEAR_DOWNSCALE_BOUNDS_PROLOG
 
                     const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
@@ -1117,7 +1117,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx);
                     }
                     fx = v_fx.i[0];
-#elif defined(QT_ALWAYS_HAVE_NEON)
+#elif defined(__ARM_NEON__)
                     BILINEAR_DOWNSCALE_BOUNDS_PROLOG
 
                     const int16x8_t colorMask = vdupq_n_s16(0x00ff);