diff options
Diffstat (limited to 'chromium/third_party/skia/bench/MemcpyBench.cpp')
-rw-r--r-- | chromium/third_party/skia/bench/MemcpyBench.cpp | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/chromium/third_party/skia/bench/MemcpyBench.cpp b/chromium/third_party/skia/bench/MemcpyBench.cpp new file mode 100644 index 00000000000..f5501927519 --- /dev/null +++ b/chromium/third_party/skia/bench/MemcpyBench.cpp @@ -0,0 +1,160 @@ +/* + * Copyright 2014 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "Benchmark.h" +#include "SkRandom.h" +#include "SkTemplates.h" +#include "SkUtils.h" + +template <typename Memcpy32> +class Memcpy32Bench : public Benchmark { +public: + explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name) + : fCount(count) + , fMemcpy32(memcpy32) + , fName(SkStringPrintf("%s_%d", name, count)) {} + + virtual const char* onGetName() SK_OVERRIDE { + return fName.c_str(); + } + + virtual bool isSuitableFor(Backend backend) SK_OVERRIDE { + return backend == kNonRendering_Backend; + } + + virtual void onPreDraw() SK_OVERRIDE { + fDst.reset(fCount); + fSrc.reset(fCount); + + SkRandom rand; + for (int i = 0; i < fCount; i++) { + fSrc[i] = rand.nextU(); + } + } + + virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE { + for (int i = 0; i < loops; i++) { + fMemcpy32(fDst, fSrc, fCount); + } + } + +private: + SkAutoTMalloc<uint32_t> fDst, fSrc; + + int fCount; + Memcpy32 fMemcpy32; + const SkString fName; +}; + +template <typename Memcpy32> +static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) { + return new Memcpy32Bench<Memcpy32>(count, memcpy32, name); +} +#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); ) + + +// Let the libc developers do what they think is best. +static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { + memcpy(dst, src, sizeof(uint32_t) * count); +} +BENCH(memcpy32_memcpy, 10) +BENCH(memcpy32_memcpy, 100) +BENCH(memcpy32_memcpy, 1000) +BENCH(memcpy32_memcpy, 10000) +BENCH(memcpy32_memcpy, 100000) + +// Let the compiler's autovectorizer do what it thinks is best. +static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) { + while (count --> 0) { + *dst++ = *src++; + } +} +BENCH(memcpy32_autovectorize, 10) +BENCH(memcpy32_autovectorize, 100) +BENCH(memcpy32_autovectorize, 1000) +BENCH(memcpy32_autovectorize, 10000) +BENCH(memcpy32_autovectorize, 100000) + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + +// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads. +static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { + if (count >= 16) { + while (uintptr_t(dst) & 0xF) { + *dst++ = *src++; + count--; + } + + __m128i* dst128 = reinterpret_cast<__m128i*>(dst); + const __m128i* src128 = reinterpret_cast<const __m128i*>(src); + dst += 16 * (count / 16); + src += 16 * (count / 16); + while (count >= 16) { + __m128i a = _mm_loadu_si128(src128++); + __m128i b = _mm_loadu_si128(src128++); + __m128i c = _mm_loadu_si128(src128++); + __m128i d = _mm_loadu_si128(src128++); + + _mm_store_si128(dst128++, a); + _mm_store_si128(dst128++, b); + _mm_store_si128(dst128++, c); + _mm_store_si128(dst128++, d); + + count -= 16; + } + } + + while (count --> 0) { + *dst++ = *src++; + } +} +BENCH(memcpy32_sse2_align, 10) +BENCH(memcpy32_sse2_align, 100) +BENCH(memcpy32_sse2_align, 1000) +BENCH(memcpy32_sse2_align, 10000) +BENCH(memcpy32_sse2_align, 100000) + +// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src. +static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) { + __m128i* dst128 = reinterpret_cast<__m128i*>(dst); + const __m128i* src128 = reinterpret_cast<const __m128i*>(src); + dst += 16 * (count / 16); + src += 16 * (count / 16); + while (count >= 16) { + __m128i a = _mm_loadu_si128(src128++); + __m128i b = _mm_loadu_si128(src128++); + __m128i c = _mm_loadu_si128(src128++); + __m128i d = _mm_loadu_si128(src128++); + + _mm_storeu_si128(dst128++, a); + _mm_storeu_si128(dst128++, b); + _mm_storeu_si128(dst128++, c); + _mm_storeu_si128(dst128++, d); + + count -= 16; + } + + while (count --> 0) { + *dst++ = *src++; + } +} +BENCH(memcpy32_sse2_unalign, 10) +BENCH(memcpy32_sse2_unalign, 100) +BENCH(memcpy32_sse2_unalign, 1000) +BENCH(memcpy32_sse2_unalign, 10000) +BENCH(memcpy32_sse2_unalign, 100000) + +// Test our chosen best, from SkUtils.h +BENCH(sk_memcpy32, 10) +BENCH(sk_memcpy32, 100) +BENCH(sk_memcpy32, 1000) +BENCH(sk_memcpy32, 10000) +BENCH(sk_memcpy32, 100000) + +#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + +#undef BENCH |