summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/skia/bench/MemcpyBench.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/skia/bench/MemcpyBench.cpp')
-rw-r--r--chromium/third_party/skia/bench/MemcpyBench.cpp160
1 files changed, 160 insertions, 0 deletions
diff --git a/chromium/third_party/skia/bench/MemcpyBench.cpp b/chromium/third_party/skia/bench/MemcpyBench.cpp
new file mode 100644
index 00000000000..f5501927519
--- /dev/null
+++ b/chromium/third_party/skia/bench/MemcpyBench.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Benchmark.h"
+#include "SkRandom.h"
+#include "SkTemplates.h"
+#include "SkUtils.h"
+
+template <typename Memcpy32>
+class Memcpy32Bench : public Benchmark {
+public:
+ explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
+ : fCount(count)
+ , fMemcpy32(memcpy32)
+ , fName(SkStringPrintf("%s_%d", name, count)) {}
+
+ virtual const char* onGetName() SK_OVERRIDE {
+ return fName.c_str();
+ }
+
+ virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
+ return backend == kNonRendering_Backend;
+ }
+
+ virtual void onPreDraw() SK_OVERRIDE {
+ fDst.reset(fCount);
+ fSrc.reset(fCount);
+
+ SkRandom rand;
+ for (int i = 0; i < fCount; i++) {
+ fSrc[i] = rand.nextU();
+ }
+ }
+
+ virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
+ for (int i = 0; i < loops; i++) {
+ fMemcpy32(fDst, fSrc, fCount);
+ }
+ }
+
+private:
+ SkAutoTMalloc<uint32_t> fDst, fSrc;
+
+ int fCount;
+ Memcpy32 fMemcpy32;
+ const SkString fName;
+};
+
+template <typename Memcpy32>
+static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
+ return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
+}
+#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )
+
+
+// Let the libc developers do what they think is best.
+static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
+ memcpy(dst, src, sizeof(uint32_t) * count);
+}
+BENCH(memcpy32_memcpy, 10)
+BENCH(memcpy32_memcpy, 100)
+BENCH(memcpy32_memcpy, 1000)
+BENCH(memcpy32_memcpy, 10000)
+BENCH(memcpy32_memcpy, 100000)
+
+// Let the compiler's autovectorizer do what it thinks is best.
+static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
+ while (count --> 0) {
+ *dst++ = *src++;
+ }
+}
+BENCH(memcpy32_autovectorize, 10)
+BENCH(memcpy32_autovectorize, 100)
+BENCH(memcpy32_autovectorize, 1000)
+BENCH(memcpy32_autovectorize, 10000)
+BENCH(memcpy32_autovectorize, 100000)
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.
+static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
+ if (count >= 16) {
+ while (uintptr_t(dst) & 0xF) {
+ *dst++ = *src++;
+ count--;
+ }
+
+ __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
+ const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
+ dst += 16 * (count / 16);
+ src += 16 * (count / 16);
+ while (count >= 16) {
+ __m128i a = _mm_loadu_si128(src128++);
+ __m128i b = _mm_loadu_si128(src128++);
+ __m128i c = _mm_loadu_si128(src128++);
+ __m128i d = _mm_loadu_si128(src128++);
+
+ _mm_store_si128(dst128++, a);
+ _mm_store_si128(dst128++, b);
+ _mm_store_si128(dst128++, c);
+ _mm_store_si128(dst128++, d);
+
+ count -= 16;
+ }
+ }
+
+ while (count --> 0) {
+ *dst++ = *src++;
+ }
+}
+BENCH(memcpy32_sse2_align, 10)
+BENCH(memcpy32_sse2_align, 100)
+BENCH(memcpy32_sse2_align, 1000)
+BENCH(memcpy32_sse2_align, 10000)
+BENCH(memcpy32_sse2_align, 100000)
+
+// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
+static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
+ __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
+ const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
+ dst += 16 * (count / 16);
+ src += 16 * (count / 16);
+ while (count >= 16) {
+ __m128i a = _mm_loadu_si128(src128++);
+ __m128i b = _mm_loadu_si128(src128++);
+ __m128i c = _mm_loadu_si128(src128++);
+ __m128i d = _mm_loadu_si128(src128++);
+
+ _mm_storeu_si128(dst128++, a);
+ _mm_storeu_si128(dst128++, b);
+ _mm_storeu_si128(dst128++, c);
+ _mm_storeu_si128(dst128++, d);
+
+ count -= 16;
+ }
+
+ while (count --> 0) {
+ *dst++ = *src++;
+ }
+}
+BENCH(memcpy32_sse2_unalign, 10)
+BENCH(memcpy32_sse2_unalign, 100)
+BENCH(memcpy32_sse2_unalign, 1000)
+BENCH(memcpy32_sse2_unalign, 10000)
+BENCH(memcpy32_sse2_unalign, 100000)
+
+// Test our chosen best, from SkUtils.h
+BENCH(sk_memcpy32, 10)
+BENCH(sk_memcpy32, 100)
+BENCH(sk_memcpy32, 1000)
+BENCH(sk_memcpy32, 10000)
+BENCH(sk_memcpy32, 100000)
+
+#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+#undef BENCH