summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMilian Wolff <milian.wolff@kdab.com>2020-01-06 09:24:34 +0100
committerMilian Wolff <milian.wolff@kdab.com>2020-01-09 19:24:03 +0000
commit807cccbb89d58da774c73e9bc3c1bddc6e8e653d (patch)
tree48051cc5b66a22f7c58ffd66252e4dee517c78ea
parent37dce9049204ea8ed420686cb407aad0be3d7950 (diff)
Share per-DSO address cache across processes
When we profile a multi-process ensemble, it will often happen that we encounter samples at the relative address of a DSO. In such cases, we can leverage a central cache to store the information, instead of recomputing the same data for every process. As an example, I wrote a shell script that runs the same process four times in parallel. When I parse the resulting perf.data file, the perf stat results are as follows: before: ``` Performance counter stats for '/home/milian/projects/compiled/other/lib/libexec/hotspot-perfparser --input ./perf.data --output /dev/null': 4.240,50 msec task-clock:u # 0,956 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 17.389 page-faults:u # 0,004 M/sec 11.195.771.907 cycles:u # 2,640 GHz 26.585.168.652 instructions:u # 2,37 insn per cycle 6.234.491.027 branches:u # 1470,227 M/sec 35.149.387 branch-misses:u # 0,56% of all branches 4,435152034 seconds time elapsed 3,732758000 seconds user 0,490148000 seconds sys ``` after: ``` Performance counter stats for '/home/milian/projects/compiled/other/lib/libexec/hotspot-perfparser --input ./perf.data --output /dev/null': 4.160,90 msec task-clock:u # 0,979 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 15.476 page-faults:u # 0,004 M/sec 10.635.798.451 cycles:u # 2,556 GHz 16.616.035.720 instructions:u # 1,56 insn per cycle 3.838.148.777 branches:u # 922,433 M/sec 24.902.558 branch-misses:u # 0,65% of all branches 4,249408917 seconds time elapsed 3,612442000 seconds user 0,533933000 seconds sys ``` Note that the overall elapsed time doesn't change that much here, but the amount of instructions required is massively reduced. I bet there are other situations where this patch will bring a more tangible improvement to the overall time requirement. Change-Id: I4531ec648af40dd44b9e4290fab7bbd2a89609da Reviewed-by: Ulf Hermann <ulf.hermann@qt.io>
-rw-r--r--app/perfaddresscache.cpp26
-rw-r--r--app/perfaddresscache.h9
-rw-r--r--app/perfsymboltable.cpp6
-rw-r--r--app/perfsymboltable.h2
-rw-r--r--app/perfunwind.h3
-rw-r--r--tests/auto/addresscache/tst_addresscache.cpp17
6 files changed, 36 insertions, 27 deletions
diff --git a/app/perfaddresscache.cpp b/app/perfaddresscache.cpp
index 7f440b1..2bf05c0 100644
--- a/app/perfaddresscache.cpp
+++ b/app/perfaddresscache.cpp
@@ -22,28 +22,28 @@
namespace {
quint64 relativeAddress(const PerfElfMap::ElfInfo& elf, quint64 addr)
{
- if (!elf.isValid())
- return addr;
-
+ Q_ASSERT(elf.isValid());
Q_ASSERT(elf.addr <= addr);
Q_ASSERT((elf.addr + elf.length) > addr);
return addr - elf.addr;
}
}
-PerfAddressCache::AddressCacheEntry PerfAddressCache::find(const PerfElfMap::ElfInfo& elf,
- quint64 addr) const
+PerfAddressCache::AddressCacheEntry PerfAddressCache::find(const PerfElfMap::ElfInfo& elf, quint64 addr,
+ OffsetAddressCache *invalidAddressCache) const
{
- return m_cache.value(elf.originalPath).value(relativeAddress(elf, addr));
+ if (elf.isValid())
+ return m_cache.value(elf.originalPath).value(relativeAddress(elf, addr));
+ else
+ return invalidAddressCache->value(addr);
}
void PerfAddressCache::cache(const PerfElfMap::ElfInfo& elf, quint64 addr,
- const PerfAddressCache::AddressCacheEntry& entry)
-{
- m_cache[elf.originalPath][relativeAddress(elf, addr)] = entry;
-}
-
-void PerfAddressCache::clearInvalid()
+ const PerfAddressCache::AddressCacheEntry& entry,
+ OffsetAddressCache *invalidAddressCache)
{
- m_cache[{}].clear();
+ if (elf.isValid())
+ m_cache[elf.originalPath][relativeAddress(elf, addr)] = entry;
+ else
+ (*invalidAddressCache)[addr] = entry;
}
diff --git a/app/perfaddresscache.h b/app/perfaddresscache.h
index 92751f9..bfbb06e 100644
--- a/app/perfaddresscache.h
+++ b/app/perfaddresscache.h
@@ -36,13 +36,14 @@ public:
int locationId;
bool isInterworking;
};
+ using OffsetAddressCache = QHash<quint64, AddressCacheEntry>;
- AddressCacheEntry find(const PerfElfMap::ElfInfo& elf, quint64 addr) const;
+ AddressCacheEntry find(const PerfElfMap::ElfInfo& elf, quint64 addr,
+ OffsetAddressCache *invalidAddressCache) const;
void cache(const PerfElfMap::ElfInfo& elf, quint64 addr,
- const AddressCacheEntry& entry);
- void clearInvalid();
+ const AddressCacheEntry& entry, OffsetAddressCache *invalidAddressCache);
private:
- QHash<QByteArray, QHash<quint64, AddressCacheEntry>> m_cache;
+ QHash<QByteArray, OffsetAddressCache> m_cache;
};
#endif
diff --git a/app/perfsymboltable.cpp b/app/perfsymboltable.cpp
index 2482dd4..f260fd3 100644
--- a/app/perfsymboltable.cpp
+++ b/app/perfsymboltable.cpp
@@ -895,7 +895,7 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,
bool *isInterworking)
{
const auto& elf = findElf(ip);
- auto cached = m_addressCache.find(elf, ip);
+ auto cached = m_unwind->addressCache()->find(elf, ip, &m_invalidAddressCache);
if (cached.isValid()) {
*isInterworking = cached.isInterworking;
return cached.locationId;
@@ -1023,7 +1023,7 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,
int locationId = m_unwind->resolveLocation(addressLocation);
*isInterworking = (symname == "$a" || symname == "$t");
- m_addressCache.cache(elf, ip, {locationId, *isInterworking});
+ m_unwind->addressCache()->cache(elf, ip, {locationId, *isInterworking}, &m_invalidAddressCache);
return locationId;
}
@@ -1095,7 +1095,7 @@ Dwfl *PerfSymbolTable::attachDwfl(void *arg)
void PerfSymbolTable::clearCache()
{
- m_addressCache.clearInvalid();
+ m_invalidAddressCache.clear();
m_cuDieRanges.clear();
m_perfMap.clear();
if (m_perfMapFile.isOpen())
diff --git a/app/perfsymboltable.h b/app/perfsymboltable.h
index 9687763..d41b683 100644
--- a/app/perfsymboltable.h
+++ b/app/perfsymboltable.h
@@ -110,7 +110,7 @@ private:
ElfAndFile m_firstElf;
PerfElfMap m_elfs;
- PerfAddressCache m_addressCache;
+ PerfAddressCache::OffsetAddressCache m_invalidAddressCache;
QHash<Dwfl_Module*, CuDieRanges> m_cuDieRanges;
Dwfl_Callbacks *m_callbacks;
qint32 m_pid;
diff --git a/app/perfunwind.h b/app/perfunwind.h
index a400c80..cb4b9ed 100644
--- a/app/perfunwind.h
+++ b/app/perfunwind.h
@@ -24,6 +24,7 @@
#include "perfkallsyms.h"
#include "perfregisterinfo.h"
#include "perftracingdata.h"
+#include "perfaddresscache.h"
#include <libdwfl.h>
@@ -208,6 +209,7 @@ public:
void resolveSymbol(int locationId, const Symbol &symbol);
PerfKallsymEntry findKallsymEntry(quint64 address);
+ PerfAddressCache *addressCache() { return &m_addressCache; }
enum ErrorCode {
TimeOrderViolation = 1,
@@ -288,6 +290,7 @@ private:
QList<TaskEvent> m_taskEventsBuffer;
QHash<qint32, PerfSymbolTable *> m_symbolTables;
PerfKallsyms m_kallsyms;
+ PerfAddressCache m_addressCache;
PerfTracingData m_tracingData;
QHash<QByteArray, qint32> m_strings;
diff --git a/tests/auto/addresscache/tst_addresscache.cpp b/tests/auto/addresscache/tst_addresscache.cpp
index 2086410..604274e 100644
--- a/tests/auto/addresscache/tst_addresscache.cpp
+++ b/tests/auto/addresscache/tst_addresscache.cpp
@@ -37,24 +37,29 @@ private slots:
info_b.addr = 0x200;
PerfAddressCache cache;
+ PerfAddressCache::OffsetAddressCache invalidAddressCache;
PerfAddressCache::AddressCacheEntry entry{42, true};
- cache.cache(info_a, 0x110, entry);
- QCOMPARE(cache.find(info_a, 0x110).locationId, entry.locationId);
- QCOMPARE(cache.find(info_b, 0x210).locationId, entry.locationId);
+ cache.cache(info_a, 0x110, entry, &invalidAddressCache);
+ QCOMPARE(cache.find(info_a, 0x110, &invalidAddressCache).locationId, entry.locationId);
+ QCOMPARE(cache.find(info_b, 0x210, &invalidAddressCache).locationId, entry.locationId);
}
void testInvalid()
{
PerfAddressCache cache;
+ PerfAddressCache::OffsetAddressCache invalidAddressCache_a;
+ PerfAddressCache::OffsetAddressCache invalidAddressCache_b;
PerfAddressCache::AddressCacheEntry entry{42, true};
- cache.cache(PerfElfMap::ElfInfo{}, 0x110, entry);
- QCOMPARE(cache.find(PerfElfMap::ElfInfo{}, 0x110).locationId, entry.locationId);
+ cache.cache(PerfElfMap::ElfInfo{}, 0x110, entry, &invalidAddressCache_a);
+ QCOMPARE(cache.find(PerfElfMap::ElfInfo{}, 0x110, &invalidAddressCache_a).locationId, entry.locationId);
+ QCOMPARE(cache.find(PerfElfMap::ElfInfo{}, 0x110, &invalidAddressCache_b).locationId, -1);
}
void testEmpty()
{
PerfAddressCache cache;
- QCOMPARE(cache.find(PerfElfMap::ElfInfo{}, 0x123).locationId, -1);
+ PerfAddressCache::OffsetAddressCache invalidAddressCache;
+ QCOMPARE(cache.find(PerfElfMap::ElfInfo{}, 0x123, &invalidAddressCache).locationId, -1);
}
};