summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMilian Wolff <milian.wolff@kdab.com>2017-03-05 13:44:19 +0100
committerMilian Wolff <milian.wolff@kdab.com>2017-03-29 14:53:23 +0000
commite2f9a94944b151889ecc473fdae38f1f129d9e13 (patch)
treeee8ea3819d96249211f98a54a6a56be99724233a
parent74e76e24a474b7f181df1475b6a66167c1d287ee (diff)
Remove overwritten elf mappings from the PerfElfMap
Now that we handle the elf map events in a time-ordered fashion, we can greatly optimize the map by removing outdated information. This happens extremely often in the real-world, when the heap map grows over time when the perf client application allocates a lot of memory. Note that this patch could potentially result in no mapping getting returned for buggy samples that violate the time order across FINISHED_ROUND events. This patch dramatically improves the performance of perfparser for real-world applications for me. Before, I measured the following numbers for perfparser: Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.normal --output /dev/null': 73063.166128 task-clock:u (msec) # 0.998 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 29,923 page-faults:u # 0.410 K/sec 223,352,922,905 cycles:u # 3.057 GHz 138,541,646,788 instructions:u # 0.62 insn per cycle 39,581,056,564 branches:u # 541.737 M/sec 242,935,966 branch-misses:u # 0.61% of all branches 73.242094659 seconds time elapsed Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.stream --output /dev/null': 137772.664268 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 28,393 page-faults:u # 0.206 K/sec 398,086,328,511 cycles:u # 2.889 GHz 164,041,587,534 instructions:u # 0.41 insn per cycle 51,009,555,879 branches:u # 370.244 M/sec 350,270,354 branch-misses:u # 0.69% of all branches 137.855666379 seconds time elapsed Now, this goes down to: Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.normal --output /dev/null': 9253.921384 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 29,627 page-faults:u # 0.003 M/sec 28,099,608,231 cycles:u # 3.037 GHz 73,635,879,583 instructions:u # 2.62 insn per cycle 17,114,401,461 branches:u # 1849.422 M/sec 96,904,616 branch-misses:u # 0.57% of all branches 9.266196437 seconds time elapsed Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.stream --output /dev/null': 8331.098618 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 26,379 page-faults:u # 0.003 M/sec 25,206,589,319 cycles:u # 3.026 GHz 65,041,985,552 instructions:u # 2.58 insn per cycle 15,106,429,469 branches:u # 1813.258 M/sec 82,649,273 branch-misses:u # 0.55% of all branches 8.343114295 seconds time elapsed Similarly, the benchmarks also dramatically improve. Before I measured: ********* Start testing of TestElfMap ********* PASS : TestElfMap::benchRegisterElfExpanding(10) RESULT : TestElfMap::benchRegisterElfExpanding():"10": 0.0018 msecs per iteration (total: 59, iterations: 32768) PASS : TestElfMap::benchRegisterElfExpanding(100) RESULT : TestElfMap::benchRegisterElfExpanding():"100": 0.024 msecs per iteration (total: 51, iterations: 2048) PASS : TestElfMap::benchRegisterElfExpanding(1000) RESULT : TestElfMap::benchRegisterElfExpanding():"1000": 1.3 msecs per iteration (total: 85, iterations: 64) PASS : TestElfMap::benchRegisterElfExpanding(2000) RESULT : TestElfMap::benchRegisterElfExpanding():"2000": 5.6 msecs per iteration (total: 91, iterations: 16) PASS : TestElfMap::benchFindElfDisjunct(10) RESULT : TestElfMap::benchFindElfDisjunct():"10": 0.0028 msecs per iteration (total: 92, iterations: 32768) PASS : TestElfMap::benchFindElfDisjunct(100) RESULT : TestElfMap::benchFindElfDisjunct():"100": 0.031 msecs per iteration (total: 64, iterations: 2048) PASS : TestElfMap::benchFindElfDisjunct(1000) RESULT : TestElfMap::benchFindElfDisjunct():"1000": 0.38 msecs per iteration (total: 98, iterations: 256) PASS : TestElfMap::benchFindElfDisjunct(2000) RESULT : TestElfMap::benchFindElfDisjunct():"2000": 0.789 msecs per iteration (total: 101, iterations: 128) PASS : TestElfMap::benchFindElfOverlapping(10) RESULT : TestElfMap::benchFindElfOverlapping():"10": 0.0029 msecs per iteration (total: 98, iterations: 32768) PASS : TestElfMap::benchFindElfOverlapping(100) RESULT : TestElfMap::benchFindElfOverlapping():"100": 0.035 msecs per iteration (total: 72, iterations: 2048) PASS : TestElfMap::benchFindElfOverlapping(1000) RESULT : TestElfMap::benchFindElfOverlapping():"1000": 0.40 msecs per iteration (total: 52, iterations: 128) PASS : TestElfMap::benchFindElfOverlapping(2000) RESULT : TestElfMap::benchFindElfOverlapping():"2000": 0.82 msecs per iteration (total: 53, iterations: 64) PASS : TestElfMap::benchFindElfExpanding(10) RESULT : TestElfMap::benchFindElfExpanding():"10": 0.0034 msecs per iteration (total: 57, iterations: 16384) PASS : TestElfMap::benchFindElfExpanding(100) RESULT : TestElfMap::benchFindElfExpanding():"100": 0.11 msecs per iteration (total: 59, iterations: 512) PASS : TestElfMap::benchFindElfExpanding(1000) RESULT : TestElfMap::benchFindElfExpanding():"1000": 10 msecs per iteration (total: 80, iterations: 8) PASS : TestElfMap::benchFindElfExpanding(2000) RESULT : TestElfMap::benchFindElfExpanding():"2000": 52.0 msecs per iteration (total: 104, iterations: 2) Totals: 33 passed, 0 failed, 0 skipped, 0 blacklisted, 7198ms ********* Finished testing of TestElfMap ********* Now, this goes down to: ********* Start testing of TestElfMap ********* Config: Using QtTest library 5.8.0, Qt 5.8.0 (x86_64-little_endian-lp64 shared (dynamic) release build; by GCC 6.3.1 20170109) PASS : TestElfMap::benchRegisterElfDisjunct(10) RESULT : TestElfMap::benchRegisterElfDisjunct():"10": 0.0016 msecs per iteration (total: 54, iterations: 32768) PASS : TestElfMap::benchRegisterElfDisjunct(100) RESULT : TestElfMap::benchRegisterElfDisjunct():"100": 0.018 msecs per iteration (total: 74, iterations: 4096) PASS : TestElfMap::benchRegisterElfDisjunct(1000) RESULT : TestElfMap::benchRegisterElfDisjunct():"1000": 0.53 msecs per iteration (total: 68, iterations: 128) PASS : TestElfMap::benchRegisterElfDisjunct(2000) RESULT : TestElfMap::benchRegisterElfDisjunct():"2000": 1.9 msecs per iteration (total: 62, iterations: 32) PASS : TestElfMap::benchRegisterElfOverlapping(10) RESULT : TestElfMap::benchRegisterElfOverlapping():"10": 0.0023 msecs per iteration (total: 76, iterations: 32768) PASS : TestElfMap::benchRegisterElfOverlapping(100) RESULT : TestElfMap::benchRegisterElfOverlapping():"100": 0.025 msecs per iteration (total: 52, iterations: 2048) PASS : TestElfMap::benchRegisterElfOverlapping(1000) RESULT : TestElfMap::benchRegisterElfOverlapping():"1000": 0.59 msecs per iteration (total: 76, iterations: 128) PASS : TestElfMap::benchRegisterElfOverlapping(2000) RESULT : TestElfMap::benchRegisterElfOverlapping():"2000": 2.0 msecs per iteration (total: 66, iterations: 32) PASS : TestElfMap::benchRegisterElfExpanding(10) RESULT : TestElfMap::benchRegisterElfExpanding():"10": 0.0015 msecs per iteration (total: 52, iterations: 32768) PASS : TestElfMap::benchRegisterElfExpanding(100) RESULT : TestElfMap::benchRegisterElfExpanding():"100": 0.015 msecs per iteration (total: 65, iterations: 4096) PASS : TestElfMap::benchRegisterElfExpanding(1000) RESULT : TestElfMap::benchRegisterElfExpanding():"1000": 0.15 msecs per iteration (total: 81, iterations: 512) PASS : TestElfMap::benchRegisterElfExpanding(2000) RESULT : TestElfMap::benchRegisterElfExpanding():"2000": 0.31 msecs per iteration (total: 81, iterations: 256) PASS : TestElfMap::benchFindElfDisjunct(10) RESULT : TestElfMap::benchFindElfDisjunct():"10": 0.0028 msecs per iteration (total: 93, iterations: 32768) PASS : TestElfMap::benchFindElfDisjunct(100) RESULT : TestElfMap::benchFindElfDisjunct():"100": 0.031 msecs per iteration (total: 65, iterations: 2048) PASS : TestElfMap::benchFindElfDisjunct(1000) RESULT : TestElfMap::benchFindElfDisjunct():"1000": 0.38 msecs per iteration (total: 99, iterations: 256) PASS : TestElfMap::benchFindElfDisjunct(2000) RESULT : TestElfMap::benchFindElfDisjunct():"2000": 0.79 msecs per iteration (total: 51, iterations: 64) PASS : TestElfMap::benchFindElfOverlapping(10) RESULT : TestElfMap::benchFindElfOverlapping():"10": 0.0028 msecs per iteration (total: 93, iterations: 32768) PASS : TestElfMap::benchFindElfOverlapping(100) RESULT : TestElfMap::benchFindElfOverlapping():"100": 0.031 msecs per iteration (total: 64, iterations: 2048) PASS : TestElfMap::benchFindElfOverlapping(1000) RESULT : TestElfMap::benchFindElfOverlapping():"1000": 0.39 msecs per iteration (total: 51, iterations: 128) PASS : TestElfMap::benchFindElfOverlapping(2000) RESULT : TestElfMap::benchFindElfOverlapping():"2000": 0.79 msecs per iteration (total: 51, iterations: 64) PASS : TestElfMap::benchFindElfExpanding(10) RESULT : TestElfMap::benchFindElfExpanding():"10": 0.0032 msecs per iteration (total: 53, iterations: 16384) PASS : TestElfMap::benchFindElfExpanding(100) RESULT : TestElfMap::benchFindElfExpanding():"100": 0.032 msecs per iteration (total: 67, iterations: 2048) PASS : TestElfMap::benchFindElfExpanding(1000) RESULT : TestElfMap::benchFindElfExpanding():"1000": 0.32 msecs per iteration (total: 84, iterations: 256) PASS : TestElfMap::benchFindElfExpanding(2000) RESULT : TestElfMap::benchFindElfExpanding():"2000": 0.65 msecs per iteration (total: 84, iterations: 128) Totals: 33 passed, 0 failed, 0 skipped, 0 blacklisted, 6747ms ********* Finished testing of TestElfMap ********* Change-Id: I6eaca5d6561dcdb0cee0d3aed4eec8f0f6c9c9a3 Reviewed-by: Ulf Hermann <ulf.hermann@qt.io>
-rw-r--r--app/perfelfmap.cpp38
-rw-r--r--app/perfelfmap.h9
-rw-r--r--app/perfsymboltable.cpp12
-rw-r--r--app/perfsymboltable.h1
-rw-r--r--tests/auto/elfmap/tst_elfmap.cpp40
5 files changed, 48 insertions, 52 deletions
diff --git a/app/perfelfmap.cpp b/app/perfelfmap.cpp
index be7bad7..e891fa0 100644
--- a/app/perfelfmap.cpp
+++ b/app/perfelfmap.cpp
@@ -32,7 +32,7 @@ QDebug operator<<(QDebug stream, const PerfElfMap::ElfInfo& info)
<< "len=" << info.length << ", "
<< "pgoff=" << info.pgoff << ", "
<< "timeAdded=" << info.timeAdded << ", "
- << "timeOverwritten=" << info.timeOverwritten << "}";
+ << "}";
return stream.space();
}
@@ -55,32 +55,30 @@ bool PerfElfMap::registerElf(const quint64 addr, const quint64 len, quint64 pgof
const quint64 time, const QFileInfo &fullPath)
{
bool cacheInvalid = false;
- quint64 overwritten = std::numeric_limits<quint64>::max();
const quint64 addrEnd = addr + len;
const bool isFile = fullPath.isFile();
QVarLengthArray<ElfInfo, 8> newElfs;
+ QVarLengthArray<int, 8> removedElfs;
for (auto i = m_elfs.begin(), end = m_elfs.end(); i != end && i->addr < addrEnd; ++i) {
const quint64 iEnd = i->addr + i->length;
if (iEnd <= addr)
continue;
- if (i->timeOverwritten > time) {
- // Newly added elf overwrites existing one. Mark the existing one as overwritten and
- // reinsert any fragments of it that remain.
-
- if (i->addr < addr) {
- newElfs.push_back(ElfInfo(i->file, i->addr, addr - i->addr, i->pgoff,
- time, i->timeOverwritten));
- }
- if (iEnd > addrEnd) {
- newElfs.push_back(ElfInfo(i->file, addrEnd, iEnd - addrEnd,
- i->pgoff + addrEnd - i->addr, time,
- i->timeOverwritten));
- }
- i->timeOverwritten = time;
+ // Newly added elf overwrites existing one. Mark the existing one as overwritten and
+ // reinsert any fragments of it that remain.
+
+ if (i->addr < addr) {
+ newElfs.push_back(ElfInfo(i->file, i->addr, addr - i->addr,
+ i->pgoff, time));
+ }
+ if (iEnd > addrEnd) {
+ newElfs.push_back(ElfInfo(i->file, addrEnd, iEnd - addrEnd,
+ i->pgoff + addrEnd - i->addr, time));
}
+ removedElfs.push_back(std::distance(m_elfs.begin(), i));
+
// Overlapping module. Clear the cache, but only when the section is actually backed by a
// file. Otherwise, we will see tons of overlapping heap/anon sections which don't actually
// invalidate our caches
@@ -88,7 +86,11 @@ bool PerfElfMap::registerElf(const quint64 addr, const quint64 len, quint64 pgof
cacheInvalid = true;
}
- newElfs.push_back(ElfInfo(fullPath, addr, len, pgoff, time, overwritten));
+ // remove the overwritten elfs, iterate from the back to not invalidate the indices
+ for (auto it = removedElfs.rbegin(), end = removedElfs.rend(); it != end; ++it)
+ m_elfs.remove(*it);
+
+ newElfs.push_back(ElfInfo(fullPath, addr, len, pgoff, time));
for (const auto &elf : newElfs) {
auto it = std::lower_bound(m_elfs.begin(), m_elfs.end(),
@@ -110,7 +112,7 @@ PerfElfMap::ElfInfo PerfElfMap::findElf(quint64 ip, quint64 timestamp) const
}
while (true) {
- if (i->timeAdded <= timestamp && i->timeOverwritten > timestamp)
+ if (i->timeAdded <= timestamp)
return (i->addr + i->length > ip) ? *i : ElfInfo();
if (i == m_elfs.constBegin())
diff --git a/app/perfelfmap.h b/app/perfelfmap.h
index 88923d5..9e347b9 100644
--- a/app/perfelfmap.h
+++ b/app/perfelfmap.h
@@ -29,10 +29,9 @@ class PerfElfMap
public:
struct ElfInfo {
ElfInfo(const QFileInfo &file = QFileInfo(), quint64 addr = 0, quint64 length = 0,
- quint64 pgoff = 0, quint64 timeAdded = 0,
- quint64 timeOverwritten = std::numeric_limits<quint64>::max()) :
+ quint64 pgoff = 0, quint64 timeAdded = 0) :
file(file), addr(addr), length(length), pgoff(pgoff), timeAdded(timeAdded),
- timeOverwritten(timeOverwritten), found(file.isFile()) {}
+ found(file.isFile()) {}
bool isValid() const
{
@@ -46,8 +45,7 @@ public:
&& addr == rhs.addr
&& length == rhs.length
&& pgoff == rhs.pgoff
- && timeAdded == rhs.timeAdded
- && timeOverwritten == rhs.timeOverwritten;
+ && timeAdded == rhs.timeAdded;
}
QFileInfo file;
@@ -55,7 +53,6 @@ public:
quint64 length;
quint64 pgoff;
quint64 timeAdded;
- quint64 timeOverwritten;
bool found;
};
diff --git a/app/perfsymboltable.cpp b/app/perfsymboltable.cpp
index 48d6613..5c198ba 100644
--- a/app/perfsymboltable.cpp
+++ b/app/perfsymboltable.cpp
@@ -35,8 +35,7 @@
PerfSymbolTable::PerfSymbolTable(quint32 pid, Dwfl_Callbacks *callbacks, PerfUnwind *parent) :
m_perfMapFile(QString::fromLatin1("/tmp/perf-%1.map").arg(pid)),
- m_unwind(parent), m_lastMmapAddedTime(0),
- m_nextMmapOverwrittenTime(std::numeric_limits<quint64>::max()), m_callbacks(callbacks),
+ m_unwind(parent), m_lastMmapAddedTime(0), m_callbacks(callbacks),
m_pid(pid)
{
m_dwfl = dwfl_begin(m_callbacks);
@@ -364,8 +363,6 @@ Dwfl_Module *PerfSymbolTable::reportElf(const PerfElfMap::ElfInfo& info)
if (m_lastMmapAddedTime < info.timeAdded)
m_lastMmapAddedTime = info.timeAdded;
- if (m_nextMmapOverwrittenTime > info.timeOverwritten)
- m_nextMmapOverwrittenTime = info.timeOverwritten;
return ret;
}
@@ -379,7 +376,6 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, quint64 timestamp, bool isKernel
bool *isInterworking)
{
Q_ASSERT(timestamp >= m_lastMmapAddedTime);
- Q_ASSERT(timestamp < m_nextMmapOverwrittenTime);
auto it = m_addressCache.constFind(ip);
if (it != m_addressCache.constEnd()) {
@@ -544,20 +540,19 @@ bool PerfSymbolTable::containsAddress(quint64 address) const
Dwfl *PerfSymbolTable::attachDwfl(quint64 timestamp, void *arg)
{
- if (timestamp < m_lastMmapAddedTime || timestamp >= m_nextMmapOverwrittenTime)
+ if (timestamp < m_lastMmapAddedTime)
clearCache();
else if (static_cast<pid_t>(m_pid) == dwfl_pid(m_dwfl))
return m_dwfl; // Already attached, nothing to do
// Report some random elf, so that dwfl guesses the target architecture.
for (const auto &elf : m_elfs) {
- if (!elf.found || elf.timeAdded > timestamp || elf.timeOverwritten <= timestamp)
+ if (!elf.found || elf.timeAdded > timestamp)
continue;
if (dwfl_report_elf(m_dwfl, elf.file.fileName().toLocal8Bit().constData(),
elf.file.absoluteFilePath().toLocal8Bit().constData(), -1,
elf.addr, false)) {
m_lastMmapAddedTime = elf.timeAdded;
- m_nextMmapOverwrittenTime = elf.timeOverwritten;
break;
}
}
@@ -573,7 +568,6 @@ Dwfl *PerfSymbolTable::attachDwfl(quint64 timestamp, void *arg)
void PerfSymbolTable::clearCache()
{
m_lastMmapAddedTime = 0;
- m_nextMmapOverwrittenTime = std::numeric_limits<quint64>::max();
m_addressCache.clear();
m_perfMap.clear();
if (m_perfMapFile.isOpen())
diff --git a/app/perfsymboltable.h b/app/perfsymboltable.h
index c60852e..8dff8e2 100644
--- a/app/perfsymboltable.h
+++ b/app/perfsymboltable.h
@@ -87,7 +87,6 @@ private:
Dwfl *m_dwfl;
quint64 m_lastMmapAddedTime;
- quint64 m_nextMmapOverwrittenTime;
PerfElfMap m_elfs;
Dwfl_Callbacks *m_callbacks;
diff --git a/tests/auto/elfmap/tst_elfmap.cpp b/tests/auto/elfmap/tst_elfmap.cpp
index 534dd78..44df515 100644
--- a/tests/auto/elfmap/tst_elfmap.cpp
+++ b/tests/auto/elfmap/tst_elfmap.cpp
@@ -108,30 +108,34 @@ private slots:
QFileInfo file2(tmpFile2.fileName());
QCOMPARE(file2.isFile(), secondIsFile);
- const PerfElfMap::ElfInfo first(file1, 95, 20, 0, 0, 1);
- const PerfElfMap::ElfInfo second(file1, 105, 20, 0, 1, 2);
- const PerfElfMap::ElfInfo third(file2, 100, 20, 0, 2);
-
PerfElfMap map;
- QCOMPARE(registerElf(&map, first), false);
- QCOMPARE(registerElf(&map, second), firstIsFile);
- QCOMPARE(registerElf(&map, third), firstIsFile || secondIsFile);
-
- QCOMPARE(map.findElf(110, 0), first);
- QCOMPARE(map.findElf(110, 1), second);
+ {
+ const PerfElfMap::ElfInfo first(file1, 95, 20, 0, 0);
+ QCOMPARE(registerElf(&map, first), false);
+ QCOMPARE(map.findElf(110, 0), first);
+ }
- QCOMPARE(map.findElf(110, 2), third);
- QCOMPARE(map.findElf(110, 3), third);
+ {
+ const PerfElfMap::ElfInfo second(file1, 105, 20, 0, 1);
+ QCOMPARE(registerElf(&map, second), firstIsFile);
+ QCOMPARE(map.findElf(110, 1), second);
- const PerfElfMap::ElfInfo fragment1(file1, 95, 10, 0, 1, 2);
- QCOMPARE(map.findElf(97, 1), fragment1);
+ const PerfElfMap::ElfInfo fragment1(file1, 95, 10, 0, 1);
+ QCOMPARE(map.findElf(97, 1), fragment1);
+ }
- const PerfElfMap::ElfInfo fragment2(file1, 120, 5, 15, 2);
- QCOMPARE(map.findElf(122, 2), fragment2);
+ {
+ const PerfElfMap::ElfInfo third(file2, 100, 20, 0, 2);
+ QCOMPARE(registerElf(&map, third), firstIsFile || secondIsFile);
+ QCOMPARE(map.findElf(110, 2), third);
+ QCOMPARE(map.findElf(110, 3), third);
- const PerfElfMap::ElfInfo fragment3(file1, 95, 5, 0, 2);
- QCOMPARE(map.findElf(97, 2), fragment3);
+ const PerfElfMap::ElfInfo fragment2(file1, 120, 5, 15, 2);
+ const PerfElfMap::ElfInfo fragment3(file1, 95, 5, 0, 2);
+ QCOMPARE(map.findElf(122, 2), fragment2);
+ QCOMPARE(map.findElf(97, 2), fragment3);
+ }
}
void testOverwrite_data()