From e2f9a94944b151889ecc473fdae38f1f129d9e13 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Sun, 5 Mar 2017 13:44:19 +0100 Subject: Remove overwritten elf mappings from the PerfElfMap Now that we handle the elf map events in a time-ordered fashion, we can greatly optimize the map by removing outdated information. This happens extremely often in the real-world, when the heap map grows over time when the perf client application allocates a lot of memory. Note that this patch could potentially result in no mapping getting returned for buggy samples that violate the time order across FINISHED_ROUND events. This patch dramatically improves the performance of perfparser for real-world applications for me. Before, I measured the following numbers for perfparser: Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.normal --output /dev/null': 73063.166128 task-clock:u (msec) # 0.998 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 29,923 page-faults:u # 0.410 K/sec 223,352,922,905 cycles:u # 3.057 GHz 138,541,646,788 instructions:u # 0.62 insn per cycle 39,581,056,564 branches:u # 541.737 M/sec 242,935,966 branch-misses:u # 0.61% of all branches 73.242094659 seconds time elapsed Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.stream --output /dev/null': 137772.664268 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 28,393 page-faults:u # 0.206 K/sec 398,086,328,511 cycles:u # 2.889 GHz 164,041,587,534 instructions:u # 0.41 insn per cycle 51,009,555,879 branches:u # 370.244 M/sec 350,270,354 branch-misses:u # 0.69% of all branches 137.855666379 seconds time elapsed Now, this goes down to: Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.normal --output /dev/null': 9253.921384 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 29,627 page-faults:u # 0.003 M/sec 28,099,608,231 cycles:u # 3.037 GHz 73,635,879,583 instructions:u # 2.62 insn per cycle 17,114,401,461 branches:u # 1849.422 M/sec 96,904,616 branch-misses:u # 0.57% of all branches 9.266196437 seconds time elapsed Performance counter stats for './lib/hotspot/libexec/hotspot-perfparser --input perf.data.heaptrack.stream --output /dev/null': 8331.098618 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 26,379 page-faults:u # 0.003 M/sec 25,206,589,319 cycles:u # 3.026 GHz 65,041,985,552 instructions:u # 2.58 insn per cycle 15,106,429,469 branches:u # 1813.258 M/sec 82,649,273 branch-misses:u # 0.55% of all branches 8.343114295 seconds time elapsed Similarly, the benchmarks also dramatically improve. Before I measured: ********* Start testing of TestElfMap ********* PASS : TestElfMap::benchRegisterElfExpanding(10) RESULT : TestElfMap::benchRegisterElfExpanding():"10": 0.0018 msecs per iteration (total: 59, iterations: 32768) PASS : TestElfMap::benchRegisterElfExpanding(100) RESULT : TestElfMap::benchRegisterElfExpanding():"100": 0.024 msecs per iteration (total: 51, iterations: 2048) PASS : TestElfMap::benchRegisterElfExpanding(1000) RESULT : TestElfMap::benchRegisterElfExpanding():"1000": 1.3 msecs per iteration (total: 85, iterations: 64) PASS : TestElfMap::benchRegisterElfExpanding(2000) RESULT : TestElfMap::benchRegisterElfExpanding():"2000": 5.6 msecs per iteration (total: 91, iterations: 16) PASS : TestElfMap::benchFindElfDisjunct(10) RESULT : TestElfMap::benchFindElfDisjunct():"10": 0.0028 msecs per iteration (total: 92, iterations: 32768) PASS : TestElfMap::benchFindElfDisjunct(100) RESULT : TestElfMap::benchFindElfDisjunct():"100": 0.031 msecs per iteration (total: 64, iterations: 2048) PASS : TestElfMap::benchFindElfDisjunct(1000) RESULT : TestElfMap::benchFindElfDisjunct():"1000": 0.38 msecs per iteration (total: 98, iterations: 256) PASS : TestElfMap::benchFindElfDisjunct(2000) RESULT : TestElfMap::benchFindElfDisjunct():"2000": 0.789 msecs per iteration (total: 101, iterations: 128) PASS : TestElfMap::benchFindElfOverlapping(10) RESULT : TestElfMap::benchFindElfOverlapping():"10": 0.0029 msecs per iteration (total: 98, iterations: 32768) PASS : TestElfMap::benchFindElfOverlapping(100) RESULT : TestElfMap::benchFindElfOverlapping():"100": 0.035 msecs per iteration (total: 72, iterations: 2048) PASS : TestElfMap::benchFindElfOverlapping(1000) RESULT : TestElfMap::benchFindElfOverlapping():"1000": 0.40 msecs per iteration (total: 52, iterations: 128) PASS : TestElfMap::benchFindElfOverlapping(2000) RESULT : TestElfMap::benchFindElfOverlapping():"2000": 0.82 msecs per iteration (total: 53, iterations: 64) PASS : TestElfMap::benchFindElfExpanding(10) RESULT : TestElfMap::benchFindElfExpanding():"10": 0.0034 msecs per iteration (total: 57, iterations: 16384) PASS : TestElfMap::benchFindElfExpanding(100) RESULT : TestElfMap::benchFindElfExpanding():"100": 0.11 msecs per iteration (total: 59, iterations: 512) PASS : TestElfMap::benchFindElfExpanding(1000) RESULT : TestElfMap::benchFindElfExpanding():"1000": 10 msecs per iteration (total: 80, iterations: 8) PASS : TestElfMap::benchFindElfExpanding(2000) RESULT : TestElfMap::benchFindElfExpanding():"2000": 52.0 msecs per iteration (total: 104, iterations: 2) Totals: 33 passed, 0 failed, 0 skipped, 0 blacklisted, 7198ms ********* Finished testing of TestElfMap ********* Now, this goes down to: ********* Start testing of TestElfMap ********* Config: Using QtTest library 5.8.0, Qt 5.8.0 (x86_64-little_endian-lp64 shared (dynamic) release build; by GCC 6.3.1 20170109) PASS : TestElfMap::benchRegisterElfDisjunct(10) RESULT : TestElfMap::benchRegisterElfDisjunct():"10": 0.0016 msecs per iteration (total: 54, iterations: 32768) PASS : TestElfMap::benchRegisterElfDisjunct(100) RESULT : TestElfMap::benchRegisterElfDisjunct():"100": 0.018 msecs per iteration (total: 74, iterations: 4096) PASS : TestElfMap::benchRegisterElfDisjunct(1000) RESULT : TestElfMap::benchRegisterElfDisjunct():"1000": 0.53 msecs per iteration (total: 68, iterations: 128) PASS : TestElfMap::benchRegisterElfDisjunct(2000) RESULT : TestElfMap::benchRegisterElfDisjunct():"2000": 1.9 msecs per iteration (total: 62, iterations: 32) PASS : TestElfMap::benchRegisterElfOverlapping(10) RESULT : TestElfMap::benchRegisterElfOverlapping():"10": 0.0023 msecs per iteration (total: 76, iterations: 32768) PASS : TestElfMap::benchRegisterElfOverlapping(100) RESULT : TestElfMap::benchRegisterElfOverlapping():"100": 0.025 msecs per iteration (total: 52, iterations: 2048) PASS : TestElfMap::benchRegisterElfOverlapping(1000) RESULT : TestElfMap::benchRegisterElfOverlapping():"1000": 0.59 msecs per iteration (total: 76, iterations: 128) PASS : TestElfMap::benchRegisterElfOverlapping(2000) RESULT : TestElfMap::benchRegisterElfOverlapping():"2000": 2.0 msecs per iteration (total: 66, iterations: 32) PASS : TestElfMap::benchRegisterElfExpanding(10) RESULT : TestElfMap::benchRegisterElfExpanding():"10": 0.0015 msecs per iteration (total: 52, iterations: 32768) PASS : TestElfMap::benchRegisterElfExpanding(100) RESULT : TestElfMap::benchRegisterElfExpanding():"100": 0.015 msecs per iteration (total: 65, iterations: 4096) PASS : TestElfMap::benchRegisterElfExpanding(1000) RESULT : TestElfMap::benchRegisterElfExpanding():"1000": 0.15 msecs per iteration (total: 81, iterations: 512) PASS : TestElfMap::benchRegisterElfExpanding(2000) RESULT : TestElfMap::benchRegisterElfExpanding():"2000": 0.31 msecs per iteration (total: 81, iterations: 256) PASS : TestElfMap::benchFindElfDisjunct(10) RESULT : TestElfMap::benchFindElfDisjunct():"10": 0.0028 msecs per iteration (total: 93, iterations: 32768) PASS : TestElfMap::benchFindElfDisjunct(100) RESULT : TestElfMap::benchFindElfDisjunct():"100": 0.031 msecs per iteration (total: 65, iterations: 2048) PASS : TestElfMap::benchFindElfDisjunct(1000) RESULT : TestElfMap::benchFindElfDisjunct():"1000": 0.38 msecs per iteration (total: 99, iterations: 256) PASS : TestElfMap::benchFindElfDisjunct(2000) RESULT : TestElfMap::benchFindElfDisjunct():"2000": 0.79 msecs per iteration (total: 51, iterations: 64) PASS : TestElfMap::benchFindElfOverlapping(10) RESULT : TestElfMap::benchFindElfOverlapping():"10": 0.0028 msecs per iteration (total: 93, iterations: 32768) PASS : TestElfMap::benchFindElfOverlapping(100) RESULT : TestElfMap::benchFindElfOverlapping():"100": 0.031 msecs per iteration (total: 64, iterations: 2048) PASS : TestElfMap::benchFindElfOverlapping(1000) RESULT : TestElfMap::benchFindElfOverlapping():"1000": 0.39 msecs per iteration (total: 51, iterations: 128) PASS : TestElfMap::benchFindElfOverlapping(2000) RESULT : TestElfMap::benchFindElfOverlapping():"2000": 0.79 msecs per iteration (total: 51, iterations: 64) PASS : TestElfMap::benchFindElfExpanding(10) RESULT : TestElfMap::benchFindElfExpanding():"10": 0.0032 msecs per iteration (total: 53, iterations: 16384) PASS : TestElfMap::benchFindElfExpanding(100) RESULT : TestElfMap::benchFindElfExpanding():"100": 0.032 msecs per iteration (total: 67, iterations: 2048) PASS : TestElfMap::benchFindElfExpanding(1000) RESULT : TestElfMap::benchFindElfExpanding():"1000": 0.32 msecs per iteration (total: 84, iterations: 256) PASS : TestElfMap::benchFindElfExpanding(2000) RESULT : TestElfMap::benchFindElfExpanding():"2000": 0.65 msecs per iteration (total: 84, iterations: 128) Totals: 33 passed, 0 failed, 0 skipped, 0 blacklisted, 6747ms ********* Finished testing of TestElfMap ********* Change-Id: I6eaca5d6561dcdb0cee0d3aed4eec8f0f6c9c9a3 Reviewed-by: Ulf Hermann --- app/perfelfmap.cpp | 38 ++++++++++++++++++++------------------ app/perfelfmap.h | 9 +++------ app/perfsymboltable.cpp | 12 +++--------- app/perfsymboltable.h | 1 - tests/auto/elfmap/tst_elfmap.cpp | 40 ++++++++++++++++++++++------------------ 5 files changed, 48 insertions(+), 52 deletions(-) diff --git a/app/perfelfmap.cpp b/app/perfelfmap.cpp index be7bad7..e891fa0 100644 --- a/app/perfelfmap.cpp +++ b/app/perfelfmap.cpp @@ -32,7 +32,7 @@ QDebug operator<<(QDebug stream, const PerfElfMap::ElfInfo& info) << "len=" << info.length << ", " << "pgoff=" << info.pgoff << ", " << "timeAdded=" << info.timeAdded << ", " - << "timeOverwritten=" << info.timeOverwritten << "}"; + << "}"; return stream.space(); } @@ -55,32 +55,30 @@ bool PerfElfMap::registerElf(const quint64 addr, const quint64 len, quint64 pgof const quint64 time, const QFileInfo &fullPath) { bool cacheInvalid = false; - quint64 overwritten = std::numeric_limits::max(); const quint64 addrEnd = addr + len; const bool isFile = fullPath.isFile(); QVarLengthArray newElfs; + QVarLengthArray removedElfs; for (auto i = m_elfs.begin(), end = m_elfs.end(); i != end && i->addr < addrEnd; ++i) { const quint64 iEnd = i->addr + i->length; if (iEnd <= addr) continue; - if (i->timeOverwritten > time) { - // Newly added elf overwrites existing one. Mark the existing one as overwritten and - // reinsert any fragments of it that remain. - - if (i->addr < addr) { - newElfs.push_back(ElfInfo(i->file, i->addr, addr - i->addr, i->pgoff, - time, i->timeOverwritten)); - } - if (iEnd > addrEnd) { - newElfs.push_back(ElfInfo(i->file, addrEnd, iEnd - addrEnd, - i->pgoff + addrEnd - i->addr, time, - i->timeOverwritten)); - } - i->timeOverwritten = time; + // Newly added elf overwrites existing one. Mark the existing one as overwritten and + // reinsert any fragments of it that remain. + + if (i->addr < addr) { + newElfs.push_back(ElfInfo(i->file, i->addr, addr - i->addr, + i->pgoff, time)); + } + if (iEnd > addrEnd) { + newElfs.push_back(ElfInfo(i->file, addrEnd, iEnd - addrEnd, + i->pgoff + addrEnd - i->addr, time)); } + removedElfs.push_back(std::distance(m_elfs.begin(), i)); + // Overlapping module. Clear the cache, but only when the section is actually backed by a // file. Otherwise, we will see tons of overlapping heap/anon sections which don't actually // invalidate our caches @@ -88,7 +86,11 @@ bool PerfElfMap::registerElf(const quint64 addr, const quint64 len, quint64 pgof cacheInvalid = true; } - newElfs.push_back(ElfInfo(fullPath, addr, len, pgoff, time, overwritten)); + // remove the overwritten elfs, iterate from the back to not invalidate the indices + for (auto it = removedElfs.rbegin(), end = removedElfs.rend(); it != end; ++it) + m_elfs.remove(*it); + + newElfs.push_back(ElfInfo(fullPath, addr, len, pgoff, time)); for (const auto &elf : newElfs) { auto it = std::lower_bound(m_elfs.begin(), m_elfs.end(), @@ -110,7 +112,7 @@ PerfElfMap::ElfInfo PerfElfMap::findElf(quint64 ip, quint64 timestamp) const } while (true) { - if (i->timeAdded <= timestamp && i->timeOverwritten > timestamp) + if (i->timeAdded <= timestamp) return (i->addr + i->length > ip) ? *i : ElfInfo(); if (i == m_elfs.constBegin()) diff --git a/app/perfelfmap.h b/app/perfelfmap.h index 88923d5..9e347b9 100644 --- a/app/perfelfmap.h +++ b/app/perfelfmap.h @@ -29,10 +29,9 @@ class PerfElfMap public: struct ElfInfo { ElfInfo(const QFileInfo &file = QFileInfo(), quint64 addr = 0, quint64 length = 0, - quint64 pgoff = 0, quint64 timeAdded = 0, - quint64 timeOverwritten = std::numeric_limits::max()) : + quint64 pgoff = 0, quint64 timeAdded = 0) : file(file), addr(addr), length(length), pgoff(pgoff), timeAdded(timeAdded), - timeOverwritten(timeOverwritten), found(file.isFile()) {} + found(file.isFile()) {} bool isValid() const { @@ -46,8 +45,7 @@ public: && addr == rhs.addr && length == rhs.length && pgoff == rhs.pgoff - && timeAdded == rhs.timeAdded - && timeOverwritten == rhs.timeOverwritten; + && timeAdded == rhs.timeAdded; } QFileInfo file; @@ -55,7 +53,6 @@ public: quint64 length; quint64 pgoff; quint64 timeAdded; - quint64 timeOverwritten; bool found; }; diff --git a/app/perfsymboltable.cpp b/app/perfsymboltable.cpp index 48d6613..5c198ba 100644 --- a/app/perfsymboltable.cpp +++ b/app/perfsymboltable.cpp @@ -35,8 +35,7 @@ PerfSymbolTable::PerfSymbolTable(quint32 pid, Dwfl_Callbacks *callbacks, PerfUnwind *parent) : m_perfMapFile(QString::fromLatin1("/tmp/perf-%1.map").arg(pid)), - m_unwind(parent), m_lastMmapAddedTime(0), - m_nextMmapOverwrittenTime(std::numeric_limits::max()), m_callbacks(callbacks), + m_unwind(parent), m_lastMmapAddedTime(0), m_callbacks(callbacks), m_pid(pid) { m_dwfl = dwfl_begin(m_callbacks); @@ -364,8 +363,6 @@ Dwfl_Module *PerfSymbolTable::reportElf(const PerfElfMap::ElfInfo& info) if (m_lastMmapAddedTime < info.timeAdded) m_lastMmapAddedTime = info.timeAdded; - if (m_nextMmapOverwrittenTime > info.timeOverwritten) - m_nextMmapOverwrittenTime = info.timeOverwritten; return ret; } @@ -379,7 +376,6 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, quint64 timestamp, bool isKernel bool *isInterworking) { Q_ASSERT(timestamp >= m_lastMmapAddedTime); - Q_ASSERT(timestamp < m_nextMmapOverwrittenTime); auto it = m_addressCache.constFind(ip); if (it != m_addressCache.constEnd()) { @@ -544,20 +540,19 @@ bool PerfSymbolTable::containsAddress(quint64 address) const Dwfl *PerfSymbolTable::attachDwfl(quint64 timestamp, void *arg) { - if (timestamp < m_lastMmapAddedTime || timestamp >= m_nextMmapOverwrittenTime) + if (timestamp < m_lastMmapAddedTime) clearCache(); else if (static_cast(m_pid) == dwfl_pid(m_dwfl)) return m_dwfl; // Already attached, nothing to do // Report some random elf, so that dwfl guesses the target architecture. for (const auto &elf : m_elfs) { - if (!elf.found || elf.timeAdded > timestamp || elf.timeOverwritten <= timestamp) + if (!elf.found || elf.timeAdded > timestamp) continue; if (dwfl_report_elf(m_dwfl, elf.file.fileName().toLocal8Bit().constData(), elf.file.absoluteFilePath().toLocal8Bit().constData(), -1, elf.addr, false)) { m_lastMmapAddedTime = elf.timeAdded; - m_nextMmapOverwrittenTime = elf.timeOverwritten; break; } } @@ -573,7 +568,6 @@ Dwfl *PerfSymbolTable::attachDwfl(quint64 timestamp, void *arg) void PerfSymbolTable::clearCache() { m_lastMmapAddedTime = 0; - m_nextMmapOverwrittenTime = std::numeric_limits::max(); m_addressCache.clear(); m_perfMap.clear(); if (m_perfMapFile.isOpen()) diff --git a/app/perfsymboltable.h b/app/perfsymboltable.h index c60852e..8dff8e2 100644 --- a/app/perfsymboltable.h +++ b/app/perfsymboltable.h @@ -87,7 +87,6 @@ private: Dwfl *m_dwfl; quint64 m_lastMmapAddedTime; - quint64 m_nextMmapOverwrittenTime; PerfElfMap m_elfs; Dwfl_Callbacks *m_callbacks; diff --git a/tests/auto/elfmap/tst_elfmap.cpp b/tests/auto/elfmap/tst_elfmap.cpp index 534dd78..44df515 100644 --- a/tests/auto/elfmap/tst_elfmap.cpp +++ b/tests/auto/elfmap/tst_elfmap.cpp @@ -108,30 +108,34 @@ private slots: QFileInfo file2(tmpFile2.fileName()); QCOMPARE(file2.isFile(), secondIsFile); - const PerfElfMap::ElfInfo first(file1, 95, 20, 0, 0, 1); - const PerfElfMap::ElfInfo second(file1, 105, 20, 0, 1, 2); - const PerfElfMap::ElfInfo third(file2, 100, 20, 0, 2); - PerfElfMap map; - QCOMPARE(registerElf(&map, first), false); - QCOMPARE(registerElf(&map, second), firstIsFile); - QCOMPARE(registerElf(&map, third), firstIsFile || secondIsFile); - - QCOMPARE(map.findElf(110, 0), first); - QCOMPARE(map.findElf(110, 1), second); + { + const PerfElfMap::ElfInfo first(file1, 95, 20, 0, 0); + QCOMPARE(registerElf(&map, first), false); + QCOMPARE(map.findElf(110, 0), first); + } - QCOMPARE(map.findElf(110, 2), third); - QCOMPARE(map.findElf(110, 3), third); + { + const PerfElfMap::ElfInfo second(file1, 105, 20, 0, 1); + QCOMPARE(registerElf(&map, second), firstIsFile); + QCOMPARE(map.findElf(110, 1), second); - const PerfElfMap::ElfInfo fragment1(file1, 95, 10, 0, 1, 2); - QCOMPARE(map.findElf(97, 1), fragment1); + const PerfElfMap::ElfInfo fragment1(file1, 95, 10, 0, 1); + QCOMPARE(map.findElf(97, 1), fragment1); + } - const PerfElfMap::ElfInfo fragment2(file1, 120, 5, 15, 2); - QCOMPARE(map.findElf(122, 2), fragment2); + { + const PerfElfMap::ElfInfo third(file2, 100, 20, 0, 2); + QCOMPARE(registerElf(&map, third), firstIsFile || secondIsFile); + QCOMPARE(map.findElf(110, 2), third); + QCOMPARE(map.findElf(110, 3), third); - const PerfElfMap::ElfInfo fragment3(file1, 95, 5, 0, 2); - QCOMPARE(map.findElf(97, 2), fragment3); + const PerfElfMap::ElfInfo fragment2(file1, 120, 5, 15, 2); + const PerfElfMap::ElfInfo fragment3(file1, 95, 5, 0, 2); + QCOMPARE(map.findElf(122, 2), fragment2); + QCOMPARE(map.findElf(97, 2), fragment3); + } } void testOverwrite_data() -- cgit v1.2.3