summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMilian Wolff <milian.wolff@kdab.com>2017-11-07 09:46:28 +0100
committerMilian Wolff <milian.wolff@kdab.com>2019-08-16 08:37:25 +0000
commit3508f6297a829027694a8d141945797e4923c748 (patch)
treec563674396b62ef128a824ae5418d35e9d8d4712
parent1d4f8dd4ac30acf02c2d774d7ecd51b1e92d40ee (diff)
Speed up perfparser when DWARF ranges are broken/missing in ELFs
The previous approach to handle the broken DWARF emitter from clang was to parse all CUs in the hope to find one that contains the requested address. This was done repeatedly for every address and often had to go through most of the CUs in a binary, which is a costly process. To reproduce, one can do the following: - build perfparser with clang - profile this build of perfparser while parsing some given workload - inspect the profile and notice the large overhead from find_fundie_by_pc Instead, the first time dwfl_module_addrdie fails, we now query for all CUs of an ELF and build a custom range map and use that for lookup. The initial build of this range map is roughly as costly as querying for two addresses using the old slow code path. As such, once we query three or more addresses in a given ELF this new approach is already yielding better performance. Some numbers from my test: Before: Performance counter stats for './perfparser --input perf.data --output /dev/null': 46576.925866 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 60,868 page-faults:u # 0.001 M/sec 144,506,727,842 cycles:u # 3.103 GHz 472,211,018,102 instructions:u # 3.27 insn per cycle 116,423,898,539 branches:u # 2499.605 M/sec 488,663,345 branch-misses:u # 0.42% of all branches 46.611237448 seconds time elapsed After: Performance counter stats for './perfparser --input perf.data --output /dev/null': 17447.629837 task-clock:u (msec) # 0.995 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 60,142 page-faults:u # 0.003 M/sec 53,150,847,387 cycles:u # 3.046 GHz 158,503,454,039 instructions:u # 2.98 insn per cycle 38,711,860,671 branches:u # 2218.746 M/sec 190,209,418 branch-misses:u # 0.49% of all branches 17.543916999 seconds time elapsed Change-Id: I9ca45ad7c8f77f91d0376f6dcae2f73c6e868404 Reviewed-by: Ulf Hermann <ulf.hermann@qt.io>
-rw-r--r--app/perfsymboltable.cpp230
-rw-r--r--app/perfsymboltable.h3
2 files changed, 161 insertions, 72 deletions
diff --git a/app/perfsymboltable.cpp b/app/perfsymboltable.cpp
index 445528c..3bc68bc 100644
--- a/app/perfsymboltable.cpp
+++ b/app/perfsymboltable.cpp
@@ -32,6 +32,7 @@
#include <QDir>
#include <QStack>
+#include <tuple>
#include <cstring>
#include <fcntl.h>
@@ -598,6 +599,150 @@ PerfElfMap::ElfInfo PerfSymbolTable::findElf(quint64 ip) const
return m_elfs.findElf(ip);
}
+struct AddrRange
+{
+ Dwarf_Addr low = 0;
+ Dwarf_Addr high = 0;
+
+ void setMinMax(const AddrRange range)
+ {
+ if (range.low && (low == 0 || low > range.low))
+ low = range.low;
+ if (range.high && (high == 0 || high < range.high))
+ high = range.high;
+ }
+
+ bool contains(Dwarf_Addr addr) const
+ {
+ return low <= addr && addr < high;
+ }
+
+ bool operator<(const AddrRange &rhs) const
+ {
+ return std::tie(low, high) < std::tie(rhs.low, high);
+ }
+};
+Q_DECLARE_TYPEINFO(AddrRange, Q_MOVABLE_TYPE);
+
+struct DieRangeMap
+{
+ DieRangeMap(Dwarf_Die *die = nullptr, Dwarf_Addr bias = 0)
+ : die(die)
+ , bias(bias)
+ {
+ if (die)
+ gatherRanges(die, bias);
+ }
+
+ bool contains(Dwarf_Addr addr) const
+ {
+ if (!range.contains(addr))
+ return false;
+ return std::any_of(ranges.begin(), ranges.end(),
+ [addr](AddrRange range) {
+ return range.contains(addr);
+ });
+ }
+
+ bool operator<(const DieRangeMap &rhs) const
+ {
+ return range < rhs.range;
+ }
+
+ Dwarf_Die *die = nullptr;
+ AddrRange range; // may be non-continuous, but allows quick checks and sorting
+ QVector<AddrRange> ranges;
+ Dwarf_Addr bias;
+
+private:
+ void gatherRanges(Dwarf_Die *parent_die, Dwarf_Addr bias)
+ {
+ Dwarf_Die die;
+ if (dwarf_child(parent_die, &die) != 0)
+ return;
+
+ do {
+ switch (dwarf_tag(&die)) {
+ case DW_TAG_subprogram:
+ case DW_TAG_inlined_subroutine:
+ addRanges(&die, bias);
+ break;
+ };
+ bool declaration = false;
+ Dwarf_Attribute attr_mem;
+ dwarf_formflag(dwarf_attr(&die, DW_AT_declaration, &attr_mem), &declaration);
+ if (!declaration) {
+ // let's be curious and look deeper in the tree,
+ // function are not necessarily at the first level, but
+ // might be nested inside a namespace, structure etc.
+ gatherRanges(&die, bias);
+ }
+ } while (dwarf_siblingof(&die, &die) == 0);
+ }
+
+ void addRanges(Dwarf_Die *die, Dwarf_Addr bias)
+ {
+ Dwarf_Addr low = 0, high = 0;
+ Dwarf_Addr base = 0;
+ ptrdiff_t offset = 0;
+ while ((offset = dwarf_ranges(die, offset, &base, &low, &high)) > 0) {
+ addRange(low, high, bias);
+ }
+ }
+
+ void addRange(Dwarf_Addr low, Dwarf_Addr high, Dwarf_Addr bias)
+ {
+ AddrRange ret;
+ ret.low = low + bias;
+ ret.high = high + bias;
+ range.setMinMax(ret);
+ ranges.push_back(ret);
+ }
+};
+Q_DECLARE_TYPEINFO(DieRangeMap, Q_MOVABLE_TYPE);
+
+class DieRangeMaps
+{
+public:
+ DieRangeMaps(Dwfl_Module *mod = nullptr)
+ {
+ if (!mod)
+ return;
+
+ Dwarf_Die *die = nullptr;
+ Dwarf_Addr bias = 0;
+ while ((die = dwfl_module_nextcu(mod, die, &bias))) {
+ DieRangeMap map(die, bias);
+ if (map.range.low == 0 && map.range.high == 0) {
+ // no range entries, skip
+ continue;
+ }
+ range.setMinMax(map.range);
+ maps.push_back(std::move(map));
+ }
+ }
+
+ Dwarf_Die *findDie(Dwarf_Addr addr, Dwarf_Addr *bias) const
+ {
+ if (!range.contains(addr))
+ return nullptr;
+
+ auto it = std::find_if(maps.begin(), maps.end(),
+ [addr](const DieRangeMap &map) {
+ return map.contains(addr);
+ });
+ if (it == maps.end())
+ return nullptr;
+
+ *bias = it->bias;
+ return it->die;
+ }
+public:
+ AddrRange range; // may be non-continuous, but allows quick checks
+ QVector<DieRangeMap> maps;
+};
+Q_DECLARE_TYPEINFO(DieRangeMaps, Q_MOVABLE_TYPE);
+
int symbolIndex(const Elf64_Rel &rel)
{
return ELF64_R_SYM(rel.r_info);
@@ -771,77 +916,6 @@ static QByteArray fakeSymbolFromSection(Dwfl_Module *mod, Dwarf_Addr addr)
return sym;
}
-// based on MIT licensed https://github.com/bombela/backward-cpp
-static bool die_has_pc(Dwarf_Die* die, Dwarf_Addr pc)
-{
- Dwarf_Addr low, high;
-
- // continuous range
- if (dwarf_hasattr(die, DW_AT_low_pc) && dwarf_hasattr(die, DW_AT_high_pc)) {
- if (dwarf_lowpc(die, &low) != 0)
- return false;
- if (dwarf_highpc(die, &high) != 0) {
- Dwarf_Attribute attr_mem;
- Dwarf_Attribute* attr = dwarf_attr(die, DW_AT_high_pc, &attr_mem);
- Dwarf_Word value;
- if (dwarf_formudata(attr, &value) != 0)
- return false;
- high = low + value;
- }
- return pc >= low && pc < high;
- }
-
- // non-continuous range.
- Dwarf_Addr base;
- ptrdiff_t offset = 0;
- while ((offset = dwarf_ranges(die, offset, &base, &low, &high)) > 0) {
- if (pc >= low && pc < high)
- return true;
- }
- return false;
-}
-
-static bool find_fundie_by_pc(Dwarf_Die* parent_die, Dwarf_Addr pc)
-{
- Dwarf_Die die;
- if (dwarf_child(parent_die, &die) != 0)
- return false;
-
- do {
- switch (dwarf_tag(&die)) {
- case DW_TAG_subprogram:
- case DW_TAG_inlined_subroutine:
- if (die_has_pc(&die, pc))
- return true;
- };
- bool declaration = false;
- Dwarf_Attribute attr_mem;
- dwarf_formflag(dwarf_attr(&die, DW_AT_declaration, &attr_mem), &declaration);
- if (!declaration) {
- // let's be curious and look deeper in the tree,
- // function are not necessarily at the first level, but
- // might be nested inside a namespace, structure etc.
- if (find_fundie_by_pc(&die, pc))
- return true;
- }
- } while (dwarf_siblingof(&die, &die) == 0);
- return false;
-}
-
-Dwarf_Die *find_die(Dwfl_Module *mod, Dwarf_Addr addr, Dwarf_Addr *bias)
-{
- auto die = dwfl_module_addrdie(mod, addr, bias);
- if (die)
- return die;
-
- while ((die = dwfl_module_nextcu(mod, die, bias))) {
- if (find_fundie_by_pc(die, addr - *bias))
- return die;
- }
-
- return nullptr;
-}
-
int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,
bool *isInterworking)
{
@@ -883,7 +957,18 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,
} else {
Dwarf_Addr bias = 0;
functionLocation.address -= off; // in case we don't find anything better
- Dwarf_Die *die = find_die(mod, addressLocation.address, &bias);
+
+ auto die = dwfl_module_addrdie(mod, addressLocation.address, &bias);
+ if (!die) {
+ // broken DWARF emitter by clang, e.g. no aranges
+ // cf.: https://sourceware.org/ml/elfutils-devel/2017-q2/msg00180.html
+ // build a custom lookup table and query that one
+ if (!m_dieRangeMaps.contains(mod)) {
+ m_dieRangeMaps[mod] = DieRangeMaps(mod);
+ }
+ const auto& maps = m_dieRangeMaps[mod];
+ die = maps.findDie(addressLocation.address, &bias);
+ }
if (die) {
auto srcloc = dwarf_getsrc_die(die, addressLocation.address - bias);
@@ -1036,6 +1121,7 @@ Dwfl *PerfSymbolTable::attachDwfl(void *arg)
void PerfSymbolTable::clearCache()
{
m_addressCache.clearInvalid();
+ m_dieRangeMaps.clear();
m_perfMap.clear();
if (m_perfMapFile.isOpen())
m_perfMapFile.reset();
diff --git a/app/perfsymboltable.h b/app/perfsymboltable.h
index 3fcd155..4d9318b 100644
--- a/app/perfsymboltable.h
+++ b/app/perfsymboltable.h
@@ -34,6 +34,8 @@
#include <QObject>
+class DieRangeMaps;
+
class PerfSymbolTable
{
public:
@@ -109,6 +111,7 @@ private:
PerfElfMap m_elfs;
PerfAddressCache m_addressCache;
+ QHash<Dwfl_Module*, DieRangeMaps> m_dieRangeMaps;
Dwfl_Callbacks *m_callbacks;
qint32 m_pid;