diff options
author | Milian Wolff <milian.wolff@kdab.com> | 2017-08-06 18:39:49 +0200 |
---|---|---|
committer | Ulf Hermann <ulf.hermann@qt.io> | 2019-05-02 14:39:16 +0000 |
commit | 6a41c2d90a43e1406f6ac27cacc1e5f6affe7108 (patch) | |
tree | c7674afc0aa28cbf1036bd641403ebcd589987ef | |
parent | f528071fa3b392e4e29f500746f799771939b96d (diff) |
Add support for PERF_RECORD_SWITCH events and forward them to clients
This enables accurate off-CPU profiling based on context switch data.
When perf data is recorded via
perf record --switch-events ...
then the switch in/out events of every thread is included. The time
differences allow us to measure how long a thread was off-CPU. Paired
with call stacks recorded for the switch-out event, we can attribute
the wait time to individual source code locations, i.e.:
perf record --switch-events -e sched:sched_switch
This then gives us event data such as:
lab_mandelbrot_ 6974 [003] 1888.320564: sched:sched_switch: lab_mandelbrot_:6974 [120] S ==> swapper/3:0 [120]
857516 __sched_text_start (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
857b0d schedule (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
85bdc7 schedule_hrtimeout_range_clock (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
85bdf3 schedule_hrtimeout_range (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
434d85 poll_schedule_timeout (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
436150 do_sys_poll (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
436aa0 sys_poll (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
85d0f7 entry_SYSCALL_64_fastpath (/lib/modules/4.12.3-1-ARCH/build/vmlinux)
e3e80 __poll_nocancel (/usr/lib/libc-2.25.so)
...
lab_mandelbrot_ 6974 1888.320568: PERF_RECORD_SWITCH OUT
lab_mandelbrot_ 6974 1888.320593: PERF_RECORD_SWITCH IN
Note how you don't get the sched:sched_switch trace event for the
switch-in event, as that happens in the context of another process.
Additionally to enabling such kind of off-CPU profiling, we can use
the switch events for creating a cpu usage timeline, similar to what
other tools offer. I.e. a thread that is scheduled consumes one CPU
for the given timespan. Paired with other threads and based on the
zoom level of the timeline, this then leads to fractional CPU loads.
Change-Id: Ia2f4d07692e68a4c20244be6327791c6ceaed85c
Reviewed-by: Milian Wolff <milian.wolff@kdab.com>
-rw-r--r-- | app/perfdata.cpp | 16 | ||||
-rw-r--r-- | app/perfdata.h | 29 | ||||
-rw-r--r-- | app/perfunwind.cpp | 37 | ||||
-rw-r--r-- | app/perfunwind.h | 13 |
4 files changed, 92 insertions, 3 deletions
diff --git a/app/perfdata.cpp b/app/perfdata.cpp index f9407f6..2f68b1b 100644 --- a/app/perfdata.cpp +++ b/app/perfdata.cpp @@ -175,6 +175,12 @@ PerfData::ReadStatus PerfData::processEvents(QDataStream &stream) } break; } + case PERF_RECORD_SWITCH: { + PerfRecordContextSwitch switchEvent(&m_eventHeader, sampleType, sampleIdAll); + stream >> switchEvent; + m_destination->contextSwitch(switchEvent); + break; + } default: qWarning() << "unhandled event type" << m_eventHeader.type; @@ -628,3 +634,13 @@ QDataStream &operator>>(QDataStream &stream, PerfRecordFork &record) return stream >> record.m_pid >> record.m_ppid >> record.m_tid >> record.m_ptid >> record.m_time >> record.m_sampleId; } + +PerfRecordContextSwitch::PerfRecordContextSwitch(PerfEventHeader *header, quint64 sampleType, bool sampleIdAll) : + PerfRecord(header, sampleType, sampleIdAll) +{ +} + +QDataStream &operator>>(QDataStream &stream, PerfRecordContextSwitch &record) +{ + return stream >> record.m_sampleId; +} diff --git a/app/perfdata.h b/app/perfdata.h index 7daf794..9928c94 100644 --- a/app/perfdata.h +++ b/app/perfdata.h @@ -213,6 +213,18 @@ enum PerfEventType { */ PERF_RECORD_MMAP2 = 10, + /* + * Records a context switch in or out (flagged by + * PERF_RECORD_MISC_SWITCH_OUT). See also + * PERF_RECORD_SWITCH_CPU_WIDE. + * + * struct { + * struct perf_event_header header; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH = 14, + PERF_RECORD_MAX, /* non-ABI */ PERF_RECORD_USER_TYPE_START = 64, @@ -224,6 +236,10 @@ enum PerfEventType { PERF_RECORD_HEADER_MAX }; +enum PERF_RECORD_MISC { + PERF_RECORD_MISC_SWITCH_OUT = (1 << 13), +}; + class PerfRecordSample; // Use first attribute for deciding if this is present, not the header! @@ -269,6 +285,7 @@ public: quint64 time() const { return m_sampleId.time(); } quint64 id() const { return m_sampleId.id(); } quint16 size() const { return m_header.size; } + quint16 misc() const { return m_header.misc; } protected: PerfRecord(const PerfEventHeader *header, quint64 sampleType, bool sampleIdAll); @@ -458,6 +475,18 @@ QDataStream &operator>>(QDataStream &stream, PerfRecordFork &record); typedef PerfRecordFork PerfRecordExit; +class PerfRecordContextSwitch : public PerfRecord +{ +public: + PerfRecordContextSwitch(PerfEventHeader *header = 0, quint64 sampleType = 0, + bool sampleIdAll = false); + +private: + friend QDataStream &operator>>(QDataStream &stream, PerfRecordContextSwitch &record); +}; + +QDataStream &operator>>(QDataStream &stream, PerfRecordContextSwitch &record); + class PerfUnwind; class PerfData : public QObject { diff --git a/app/perfunwind.cpp b/app/perfunwind.cpp index 3eef7d8..b6339c7 100644 --- a/app/perfunwind.cpp +++ b/app/perfunwind.cpp @@ -63,8 +63,10 @@ void PerfUnwind::Stats::finishedRound() maxSamplesPerRound = std::max(maxSamplesPerRound, numSamplesInRound); maxMmapsPerRound = std::max(maxMmapsPerRound, numMmapsInRound); + maxContextSwitchesPerRound = std::max(maxContextSwitchesPerRound, numContextSwitchesInRound); numSamplesInRound = 0; numMmapsInRound = 0; + numContextSwitchesInRound = 0; ++numRounds; maxTotalEventSizePerRound = std::max(maxTotalEventSizePerRound, @@ -147,8 +149,10 @@ PerfUnwind::~PerfUnwind() out << "mmaps time violations: " << m_stats.numTimeViolatingMmaps << "\n"; out << "max samples per round: " << m_stats.maxSamplesPerRound << "\n"; out << "max mmaps per round: " << m_stats.maxMmapsPerRound << "\n"; + out << "max context switches per round: " << m_stats.maxContextSwitchesPerRound << "\n"; out << "max samples per flush: " << m_stats.maxSamplesPerFlush << "\n"; out << "max mmaps per flush: " << m_stats.maxMmapsPerFlush << "\n"; + out << "max context switches per flush: " << m_stats.maxContextSwitchesPerFlush << "\n"; out << "max buffer size: " << m_stats.maxBufferSize << "\n"; out << "max total event size per round: " << m_stats.maxTotalEventSizePerRound << "\n"; out << "max time: " << m_stats.maxTime << "\n"; @@ -839,6 +843,7 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize) // which we must not shuffle std::stable_sort(m_mmapBuffer.begin(), m_mmapBuffer.end(), sortByTime); std::stable_sort(m_sampleBuffer.begin(), m_sampleBuffer.end(), sortByTime); + std::stable_sort(m_contextSwitchBuffer.begin(), m_contextSwitchBuffer.end(), sortByTime); if (m_stats.enabled) { for (const auto &sample : m_sampleBuffer) { @@ -874,6 +879,10 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize) auto sampleEnd = m_sampleBuffer.end(); uint bufferSize = m_eventBufferSize; + + auto contextSwitchIt = m_contextSwitchBuffer.begin(); + auto contextSwitchEnd = m_contextSwitchBuffer.end(); + for (; m_eventBufferSize > desiredBufferSize && sampleIt != sampleEnd; ++sampleIt) { const quint64 timestamp = sampleIt->time(); @@ -904,6 +913,14 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize) forwardMmapBuffer(mmapIt, mmapEnd, timestamp); + for (; contextSwitchIt != contextSwitchEnd && contextSwitchIt->time() <= sampleIt->time(); + ++contextSwitchIt) { + if (!m_stats.enabled) { + sendContextSwitch(*contextSwitchIt); + } + m_eventBufferSize -= contextSwitchIt->size(); + } + analyze(*sampleIt); m_eventBufferSize -= sampleIt->size(); } @@ -918,10 +935,15 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize) Q_ASSERT(mmaps >= 0 && mmaps < std::numeric_limits<uint>::max()); m_stats.maxMmapsPerFlush = std::max(static_cast<uint>(mmaps), m_stats.maxMmapsPerFlush); + const auto contextSwitches = std::distance(m_contextSwitchBuffer.begin(), contextSwitchIt); + Q_ASSERT(contextSwitches >= 0 && contextSwitches < std::numeric_limits<uint>::max()); + m_stats.maxContextSwitchesPerFlush = std::max(static_cast<uint>(contextSwitches), + m_stats.maxContextSwitchesPerFlush); } m_sampleBuffer.erase(m_sampleBuffer.begin(), sampleIt); m_mmapBuffer.erase(m_mmapBuffer.begin(), mmapIt); + m_contextSwitchBuffer.erase(m_contextSwitchBuffer.begin(), contextSwitchIt); if (!violatesTimeOrder) return; @@ -946,3 +968,18 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize) qDebug() << "Increasing buffer size to" << bufferSize; setTargetEventBufferSize(bufferSize); } + +void PerfUnwind::contextSwitch(const PerfRecordContextSwitch& contextSwitch) +{ + bufferEvent(contextSwitch, &m_contextSwitchBuffer, &m_stats.numContextSwitchesInRound); +} + +void PerfUnwind::sendContextSwitch(const PerfRecordContextSwitch& contextSwitch) +{ + QByteArray buffer; + QDataStream(&buffer, QIODevice::WriteOnly) << static_cast<quint8>(ContextSwitchDefinition) + << contextSwitch.pid() << contextSwitch.tid() + << contextSwitch.time() + << bool(contextSwitch.misc() & PERF_RECORD_MISC_SWITCH_OUT); + sendBuffer(buffer); +} diff --git a/app/perfunwind.h b/app/perfunwind.h index 304534d..27e992a 100644 --- a/app/perfunwind.h +++ b/app/perfunwind.h @@ -60,6 +60,7 @@ public: TracePointFormat, TracePointSample, AttributesDefinition, + ContextSwitchDefinition, InvalidType }; @@ -107,9 +108,9 @@ public: Stats() : numSamples(0), numMmaps(0), numRounds(0), numBufferFlushes(0), numTimeViolatingSamples(0), numTimeViolatingMmaps(0), - numSamplesInRound(0), numMmapsInRound(0), - maxSamplesPerRound(0), maxMmapsPerRound(0), - maxSamplesPerFlush(0), maxMmapsPerFlush(0), + numSamplesInRound(0), numMmapsInRound(0), numContextSwitchesInRound(0), + maxSamplesPerRound(0), maxMmapsPerRound(0), maxContextSwitchesPerRound(0), + maxSamplesPerFlush(0), maxMmapsPerFlush(0), maxContextSwitchesPerFlush(0), maxBufferSize(0), maxTotalEventSizePerRound(0), maxTime(0), maxTimeBetweenRounds(0), maxReorderTime(0), lastRoundTime(0), totalEventSizePerRound(0), @@ -127,10 +128,13 @@ public: quint64 numTimeViolatingMmaps; uint numSamplesInRound; uint numMmapsInRound; + uint numContextSwitchesInRound; uint maxSamplesPerRound; uint maxMmapsPerRound; + uint maxContextSwitchesPerRound; uint maxSamplesPerFlush; uint maxMmapsPerFlush; + uint maxContextSwitchesPerFlush; uint maxBufferSize; uint maxTotalEventSizePerRound; quint64 maxTime; @@ -182,6 +186,7 @@ public: void features(const PerfFeatures &features); void tracing(const PerfTracingData &tracingData); void finishedRound(); + void contextSwitch(const PerfRecordContextSwitch &contextSwitch); bool ipIsInKernelSpace(quint64 ip) const; void sample(const PerfRecordSample &sample); @@ -273,6 +278,7 @@ private: QMultiMap<quint64, QByteArray> m_auxBuffer; QList<PerfRecordSample> m_sampleBuffer; QList<PerfRecordMmap> m_mmapBuffer; + QList<PerfRecordContextSwitch> m_contextSwitchBuffer; QHash<qint32, PerfSymbolTable *> m_symbolTables; PerfKallsyms m_kallsyms; PerfTracingData m_tracingData; @@ -305,6 +311,7 @@ private: void sendSymbol(qint32 id, const Symbol &symbol); void sendAttributes(qint32 id, const PerfEventAttributes &attributes, const QByteArray &name); void sendEventFormat(qint32 id, const EventFormat &format); + void sendContextSwitch(const PerfRecordContextSwitch &contextSwitch); template<typename Event> void bufferEvent(const Event &event, QList<Event> *buffer, uint *eventCounter); |