Add support for PERF_RECORD_SWITCH events and forward them to clients

This enables accurate off-CPU profiling based on context switch data. When perf data is recorded via perf record --switch-events ... then the switch in/out events of every thread is included. The time differences allow us to measure how long a thread was off-CPU. Paired with call stacks recorded for the switch-out event, we can attribute the wait time to individual source code locations, i.e.: perf record --switch-events -e sched:sched_switch This then gives us event data such as: lab_mandelbrot_ 6974 [003] 1888.320564: sched:sched_switch: lab_mandelbrot_:6974 [120] S ==> swapper/3:0 [120] 857516 __sched_text_start (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 857b0d schedule (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 85bdc7 schedule_hrtimeout_range_clock (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 85bdf3 schedule_hrtimeout_range (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 434d85 poll_schedule_timeout (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 436150 do_sys_poll (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 436aa0 sys_poll (/lib/modules/4.12.3-1-ARCH/build/vmlinux) 85d0f7 entry_SYSCALL_64_fastpath (/lib/modules/4.12.3-1-ARCH/build/vmlinux) e3e80 __poll_nocancel (/usr/lib/libc-2.25.so) ... lab_mandelbrot_ 6974 1888.320568: PERF_RECORD_SWITCH OUT lab_mandelbrot_ 6974 1888.320593: PERF_RECORD_SWITCH IN Note how you don't get the sched:sched_switch trace event for the switch-in event, as that happens in the context of another process. Additionally to enabling such kind of off-CPU profiling, we can use the switch events for creating a cpu usage timeline, similar to what other tools offer. I.e. a thread that is scheduled consumes one CPU for the given timespan. Paired with other threads and based on the zoom level of the timeline, this then leads to fractional CPU loads. Change-Id: Ia2f4d07692e68a4c20244be6327791c6ceaed85c Reviewed-by: Milian Wolff <milian.wolff@kdab.com>
author: Milian Wolff <milian.wolff@kdab.com> 2017-08-06 18:39:49 +0200
committer: Ulf Hermann <ulf.hermann@qt.io> 2019-05-02 14:39:16 +0000
commit: 6a41c2d90a43e1406f6ac27cacc1e5f6affe7108 (patch)
tree: c7674afc0aa28cbf1036bd641403ebcd589987ef
parent: f528071fa3b392e4e29f500746f799771939b96d (diff)
4 files changed, 92 insertions, 3 deletions
diff --git a/app/perfdata.cpp b/app/perfdata.cpp
index f9407f6..2f68b1b 100644
--- a/app/perfdata.cpp
+++ b/app/perfdata.cpp
@@ -175,6 +175,12 @@ PerfData::ReadStatus PerfData::processEvents(QDataStream &stream)
         }
         break;
     }
+    case PERF_RECORD_SWITCH: {
+        PerfRecordContextSwitch switchEvent(&m_eventHeader, sampleType, sampleIdAll);
+        stream >> switchEvent;
+        m_destination->contextSwitch(switchEvent);
+        break;
+    }
 
     default:
         qWarning() << "unhandled event type" << m_eventHeader.type;
@@ -628,3 +634,13 @@ QDataStream &operator>>(QDataStream &stream, PerfRecordFork &record)
     return stream >> record.m_pid >> record.m_ppid >> record.m_tid >> record.m_ptid >> record.m_time
                   >> record.m_sampleId;
 }
+
+PerfRecordContextSwitch::PerfRecordContextSwitch(PerfEventHeader *header, quint64 sampleType, bool sampleIdAll) :
+    PerfRecord(header, sampleType, sampleIdAll)
+{
+}
+
+QDataStream &operator>>(QDataStream &stream, PerfRecordContextSwitch &record)
+{
+    return stream >> record.m_sampleId;
+}
diff --git a/app/perfdata.h b/app/perfdata.h
index 7daf794..9928c94 100644
--- a/app/perfdata.h
+++ b/app/perfdata.h
@@ -213,6 +213,18 @@ enum PerfEventType {
      */
     PERF_RECORD_MMAP2             = 10,
 
+    /*
+     * Records a context switch in or out (flagged by
+     * PERF_RECORD_MISC_SWITCH_OUT). See also
+     * PERF_RECORD_SWITCH_CPU_WIDE.
+     *
+     * struct {
+     *    struct perf_event_header header;
+     *    struct sample_id         sample_id;
+     * };
+     */
+    PERF_RECORD_SWITCH            = 14,
+
     PERF_RECORD_MAX,              /* non-ABI */
 
     PERF_RECORD_USER_TYPE_START     = 64,
@@ -224,6 +236,10 @@ enum PerfEventType {
     PERF_RECORD_HEADER_MAX
 };
 
+enum PERF_RECORD_MISC {
+    PERF_RECORD_MISC_SWITCH_OUT = (1 << 13),
+};
+
 class PerfRecordSample;
 
 // Use first attribute for deciding if this is present, not the header!
@@ -269,6 +285,7 @@ public:
     quint64 time() const { return m_sampleId.time(); }
     quint64 id() const { return m_sampleId.id(); }
     quint16 size() const { return m_header.size; }
+    quint16 misc() const { return m_header.misc; }
 
 protected:
     PerfRecord(const PerfEventHeader *header, quint64 sampleType, bool sampleIdAll);
@@ -458,6 +475,18 @@ QDataStream &operator>>(QDataStream &stream, PerfRecordFork &record);
 
 typedef PerfRecordFork PerfRecordExit;
 
+class PerfRecordContextSwitch : public PerfRecord
+{
+public:
+    PerfRecordContextSwitch(PerfEventHeader *header = 0, quint64 sampleType = 0,
+                            bool sampleIdAll = false);
+
+private:
+    friend QDataStream &operator>>(QDataStream &stream, PerfRecordContextSwitch &record);
+};
+
+QDataStream &operator>>(QDataStream &stream, PerfRecordContextSwitch &record);
+
 class PerfUnwind;
 class PerfData : public QObject
 {
diff --git a/app/perfunwind.cpp b/app/perfunwind.cpp
index 3eef7d8..b6339c7 100644
--- a/app/perfunwind.cpp
+++ b/app/perfunwind.cpp
@@ -63,8 +63,10 @@ void PerfUnwind::Stats::finishedRound()
 
     maxSamplesPerRound = std::max(maxSamplesPerRound, numSamplesInRound);
     maxMmapsPerRound = std::max(maxMmapsPerRound, numMmapsInRound);
+    maxContextSwitchesPerRound = std::max(maxContextSwitchesPerRound, numContextSwitchesInRound);
     numSamplesInRound = 0;
     numMmapsInRound = 0;
+    numContextSwitchesInRound = 0;
     ++numRounds;
 
     maxTotalEventSizePerRound = std::max(maxTotalEventSizePerRound,
@@ -147,8 +149,10 @@ PerfUnwind::~PerfUnwind()
         out << "mmaps time violations: " << m_stats.numTimeViolatingMmaps << "\n";
         out << "max samples per round: " << m_stats.maxSamplesPerRound << "\n";
         out << "max mmaps per round: " << m_stats.maxMmapsPerRound << "\n";
+        out << "max context switches per round: " << m_stats.maxContextSwitchesPerRound << "\n";
         out << "max samples per flush: " << m_stats.maxSamplesPerFlush << "\n";
         out << "max mmaps per flush: " << m_stats.maxMmapsPerFlush << "\n";
+        out << "max context switches per flush: " << m_stats.maxContextSwitchesPerFlush << "\n";
         out << "max buffer size: " << m_stats.maxBufferSize << "\n";
         out << "max total event size per round: " << m_stats.maxTotalEventSizePerRound << "\n";
         out << "max time: " << m_stats.maxTime << "\n";
@@ -839,6 +843,7 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize)
     // which we must not shuffle
     std::stable_sort(m_mmapBuffer.begin(), m_mmapBuffer.end(), sortByTime);
     std::stable_sort(m_sampleBuffer.begin(), m_sampleBuffer.end(), sortByTime);
+    std::stable_sort(m_contextSwitchBuffer.begin(), m_contextSwitchBuffer.end(), sortByTime);
 
     if (m_stats.enabled) {
         for (const auto &sample : m_sampleBuffer) {
@@ -874,6 +879,10 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize)
     auto sampleEnd = m_sampleBuffer.end();
 
     uint bufferSize = m_eventBufferSize;
+
+    auto contextSwitchIt = m_contextSwitchBuffer.begin();
+    auto contextSwitchEnd = m_contextSwitchBuffer.end();
+
     for (; m_eventBufferSize > desiredBufferSize && sampleIt != sampleEnd; ++sampleIt) {
         const quint64 timestamp = sampleIt->time();
 
@@ -904,6 +913,14 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize)
 
         forwardMmapBuffer(mmapIt, mmapEnd, timestamp);
 
+        for (; contextSwitchIt != contextSwitchEnd && contextSwitchIt->time() <= sampleIt->time();
+             ++contextSwitchIt) {
+            if (!m_stats.enabled) {
+                sendContextSwitch(*contextSwitchIt);
+            }
+            m_eventBufferSize -= contextSwitchIt->size();
+        }
+
         analyze(*sampleIt);
         m_eventBufferSize -= sampleIt->size();
     }
@@ -918,10 +935,15 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize)
         Q_ASSERT(mmaps >= 0 && mmaps < std::numeric_limits<uint>::max());
         m_stats.maxMmapsPerFlush = std::max(static_cast<uint>(mmaps),
                                             m_stats.maxMmapsPerFlush);
+        const auto contextSwitches = std::distance(m_contextSwitchBuffer.begin(), contextSwitchIt);
+        Q_ASSERT(contextSwitches >= 0 && contextSwitches < std::numeric_limits<uint>::max());
+        m_stats.maxContextSwitchesPerFlush = std::max(static_cast<uint>(contextSwitches),
+                                                      m_stats.maxContextSwitchesPerFlush);
     }
 
     m_sampleBuffer.erase(m_sampleBuffer.begin(), sampleIt);
     m_mmapBuffer.erase(m_mmapBuffer.begin(), mmapIt);
+    m_contextSwitchBuffer.erase(m_contextSwitchBuffer.begin(), contextSwitchIt);
 
     if (!violatesTimeOrder)
         return;
@@ -946,3 +968,18 @@ void PerfUnwind::flushEventBuffer(uint desiredBufferSize)
     qDebug() << "Increasing buffer size to" << bufferSize;
     setTargetEventBufferSize(bufferSize);
 }
+
+void PerfUnwind::contextSwitch(const PerfRecordContextSwitch& contextSwitch)
+{
+    bufferEvent(contextSwitch, &m_contextSwitchBuffer, &m_stats.numContextSwitchesInRound);
+}
+
+void PerfUnwind::sendContextSwitch(const PerfRecordContextSwitch& contextSwitch)
+{
+    QByteArray buffer;
+    QDataStream(&buffer, QIODevice::WriteOnly) << static_cast<quint8>(ContextSwitchDefinition)
+                                               << contextSwitch.pid() << contextSwitch.tid()
+                                               << contextSwitch.time()
+                                               << bool(contextSwitch.misc() & PERF_RECORD_MISC_SWITCH_OUT);
+    sendBuffer(buffer);
+}
diff --git a/app/perfunwind.h b/app/perfunwind.h
index 304534d..27e992a 100644
--- a/app/perfunwind.h
+++ b/app/perfunwind.h
@@ -60,6 +60,7 @@ public:
         TracePointFormat,
         TracePointSample,
         AttributesDefinition,
+        ContextSwitchDefinition,
         InvalidType
     };
 
@@ -107,9 +108,9 @@ public:
         Stats()
             : numSamples(0), numMmaps(0), numRounds(0), numBufferFlushes(0),
             numTimeViolatingSamples(0), numTimeViolatingMmaps(0),
-            numSamplesInRound(0), numMmapsInRound(0),
-            maxSamplesPerRound(0), maxMmapsPerRound(0),
-            maxSamplesPerFlush(0), maxMmapsPerFlush(0),
+            numSamplesInRound(0), numMmapsInRound(0), numContextSwitchesInRound(0),
+            maxSamplesPerRound(0), maxMmapsPerRound(0), maxContextSwitchesPerRound(0),
+            maxSamplesPerFlush(0), maxMmapsPerFlush(0), maxContextSwitchesPerFlush(0),
             maxBufferSize(0), maxTotalEventSizePerRound(0),
             maxTime(0), maxTimeBetweenRounds(0), maxReorderTime(0),
             lastRoundTime(0), totalEventSizePerRound(0),
@@ -127,10 +128,13 @@ public:
         quint64 numTimeViolatingMmaps;
         uint numSamplesInRound;
         uint numMmapsInRound;
+        uint numContextSwitchesInRound;
         uint maxSamplesPerRound;
         uint maxMmapsPerRound;
+        uint maxContextSwitchesPerRound;
         uint maxSamplesPerFlush;
         uint maxMmapsPerFlush;
+        uint maxContextSwitchesPerFlush;
         uint maxBufferSize;
         uint maxTotalEventSizePerRound;
         quint64 maxTime;
@@ -182,6 +186,7 @@ public:
     void features(const PerfFeatures &features);
     void tracing(const PerfTracingData &tracingData);
     void finishedRound();
+    void contextSwitch(const PerfRecordContextSwitch &contextSwitch);
 
     bool ipIsInKernelSpace(quint64 ip) const;
     void sample(const PerfRecordSample &sample);
@@ -273,6 +278,7 @@ private:
     QMultiMap<quint64, QByteArray> m_auxBuffer;
     QList<PerfRecordSample> m_sampleBuffer;
     QList<PerfRecordMmap> m_mmapBuffer;
+    QList<PerfRecordContextSwitch> m_contextSwitchBuffer;
     QHash<qint32, PerfSymbolTable *> m_symbolTables;
     PerfKallsyms m_kallsyms;
     PerfTracingData m_tracingData;
@@ -305,6 +311,7 @@ private:
     void sendSymbol(qint32 id, const Symbol &symbol);
     void sendAttributes(qint32 id, const PerfEventAttributes &attributes, const QByteArray &name);
     void sendEventFormat(qint32 id, const EventFormat &format);
+    void sendContextSwitch(const PerfRecordContextSwitch &contextSwitch);
 
     template<typename Event>
     void bufferEvent(const Event &event, QList<Event> *buffer, uint *eventCounter);
author	Milian Wolff <milian.wolff@kdab.com>	2017-08-06 18:39:49 +0200
committer	Ulf Hermann <ulf.hermann@qt.io>	2019-05-02 14:39:16 +0000
commit	6a41c2d90a43e1406f6ac27cacc1e5f6affe7108 (patch)
tree	c7674afc0aa28cbf1036bd641403ebcd589987ef
parent	f528071fa3b392e4e29f500746f799771939b96d (diff)