From 60b3aa78fbe280b54ee1a38142c8e90c6d50bef7 Mon Sep 17 00:00:00 2001 From: Bill Chen Date: Wed, 31 Jul 2024 17:19:24 +0800 Subject: [PATCH] Use inline asm for more accurate perf collection. Added flag parameter in collect_scope_start/stop to control what threads to collect. Signed-off-by: Bill Chen --- collectors/perf.cpp | 141 ++++++++++++++++++++++++++++++++------------ collectors/perf.hpp | 16 ++++- interface.cpp | 8 +-- interface.hpp | 8 +-- test.cpp | 27 ++++++--- 5 files changed, 145 insertions(+), 55 deletions(-) diff --git a/collectors/perf.cpp b/collectors/perf.cpp index b9a4351..487bc36 100644 --- a/collectors/perf.cpp +++ b/collectors/perf.cpp @@ -443,59 +443,99 @@ bool PerfCollector::collect(int64_t now) return true; } -bool PerfCollector::collect_scope_start(int64_t now, uint16_t func_id) { +bool PerfCollector::collect_scope_start(int64_t now, uint16_t func_id, int32_t flags) { if (!mCollecting) return false; struct snapshot snap; - for (perf_thread& t : mReplayThreads) + if (flags & COLLECT_REPLAY_THREADS || flags & COLLECT_ALL_THREADS) { - t.eventCtx.collect_scope(now, func_id, false); + for (perf_thread &t : mReplayThreads) + { + t.eventCtx.collect_scope(now, func_id, false); + } } - for (perf_thread& t : mBgThreads) + if (flags & COLLECT_BG_THREADS || flags & COLLECT_ALL_THREADS) { - t.eventCtx.collect_scope(now, func_id, false); + for (perf_thread &t : mBgThreads) + { + t.eventCtx.collect_scope(now, func_id, false); + } } - for (perf_thread& t : mMultiPMUThreads) + if (flags & COLLECT_MULTI_PMU_THREADS || flags & COLLECT_ALL_THREADS) { - t.eventCtx.collect_scope(now, func_id, false); + for (perf_thread &t : mMultiPMUThreads) + { + t.eventCtx.collect_scope(now, func_id, false); + } } - for (perf_thread& t : mBookerThread) + if (flags & COLLECT_BOOKER_THREADS || flags & COLLECT_ALL_THREADS) { - t.eventCtx.collect_scope(now, func_id, false); + for (perf_thread &t : mBookerThread) + { + t.eventCtx.collect_scope(now, func_id, false); + } } - for (perf_thread& t : mCSPMUThreads) + if (flags & COLLECT_CSPMU_THREADS || flags & COLLECT_ALL_THREADS) { - t.eventCtx.collect_scope(now, func_id, false); + for (perf_thread &t : mCSPMUThreads) + { + t.eventCtx.collect_scope(now, func_id, false); + } } + last_collect_scope_flags = flags; return true; } -bool PerfCollector::collect_scope_stop(int64_t now, uint16_t func_id) { +bool PerfCollector::collect_scope_stop(int64_t now, uint16_t func_id, int32_t flags) { if (!mCollecting) return false; + if (last_collect_scope_flags != flags) { + DBG_LOG("Error: Could not find the corresponding collect_scope_start call for func_id %ud.\n", func_id); + return false; + } struct snapshot snap_start, snap_stop; - for (perf_thread &t : mReplayThreads) { - snap_start = t.eventCtx.last_snap; - snap_stop = t.eventCtx.collect_scope(now, func_id, true); - t.update_data_scope(func_id, snap_start, snap_stop); - } - for (perf_thread &t : mBgThreads) { - snap_start = t.eventCtx.last_snap; - snap_stop = t.eventCtx.collect_scope(now, func_id, true); - t.update_data_scope(func_id, snap_start, snap_stop); - } - for (perf_thread &t : mMultiPMUThreads) { - snap_start = t.eventCtx.last_snap; - snap_stop = t.eventCtx.collect_scope(now, func_id, true); - t.update_data_scope(func_id, snap_start, snap_stop); - } - for (perf_thread &t : mBookerThread) { - snap_start = t.eventCtx.last_snap; - snap_stop = t.eventCtx.collect_scope(now, func_id, true); - t.update_data_scope(func_id, snap_start, snap_stop); - } - for (perf_thread &t : mCSPMUThreads) { - snap_start = t.eventCtx.last_snap; - snap_stop = t.eventCtx.collect_scope(now, func_id, true); - t.update_data_scope(func_id, snap_start, snap_stop); + if (flags & COLLECT_REPLAY_THREADS || flags & COLLECT_ALL_THREADS) + { + for (perf_thread &t : mReplayThreads) + { + snap_start = t.eventCtx.last_snap; + snap_stop = t.eventCtx.collect_scope(now, func_id, true); + t.update_data_scope(func_id, snap_start, snap_stop); + } + } + if (flags & COLLECT_BG_THREADS || flags & COLLECT_ALL_THREADS) + { + for (perf_thread &t : mBgThreads) + { + snap_start = t.eventCtx.last_snap; + snap_stop = t.eventCtx.collect_scope(now, func_id, true); + t.update_data_scope(func_id, snap_start, snap_stop); + } + } + if (flags & COLLECT_MULTI_PMU_THREADS || flags & COLLECT_ALL_THREADS) + { + for (perf_thread &t : mMultiPMUThreads) + { + snap_start = t.eventCtx.last_snap; + snap_stop = t.eventCtx.collect_scope(now, func_id, true); + t.update_data_scope(func_id, snap_start, snap_stop); + } + } + if (flags & COLLECT_BOOKER_THREADS || flags & COLLECT_ALL_THREADS) + { + for (perf_thread &t : mBookerThread) + { + snap_start = t.eventCtx.last_snap; + snap_stop = t.eventCtx.collect_scope(now, func_id, true); + t.update_data_scope(func_id, snap_start, snap_stop); + } + } + if (flags & COLLECT_CSPMU_THREADS || flags & COLLECT_ALL_THREADS) + { + for (perf_thread &t : mCSPMUThreads) + { + snap_start = t.eventCtx.last_snap; + snap_stop = t.eventCtx.collect_scope(now, func_id, true); + t.update_data_scope(func_id, snap_start, snap_stop); + } } return false; } @@ -707,7 +747,23 @@ struct snapshot event_context::collect(int64_t now) return snap; } -struct snapshot event_context::collect_scope(int64_t now, uint16_t func_id, bool stopping) { +struct snapshot event_context::collect_scope(int64_t now, uint16_t func_id, bool stopping) +{ + +#if defined(__aarch64__) + // stop counters for arm64 + uint64_t PMCNTENSET_EL0_safe; + uint64_t PMCR_EL0_safe; + asm volatile("mrs %0, PMCR_EL0" : "=r" (PMCR_EL0_safe)); + asm volatile("msr PMCR_EL0, %0" : : "r" (PMCR_EL0_safe & 0xFFFFFFFFFFFFFFFE)); +#elif defined(__arm__) + // stop counters for arm32 + uint64_t PMCNTENSET_EL0_safe; + uint64_t PMCR_EL0_safe; + asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(PMCR_EL0_safe)); + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(PMCR_EL0_safe & 0xFFFFFFFE)); +#endif + if (stopping && last_snap_func_id != func_id) { DBG_LOG("Error: Could not find the corresponding collect_scope_start call for func_id %ud.\n", func_id); } @@ -719,6 +775,17 @@ struct snapshot event_context::collect_scope(int64_t now, uint16_t func_id, bool last_snap_func_id = func_id; last_snap = snap; } + +#if defined(__aarch64__) + // start counters for arm64 + asm volatile("msr PMCNTENSET_EL0, %0" : : "r" (PMCNTENSET_EL0_safe)); + asm volatile("msr PMCR_EL0, %0" : : "r" (PMCR_EL0_safe)); +#elif defined(__arm__) + // start counters for arm32 + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(PMCNTENSET_EL0_safe)); + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(PMCR_EL0_safe)); +#endif + return snap; } diff --git a/collectors/perf.hpp b/collectors/perf.hpp index dc06d0e..df29639 100644 --- a/collectors/perf.hpp +++ b/collectors/perf.hpp @@ -36,6 +36,17 @@ enum cmn_node_type CMN_TYPE_WP = 0x7770, }; +enum collect_scope_flags: int32_t +{ + COLLECT_NOOP = 0x00, + COLLECT_ALL_THREADS = 0x01, + COLLECT_REPLAY_THREADS = 0x01 << 1, + COLLECT_BG_THREADS = 0x01 << 2, + COLLECT_MULTI_PMU_THREADS = 0x01 << 3, + COLLECT_BOOKER_THREADS = 0x01 << 4, + COLLECT_CSPMU_THREADS = 0x01 << 5, +}; + struct snapshot { snapshot() : size(0) {} @@ -147,8 +158,8 @@ class PerfCollector : public Collector virtual void summarize() override; /// Collector functions for perapi perf instrumentations. - virtual bool collect_scope_start(int64_t now, uint16_t func_id); - virtual bool collect_scope_stop(int64_t now, uint16_t func_id); + virtual bool collect_scope_start(int64_t now, uint16_t func_id, int32_t flags); + virtual bool collect_scope_stop(int64_t now, uint16_t func_id, int32_t flags); private: void create_perf_thread(); @@ -163,6 +174,7 @@ class PerfCollector : public Collector std::map> mMultiPMUEvents; std::map> mCSPMUEvents; std::map> mClocks; // device_name -> clock_vector + int last_collect_scope_flags = 0; struct perf_thread { diff --git a/interface.cpp b/interface.cpp index 3fcbdb2..6151e13 100644 --- a/interface.cpp +++ b/interface.cpp @@ -450,20 +450,20 @@ void Collection::collect(std::vector custom) } } -void Collection::collect_scope_start(uint16_t label) { +void Collection::collect_scope_start(uint16_t label, int32_t flags) { const int64_t now = getTime(); mScopeStartTime = now; for (Collector* c : mRunning) { if (!c->isThreaded()) { - c->collect_scope_start(now, label); + c->collect_scope_start(now, label, flags); } } mScopeStarted = true; } -void Collection::collect_scope_stop(uint16_t label) { +void Collection::collect_scope_stop(uint16_t label, int32_t flags) { // A collect_scope_start and collect_scope_end pair is considered as one sample. if (!mScopeStarted) { DBG_LOG("WARNING: collect_scope_stop called without a corresponding collect_scope_start.\n"); @@ -476,7 +476,7 @@ void Collection::collect_scope_stop(uint16_t label) { { if (!c->isThreaded()) { - c->collect_scope_stop(now, label); + c->collect_scope_stop(now, label, flags); } } mScopeStarted = false; diff --git a/interface.hpp b/interface.hpp index 573bf4d..dce31cb 100644 --- a/interface.hpp +++ b/interface.hpp @@ -90,8 +90,8 @@ class Collector virtual bool stop() { mCollecting = false; return true; } virtual bool postprocess(const std::vector& timing); virtual bool collect( int64_t ) = 0; - virtual bool collect_scope_start( int64_t now, uint16_t func_id) {return true; }; - virtual bool collect_scope_stop( int64_t now, uint16_t func_id) { return true; }; + virtual bool collect_scope_start( int64_t now, uint16_t func_id, int flags ) {return true; }; + virtual bool collect_scope_stop( int64_t now, uint16_t func_id, int flags ) { return true; }; virtual bool collecting() const { return mCollecting; } virtual const std::string& name() const { return mName; } virtual bool available() = 0; @@ -256,11 +256,11 @@ class Collection /// Sample periodical data for per API instrumentation. Call this method before the payload /// execution. Currently only used for perf collector. - void collect_scope_start(uint16_t label); + void collect_scope_start(uint16_t label, int32_t flags); /// Sample periodical data for per API instrumentation. Call this method after the payload /// execution. Currently only used for perf collector. - void collect_scope_stop(uint16_t label); + void collect_scope_stop(uint16_t label, int32_t flags); /// Get the results as JSON Json::Value results(); diff --git a/test.cpp b/test.cpp index f4e86b2..dc4220d 100644 --- a/test.cpp +++ b/test.cpp @@ -1,4 +1,5 @@ #include "interface.hpp" +#include "collectors/perf.hpp" #include #include @@ -6,6 +7,7 @@ #include #include #include +#include #include #include "json/writer.h" @@ -267,6 +269,10 @@ class Test8 { Test8() : test8_ready(false) {} + ~Test8() { + delete c; + } + void run() { printf("[test 8]: Testing collect_scope for the perf collector...\n"); std::vector threads; @@ -342,14 +348,18 @@ class Test8 { tmp *= rand(); }; - c->collect_scope_start(0 + scope_label_offset); - payload(10); - c->collect_scope_stop(0 + scope_label_offset); - c->collect_scope_start(5 + scope_label_offset); - payload(1000); - c->collect_scope_stop(5 + scope_label_offset); + if (strncmp(thread_name.c_str(), "patrace", 7) == 0) { + c->collect_scope_start(0 + scope_label_offset, COLLECT_REPLAY_THREADS); + payload(1000); + c->collect_scope_stop(0 + scope_label_offset, COLLECT_REPLAY_THREADS); + } + + if (strncmp(thread_name.c_str(), "mali", 4) == 0) { + c->collect_scope_start(1 + scope_label_offset, COLLECT_BG_THREADS); + payload(1000); + c->collect_scope_stop(1 + scope_label_offset, COLLECT_BG_THREADS); + } printf("Thread %s finished.\n", thread_name.c_str()); - // usleep(1e5); } Collection *c; @@ -369,7 +379,8 @@ int main() test5(); test6(); test7(); // summarized results - (new Test8())->run(); + auto test8 = std::unique_ptr(new Test8()); + test8->run(); printf("ALL DONE!\n"); return 0; }