-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmulti_cpu.cpp
114 lines (95 loc) · 4.06 KB
/
multi_cpu.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include "access_benchmark.h"
#include "perfcpp/event_counter.h"
#include <atomic>
#include <iostream>
#include <numeric>
#include <thread>
int
main()
{
std::cout << "libperf-cpp example: Record performance counter for "
"random access to an in-memory array on all CPU cores."
<< std::endl;
std::cout << "We will record the counters per (logical) CPU core and merge the results "
"afterwards."
<< std::endl;
/// Create a list of cpus to record performance counters on (all available, in this example).
auto cpus_to_watch = std::vector<std::uint16_t>(std::thread::hardware_concurrency());
std::iota(cpus_to_watch.begin(), cpus_to_watch.end(), 0U);
std::cout << "Creating counters for CPUs: ";
for (auto cpu : cpus_to_watch) {
std::cout << std::int32_t(cpu) << " ";
}
std::cout << std::endl;
/// Initialize performance counters.
/// Note that the perf::CounterDefinition holds all counter names and must be
/// alive until the benchmark finishes.
auto counter_definitions = perf::CounterDefinition{};
auto multi_cpu_event_counter = perf::MultiCoreEventCounter{ counter_definitions, std::move(cpus_to_watch) };
/// Add all the performance counters we want to record.
try {
multi_cpu_event_counter.add({ "instructions",
"cycles",
"branches",
"cache-misses",
"dTLB-miss-ratio",
"L1-data-miss-ratio",
"cycles-per-instruction" });
} catch (std::runtime_error& e) {
std::cerr << e.what() << std::endl;
return 1;
}
/// Create random access benchmark.
auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
/* create benchmark of 1024 MB */ 1024U };
/// One event_counter instance for every thread.
constexpr auto count_threads = 2U;
const auto items_per_thread = benchmark.size() / count_threads;
auto threads = std::vector<std::thread>{};
auto thread_local_results = std::vector<std::uint64_t>(2U, 0U); /// Array to store the thread-local results.
/// Barrier for the threads to wait.
auto thread_barrier = std::atomic<bool>{ false };
for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) {
threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &thread_barrier]() {
auto local_value = 0ULL;
/// Wait for the barrier to become "true", i.e., all threads are spawned.
while (!thread_barrier)
;
/// Process the data.
for (auto index = 0U; index < items_per_thread; ++index) {
local_value += benchmark[(thread_index * items_per_thread) + index].value;
}
thread_local_results[thread_index] = local_value;
});
}
/// Start recording performance counter.
/// In contrast to the inherit-thread example (see inherit_thread.cpp), we
/// will record the performance counters on each logical CPU core.
try {
multi_cpu_event_counter.start();
} catch (std::runtime_error& exception) {
std::cerr << exception.what() << std::endl;
return 1;
}
/// Let threads start.
thread_barrier = true;
/// Wait for all threads to finish.
for (auto& thread : threads) {
thread.join();
}
/// Stop performance counter recording.
multi_cpu_event_counter.stop();
/// Add up the results so that the compiler does not get the idea of
/// optimizing away the accesses.
auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
asm volatile("" : "+r,m"(value) : : "memory");
/// Get the result (normalized per cache line) from the
/// multithread_event_counter.
auto result = multi_cpu_event_counter.result(benchmark.size());
/// Print the performance counters.
std::cout << "\nResults:\n";
for (const auto& [counter_name, counter_value] : result) {
std::cout << counter_value << " " << counter_name << " / cache line" << std::endl;
}
return 0;
}