From 10325f440d0596eb7b301275fd85844f752752ef Mon Sep 17 00:00:00 2001 From: Achille Date: Fri, 12 May 2023 12:54:14 -0700 Subject: [PATCH] cpu: subtract sample time from parent call stack (#72) We misinterpreted the way pprof uses the sample time, and the test programs we used had too shallow or too similar stack traces to highlight the issue. Too much time was accounted in `main` because we didn't subtract the time spent in sub-stacks. pprof also miscalculated the total profile time, reporting 45s on a test that lasted for ~4s. With this fix, the profile looks a lot more accurate, and the total profile time also closely matches the test runtime. ``` Benchmark_speedtest1 20 234045385 ns/op 1164882 B/op 355 allocs/op PASS -- Speedtest1 for SQLite 3.41.2 2023-03-22 11:56:21 0d1fc92f94cb6b76bffe3ec34d69 100 - 10000 INSERTs into table with no index...................... 0.073s 110 - 10000 ordered INSERTS with one index/PK..................... 0.120s 120 - 10000 unordered INSERTS with one index/PK................... 0.128s 130 - 25 SELECTS, numeric BETWEEN, unindexed...................... 0.091s 140 - 10 SELECTS, LIKE, unindexed................................. 0.148s 142 - 10 SELECTS w/ORDER BY, unindexed............................ 0.204s 145 - 10 SELECTS w/ORDER BY and LIMIT, unindexed.................. 0.129s 150 - CREATE INDEX five times..................................... 0.169s 160 - 2000 SELECTS, numeric BETWEEN, indexed...................... 0.079s 161 - 2000 SELECTS, numeric BETWEEN, PK........................... 0.081s 170 - 2000 SELECTS, text BETWEEN, indexed......................... 0.175s 180 - 10000 INSERTS with three indexes............................ 0.160s 190 - DELETE and REFILL one table................................. 0.162s 200 - VACUUM...................................................... 0.145s 210 - ALTER TABLE ADD COLUMN, and query........................... 0.008s 230 - 2000 UPDATES, numeric BETWEEN, indexed...................... 0.048s 240 - 10000 UPDATES of individual rows............................ 0.094s 250 - One big UPDATE of the whole 10000-row table................. 0.022s 260 - Query added column after filling............................ 0.007s 270 - 2000 DELETEs, numeric BETWEEN, indexed...................... 0.104s 280 - 10000 DELETEs of individual rows............................ 0.096s 290 - Refill two 10000-row tables using REPLACE................... 0.428s 300 - Refill a 10000-row table using (b&1)==(a&1)................. 0.180s 310 - 2000 four-ways joins........................................ 0.278s 320 - subquery in result set...................................... 0.262s 400 - 14000 REPLACE ops on an IPK................................. 0.143s 410 - 14000 SELECTS on an IPK..................................... 0.105s 500 - 14000 REPLACE on TEXT PK.................................... 0.167s 510 - 14000 SELECTS on a TEXT PK.................................. 0.188s 520 - 14000 SELECT DISTINCT....................................... 0.137s 980 - PRAGMA integrity_check...................................... 0.484s 990 - ANALYZE..................................................... 0.065s TOTAL....................................................... 4.680s ok github.com/ncruces/go-sqlite3/internal/vfs/tests/speedtest1 5.405s ``` Fixes #71 Signed-off-by: Achille Roussel --- cpu.go | 13 +++---- cpu_test.go | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++ wzprof.go | 28 +++------------ 3 files changed, 113 insertions(+), 30 deletions(-) diff --git a/cpu.go b/cpu.go index a1912ad..195cecd 100644 --- a/cpu.go +++ b/cpu.go @@ -50,6 +50,7 @@ func TimeFunc(time func() int64) CPUProfilerOption { type cpuTimeFrame struct { start int64 + sub int64 trace stackTrace } @@ -99,11 +100,6 @@ func (p *CPUProfiler) StopProfile(sampleRate float64, symbols Symbolizer) *profi for k, sample := range samples { if sample.stack.host() { delete(samples, k) - for _, other := range samples { - if sample.stack.contains(other.stack) { - other.subtract(sample.total()) - } - } } } } @@ -220,9 +216,14 @@ func (p cpuProfiler) After(ctx context.Context, mod api.Module, def api.Function p.frames = p.frames[:i] if f.start != 0 { + duration := p.time() - f.start + if i := len(p.frames); i > 0 { + p.frames[i-1].sub += duration + } + duration -= f.sub p.mutex.Lock() if p.counts != nil { - p.counts.observe(f.trace, p.time()-f.start) + p.counts.observe(f.trace, duration) } p.mutex.Unlock() p.traces = append(p.traces, f.trace) diff --git a/cpu_test.go b/cpu_test.go index 684a9cb..5c2c35d 100644 --- a/cpu_test.go +++ b/cpu_test.go @@ -1,7 +1,12 @@ package wzprof import ( + "context" "testing" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/experimental/wazerotest" ) func BenchmarkCPUProfilerOn(b *testing.B) { @@ -14,3 +19,100 @@ func BenchmarkCPUProfilerOff(b *testing.B) { p := NewCPUProfiler() benchmarkFunctionListener(b, p) } + +func TestCPUProfilerTime(t *testing.T) { + currentTime := int64(0) + + p := NewCPUProfiler( + TimeFunc(func() int64 { return currentTime }), + ) + + module := wazerotest.NewModule(nil, + wazerotest.NewFunction(func(context.Context, api.Module) {}), + wazerotest.NewFunction(func(context.Context, api.Module) {}), + wazerotest.NewFunction(func(context.Context, api.Module) {}), + ) + + f0 := p.NewFunctionListener(module.Function(0).Definition()) + f1 := p.NewFunctionListener(module.Function(1).Definition()) + f2 := p.NewFunctionListener(module.Function(2).Definition()) + + stack0 := []experimental.StackFrame{ + {Function: module.Function(0)}, + } + + stack1 := []experimental.StackFrame{ + {Function: module.Function(0)}, + {Function: module.Function(1)}, + } + + stack2 := []experimental.StackFrame{ + {Function: module.Function(0)}, + {Function: module.Function(1)}, + {Function: module.Function(2)}, + } + + def0 := stack0[0].Function.Definition() + def1 := stack1[1].Function.Definition() + def2 := stack2[2].Function.Definition() + + ctx := context.Background() + + const ( + t0 int64 = 1 + t1 int64 = 10 + t2 int64 = 42 + t3 int64 = 100 + t4 int64 = 101 + t5 int64 = 102 + ) + + p.StartProfile() + + currentTime = t0 + f0.Before(ctx, module, def0, nil, experimental.NewStackIterator(stack0...)) + + currentTime = t1 + f1.Before(ctx, module, def1, nil, experimental.NewStackIterator(stack1...)) + + currentTime = t2 + f2.Before(ctx, module, def2, nil, experimental.NewStackIterator(stack2...)) + + currentTime = t3 + f2.After(ctx, module, def2, nil) + + currentTime = t4 + f1.After(ctx, module, def1, nil) + + currentTime = t5 + f0.After(ctx, module, def0, nil) + + trace0 := makeStackTraceFromFrames(stack0) + trace1 := makeStackTraceFromFrames(stack1) + trace2 := makeStackTraceFromFrames(stack2) + + d2 := t3 - t2 + d1 := t4 - (t1 + d2) + d0 := t5 - (t0 + d1 + d2) + + assertStackCount(t, p.counts, trace0, 1, d0) + assertStackCount(t, p.counts, trace1, 1, d1) + assertStackCount(t, p.counts, trace2, 1, d2) +} + +func assertStackCount(t *testing.T, counts stackCounterMap, trace stackTrace, count, total int64) { + t.Helper() + c := counts.lookup(trace) + + if c.count() != count { + t.Errorf("%sstack count mismatch: want=%d got=%d", trace, count, c.count()) + } + + if c.total() != total { + t.Errorf("%sstack total mismatch: want=%d got=%d", trace, total, c.total()) + } +} + +func makeStackTraceFromFrames(stackFrames []experimental.StackFrame) stackTrace { + return makeStackTrace(stackTrace{}, experimental.NewStackIterator(stackFrames...)) +} diff --git a/wzprof.go b/wzprof.go index 71c4cc9..49a0a9a 100644 --- a/wzprof.go +++ b/wzprof.go @@ -1,7 +1,6 @@ package wzprof import ( - "bytes" "fmt" "hash/maphash" "net/http" @@ -186,14 +185,6 @@ func (sc *stackCounter) total() int64 { return sc.value[1] } -func (sc *stackCounter) subtract(value int64) { - if total := sc.total(); total < value { - sc.value[1] = 0 - } else { - sc.value[1] -= value - } -} - func (sc *stackCounter) sampleLocation() stackTrace { return sc.stack } @@ -202,6 +193,10 @@ func (sc *stackCounter) sampleValue() []int64 { return sc.value[:] } +func (sc *stackCounter) String() string { + return fmt.Sprintf("{count:%d,total:%d}", sc.count(), sc.total()) +} + // Compile-time check that program counters are uint64 values. var _ = assertTypeIsUint64[experimental.ProgramCounter]() @@ -235,21 +230,6 @@ func (st stackTrace) host() bool { return len(st.fns) > 0 && st.fns[0].Definition().GoFunction() != nil } -func (st stackTrace) contains(sx stackTrace) bool { - if len(st.fns) < len(sx.fns) { - return false - } - n := len(st.fns) - len(sx.fns) - st.fns = st.fns[n:] - st.pcs = st.pcs[n:] - if st.fns[0] != sx.fns[0] { - return false - } - st.pcs = st.pcs[1:] - sx.pcs = sx.pcs[1:] - return bytes.Equal(st.bytes(), sx.bytes()) -} - func (st stackTrace) len() int { return len(st.pcs) }