Skip to content

Commit

Permalink
cpu: subtract sample time from parent call stack (#72)
Browse files Browse the repository at this point in the history
We misinterpreted the way pprof uses the sample time, and the test
programs we used had too shallow or too similar stack traces to
highlight the issue.

Too much time was accounted in `main` because we didn't subtract the
time spent in sub-stacks. pprof also miscalculated the total profile
time, reporting 45s on a test that lasted for ~4s.

With this fix, the profile looks a lot more accurate, and the total
profile time also closely matches the test runtime.

```
Benchmark_speedtest1 	      20	 234045385 ns/op	 1164882 B/op	     355 allocs/op
PASS
-- Speedtest1 for SQLite 3.41.2 2023-03-22 11:56:21 0d1fc92f94cb6b76bffe3ec34d69
 100 - 10000 INSERTs into table with no index......................    0.073s
 110 - 10000 ordered INSERTS with one index/PK.....................    0.120s
 120 - 10000 unordered INSERTS with one index/PK...................    0.128s
 130 - 25 SELECTS, numeric BETWEEN, unindexed......................    0.091s
 140 - 10 SELECTS, LIKE, unindexed.................................    0.148s
 142 - 10 SELECTS w/ORDER BY, unindexed............................    0.204s
 145 - 10 SELECTS w/ORDER BY and LIMIT, unindexed..................    0.129s
 150 - CREATE INDEX five times.....................................    0.169s
 160 - 2000 SELECTS, numeric BETWEEN, indexed......................    0.079s
 161 - 2000 SELECTS, numeric BETWEEN, PK...........................    0.081s
 170 - 2000 SELECTS, text BETWEEN, indexed.........................    0.175s
 180 - 10000 INSERTS with three indexes............................    0.160s
 190 - DELETE and REFILL one table.................................    0.162s
 200 - VACUUM......................................................    0.145s
 210 - ALTER TABLE ADD COLUMN, and query...........................    0.008s
 230 - 2000 UPDATES, numeric BETWEEN, indexed......................    0.048s
 240 - 10000 UPDATES of individual rows............................    0.094s
 250 - One big UPDATE of the whole 10000-row table.................    0.022s
 260 - Query added column after filling............................    0.007s
 270 - 2000 DELETEs, numeric BETWEEN, indexed......................    0.104s
 280 - 10000 DELETEs of individual rows............................    0.096s
 290 - Refill two 10000-row tables using REPLACE...................    0.428s
 300 - Refill a 10000-row table using (b&1)==(a&1).................    0.180s
 310 - 2000 four-ways joins........................................    0.278s
 320 - subquery in result set......................................    0.262s
 400 - 14000 REPLACE ops on an IPK.................................    0.143s
 410 - 14000 SELECTS on an IPK.....................................    0.105s
 500 - 14000 REPLACE on TEXT PK....................................    0.167s
 510 - 14000 SELECTS on a TEXT PK..................................    0.188s
 520 - 14000 SELECT DISTINCT.......................................    0.137s
 980 - PRAGMA integrity_check......................................    0.484s
 990 - ANALYZE.....................................................    0.065s
       TOTAL.......................................................    4.680s
ok  	github.com/ncruces/go-sqlite3/internal/vfs/tests/speedtest1	5.405s
```

Fixes #71

Signed-off-by: Achille Roussel <achille.roussel@gmail.com>
  • Loading branch information
achille-roussel authored May 12, 2023
1 parent 15bc886 commit 10325f4
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 30 deletions.
13 changes: 7 additions & 6 deletions cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func TimeFunc(time func() int64) CPUProfilerOption {

type cpuTimeFrame struct {
start int64
sub int64
trace stackTrace
}

Expand Down Expand Up @@ -99,11 +100,6 @@ func (p *CPUProfiler) StopProfile(sampleRate float64, symbols Symbolizer) *profi
for k, sample := range samples {
if sample.stack.host() {
delete(samples, k)
for _, other := range samples {
if sample.stack.contains(other.stack) {
other.subtract(sample.total())
}
}
}
}
}
Expand Down Expand Up @@ -220,9 +216,14 @@ func (p cpuProfiler) After(ctx context.Context, mod api.Module, def api.Function
p.frames = p.frames[:i]

if f.start != 0 {
duration := p.time() - f.start
if i := len(p.frames); i > 0 {
p.frames[i-1].sub += duration
}
duration -= f.sub
p.mutex.Lock()
if p.counts != nil {
p.counts.observe(f.trace, p.time()-f.start)
p.counts.observe(f.trace, duration)
}
p.mutex.Unlock()
p.traces = append(p.traces, f.trace)
Expand Down
102 changes: 102 additions & 0 deletions cpu_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
package wzprof

import (
"context"
"testing"

"github.com/tetratelabs/wazero/api"
"github.com/tetratelabs/wazero/experimental"
"github.com/tetratelabs/wazero/experimental/wazerotest"
)

func BenchmarkCPUProfilerOn(b *testing.B) {
Expand All @@ -14,3 +19,100 @@ func BenchmarkCPUProfilerOff(b *testing.B) {
p := NewCPUProfiler()
benchmarkFunctionListener(b, p)
}

func TestCPUProfilerTime(t *testing.T) {
currentTime := int64(0)

p := NewCPUProfiler(
TimeFunc(func() int64 { return currentTime }),
)

module := wazerotest.NewModule(nil,
wazerotest.NewFunction(func(context.Context, api.Module) {}),
wazerotest.NewFunction(func(context.Context, api.Module) {}),
wazerotest.NewFunction(func(context.Context, api.Module) {}),
)

f0 := p.NewFunctionListener(module.Function(0).Definition())
f1 := p.NewFunctionListener(module.Function(1).Definition())
f2 := p.NewFunctionListener(module.Function(2).Definition())

stack0 := []experimental.StackFrame{
{Function: module.Function(0)},
}

stack1 := []experimental.StackFrame{
{Function: module.Function(0)},
{Function: module.Function(1)},
}

stack2 := []experimental.StackFrame{
{Function: module.Function(0)},
{Function: module.Function(1)},
{Function: module.Function(2)},
}

def0 := stack0[0].Function.Definition()
def1 := stack1[1].Function.Definition()
def2 := stack2[2].Function.Definition()

ctx := context.Background()

const (
t0 int64 = 1
t1 int64 = 10
t2 int64 = 42
t3 int64 = 100
t4 int64 = 101
t5 int64 = 102
)

p.StartProfile()

currentTime = t0
f0.Before(ctx, module, def0, nil, experimental.NewStackIterator(stack0...))

currentTime = t1
f1.Before(ctx, module, def1, nil, experimental.NewStackIterator(stack1...))

currentTime = t2
f2.Before(ctx, module, def2, nil, experimental.NewStackIterator(stack2...))

currentTime = t3
f2.After(ctx, module, def2, nil)

currentTime = t4
f1.After(ctx, module, def1, nil)

currentTime = t5
f0.After(ctx, module, def0, nil)

trace0 := makeStackTraceFromFrames(stack0)
trace1 := makeStackTraceFromFrames(stack1)
trace2 := makeStackTraceFromFrames(stack2)

d2 := t3 - t2
d1 := t4 - (t1 + d2)
d0 := t5 - (t0 + d1 + d2)

assertStackCount(t, p.counts, trace0, 1, d0)
assertStackCount(t, p.counts, trace1, 1, d1)
assertStackCount(t, p.counts, trace2, 1, d2)
}

func assertStackCount(t *testing.T, counts stackCounterMap, trace stackTrace, count, total int64) {
t.Helper()
c := counts.lookup(trace)

if c.count() != count {
t.Errorf("%sstack count mismatch: want=%d got=%d", trace, count, c.count())
}

if c.total() != total {
t.Errorf("%sstack total mismatch: want=%d got=%d", trace, total, c.total())
}
}

func makeStackTraceFromFrames(stackFrames []experimental.StackFrame) stackTrace {
return makeStackTrace(stackTrace{}, experimental.NewStackIterator(stackFrames...))
}
28 changes: 4 additions & 24 deletions wzprof.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package wzprof

import (
"bytes"
"fmt"
"hash/maphash"
"net/http"
Expand Down Expand Up @@ -186,14 +185,6 @@ func (sc *stackCounter) total() int64 {
return sc.value[1]
}

func (sc *stackCounter) subtract(value int64) {
if total := sc.total(); total < value {
sc.value[1] = 0
} else {
sc.value[1] -= value
}
}

func (sc *stackCounter) sampleLocation() stackTrace {
return sc.stack
}
Expand All @@ -202,6 +193,10 @@ func (sc *stackCounter) sampleValue() []int64 {
return sc.value[:]
}

func (sc *stackCounter) String() string {
return fmt.Sprintf("{count:%d,total:%d}", sc.count(), sc.total())
}

// Compile-time check that program counters are uint64 values.
var _ = assertTypeIsUint64[experimental.ProgramCounter]()

Expand Down Expand Up @@ -235,21 +230,6 @@ func (st stackTrace) host() bool {
return len(st.fns) > 0 && st.fns[0].Definition().GoFunction() != nil
}

func (st stackTrace) contains(sx stackTrace) bool {
if len(st.fns) < len(sx.fns) {
return false
}
n := len(st.fns) - len(sx.fns)
st.fns = st.fns[n:]
st.pcs = st.pcs[n:]
if st.fns[0] != sx.fns[0] {
return false
}
st.pcs = st.pcs[1:]
sx.pcs = sx.pcs[1:]
return bytes.Equal(st.bytes(), sx.bytes())
}

func (st stackTrace) len() int {
return len(st.pcs)
}
Expand Down

0 comments on commit 10325f4

Please sign in to comment.