From 27158e62a19e144a3081be9507c81084b702c58e Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Thu, 30 May 2024 21:01:42 +0300 Subject: [PATCH] Use PyTorch Benchmarking Timer by default with pytest-benchmark (#491) --- pyproject.toml | 2 ++ requirements/test.txt | 1 + thunder/benchmarks/einsum.py | 2 -- thunder/benchmarks/targets.py | 44 +---------------------------------- 4 files changed, 4 insertions(+), 45 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 98c74249ac..df50fa8503 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ addopts = [ "--doctest-modules", "--color=yes", "--disable-pytest-warnings", + "--benchmark-timer=torch.utils.benchmark.utils.timer.timer", + "--benchmark-warmup=on", ] markers = [ "standalone: mark a test as standalone", diff --git a/requirements/test.txt b/requirements/test.txt index 5d02771607..b7a3eb3936 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,6 @@ coverage ==7.5.1 pytest ==8.1.1 +pytest-benchmark ==4.0.0 pytest-timeout ==2.3.1 pytest-cov ==4.1.0 pytest-xdist ==3.6.1 diff --git a/thunder/benchmarks/einsum.py b/thunder/benchmarks/einsum.py index a146a254fd..4063cfef9e 100644 --- a/thunder/benchmarks/einsum.py +++ b/thunder/benchmarks/einsum.py @@ -15,7 +15,6 @@ grad_executors_ids, thunder_gradv1, thunder_torchcompile_gradv1, - wrap_for_benchmark, ) @@ -33,7 +32,6 @@ def _instantiate_benchmark_env( setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) return setup, fn diff --git a/thunder/benchmarks/targets.py b/thunder/benchmarks/targets.py index d1f4103dbc..3c34b5152c 100644 --- a/thunder/benchmarks/targets.py +++ b/thunder/benchmarks/targets.py @@ -8,7 +8,7 @@ import os import torch import thunder -from thunder.core.transforms import grad, clear_grads, populate_grads, get_grad, put_grad, put_grads +from thunder.core.transforms import grad, clear_grads, populate_grads from thunder.core.interpreter import interpret from thunder.benchmarks import ( @@ -28,7 +28,6 @@ torch_executor, torch_compile_executor, thunder_executor, - thunder_torch_executor, thunder_torch_compile_executor, thunder_apex_executor, thunder_apex_nvfuser_executor, @@ -36,7 +35,6 @@ thunder_cudnn_nvfuser_executor, thunder_cudnn_layer_norm_executor, thunder_cudnn_layer_norm_nvfuser_executor, - thunder_sdpa_executor, thunder_sdpa_torch_compile_nvfuser_executor, BatchNormBenchmark, ) @@ -69,16 +67,6 @@ def setup(): return setup -def wrap_for_benchmark(fn): - @wraps(fn) - def fn_(*args, **kwargs): - result = fn(*args, **kwargs) - torch.cuda.synchronize() - return result - - return fn_ - - def torch_fwd(b: Benchmark): module = b.fn() fn_ = torch_executor(module) @@ -349,7 +337,6 @@ def test_nanogpt_gelu_fwd(benchmark, executor: Callable): setup = make_setup(gelu_bench) fn = executor(gelu_bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -366,7 +353,6 @@ def test_nanogpt_gelu_grad(benchmark, executor: Callable): setup = make_setup(gelu_bench) fn = executor(gelu_bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -383,7 +369,6 @@ def test_batch_norm_fwd(benchmark, executor: Callable): setup = make_setup(bn_bench) fn = executor(bn_bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -404,7 +389,6 @@ def test_batch_norm_grad(benchmark, executor: Callable): setup = make_setup(bn_bench) fn = executor(bn_bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=200, warmup_rounds=20) @@ -426,7 +410,6 @@ def test_nanogpt_cross_entropy_fwd(benchmark, executor: None | Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -449,7 +432,6 @@ def test_nanogpt_cross_entropy_grad(benchmark, executor: None | Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -472,7 +454,6 @@ def test_nanogpt_layer_norm_fwd(benchmark, executor: None | Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -490,7 +471,6 @@ def test_nanogpt_sdpa_fwd(benchmark, executor: None | Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -508,7 +488,6 @@ def test_nanogpt_sdpa_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -525,7 +504,6 @@ def test_llama2_7b_sdpa_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -574,7 +552,6 @@ def test_litgpt_sdpa_grad(benchmark, executor: Callable, bs, config): setup = make_setup(bench) fn = thunder_fwd_bwd(bench, compile_fn=executor) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -591,7 +568,6 @@ def test_nanogpt_mlp_fwd(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -608,7 +584,6 @@ def test_nanogpt_mlp_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -629,7 +604,6 @@ def test_nanogpt_csa_fwd(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -650,7 +624,6 @@ def test_nanogpt_csa_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -668,7 +641,6 @@ def test_nanogpt_block_fwd(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -686,7 +658,6 @@ def test_nanogpt_block_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -708,7 +679,6 @@ def test_nanogpt_gpt2_fwd(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1) @@ -730,7 +700,6 @@ def test_nanogpt_gpt2_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1) @@ -751,7 +720,6 @@ def test_nanogpt_gpt2xl_fwd(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1) @@ -773,7 +741,6 @@ def test_nanogpt_gpt2xl_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1) @@ -792,7 +759,6 @@ def test_open_llama_7b_fwd(benchmark, executor: Callable): setup = make_setup(b) fn = executor(b) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1) @@ -806,7 +772,6 @@ def test_llama_2_7b_hf_fwd(benchmark, executor: Callable): setup = make_setup(b) fn = executor(b) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1) @@ -828,7 +793,6 @@ def test_llama_2_7b_grad(benchmark, executor: Callable): setup = make_setup(b) fn = executor(b) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1) @@ -845,7 +809,6 @@ def test_llama2_mlp_7b_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -862,7 +825,6 @@ def test_llama2_causal_self_attention_7b_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -879,7 +841,6 @@ def test_llama2_7b_rmsnorm_grad(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -968,7 +929,6 @@ def test_litgpt_qkv_split_rope_train_forward(benchmark, executor: Callable, use_ setup = make_setup(bench) fn = executor(bench.fn()) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1) @@ -1032,7 +992,6 @@ def test_litgpt_qkv_split_rope_train_backward(benchmark, executor: Callable, use fw_setup = make_setup(bench) fn, bw_setup = backward_only(bench.fn(), executor, fw_setup) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=bw_setup, rounds=40, warmup_rounds=1) @@ -1054,6 +1013,5 @@ def test_interpreter_nanogpt_gpt2_fwd(benchmark, executor: Callable): setup = make_setup(bench) fn = executor(bench) - fn = wrap_for_benchmark(fn) benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)