From 27158e62a19e144a3081be9507c81084b702c58e Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 30 May 2024 21:01:42 +0300
Subject: [PATCH] Use PyTorch Benchmarking Timer by default with
 pytest-benchmark (#491)

---
 pyproject.toml                |  2 ++
 requirements/test.txt         |  1 +
 thunder/benchmarks/einsum.py  |  2 --
 thunder/benchmarks/targets.py | 44 +----------------------------------
 4 files changed, 4 insertions(+), 45 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 98c74249ac..df50fa8503 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,8 @@ addopts = [
     "--doctest-modules",
     "--color=yes",
     "--disable-pytest-warnings",
+    "--benchmark-timer=torch.utils.benchmark.utils.timer.timer",
+    "--benchmark-warmup=on",
 ]
 markers = [
     "standalone: mark a test as standalone",
diff --git a/requirements/test.txt b/requirements/test.txt
index 5d02771607..b7a3eb3936 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,6 @@
 coverage ==7.5.1
 pytest ==8.1.1
+pytest-benchmark ==4.0.0
 pytest-timeout ==2.3.1
 pytest-cov ==4.1.0
 pytest-xdist ==3.6.1
diff --git a/thunder/benchmarks/einsum.py b/thunder/benchmarks/einsum.py
index a146a254fd..4063cfef9e 100644
--- a/thunder/benchmarks/einsum.py
+++ b/thunder/benchmarks/einsum.py
@@ -15,7 +15,6 @@
     grad_executors_ids,
     thunder_gradv1,
     thunder_torchcompile_gradv1,
-    wrap_for_benchmark,
 )
 
 
@@ -33,7 +32,6 @@ def _instantiate_benchmark_env(
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     return setup, fn
 
diff --git a/thunder/benchmarks/targets.py b/thunder/benchmarks/targets.py
index d1f4103dbc..3c34b5152c 100644
--- a/thunder/benchmarks/targets.py
+++ b/thunder/benchmarks/targets.py
@@ -8,7 +8,7 @@
 import os
 import torch
 import thunder
-from thunder.core.transforms import grad, clear_grads, populate_grads, get_grad, put_grad, put_grads
+from thunder.core.transforms import grad, clear_grads, populate_grads
 from thunder.core.interpreter import interpret
 
 from thunder.benchmarks import (
@@ -28,7 +28,6 @@
     torch_executor,
     torch_compile_executor,
     thunder_executor,
-    thunder_torch_executor,
     thunder_torch_compile_executor,
     thunder_apex_executor,
     thunder_apex_nvfuser_executor,
@@ -36,7 +35,6 @@
     thunder_cudnn_nvfuser_executor,
     thunder_cudnn_layer_norm_executor,
     thunder_cudnn_layer_norm_nvfuser_executor,
-    thunder_sdpa_executor,
     thunder_sdpa_torch_compile_nvfuser_executor,
     BatchNormBenchmark,
 )
@@ -69,16 +67,6 @@ def setup():
     return setup
 
 
-def wrap_for_benchmark(fn):
-    @wraps(fn)
-    def fn_(*args, **kwargs):
-        result = fn(*args, **kwargs)
-        torch.cuda.synchronize()
-        return result
-
-    return fn_
-
-
 def torch_fwd(b: Benchmark):
     module = b.fn()
     fn_ = torch_executor(module)
@@ -349,7 +337,6 @@ def test_nanogpt_gelu_fwd(benchmark, executor: Callable):
 
     setup = make_setup(gelu_bench)
     fn = executor(gelu_bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -366,7 +353,6 @@ def test_nanogpt_gelu_grad(benchmark, executor: Callable):
 
     setup = make_setup(gelu_bench)
     fn = executor(gelu_bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -383,7 +369,6 @@ def test_batch_norm_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bn_bench)
     fn = executor(bn_bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -404,7 +389,6 @@ def test_batch_norm_grad(benchmark, executor: Callable):
 
     setup = make_setup(bn_bench)
     fn = executor(bn_bench)
-    fn = wrap_for_benchmark(fn)
     benchmark.pedantic(fn, setup=setup, rounds=200, warmup_rounds=20)
 
 
@@ -426,7 +410,6 @@ def test_nanogpt_cross_entropy_fwd(benchmark, executor: None | Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -449,7 +432,6 @@ def test_nanogpt_cross_entropy_grad(benchmark, executor: None | Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -472,7 +454,6 @@ def test_nanogpt_layer_norm_fwd(benchmark, executor: None | Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -490,7 +471,6 @@ def test_nanogpt_sdpa_fwd(benchmark, executor: None | Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -508,7 +488,6 @@ def test_nanogpt_sdpa_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -525,7 +504,6 @@ def test_llama2_7b_sdpa_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -574,7 +552,6 @@ def test_litgpt_sdpa_grad(benchmark, executor: Callable, bs, config):
 
     setup = make_setup(bench)
     fn = thunder_fwd_bwd(bench, compile_fn=executor)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -591,7 +568,6 @@ def test_nanogpt_mlp_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -608,7 +584,6 @@ def test_nanogpt_mlp_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -629,7 +604,6 @@ def test_nanogpt_csa_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -650,7 +624,6 @@ def test_nanogpt_csa_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -668,7 +641,6 @@ def test_nanogpt_block_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -686,7 +658,6 @@ def test_nanogpt_block_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -708,7 +679,6 @@ def test_nanogpt_gpt2_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)
 
@@ -730,7 +700,6 @@ def test_nanogpt_gpt2_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)
 
@@ -751,7 +720,6 @@ def test_nanogpt_gpt2xl_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)
 
@@ -773,7 +741,6 @@ def test_nanogpt_gpt2xl_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)
 
@@ -792,7 +759,6 @@ def test_open_llama_7b_fwd(benchmark, executor: Callable):
 
     setup = make_setup(b)
     fn = executor(b)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)
 
@@ -806,7 +772,6 @@ def test_llama_2_7b_hf_fwd(benchmark, executor: Callable):
 
     setup = make_setup(b)
     fn = executor(b)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)
 
@@ -828,7 +793,6 @@ def test_llama_2_7b_grad(benchmark, executor: Callable):
 
     setup = make_setup(b)
     fn = executor(b)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=20, warmup_rounds=1)
 
@@ -845,7 +809,6 @@ def test_llama2_mlp_7b_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -862,7 +825,6 @@ def test_llama2_causal_self_attention_7b_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -879,7 +841,6 @@ def test_llama2_7b_rmsnorm_grad(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -968,7 +929,6 @@ def test_litgpt_qkv_split_rope_train_forward(benchmark, executor: Callable, use_
 
     setup = make_setup(bench)
     fn = executor(bench.fn())
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=40, warmup_rounds=1)
 
@@ -1032,7 +992,6 @@ def test_litgpt_qkv_split_rope_train_backward(benchmark, executor: Callable, use
 
     fw_setup = make_setup(bench)
     fn, bw_setup = backward_only(bench.fn(), executor, fw_setup)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=bw_setup, rounds=40, warmup_rounds=1)
 
@@ -1054,6 +1013,5 @@ def test_interpreter_nanogpt_gpt2_fwd(benchmark, executor: Callable):
 
     setup = make_setup(bench)
     fn = executor(bench)
-    fn = wrap_for_benchmark(fn)
 
     benchmark.pedantic(fn, setup=setup, rounds=5, warmup_rounds=1)