diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh index ebb2afcdc..1119b6724 100755 --- a/.github/container/test-maxtext.sh +++ b/.github/container/test-maxtext.sh @@ -228,7 +228,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_graph_level=0 + --xla_gpu_enable_command_buffer= --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md index 2425ddffe..bc3ce5d1e 100644 --- a/rosetta/docs/PGLE.md +++ b/rosetta/docs/PGLE.md @@ -62,7 +62,7 @@ In order to get the best performance with PGLE, here is a list of all recommende ``` export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false ---xla_gpu_graph_level=0 +--xla_gpu_enable_command_buffer= --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md index 44baa19ef..97eac185d 100644 --- a/rosetta/rosetta/projects/maxtext/README.md +++ b/rosetta/rosetta/projects/maxtext/README.md @@ -69,7 +69,7 @@ The [GPU Performance document](../../../docs/GPU_performance.md) provides a deta ``` XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_graph_level=0 + --xla_gpu_enable_command_buffer= --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 diff --git a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub index 0ca3fd802..a9d62e55c 100644 --- a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub +++ b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub @@ -54,7 +54,7 @@ export NCCL_IB_SL=1 # Set XLA Flags export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_graph_level=0 + --xla_gpu_enable_command_buffer= --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 diff --git a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env index d999f5b5e..3730855fc 100644 --- a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env +++ b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env @@ -5,7 +5,7 @@ THRESHOLD_BYTES=1073741824 export XLA_FLAGS="\ --xla_gpu_enable_latency_hiding_scheduler=true \ --xla_gpu_enable_triton_gemm=false \ - --xla_gpu_graph_level=0 \ + --xla_gpu_enable_command_buffer= \ --xla_gpu_enable_highest_priority_async_stream=true \ --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \ --xla_gpu_all_gather_combine_threshold_bytes=$((THRESHOLD_BYTES/(NUM_NODES*NUM_GPUS))) \ diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md index d1829b847..a249fd461 100644 --- a/rosetta/rosetta/projects/pax/README.md +++ b/rosetta/rosetta/projects/pax/README.md @@ -141,7 +141,7 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_all_reduce_combine_threshold_bytes=33554432 - --xla_gpu_graph_level=0" bash run_pile_multinode.sh ... + --xla_gpu_enable_command_buffer=" bash run_pile_multinode.sh ... ``` # Configs diff --git a/rosetta/rosetta/projects/pax/xla_flags/common.env b/rosetta/rosetta/projects/pax/xla_flags/common.env index 26c819143..139544734 100644 --- a/rosetta/rosetta/projects/pax/xla_flags/common.env +++ b/rosetta/rosetta/projects/pax/xla_flags/common.env @@ -6,7 +6,7 @@ export XLA_FLAGS="\ --xla_gpu_enable_highest_priority_async_stream=true \ --xla_gpu_enable_triton_softmax_fusion=false \ --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \ - --xla_gpu_graph_level=0 \ + --xla_gpu_enable_command_buffer= \ " export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8 unset THRESHOLD_BYTES diff --git a/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env b/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env index e5b97b466..15159305b 100644 --- a/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env +++ b/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env @@ -6,7 +6,7 @@ export XLA_FLAGS="\ --xla_gpu_enable_highest_priority_async_stream=true \ --xla_gpu_enable_triton_softmax_fusion=false \ --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \ - --xla_gpu_graph_level=0 \ + --xla_gpu_enable_command_buffer= \ --xla_gpu_enable_cudnn_fmha=false \ " export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8 diff --git a/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env b/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env index e48b76dcf..cc2ef61b6 100644 --- a/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env +++ b/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env @@ -8,7 +8,7 @@ export XLA_FLAGS="\ --xla_gpu_enable_highest_priority_async_stream=true \ --xla_gpu_enable_triton_softmax_fusion=false \ --xla_gpu_all_reduce_combine_threshold_bytes=${ALL_REDUCE_THRESHOLD_BYTES} \ - --xla_gpu_graph_level=0 \ + --xla_gpu_enable_command_buffer= \ --xla_gpu_all_gather_combine_threshold_bytes=${ALL_GATHER_THRESHOLD_BYTES} \ --xla_gpu_reduce_scatter_combine_threshold_bytes=${REDUCE_SCATTER_THRESHOLD_BYTES} \ --xla_gpu_enable_pipelined_all_gather=true \