NVIDIA · sergachev · Jan 11, 2025
diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh
@@ -228,7 +228,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 
 export BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_graph_level=0 
+                --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
                 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728

diff --git a/rosetta/docs/PGLE.md b/rosetta/docs/PGLE.md
@@ -62,7 +62,7 @@ In order to get the best performance with PGLE, here is a list of all recommende
 ```
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false
---xla_gpu_graph_level=0
+--xla_gpu_enable_command_buffer=
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824

diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md
@@ -69,7 +69,7 @@ The [GPU Performance document](../../../docs/GPU_performance.md) provides a deta
 ```
 XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
             --xla_gpu_enable_triton_gemm=false
-            --xla_gpu_graph_level=0
+            --xla_gpu_enable_command_buffer=
             --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
             --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
             --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728

diff --git a/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub b/rosetta/rosetta/projects/maxtext/scripts/example_slurm.sub
@@ -54,7 +54,7 @@ export NCCL_IB_SL=1
 # Set XLA Flags
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
-                --xla_gpu_graph_level=0
+                --xla_gpu_enable_command_buffer=
                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
                 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728

diff --git a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env
@@ -5,7 +5,7 @@ THRESHOLD_BYTES=1073741824
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_gpu_enable_triton_gemm=false \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
     --xla_gpu_all_gather_combine_threshold_bytes=$((THRESHOLD_BYTES/(NUM_NODES*NUM_GPUS))) \

diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
@@ -141,7 +141,7 @@ For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_thres
 BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                 --xla_gpu_enable_triton_gemm=false
                 --xla_gpu_all_reduce_combine_threshold_bytes=33554432
-                --xla_gpu_graph_level=0" bash run_pile_multinode.sh ...
+                --xla_gpu_enable_command_buffer=" bash run_pile_multinode.sh ...
 ```
 
 # Configs

diff --git a/rosetta/rosetta/projects/pax/xla_flags/common.env b/rosetta/rosetta/projects/pax/xla_flags/common.env
@@ -6,7 +6,7 @@ export XLA_FLAGS="\
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     "
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8
 unset THRESHOLD_BYTES

diff --git a/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env b/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env
@@ -6,7 +6,7 @@ export XLA_FLAGS="\
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_enable_cudnn_fmha=false \
     "
 export XLA_PYTHON_CLIENT_MEM_FRACTION=0.8

diff --git a/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env b/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env
@@ -8,7 +8,7 @@ export XLA_FLAGS="\
     --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${ALL_REDUCE_THRESHOLD_BYTES} \
-    --xla_gpu_graph_level=0 \
+    --xla_gpu_enable_command_buffer= \
     --xla_gpu_all_gather_combine_threshold_bytes=${ALL_GATHER_THRESHOLD_BYTES} \
     --xla_gpu_reduce_scatter_combine_threshold_bytes=${REDUCE_SCATTER_THRESHOLD_BYTES} \
     --xla_gpu_enable_pipelined_all_gather=true \