Add TPU step breakdown to OSS

PiperOrigin-RevId: 707790757
tensorflow · Dec 20, 2024 · 6efc3db · 6efc3db
1 parent 12e5c48
commit 6efc3db
Show file tree

Hide file tree

Showing 3 changed files with 413 additions and 32 deletions.
diff --git a/plugin/tensorboard_plugin_profile/convert/BUILD b/plugin/tensorboard_plugin_profile/convert/BUILD
@@ -56,6 +56,7 @@ py_library(
     deps = [
         ":diagnostics",
         requirement("gviz_api"),
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:protos_all_py_pb2",
     ],
 )
 

diff --git a/plugin/tensorboard_plugin_profile/convert/input_pipeline_proto_to_gviz.py b/plugin/tensorboard_plugin_profile/convert/input_pipeline_proto_to_gviz.py
@@ -22,12 +22,198 @@
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import gviz_api
 
 from tensorboard_plugin_profile.convert import diagnostics as diag
 from tensorboard_plugin_profile.protobuf import input_pipeline_pb2
+from tensorboard_plugin_profile.protobuf import tpu_input_pipeline_pb2
+
+
+def compute_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
+  return details.tc_compute_time_ms + details.scv0_compute_time_ms
+
+
+def infeed_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
+  return details.tc_infeed_time_ms + details.scv0_infeed_time_ms
+
+
+def all_reduce_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
+  return details.all_reduce_compute_time_ms + details.all_reduce_sync_time_ms
+
+
+def non_idle_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
+  return (
+      compute_time_ms(details)
+      + infeed_time_ms(details)
+      + all_reduce_time_ms(details)
+      + details.tc_outfeed_time_ms
+  )
+
+
+# Time spent by a training step on TPU.
+def step_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
+  return non_idle_time_ms(details) + details.tc_idle_time_ms
 
 
+def get_step_breakdown_table_args_for_tpu(ipa):
+  """Creates a step breakdown from an Input Pipeline Analyzer proto for TPU.
+
+  Args:
+    ipa: An input_pipeline_pb2.InputPipelineAnalysisResult.
+
+  Returns:
+    Returns a gviz_api.DataTable
+  """
+
+  breakdown = tpu_input_pipeline_pb2.TpuStepTimeBreakdown()
+  has_breakdown = isinstance(
+      ipa.step_time_breakdown, tpu_input_pipeline_pb2.TpuStepTimeBreakdown
+  )
+  if not ipa.step_time_breakdown.Unpack(breakdown):
+    warnings.warn("Could not unpack to TpuStepBreakdown")
+  has_sc_summary_legacy = breakdown.HasField("sparse_core_step_summary")
+
+  table_description = [
+      ("stepnum", "number", "stepnum"),
+      ("tcComputeTimeMs", "number", "TensorCore compute (in ms)"),
+  ]
+  if not has_sc_summary_legacy:
+    table_description += [
+        ("scv0ComputeTimeMs", "number", "SparseCoreV0 compute (in ms)"),
+        ("scv0InfeedTimeMs", "number", "SparseCoreV0 input (in ms)"),
+    ]
+  table_description += [
+      ("tcInfeedTimeMs", "number", "TensorCore input (in ms)"),
+      ("tcOutfeedTimeMs", "number", "TensorCore output (in ms)"),
+      ("tcIdleTimeMs", "number", "TensorCore idle (in ms)"),
+      ("hostTransferTimeMs", "number", "Host transfer (in ms)"),
+      ("tooltip", "string", "tooltip", {"role": "tooltip"}),
+      (
+          "infeedPercentAverage",
+          "number",
+          "number% step time waiting for input data",
+      ),
+      ("infeedPercentMin", "number", "Infeed percent min"),
+      ("infeedPercentMax", "number", "Infeed percent max"),
+  ]
+
+  data = []
+  for step_details in ipa.step_details:
+    details = tpu_input_pipeline_pb2.PerTpuStepDetails()
+    step_details.Unpack(details)
+    tooltip = (
+        "step {}: \nTime waiting for input data = {:.3f} ms, Step time ="
+        " {:.3f} ms".format(
+            details.step_number,
+            infeed_time_ms(details),
+            step_time_ms(details) - all_reduce_time_ms(details),
+        )
+    )
+    row = [details.step_number, details.tc_compute_time_ms]
+    if not has_sc_summary_legacy:
+      row += [
+          details.scv0_compute_time_ms,
+          details.scv0_infeed_time_ms,
+      ]
+    row += [
+        details.tc_infeed_time_ms,
+        details.tc_outfeed_time_ms,
+        details.tc_idle_time_ms,
+        details.host_transfer_ms,
+        tooltip,
+        details.infeed_percent_average,
+        details.infeed_percent_minimum,
+        details.infeed_percent_maximum,
+    ]
+    data.append(row)
+
+  step_time_summary = ipa.step_time_summary
+  input_percent_summary = ipa.input_percent_summary
+  custom_properties = {
+      "steptime_ms_average": "{:.1f}".format(step_time_summary.average),
+      "steptime_ms_standard_deviation": "{:.1f}".format(
+          step_time_summary.standard_deviation
+      ),
+      "steptime_ms_minimum": "{:.1f}".format(step_time_summary.minimum),
+      "steptime_ms_maximum": "{:.1f}".format(step_time_summary.maximum),
+      "infeed_percent_average": "{:.1f}".format(input_percent_summary.average),
+      "infeed_percent_standard_deviation": "{:.1f}".format(
+          input_percent_summary.standard_deviation
+      ),
+      "infeed_percent_minimum": "{:.1f}".format(input_percent_summary.minimum),
+      "infeed_percent_maximum": "{:.1f}".format(input_percent_summary.maximum),
+  }
+
+  # Add TPU step time breakdown to table properties
+  if has_breakdown:
+    scv0_compute_summary = breakdown.scv0_compute_ms_summary
+    scv0_infeed_summary = breakdown.scv0_infeed_ms_summary
+    tc_compute_summary = breakdown.tc_compute_ms_summary
+    tc_infeed_summary = breakdown.tc_infeed_ms_summary
+    tc_outfeed_summary = breakdown.tc_outfeed_ms_summary
+    tc_idle_summary = breakdown.tc_idle_ms_summary
+    sc_compute_summary = (
+        breakdown.sparse_core_step_summary.sc_compute_ms_summary
+    )
+    sc_infeed_summary = breakdown.sparse_core_step_summary.sc_infeed_ms_summary
+    sc_outfeed_summary = (
+        breakdown.sparse_core_step_summary.sc_outfeed_ms_summary
+    )
+    sc_idle_summary = breakdown.sparse_core_step_summary.sc_idle_ms_summary
+    sc_step_summary = breakdown.sparse_core_step_summary.sc_step_time_ms_summary
+    host_transfer_summary = breakdown.host_transfer_ms_summary
+
+    if not has_sc_summary_legacy:
+      custom_properties.update({
+          "scv0_compute_ms_average": "{:.2f}".format(
+              scv0_compute_summary.average
+          ),
+          "scv0_infeed_ms_average": "{:.2f}".format(
+              scv0_infeed_summary.average
+          ),
+      })
+    custom_properties.update({
+        "tc_compute_ms_average": "{:.2f}".format(tc_compute_summary.average),
+        "tc_infeed_ms_average": "{:.2f}".format(tc_infeed_summary.average),
+        "tc_outfeed_ms_average": "{:.2f}".format(tc_outfeed_summary.average),
+        "tc_idle_ms_average": "{:.2f}".format(tc_idle_summary.average),
+        "host_transfer_ms_average": "{:.2f}".format(
+            host_transfer_summary.average
+        ),
+    })
+    if sc_step_summary.minimum > 0:
+      custom_properties.update({
+          "sc_compute_ms_average": "{:.2f}".format(sc_compute_summary.average),
+          "sc_infeed_ms_average": "{:.2f}".format(sc_infeed_summary.average),
+          "sc_outfeed_ms_average": "{:.2f}".format(sc_outfeed_summary.average),
+          "sc_idle_ms_average": "{:.2f}".format(sc_idle_summary.average),
+          "sc_step_time_ms_average": "{:.1f}".format(sc_step_summary.average),
+      })
+
+  # Add TPU bottleneck summary analysis to table properties
+  bottleneck = tpu_input_pipeline_pb2.TpuBottleneckAnalysis()
+  has_bottleneck = isinstance(
+      ipa.recommendation.bottleneck_analysis,
+      tpu_input_pipeline_pb2.TpuBottleneckAnalysis,
+  )
+  if not has_bottleneck:
+    warnings.warn("Could not unpack to TpuBottleneckAnalysis")
+  ipa.recommendation.bottleneck_analysis.Unpack(bottleneck)
+  if has_bottleneck:
+    custom_properties.update({
+        "input_conclusion": bottleneck.input_statement,
+        "output_conclusion": bottleneck.output_statement,
+    })
+  custom_properties.update({
+      "summary_nextstep": ipa.recommendation.summary_next_step,
+  })
+
+  return (table_description, data, custom_properties)
+
+
+# Get generic step breakdown table
 def get_step_breakdown_table_args(ipa):
   """Creates a step breakdown from an Input Pipeline Analyzer proto.
 
@@ -39,7 +225,7 @@ def get_step_breakdown_table_args(ipa):
   """
 
   table_description = [
-      ("stepnum", "string", "Step number"),
+      ("stepname", "string", "Step Name"),
       ("deviceComputeTimeMs", "number", "Device compute"),
       ("deviceToDeviceTimeMs", "number", "Device to device"),
       ("deviceCollectivesTimeMs", "number", "Device collectives"),
@@ -90,12 +276,18 @@ def get_step_breakdown_table_args(ipa):
                    details.device_compute_ms)
 
     row = [
-        details.step_name, details.device_compute_ms,
-        details.device_to_device_ms, details.device_collectives_ms,
-        details.host_compute_ms, details.host_prepare_ms,
+        details.step_name,
+        details.device_compute_ms,
+        details.device_to_device_ms,
+        details.device_collectives_ms,
+        details.host_compute_ms,
+        details.host_prepare_ms,
         details.host_wait_input_ms + details.host_to_device_ms,
-        details.output_ms, details.host_compile_ms, details.unknown_time_ms,
-        tooltip, details.step_time_ms,
+        details.output_ms,
+        details.host_compile_ms,
+        details.unknown_time_ms,
+        tooltip,
+        details.step_time_ms,
     ]
     data.append(row)
 
@@ -280,9 +472,17 @@ def get_recommendation_table_args(ipa):
   return (table_description, data, None)
 
 
+def generate_step_breakdown_table_for_tpu(ipa):
+  table_description, data, custom_properties = (
+      get_step_breakdown_table_args_for_tpu(ipa)
+  )
+  return gviz_api.DataTable(table_description, data, custom_properties)
+
+
 def generate_step_breakdown_table(ipa):
-  (table_description, data,
-   custom_properties) = get_step_breakdown_table_args(ipa)
+  (table_description, data, custom_properties) = get_step_breakdown_table_args(
+      ipa
+  )
   return gviz_api.DataTable(table_description, data, custom_properties)
 
 
@@ -300,8 +500,14 @@ def generate_recommendation_table(ipa):
 def generate_all_chart_tables(ipa):
   """Generates a list of gviz tables from InputPipelineAnalysisResult."""
 
+  step_breakdown_table = (
+      generate_step_breakdown_table(ipa)
+      if ipa.tag
+      else generate_step_breakdown_table_for_tpu(ipa)
+  )
+
   return [
-      generate_step_breakdown_table(ipa),
+      step_breakdown_table,
       generate_input_op_table(ipa),
       generate_recommendation_table(ipa),
       diag.generate_diagnostics_table(ipa.diagnostics),