Skip to content

Commit

Permalink
Add TPU step breakdown to OSS
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 707790757
  • Loading branch information
zzzaries authored and copybara-github committed Dec 20, 2024
1 parent 12e5c48 commit 6efc3db
Show file tree
Hide file tree
Showing 3 changed files with 413 additions and 32 deletions.
1 change: 1 addition & 0 deletions plugin/tensorboard_plugin_profile/convert/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ py_library(
deps = [
":diagnostics",
requirement("gviz_api"),
"@org_xprof//plugin/tensorboard_plugin_profile/protobuf:protos_all_py_pb2",
],
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,198 @@
from __future__ import division
from __future__ import print_function

import warnings

import gviz_api

from tensorboard_plugin_profile.convert import diagnostics as diag
from tensorboard_plugin_profile.protobuf import input_pipeline_pb2
from tensorboard_plugin_profile.protobuf import tpu_input_pipeline_pb2


def compute_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
return details.tc_compute_time_ms + details.scv0_compute_time_ms


def infeed_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
return details.tc_infeed_time_ms + details.scv0_infeed_time_ms


def all_reduce_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
return details.all_reduce_compute_time_ms + details.all_reduce_sync_time_ms


def non_idle_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
return (
compute_time_ms(details)
+ infeed_time_ms(details)
+ all_reduce_time_ms(details)
+ details.tc_outfeed_time_ms
)


# Time spent by a training step on TPU.
def step_time_ms(details: tpu_input_pipeline_pb2.PerTpuStepDetails):
return non_idle_time_ms(details) + details.tc_idle_time_ms


def get_step_breakdown_table_args_for_tpu(ipa):
"""Creates a step breakdown from an Input Pipeline Analyzer proto for TPU.
Args:
ipa: An input_pipeline_pb2.InputPipelineAnalysisResult.
Returns:
Returns a gviz_api.DataTable
"""

breakdown = tpu_input_pipeline_pb2.TpuStepTimeBreakdown()
has_breakdown = isinstance(
ipa.step_time_breakdown, tpu_input_pipeline_pb2.TpuStepTimeBreakdown
)
if not ipa.step_time_breakdown.Unpack(breakdown):
warnings.warn("Could not unpack to TpuStepBreakdown")
has_sc_summary_legacy = breakdown.HasField("sparse_core_step_summary")

table_description = [
("stepnum", "number", "stepnum"),
("tcComputeTimeMs", "number", "TensorCore compute (in ms)"),
]
if not has_sc_summary_legacy:
table_description += [
("scv0ComputeTimeMs", "number", "SparseCoreV0 compute (in ms)"),
("scv0InfeedTimeMs", "number", "SparseCoreV0 input (in ms)"),
]
table_description += [
("tcInfeedTimeMs", "number", "TensorCore input (in ms)"),
("tcOutfeedTimeMs", "number", "TensorCore output (in ms)"),
("tcIdleTimeMs", "number", "TensorCore idle (in ms)"),
("hostTransferTimeMs", "number", "Host transfer (in ms)"),
("tooltip", "string", "tooltip", {"role": "tooltip"}),
(
"infeedPercentAverage",
"number",
"number% step time waiting for input data",
),
("infeedPercentMin", "number", "Infeed percent min"),
("infeedPercentMax", "number", "Infeed percent max"),
]

data = []
for step_details in ipa.step_details:
details = tpu_input_pipeline_pb2.PerTpuStepDetails()
step_details.Unpack(details)
tooltip = (
"step {}: \nTime waiting for input data = {:.3f} ms, Step time ="
" {:.3f} ms".format(
details.step_number,
infeed_time_ms(details),
step_time_ms(details) - all_reduce_time_ms(details),
)
)
row = [details.step_number, details.tc_compute_time_ms]
if not has_sc_summary_legacy:
row += [
details.scv0_compute_time_ms,
details.scv0_infeed_time_ms,
]
row += [
details.tc_infeed_time_ms,
details.tc_outfeed_time_ms,
details.tc_idle_time_ms,
details.host_transfer_ms,
tooltip,
details.infeed_percent_average,
details.infeed_percent_minimum,
details.infeed_percent_maximum,
]
data.append(row)

step_time_summary = ipa.step_time_summary
input_percent_summary = ipa.input_percent_summary
custom_properties = {
"steptime_ms_average": "{:.1f}".format(step_time_summary.average),
"steptime_ms_standard_deviation": "{:.1f}".format(
step_time_summary.standard_deviation
),
"steptime_ms_minimum": "{:.1f}".format(step_time_summary.minimum),
"steptime_ms_maximum": "{:.1f}".format(step_time_summary.maximum),
"infeed_percent_average": "{:.1f}".format(input_percent_summary.average),
"infeed_percent_standard_deviation": "{:.1f}".format(
input_percent_summary.standard_deviation
),
"infeed_percent_minimum": "{:.1f}".format(input_percent_summary.minimum),
"infeed_percent_maximum": "{:.1f}".format(input_percent_summary.maximum),
}

# Add TPU step time breakdown to table properties
if has_breakdown:
scv0_compute_summary = breakdown.scv0_compute_ms_summary
scv0_infeed_summary = breakdown.scv0_infeed_ms_summary
tc_compute_summary = breakdown.tc_compute_ms_summary
tc_infeed_summary = breakdown.tc_infeed_ms_summary
tc_outfeed_summary = breakdown.tc_outfeed_ms_summary
tc_idle_summary = breakdown.tc_idle_ms_summary
sc_compute_summary = (
breakdown.sparse_core_step_summary.sc_compute_ms_summary
)
sc_infeed_summary = breakdown.sparse_core_step_summary.sc_infeed_ms_summary
sc_outfeed_summary = (
breakdown.sparse_core_step_summary.sc_outfeed_ms_summary
)
sc_idle_summary = breakdown.sparse_core_step_summary.sc_idle_ms_summary
sc_step_summary = breakdown.sparse_core_step_summary.sc_step_time_ms_summary
host_transfer_summary = breakdown.host_transfer_ms_summary

if not has_sc_summary_legacy:
custom_properties.update({
"scv0_compute_ms_average": "{:.2f}".format(
scv0_compute_summary.average
),
"scv0_infeed_ms_average": "{:.2f}".format(
scv0_infeed_summary.average
),
})
custom_properties.update({
"tc_compute_ms_average": "{:.2f}".format(tc_compute_summary.average),
"tc_infeed_ms_average": "{:.2f}".format(tc_infeed_summary.average),
"tc_outfeed_ms_average": "{:.2f}".format(tc_outfeed_summary.average),
"tc_idle_ms_average": "{:.2f}".format(tc_idle_summary.average),
"host_transfer_ms_average": "{:.2f}".format(
host_transfer_summary.average
),
})
if sc_step_summary.minimum > 0:
custom_properties.update({
"sc_compute_ms_average": "{:.2f}".format(sc_compute_summary.average),
"sc_infeed_ms_average": "{:.2f}".format(sc_infeed_summary.average),
"sc_outfeed_ms_average": "{:.2f}".format(sc_outfeed_summary.average),
"sc_idle_ms_average": "{:.2f}".format(sc_idle_summary.average),
"sc_step_time_ms_average": "{:.1f}".format(sc_step_summary.average),
})

# Add TPU bottleneck summary analysis to table properties
bottleneck = tpu_input_pipeline_pb2.TpuBottleneckAnalysis()
has_bottleneck = isinstance(
ipa.recommendation.bottleneck_analysis,
tpu_input_pipeline_pb2.TpuBottleneckAnalysis,
)
if not has_bottleneck:
warnings.warn("Could not unpack to TpuBottleneckAnalysis")
ipa.recommendation.bottleneck_analysis.Unpack(bottleneck)
if has_bottleneck:
custom_properties.update({
"input_conclusion": bottleneck.input_statement,
"output_conclusion": bottleneck.output_statement,
})
custom_properties.update({
"summary_nextstep": ipa.recommendation.summary_next_step,
})

return (table_description, data, custom_properties)


# Get generic step breakdown table
def get_step_breakdown_table_args(ipa):
"""Creates a step breakdown from an Input Pipeline Analyzer proto.
Expand All @@ -39,7 +225,7 @@ def get_step_breakdown_table_args(ipa):
"""

table_description = [
("stepnum", "string", "Step number"),
("stepname", "string", "Step Name"),
("deviceComputeTimeMs", "number", "Device compute"),
("deviceToDeviceTimeMs", "number", "Device to device"),
("deviceCollectivesTimeMs", "number", "Device collectives"),
Expand Down Expand Up @@ -90,12 +276,18 @@ def get_step_breakdown_table_args(ipa):
details.device_compute_ms)

row = [
details.step_name, details.device_compute_ms,
details.device_to_device_ms, details.device_collectives_ms,
details.host_compute_ms, details.host_prepare_ms,
details.step_name,
details.device_compute_ms,
details.device_to_device_ms,
details.device_collectives_ms,
details.host_compute_ms,
details.host_prepare_ms,
details.host_wait_input_ms + details.host_to_device_ms,
details.output_ms, details.host_compile_ms, details.unknown_time_ms,
tooltip, details.step_time_ms,
details.output_ms,
details.host_compile_ms,
details.unknown_time_ms,
tooltip,
details.step_time_ms,
]
data.append(row)

Expand Down Expand Up @@ -280,9 +472,17 @@ def get_recommendation_table_args(ipa):
return (table_description, data, None)


def generate_step_breakdown_table_for_tpu(ipa):
table_description, data, custom_properties = (
get_step_breakdown_table_args_for_tpu(ipa)
)
return gviz_api.DataTable(table_description, data, custom_properties)


def generate_step_breakdown_table(ipa):
(table_description, data,
custom_properties) = get_step_breakdown_table_args(ipa)
(table_description, data, custom_properties) = get_step_breakdown_table_args(
ipa
)
return gviz_api.DataTable(table_description, data, custom_properties)


Expand All @@ -300,8 +500,14 @@ def generate_recommendation_table(ipa):
def generate_all_chart_tables(ipa):
"""Generates a list of gviz tables from InputPipelineAnalysisResult."""

step_breakdown_table = (
generate_step_breakdown_table(ipa)
if ipa.tag
else generate_step_breakdown_table_for_tpu(ipa)
)

return [
generate_step_breakdown_table(ipa),
step_breakdown_table,
generate_input_op_table(ipa),
generate_recommendation_table(ipa),
diag.generate_diagnostics_table(ipa.diagnostics),
Expand Down
Loading

0 comments on commit 6efc3db

Please sign in to comment.