Skip to content

Commit

Permalink
Add pfc-timer-set to 500mS for pfcwd tests. (#16159)
Browse files Browse the repository at this point in the history
Description of PR
Summary:
Fixes the flakiness of pfc_gen in pfcwd scripts for cisco-8000. We use a new debug CLI script to force the DUT to wait longer in case of a miss in pfc packets from the fanout due to pfc_gen script. So even if the pfc_gen/fanout misses a couple of pfc frames to DUT, the dut would still not send out data packets.

Approach
What is the motivation for this PR?
Flakiness of pfc-gen. Particularly with 400G links.

How did you do it?
We have added a new dshell based script that will force the DUT to wait before transmitting data in case of a miss in pfc pause frames.

How did you verify/test it?
Ran on our duts, with 100G and 400G.

Any platform specific information?
The new fix specific only to cisco-8000.

co-authorized by: jianquanye@microsoft.com
  • Loading branch information
rraghav-cisco authored and mssonicbld committed Jan 15, 2025
1 parent 55575ba commit 1f42038
Show file tree
Hide file tree
Showing 6 changed files with 236 additions and 19 deletions.
79 changes: 79 additions & 0 deletions tests/pfcwd/cisco/default_pfc_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Verified on Q200 @ 100G port speed. e.g. 687 is bit time to pause for 50ms (clock at 900Mhz).

def get_ifg_reg_list(slice_idx):
''' Gr2 does not have an ifg list, listify '''
if is_graphene2: # noqa: F821
ifg_root = [tree.slice[slice_idx].ifg] # noqa: F821
else:
ifg_root = tree.slice[slice_idx].ifg # noqa: F821
return ifg_root


def get_ifgb(ifg_root):
''' Complex tree register differences for ifgb per asic.
Takes tree.slice[slice_idx].ifg[ifg_idx] '''
if is_graphene2: # noqa: F821
ifgb = ifg_root.ifgbe_ra
elif is_gr: # noqa: F821
ifgb = ifg_root.ifgbe_mac
else:
ifgb = ifg_root.ifgb
return ifgb


def set_pfc_512bit_time(interface, bit_time, num_serdes_lanes):
sai_lane = port_to_sai_lane_map[interface] # noqa: F821
slice_idx, ifg_idx, serdes_idx = sai_lane_to_slice_ifg_pif(sai_lane) # noqa: F821
for i in range(num_serdes_lanes):
ifg_root = get_ifg_reg_list(slice_idx)[ifg_idx]
ifg_mac = get_ifgb(ifg_root)
regval = dd0.read_register(ifg_mac.fc_port_cfg0[serdes_idx + i]) # noqa: F821
regval.port_512bit_time = bit_time
dd0.write_register(ifg_mac.fc_port_cfg0[serdes_idx + i], regval) # noqa: F821


def compute_fractional_512bit_value(mac_freq_khz, port_gbps):
''' For G100 and G200 '''
cycles_per_512bits = 512.0 * (mac_freq_khz / 1000000.) / port_gbps
print("Cycles per 512bits: {}".format(cycles_per_512bits))
int_part = int(cycles_per_512bits)
float_part = cycles_per_512bits - int_part
print("Integer: {}".format(int_part))
print("Fraction: {}".format(float_part))
bit_time = (int_part << 10) + int(float_part * 1024)
return bit_time


bit_time = None
if is_pac or is_gb: # noqa: F821
bit_time = 5
elif is_gr or is_graphene2: # noqa: F821
mac_freq_khz = d0.get_int_property(sdk.la_device_property_e_MAC_FREQUENCY) # noqa: F821
print("Mac frequency khz: {}".format(mac_freq_khz))

mac_port = get_mac_port(INTERFACE) # noqa: F821
mac_port_speed_enum_val = mac_port.get_speed()

# Find matching speed enum
speed = None
for field in dir(mac_port):
starter_str = "port_speed_e_E_"
if field.startswith(starter_str):
poss_speed_enum_val = getattr(mac_port, field)
if mac_port_speed_enum_val == poss_speed_enum_val:
speed = field[len(starter_str):]
break
assert speed is not None, "Failed to find matching speed for mac port enum value {}".format(mac_port_speed_enum_val)
print("Speed string: {}".format(speed))
assert speed[-1] == "G", "Unexpected speed, expected trailing 'G'"
gbps_str = speed[:-1]
assert gbps_str.isdigit(), "Non-digit speed {}".format(gbps_str)
gbps = int(gbps_str)
print("Port speed gbps: {}".format(gbps))
bit_time = compute_fractional_512bit_value(mac_freq_khz, gbps)


assert bit_time is not None, "Failed to find an appropriate 512bit time on this device"
print("Setting 512bit register to normal value {}".format(bit_time))
set_pfc_512bit_time("INTERFACE", bit_time, 1)
print("Done")
70 changes: 70 additions & 0 deletions tests/pfcwd/cisco/set_pfc_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Verified on Q200 @ 100G port speed. e.g. 687 is bit time to pause for 50ms (clock at 900Mhz).

import math


def get_ifg_reg_list(slice_idx):
''' Gr2 does not have an ifg list, listify '''
if is_graphene2: # noqa: F821
ifg_root = [tree.slice[slice_idx].ifg] # noqa: F821
else:
ifg_root = tree.slice[slice_idx].ifg # noqa: F821
return ifg_root


def get_ifgb(ifg_root):
''' Complex tree register differences for ifgb per asic.
Takes tree.slice[slice_idx].ifg[ifg_idx] '''
if is_graphene2: # noqa: F821
ifgb = ifg_root.ifgbe_ra
elif is_gr: # noqa: F821
ifgb = ifg_root.ifgbe_mac
else:
ifgb = ifg_root.ifgb
return ifgb


def set_pfc_512bit_time(interface, bit_time, num_serdes_lanes):
sai_lane = port_to_sai_lane_map[interface] # noqa: F821
slice_idx, ifg_idx, serdes_idx = sai_lane_to_slice_ifg_pif(sai_lane) # noqa: F821
for i in range(num_serdes_lanes):
ifg_root = get_ifg_reg_list(slice_idx)[ifg_idx]
ifg_mac = get_ifgb(ifg_root)
regval = dd0.read_register(ifg_mac.fc_port_cfg0[serdes_idx + i]) # noqa: F821
regval.port_512bit_time = bit_time
dd0.write_register(ifg_mac.fc_port_cfg0[serdes_idx + i], regval) # noqa: F821


def set_pfc512_bit_sec(interface, time_sec):
if is_gb or is_pac: # noqa: F821
khz = d0.get_int_property(sdk.la_device_property_e_DEVICE_FREQUENCY) # noqa: F821
print("Device frequency khz: {}".format(khz))
elif is_gr or is_graphene2: # noqa: F821
khz = d0.get_int_property(sdk.la_device_property_e_MAC_FREQUENCY) # noqa: F821
print("Mac frequency khz: {}".format(khz))
else:
assert False, "Unsupported device type"
clock_time = 1. / (khz * 1000)
num_clocks_float = time_sec / (65535 * clock_time)

if is_gb or is_pac: # noqa: F821
bit_time = math.ceil(num_clocks_float)
elif is_gr or is_graphene2: # noqa: F821
int_part = int(num_clocks_float)
float_part = num_clocks_float - int_part
print("Integer: {}".format(int_part))
print("Float: {}".format(float_part))
bit_time = (int_part << 10) + int(float_part * 1024)
if bit_time >= 2 ** 18:
print("Maxed out, setting bit time {} instead of {}".format((2 ** 18) - 1, bit_time))
bit_time = (2 ** 18) - 1
else:
assert False, "Unsupported device type"
print("Setting bit_time (number of clocks) to {}".format(bit_time))
set_pfc_512bit_time(interface, bit_time, num_serdes_lanes=1)


# Increase PFC pause time
num_ms = 50
print("Setting PFC frame time to {}ms".format(num_ms))
set_pfc512_bit_sec("INTERFACE", num_ms / 1000)
60 changes: 60 additions & 0 deletions tests/pfcwd/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import pytest
import os
import os.path

from tests.common.fixtures.conn_graph_facts import conn_graph_facts # noqa F401
from tests.common.fixtures.ptfhost_utils import copy_ptftests_directory # noqa F401
Expand Down Expand Up @@ -244,3 +246,61 @@ def pfcwd_pause_service(ptfhost):
needs_resume["garp_service"] = False

logger.debug("pause_service needs_resume {}".format(needs_resume))


@pytest.fixture(scope="function", autouse=False)
def set_pfc_time_cisco_8000(
duthosts,
enum_rand_one_per_hwsku_frontend_hostname,
setup_pfc_test):

duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
test_ports = setup_pfc_test['test_ports']

# Lets limit this to cisco and T2 only.
if not (duthost.facts['asic_type'] == "cisco-8000"
and duthost.get_facts().get("modular_chassis")):
yield
return

PFC_TIME_SET_SCRIPT = "pfcwd/cisco/set_pfc_time.py"
PFC_TIME_RESET_SCRIPT = "pfcwd/cisco/default_pfc_time.py"

for port in test_ports:
asic_id = ""
if duthost.sonichost.is_multi_asic:
asic_id = duthost.get_port_asic_instance(port).asic_index
set_pfc_timer_cisco_8000(
duthost,
asic_id,
PFC_TIME_SET_SCRIPT,
port)

yield

for port in test_ports:
asic_id = ""
if duthost.sonichost.is_multi_asic:
asic_id = duthost.get_port_asic_instance(port).asic_index
set_pfc_timer_cisco_8000(
duthost,
asic_id,
PFC_TIME_RESET_SCRIPT,
port)


def set_pfc_timer_cisco_8000(duthost, asic_id, script, port):

script_name = os.path.basename(script)
dut_script_path = f"/tmp/{script_name}"
duthost.copy(src=script, dest=dut_script_path)
duthost.shell(f"sed -i 's/INTERFACE/{port}/' {dut_script_path}")
duthost.docker_copy_to_all_asics(
container_name=f"syncd{asic_id}",
src=dut_script_path,
dst="/")

asic_arg = ""
if asic_id:
asic_arg = f"-n asic{asic_id}"
duthost.shell(f"show platform npu script {asic_arg} -s {script_name}")
3 changes: 2 additions & 1 deletion tests/pfcwd/test_pfcwd_all_port_storm.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,8 @@ def run_test(self, duthost, storm_hndle, expect_regex, syslog_marker, action):
time.sleep(5)

def test_all_port_storm_restore(self, duthosts, enum_rand_one_per_hwsku_frontend_hostname,
storm_test_setup_restore, setup_pfc_test, ptfhost):
storm_test_setup_restore, setup_pfc_test, ptfhost,
set_pfc_time_cisco_8000):
"""
Tests PFC storm/restore on all ports
Expand Down
41 changes: 24 additions & 17 deletions tests/pfcwd/test_pfcwd_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,10 @@ def __init__(self, ptf, router_mac, tx_mac, pfc_params, is_dualtor):
ptf_params(dict) : all PFC test params specific to the DUT port
"""
self.ptf = ptf
self.router_mac = router_mac
self.tx_mac = tx_mac
self.tx_mac = router_mac
self.src_port_mac = router_mac
self.router_mac = tx_mac
self.dst_port_mac = tx_mac
self.pfc_queue_index = pfc_params['queue_index']
self.pfc_wd_test_pkt_count = pfc_params['test_pkt_count']
self.pfc_wd_rx_port_id = pfc_params['rx_port_id']
Expand Down Expand Up @@ -552,7 +554,7 @@ def verify_tx_egress(self, action):
if self.pfc_wd_test_port_vlan_id is not None:
ptf_params['port_dst_vlan_id'] = self.pfc_wd_test_port_vlan_id
log_format = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
log_file = "/tmp/pfc_wd.PfcWdTest.{}.log".format(log_format)
log_file = "/tmp/1.pfc_wd.PfcWdTest.{}.log".format(log_format)
ptf_runner(self.ptf, "ptftests", "pfc_wd.PfcWdTest", "ptftests", params=ptf_params,
log_file=log_file, is_python3=True)

Expand All @@ -569,8 +571,8 @@ def verify_rx_ingress(self, action):
dst_port = "".join(str(self.pfc_wd_rx_port_id)).replace(',', '')
else:
dst_port = "[ " + str(self.pfc_wd_rx_port_id) + " ]"
ptf_params = {'router_mac': self.tx_mac,
'vlan_mac': self.vlan_mac if self.is_dualtor else self.tx_mac,
ptf_params = {'router_mac': self.dst_port_mac,
'vlan_mac': self.vlan_mac if self.is_dualtor else self.dst_port_mac,
'queue_index': self.pfc_queue_index,
'pkt_count': self.pfc_wd_test_pkt_count,
'port_src': self.pfc_wd_test_port_id,
Expand All @@ -583,7 +585,7 @@ def verify_rx_ingress(self, action):
if self.pfc_wd_test_port_vlan_id is not None:
ptf_params['port_src_vlan_id'] = self.pfc_wd_test_port_vlan_id
log_format = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
log_file = "/tmp/pfc_wd.PfcWdTest.{}.log".format(log_format)
log_file = "/tmp/2.pfc_wd.PfcWdTest.{}.log".format(log_format)
ptf_runner(self.ptf, "ptftests", "pfc_wd.PfcWdTest", "ptftests", params=ptf_params,
log_file=log_file, is_python3=True)

Expand All @@ -603,7 +605,7 @@ def verify_other_pfc_queue(self):
other_queue = self.pfc_queue_index + 1

ptf_params = {'router_mac': self.router_mac,
'vlan_mac': self.vlan_mac,
'vlan_mac': self.src_port_mac,
'queue_index': other_queue,
'pkt_count': self.pfc_wd_test_pkt_count,
'port_src': self.pfc_wd_rx_port_id[0],
Expand All @@ -616,7 +618,7 @@ def verify_other_pfc_queue(self):
if self.pfc_wd_test_port_vlan_id is not None:
ptf_params['port_dst_vlan_id'] = self.pfc_wd_test_port_vlan_id
log_format = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
log_file = "/tmp/pfc_wd.PfcWdTest.{}.log".format(log_format)
log_file = "/tmp/3.pfc_wd.PfcWdTest.{}.log".format(log_format)
ptf_runner(self.ptf, "ptftests", "pfc_wd.PfcWdTest", "ptftests", params=ptf_params,
log_file=log_file, is_python3=True)

Expand All @@ -636,7 +638,7 @@ def verify_other_pfc_pg(self):
other_pg = self.pfc_queue_index + 1

ptf_params = {'router_mac': self.tx_mac,
'vlan_mac': self.vlan_mac if self.is_dualtor else self.tx_mac,
'vlan_mac': self.vlan_mac if self.is_dualtor else self.dst_port_mac,
'queue_index': other_pg,
'pkt_count': self.pfc_wd_test_pkt_count,
'port_src': self.pfc_wd_test_port_id,
Expand All @@ -649,7 +651,7 @@ def verify_other_pfc_pg(self):
if self.pfc_wd_test_port_vlan_id is not None:
ptf_params['port_src_vlan_id'] = self.pfc_wd_test_port_vlan_id
log_format = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
log_file = "/tmp/pfc_wd.PfcWdTest.{}.log".format(log_format)
log_file = "/tmp/4.pfc_wd.PfcWdTest.{}.log".format(log_format)
ptf_runner(self.ptf, "ptftests", "pfc_wd.PfcWdTest", "ptftests", params=ptf_params,
log_file=log_file, is_python3=True)

Expand All @@ -659,7 +661,7 @@ def fill_buffer(self):
"""
logger.info("Send packets to {} to fill up the buffer".format(self.pfc_wd_test_port))
ptf_params = {'router_mac': self.router_mac,
'vlan_mac': self.vlan_mac,
'vlan_mac': self.src_port_mac,
'queue_index': self.pfc_queue_index,
'pkt_count': self.pfc_wd_test_pkt_count,
'port_src': self.pfc_wd_rx_port_id[0],
Expand All @@ -672,7 +674,7 @@ def fill_buffer(self):
if self.pfc_wd_test_port_vlan_id is not None:
ptf_params['port_dst_vlan_id'] = self.pfc_wd_test_port_vlan_id
log_format = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
log_file = "/tmp/pfc_wd.PfcWdTest.{}.log".format(log_format)
log_file = "/tmp/5.pfc_wd.PfcWdTest.{}.log".format(log_format)
ptf_runner(self.ptf, "ptftests", "pfc_wd.PfcWdTest", "ptftests", params=ptf_params,
log_file=log_file, is_python3=True)

Expand Down Expand Up @@ -855,7 +857,8 @@ def set_traffic_action(self, duthost, action):
def test_pfcwd_actions(self, request, fake_storm, setup_pfc_test, setup_dut_test_params, enum_fanout_graph_facts, # noqa F811
ptfhost, duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts,
setup_standby_ports_on_non_enum_rand_one_per_hwsku_frontend_host_m_unconditionally, # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m): # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m, # noqa F811
set_pfc_time_cisco_8000): # noqa F811
"""
PFCwd functional test
Expand Down Expand Up @@ -933,7 +936,8 @@ def test_pfcwd_actions(self, request, fake_storm, setup_pfc_test, setup_dut_test
def test_pfcwd_multi_port(self, request, fake_storm, setup_pfc_test, setup_dut_test_params, enum_fanout_graph_facts, # noqa F811
ptfhost, duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts,
setup_standby_ports_on_non_enum_rand_one_per_hwsku_frontend_host_m_unconditionally, # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m): # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m, # noqa F811
set_pfc_time_cisco_8000): # noqa F811
"""
Tests pfcwd behavior when 2 ports are under pfc storm one after the other
Expand Down Expand Up @@ -1014,7 +1018,8 @@ def test_pfcwd_multi_port(self, request, fake_storm, setup_pfc_test, setup_dut_t
def test_pfcwd_mmu_change(self, request, fake_storm, setup_pfc_test, setup_dut_test_params, enum_fanout_graph_facts, # noqa F811
ptfhost, duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts, dualtor_ports, # noqa F811
setup_standby_ports_on_non_enum_rand_one_per_hwsku_frontend_host_m_unconditionally, # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m): # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m, # noqa F811
set_pfc_time_cisco_8000): # noqa F811
"""
Tests if mmu changes impact Pfcwd functionality
Expand Down Expand Up @@ -1108,7 +1113,8 @@ def test_pfcwd_mmu_change(self, request, fake_storm, setup_pfc_test, setup_dut_t
def test_pfcwd_port_toggle(self, request, fake_storm, setup_pfc_test, setup_dut_test_params, enum_fanout_graph_facts, # noqa F811
tbinfo, ptfhost, duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts,
setup_standby_ports_on_non_enum_rand_one_per_hwsku_frontend_host_m_unconditionally, # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m): # noqa F811
toggle_all_simulator_ports_to_enum_rand_one_per_hwsku_frontend_host_m, # noqa F811
set_pfc_time_cisco_8000): # noqa F811
"""
Test PfCWD functionality after toggling port
Expand Down Expand Up @@ -1213,7 +1219,8 @@ def test_pfcwd_port_toggle(self, request, fake_storm, setup_pfc_test, setup_dut_

def test_pfcwd_no_traffic(
self, request, setup_pfc_test, setup_dut_test_params, enum_fanout_graph_facts, # noqa F811
ptfhost, duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts):
ptfhost, duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts,
set_pfc_time_cisco_8000): # noqa F811
"""
Verify the pfcwd is not triggered when no traffic is sent, even when pfc storm is active.
Args:
Expand Down
2 changes: 1 addition & 1 deletion tests/pfcwd/test_pfcwd_timer_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def retrieve_timestamp(self, pattern):
return int(0)

def test_pfcwd_timer_accuracy(self, duthosts, ptfhost, enum_rand_one_per_hwsku_frontend_hostname,
pfcwd_timer_setup_restore, fanouthosts):
pfcwd_timer_setup_restore, fanouthosts, set_pfc_time_cisco_8000):
"""
Tests PFCwd timer accuracy
Expand Down

0 comments on commit 1f42038

Please sign in to comment.