From 399a1b62b031fdca43fdce12648fdccc32bf72e8 Mon Sep 17 00:00:00 2001 From: ansrajpu-git <113939367+ansrajpu-git@users.noreply.github.com> Date: Mon, 25 Nov 2024 22:46:17 -0500 Subject: [PATCH] [CHASSIS][Voq][QoS]Increasing LACP timer for lag ports for broadcom-dnx neighbor EOS host (#14469) escription of PR Intermittently testQosSaiLossyQueue tests fails due to Port-channel flap on broadcom-dnx T2 Voq chassis. The reason the port-channel goes down is because this test requires disabling TX on the egress port (which is a member of a port-channel) With the huge buffer-size, it takes a longer time to send packets . This will result in the TX LACP packets to stop egressing, so after 3 LACP packets are missed (~90s) on the server side the LAG is torn down. Issue # #11682 Summary: Fixes # (issue) What is the motivation for this PR? Intermittently testQosSaiLossyQueue tests fails due to Port-channel flap How did you do it? The lacp timer multiplier on the EOS host is configurable. By default, timeout is 30 secs with a failure tolerance of 3. We changed the multiplier to an increased value to hold the connectivity for some time until all packets are sent. And revert the changes after test case execution. How did you verify/test it? Executed qos test cases and verfiy the results. --- tests/common/devices/eos.py | 18 ++++++++++++++ tests/qos/qos_sai_base.py | 47 +++++++++++++++++++++++++++++++++++++ tests/qos/test_qos_sai.py | 12 +++++----- 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/tests/common/devices/eos.py b/tests/common/devices/eos.py index 35f28ab3e85..e2ce0bb06dc 100644 --- a/tests/common/devices/eos.py +++ b/tests/common/devices/eos.py @@ -556,3 +556,21 @@ def no_isis_metric(self, interface): lines=['no isis metric'], parents=['interface {}'.format(interface)]) return not self._has_cli_cmd_failed(out) + + def set_interface_lacp_time_multiplier(self, interface_name, multiplier): + out = self.eos_config( + lines=['lacp timer multiplier %d' % multiplier], + parents='interface %s' % interface_name) + + if out['failed'] is True or out['changed'] is False: + logging.warning("Unable to set interface [%s] lacp timer multiplier to [%d]" % (interface_name, multiplier)) + else: + logging.info("Set interface [%s] lacp timer to [%d]" % (interface_name, multiplier)) + return out + + def no_lacp_time_multiplier(self, interface_name): + out = self.eos_config( + lines=['no lacp timer multiplier'], + parents=['interface {}'.format(interface_name)]) + logging.info('Reset lacp timer to default for interface [%s]' % interface_name) + return out diff --git a/tests/qos/qos_sai_base.py b/tests/qos/qos_sai_base.py index 574dbc3c2a9..d5ba38e9218 100644 --- a/tests/qos/qos_sai_base.py +++ b/tests/qos/qos_sai_base.py @@ -27,6 +27,7 @@ from tests.common.system_utils import docker # noqa F401 from tests.common.errors import RunAnsibleModuleFail from tests.common import config_reload +from tests.common.devices.eos import EosHost logger = logging.getLogger(__name__) @@ -2577,3 +2578,49 @@ def isLonglink(self, dut_host): if cable_length >= 120000: return True return False + + @pytest.fixture(scope="function", autouse=False) + def change_lag_lacp_timer(self, duthosts, get_src_dst_asic_and_duts, tbinfo, nbrhosts, dutConfig, dutTestParams, + request): + if request.config.getoption("--neighbor_type") == "sonic": + yield + return + + if ('platform_asic' in dutTestParams["basicParams"] and + dutTestParams["basicParams"]["platform_asic"] == "broadcom-dnx"): + src_dut = get_src_dst_asic_and_duts['src_dut'] + dst_dut = get_src_dst_asic_and_duts['dst_dut'] + if src_dut.sonichost.is_multi_asic and dst_dut.sonichost.is_multi_asic: + dst_mgfacts = dst_dut.get_extended_minigraph_facts(tbinfo) + dst_port_id = dutConfig['testPorts']['dst_port_id'] + dst_interface = dutConfig['dutInterfaces'][dst_port_id] + lag_name = '' + for port_ch, port_intf in dst_mgfacts['minigraph_portchannels'].items(): + if dst_interface in port_intf['members']: + lag_name = port_ch + break + if lag_name == '': + yield + return + lag_facts = dst_dut.lag_facts(host=dst_dut.hostname)['ansible_facts']['lag_facts'] + po_interfaces = lag_facts['lags'][lag_name]['po_config']['ports'] + vm_neighbors = dst_mgfacts['minigraph_neighbors'] + neighbor_lag_intfs = [vm_neighbors[po_intf]['port'] for po_intf in po_interfaces] + neigh_intf = next(iter(po_interfaces.keys())) + peer_device = vm_neighbors[neigh_intf]['name'] + vm_host = nbrhosts[peer_device]['host'] + num = 600 + for neighbor_lag_member in neighbor_lag_intfs: + logger.info( + "Changing lacp timer multiplier to 600 for %s in %s" % (neighbor_lag_member, peer_device)) + if isinstance(vm_host, EosHost): + vm_host.set_interface_lacp_time_multiplier(neighbor_lag_member, num) + + yield + if ('platform_asic' in dutTestParams["basicParams"] and + dutTestParams["basicParams"]["platform_asic"] == "broadcom-dnx"): + if src_dut.sonichost.is_multi_asic and dst_dut.sonichost.is_multi_asic: + for neighbor_lag_member in neighbor_lag_intfs: + logger.info( + "Changing lacp timer multiplier to default for %s in %s" % (neighbor_lag_member, peer_device)) + vm_host.no_lacp_time_multiplier(neighbor_lag_member) diff --git a/tests/qos/test_qos_sai.py b/tests/qos/test_qos_sai.py index 3463fc09800..9c3ff343493 100644 --- a/tests/qos/test_qos_sai.py +++ b/tests/qos/test_qos_sai.py @@ -69,7 +69,7 @@ def ignore_expected_loganalyzer_exception(get_src_dst_asic_and_duts, loganalyzer # The following error log is related to the bug of https://github.com/sonic-net/sonic-buildimage/issues/13265 ".*ERR lldp[0-9]*#lldpmgrd.*Command failed.*lldpcli.*configure.*ports.*unable to connect to socket.*", ".*ERR lldp[0-9]*#lldpmgrd.*Command failed.*lldpcli.*configure.*ports.*lldp.*unknown command from argument" - ".*configure.*command was failed.*times, disabling retry.*" + ".*configure.*command was failed.*times, disabling retry.*", # Error related to syncd socket-timeout intermittenly ".*ERR syncd[0-9]*#dsserve: _ds2tty broken pipe.*" ] @@ -325,7 +325,7 @@ def testParameter( def testQosSaiPfcXoffLimit( self, xoffProfile, duthosts, get_src_dst_asic_and_duts, ptfhost, dutTestParams, dutConfig, dutQosConfig, - ingressLosslessProfile, egressLosslessProfile + ingressLosslessProfile, egressLosslessProfile, change_lag_lacp_timer ): # NOTE: this test will be skipped for t2 cisco 8800 if it's not xoff_1 or xoff_2 """ @@ -1147,7 +1147,7 @@ def testQosSaiBufferPoolWatermark( def testQosSaiLossyQueue( self, ptfhost, get_src_dst_asic_and_duts, dutTestParams, dutConfig, dutQosConfig, - ingressLossyProfile, skip_src_dst_different_asic + ingressLossyProfile, skip_src_dst_different_asic, change_lag_lacp_timer ): """ Test QoS SAI Lossy queue, shared buffer dynamic allocation @@ -1591,7 +1591,7 @@ def testQosSaiDwrr( @pytest.mark.parametrize("pgProfile", ["wm_pg_shared_lossless", "wm_pg_shared_lossy"]) def testQosSaiPgSharedWatermark( self, pgProfile, ptfhost, get_src_dst_asic_and_duts, dutTestParams, dutConfig, dutQosConfig, - resetWatermark, _skip_watermark_multi_DUT, skip_src_dst_different_asic + resetWatermark, _skip_watermark_multi_DUT, skip_src_dst_different_asic, change_lag_lacp_timer ): """ Test QoS SAI PG shared watermark test for lossless/lossy traffic @@ -1683,7 +1683,7 @@ def testQosSaiPgSharedWatermark( def testQosSaiPgHeadroomWatermark( self, ptfhost, get_src_dst_asic_and_duts, dutTestParams, dutConfig, dutQosConfig, resetWatermark, - ): + change_lag_lacp_timer): """ Test QoS SAI PG headroom watermark test @@ -1793,7 +1793,7 @@ def testQosSaiPGDrop( @pytest.mark.parametrize("queueProfile", ["wm_q_shared_lossless", "wm_q_shared_lossy"]) def testQosSaiQSharedWatermark( self, get_src_dst_asic_and_duts, queueProfile, ptfhost, dutTestParams, dutConfig, dutQosConfig, - resetWatermark, _skip_watermark_multi_DUT, skip_pacific_dst_asic + resetWatermark, _skip_watermark_multi_DUT, skip_pacific_dst_asic, change_lag_lacp_timer ): """ Test QoS SAI Queue shared watermark test for lossless/lossy traffic