diff --git a/tests/bgp/bgp_helpers.py b/tests/bgp/bgp_helpers.py index d14770146c..a129bdbbd7 100644 --- a/tests/bgp/bgp_helpers.py +++ b/tests/bgp/bgp_helpers.py @@ -15,6 +15,7 @@ from tests.common.helpers.assertions import pytest_assert from tests.common.helpers.constants import UPSTREAM_NEIGHBOR_MAP, DOWNSTREAM_NEIGHBOR_MAP, DEFAULT_NAMESPACE, \ DEFAULT_ASIC_ID +from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor from tests.common.helpers.parallel import reset_ansible_local_tmp from tests.common.helpers.parallel import parallel_run from tests.common.utilities import wait_until @@ -896,12 +897,16 @@ def initial_tsa_check_before_and_after_test(duthosts): pytest_assert('false' == get_tsa_chassisdb_config(duthost), "Supervisor {} tsa_enabled config is enabled".format(duthost.hostname)) - for linecard in duthosts.frontend_nodes: - # Issue TSB on the line card before proceeding further - if verify_dut_configdb_tsa_value(linecard) is not False or get_tsa_chassisdb_config(linecard) != 'false' or \ - get_traffic_shift_state(linecard, cmd='TSC no-stats') != TS_NORMAL: - linecard.shell('TSB') - linecard.shell('sudo config save -y') + def run_tsb_on_linecard_and_verify(lc): + if verify_dut_configdb_tsa_value(lc) is not False or get_tsa_chassisdb_config(lc) != 'false' or \ + get_traffic_shift_state(lc, cmd='TSC no-stats') != TS_NORMAL: + lc.shell('TSB') + lc.shell('sudo config save -y') # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard, cmd='TSC no-stats'), + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc, cmd='TSC no-stats'), "DUT is not in normal state") + + # Issue TSB on the line card before proceeding further + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(run_tsb_on_linecard_and_verify, linecard) diff --git a/tests/bgp/test_startup_tsa_tsb_service.py b/tests/bgp/test_startup_tsa_tsb_service.py index 5432b0d042..f3c592f273 100644 --- a/tests/bgp/test_startup_tsa_tsb_service.py +++ b/tests/bgp/test_startup_tsa_tsb_service.py @@ -1,7 +1,10 @@ import logging import datetime +import threading + import pytest from tests.common import reboot, config_reload +from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor from tests.common.reboot import get_reboot_cause, SONIC_SSH_PORT, SONIC_SSH_REGEX, wait_for_startup from tests.common.helpers.assertions import pytest_assert from tests.common.utilities import wait_until @@ -12,6 +15,7 @@ from tests.bgp.route_checker import parse_routes_on_neighbors, check_and_log_routes_diff, \ verify_current_routes_announced_to_neighs, verify_only_loopback_routes_are_announced_to_neighs from tests.bgp.constants import TS_NORMAL, TS_MAINTENANCE +from tests.conftest import get_hosts_per_hwsku pytestmark = [ pytest.mark.topology('t2') @@ -30,6 +34,9 @@ SSH_STATE_ABSENT = "absent" SSH_STATE_STARTED = "started" +lock = threading.Lock() +_cached_frontend_nodes = None + @pytest.fixture(scope="module", autouse=True) def enable_disable_startup_tsa_tsb_service(duthosts): @@ -139,18 +146,19 @@ def check_tsa_tsb_service_run_time_diff(service_uptime, configured_service_timer return int(actual_service_timer) < configured_service_timer -def nbrhosts_to_dut(duthost, nbrhosts): +def nbrhosts_to_dut(duthost, nbrhosts, dut_nbrhosts): """ @summary: Fetch the neighbor hosts' details for duthost - @returns: dut_nbrhosts dict """ mg_facts = duthost.minigraph_facts(host=duthost.hostname)['ansible_facts'] - dut_nbrhosts = {} + all_nbhhosts = {} for host in nbrhosts.keys(): if host in mg_facts['minigraph_devices']: new_nbrhost = {host: nbrhosts[host]} - dut_nbrhosts.update(new_nbrhost) - return dut_nbrhosts + all_nbhhosts.update(new_nbrhost) + + with lock: + dut_nbrhosts[duthost] = all_nbhhosts def check_ssh_state(localhost, dut_ip, expected_state, timeout=60): @@ -174,268 +182,362 @@ def check_ssh_state(localhost, dut_ip, expected_state, timeout=60): return not res.is_failed and 'Timeout' not in res.get('msg', '') +def verify_route_on_neighbors(linecards, dut_nbrhosts, orig_v4_routes, orig_v6_routes): + for linecard in linecards: + # Wait until all routes are announced to neighbors + cur_v4_routes = {} + cur_v6_routes = {} + # Verify that all routes advertised to neighbor at the start of the test + if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], + orig_v4_routes[linecard], cur_v4_routes, 4): + if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], + orig_v4_routes[linecard], cur_v4_routes, 4): + pytest.fail("Not all ipv4 routes are announced to neighbors") + + if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], + orig_v6_routes[linecard], cur_v6_routes, 6): + if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], + orig_v6_routes[linecard], cur_v6_routes, 6): + pytest.fail("Not all ipv6 routes are announced to neighbors") + + +def get_frontend_nodes_per_hwsku(duthosts, request): + global _cached_frontend_nodes + if _cached_frontend_nodes is None: + _cached_frontend_nodes = [ + duthosts[hostname] for hostname in get_hosts_per_hwsku( + request, + [host.hostname for host in duthosts.frontend_nodes], + ) + ] + + return _cached_frontend_nodes + @pytest.mark.disable_loganalyzer -def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_tsa_tsb_service_with_dut_cold_reboot(request, duthosts, localhost, nbrhosts, traffic_shift_community): """ Test startup TSA_TSB service after DUT cold reboot Verify startup_tsa_tsb.service started automatically when dut comes up Verify this service configures TSA and starts a timer and configures TSB once the timer is expired """ - duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] - tsa_tsb_timer = get_startup_tsb_timer(duthost) - int_status_result, crit_process_check = True, True - if not tsa_tsb_timer: - pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) - dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) - if not check_tsa_persistence_support(duthost): - pytest.skip("TSA persistence not supported in the image") + frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request) + for linecard in frontend_nodes_per_hwsku: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() + for linecard in frontend_nodes_per_hwsku: + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) + if not tsa_tsb_timer[linecard]: + pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(linecard.hostname)) + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in frontend_nodes_per_hwsku: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") - up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") - + orig_v4_routes, orig_v6_routes = dict(), dict() try: # Get all routes on neighbors before doing reboot - orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) - orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + for linecard in frontend_nodes_per_hwsku: + orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) + orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) - # Reboot dut and wait for startup_tsa_tsb service to start - logger.info("Cold reboot on node: %s", duthost.hostname) - reboot(duthost, localhost, wait=240) + def reboot_and_verify(lc): + # Reboot dut and wait for startup_tsa_tsb service to start + logger.info("Cold reboot on node: %s", lc.hostname) + reboot(lc, localhost, wait=240) - logger.info('Cold reboot finished on {}'.format(duthost.hostname)) - dut_uptime = duthost.get_up_time() - logger.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) + logger.info('Cold reboot finished on {}'.format(lc.hostname)) + dut_uptime = lc.get_up_time() + logger.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) - # Ensure startup_tsa_tsb service is running after dut reboot - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, duthost, 'running'), - "startup_tsa_tsb service is not started after reboot") + # Ensure startup_tsa_tsb service is running after dut reboot + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'running'), + "startup_tsa_tsb service is not started after reboot") - # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = duthost.get_up_time() - logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(duthost) - time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, - "startup_tsa_tsb service started much later than the expected time after dut reboot") + # Ensure startup_tsa_tsb service started on expected time since dut rebooted + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) + time_diff = (service_uptime - dut_uptime).total_seconds() + pytest_assert(int(time_diff) < 300, + "startup_tsa_tsb service started much later than the expected time after dut reboot") - # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state when startup_tsa_tsb service is running") + # Verify DUT is in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state when startup_tsa_tsb service is running") - logging.info("Wait until all critical processes are fully started") - crit_process_check = wait_until(600, 20, 0, _all_critical_processes_healthy, duthost) - int_status_result = wait_until(1200, 20, 0, check_interface_status_of_up_ports, duthost) + logging.info("Wait until all critical processes are fully started") + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res - # verify bgp sessions are established - pytest_assert( - wait_until(300, 10, 0, duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), - "All BGP sessions are not up, no point in continuing the test") + # verify bgp sessions are established + pytest_assert( + wait_until( + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), + "All BGP sessions are not up, no point in continuing the test") - pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( - duthosts, duthost, dut_nbrhosts, traffic_shift_community), "Failed to verify routes on nbr in TSA") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(reboot_and_verify, linecard) - # Verify startup_tsa_tsb service stopped after expected time - pytest_assert(wait_until(tsa_tsb_timer, 20, 0, get_tsa_tsb_service_status, duthost, 'exited'), - "startup_tsa_tsb service is not stopped even after configured timer expiry") + for linecard in frontend_nodes_per_hwsku: + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( + duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), + "Failed to verify routes on nbr in TSA") - # Ensure dut comes back to normal state after timer expiry - if not get_tsa_tsb_service_status(duthost, 'running'): - # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state after startup_tsa_tsb service is stopped") + def further_verify_linecard(lc): + # Verify startup_tsa_tsb service stopped after expected time + pytest_assert(wait_until(tsa_tsb_timer[lc], 20, 0, get_tsa_tsb_service_status, lc, 'exited'), + "startup_tsa_tsb service is not stopped even after configured timer expiry") - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v4_routes, cur_v4_routes, 4): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") + # Ensure dut comes back to normal state after timer expiry + if not get_tsa_tsb_service_status(lc, 'running'): + # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc), + "DUT is not in normal state after startup_tsa_tsb service is stopped") - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v6_routes, cur_v6_routes, 6): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v6_routes, cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(further_verify_linecard, linecard) + verify_route_on_neighbors(frontend_nodes_per_hwsku, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - # Verify DUT is in normal state after cold reboot scenario. - if not (int_status_result and crit_process_check and TS_NORMAL == get_traffic_shift_state(duthost)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}".format(int_status_result, crit_process_check, TS_NORMAL)) - logging.info("DUT is not in normal state after cold reboot, doing config-reload") - config_reload(duthost, safe_reload=True, check_intf_up_ports=True) - # Make sure the dut's reboot cause is as expected - logger.info("Check reboot cause of the dut") - reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + + def config_reload_linecard_if_unhealthy(lc): + # Verify DUT is in normal state after cold reboot scenario. + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + + logging.info("DUT is not in normal state after cold reboot, doing config-reload") + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(config_reload_linecard_if_unhealthy, linecard) + + for linecard in frontend_nodes_per_hwsku: + # Make sure the dut's reboot cause is as expected + logger.info("Check reboot cause of the dut") + reboot_cause = get_reboot_cause(linecard) + pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) @pytest.mark.disable_loganalyzer -def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, - ptfhost, nbrhosts, traffic_shift_community, tbinfo): +def test_tsa_tsb_service_with_dut_abnormal_reboot(request, duthosts, localhost, nbrhosts, traffic_shift_community): """ Test startup TSA_TSB service after DUT abnormal reboot/crash Verify startup_tsa_tsb.service started automatically when dut comes up after crash Verify this service configures TSA and starts a timer and configures TSB once the timer is expired """ - duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] - tsa_tsb_timer = get_startup_tsb_timer(duthost) - int_status_result, crit_process_check = True, True - if not tsa_tsb_timer: - pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) - dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) - dut_ip = duthost.mgmt_ip - if not check_tsa_persistence_support(duthost): - pytest.skip("TSA persistence not supported in the image") + frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request) + for linecard in frontend_nodes_per_hwsku: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() + for linecard in frontend_nodes_per_hwsku: + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) + if not tsa_tsb_timer[linecard]: + pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(linecard.hostname)) + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in frontend_nodes_per_hwsku: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") - up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") - + orig_v4_routes, orig_v6_routes = dict(), dict() try: # Get all routes on neighbors before doing reboot - orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) - orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + for linecard in frontend_nodes_per_hwsku: + orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) + orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) - # Our shell command is designed as 'nohup bash -c "sleep 5 && tail /dev/zero" &' because of: - # * `tail /dev/zero` is used to run out of memory completely. - # * Since `tail /dev/zero` will cause the DUT reboot, we need to run it in the background - # (using &) to avoid pytest getting stuck. `nohup` is also necessary to protect the - # background process. - # * Some DUTs with few free memory may reboot before ansible receive the result of shell - # command, so we add `sleep 5` to ensure ansible receive the result first. - cmd = 'nohup bash -c "sleep 5 && tail /dev/zero" &' - res = duthost.shell(cmd) - if not res.is_successful: - pytest.fail('DUT {} run command {} failed'.format(duthost.hostname, cmd)) + def abnormal_reboot_linecard_and_verify(lc): + # Our shell command is designed as 'nohup bash -c "sleep 5 && tail /dev/zero" &' because of: + # * `tail /dev/zero` is used to run out of memory completely. + # * Since `tail /dev/zero` will cause the DUT reboot, we need to run it in the background + # (using &) to avoid pytest getting stuck. `nohup` is also necessary to protect the + # background process. + # * Some DUTs with few free memory may reboot before ansible receive the result of shell + # command, so we add `sleep 5` to ensure ansible receive the result first. + cmd = 'nohup bash -c "sleep 5 && tail /dev/zero" &' + res = lc.shell(cmd) + if not res.is_successful: + pytest.fail('DUT {} run command {} failed'.format(lc.hostname, cmd)) + + # Waiting for SSH connection shutdown + dut_ip = lc.mgmt_ip + pytest_assert(check_ssh_state(localhost, dut_ip, SSH_STATE_ABSENT, SSH_SHUTDOWN_TIMEOUT), + 'DUT {} did not shutdown'.format(lc.hostname)) + # Waiting for SSH connection startup + pytest_assert(check_ssh_state(localhost, dut_ip, SSH_STATE_STARTED, SSH_STARTUP_TIMEOUT), + 'DUT {} did not startup'.format(lc.hostname)) - # Waiting for SSH connection shutdown - pytest_assert(check_ssh_state(localhost, dut_ip, SSH_STATE_ABSENT, SSH_SHUTDOWN_TIMEOUT), - 'DUT {} did not shutdown'.format(duthost.hostname)) - # Waiting for SSH connection startup - pytest_assert(check_ssh_state(localhost, dut_ip, SSH_STATE_STARTED, SSH_STARTUP_TIMEOUT), - 'DUT {} did not startup'.format(duthost.hostname)) - - # Ensure startup_tsa_tsb service is running after dut reboot - pytest_assert(wait_until(90, 5, 0, get_tsa_tsb_service_status, duthost, 'running'), - "startup_tsa_tsb service is not started after reboot") - - # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = duthost.get_up_time() - logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(duthost) - time_diff = (service_uptime - dut_uptime).total_seconds() - logger.info("Time difference between dut up-time & tsa_tsb_service up-time is {}".format(int(time_diff))) - pytest_assert(int(time_diff) < 160, - "startup_tsa_tsb service started much later than the expected time after dut reboot") - - # Make sure BGP containers are running properly before verifying - pytest_assert(wait_until(90, 5, 0, check_tsc_command_error, duthost), - "TSC command still returns error even after startup_tsa_tsb service started") - - # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state when startup_tsa_tsb service is running") + # Ensure startup_tsa_tsb service is running after dut reboot + pytest_assert(wait_until(90, 5, 0, get_tsa_tsb_service_status, lc, 'running'), + "startup_tsa_tsb service is not started after reboot") - logging.info("Wait until all critical processes are fully started") - crit_process_check = wait_until(600, 20, 0, _all_critical_processes_healthy, duthost) - int_status_result = wait_until(1200, 20, 0, check_interface_status_of_up_ports, duthost) + # Ensure startup_tsa_tsb service started on expected time since dut rebooted + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) + time_diff = (service_uptime - dut_uptime).total_seconds() + logger.info("Time difference between dut up-time & tsa_tsb_service up-time is {}".format(int(time_diff))) + pytest_assert(int(time_diff) < 300, + "startup_tsa_tsb service started much later than the expected time after dut reboot") - # verify bgp sessions are established - pytest_assert( - wait_until(300, 10, 0, duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), - "All BGP sessions are not up, no point in continuing the test") + # Make sure BGP containers are running properly before verifying + pytest_assert(wait_until(90, 5, 0, check_tsc_command_error, lc), + "TSC command still returns error even after startup_tsa_tsb service started") - pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( - duthosts, duthost, dut_nbrhosts, traffic_shift_community), "Failed to verify routes on nbr in TSA") + # Verify DUT is in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state when startup_tsa_tsb service is running") - # Verify startup_tsa_tsb service stopped after expected time - pytest_assert(wait_until(tsa_tsb_timer, 20, 0, get_tsa_tsb_service_status, duthost, 'exited'), - "startup_tsa_tsb service is not stopped even after configured timer expiry") + logging.info("Wait until all critical processes are fully started") + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res - # Ensure dut comes back to normal state after timer expiry - if not get_tsa_tsb_service_status(duthost, 'running'): - # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state after startup_tsa_tsb service is stopped") + # verify bgp sessions are established + pytest_assert( + wait_until( + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), + "All BGP sessions are not up, no point in continuing the test") - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v4_routes, cur_v4_routes, 4): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(abnormal_reboot_linecard_and_verify, linecard) - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v6_routes, cur_v6_routes, 6): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v6_routes, cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + for linecard in frontend_nodes_per_hwsku: + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( + duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), + "Failed to verify routes on nbr in TSA") + + def further_verify_linecard(lc): + # Verify startup_tsa_tsb service stopped after expected time + pytest_assert(wait_until(tsa_tsb_timer[lc], 20, 0, get_tsa_tsb_service_status, lc, 'exited'), + "startup_tsa_tsb service is not stopped even after configured timer expiry") + # Ensure dut comes back to normal state after timer expiry + if not get_tsa_tsb_service_status(lc, 'running'): + # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc), + "DUT is not in normal state after startup_tsa_tsb service is stopped") + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(further_verify_linecard, linecard) + + verify_route_on_neighbors(frontend_nodes_per_hwsku, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - # Verify DUT is in normal state after abnormal reboot scenario. - if not (int_status_result and crit_process_check and TS_NORMAL == get_traffic_shift_state(duthost)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}".format(int_status_result, crit_process_check, TS_NORMAL)) - logging.info("DUT is not in normal state after abnormal reboot, doing config-reload") - config_reload(duthost, safe_reload=True, check_intf_up_ports=True) - # Make sure the dut's reboot cause is as expected - logger.info("Check reboot cause of the dut") - reboot_cause = get_reboot_cause(duthost) - out = duthost.command('show kdump config') - if "Enabled" not in out["stdout"]: - pytest_assert( - reboot_cause == UNKNOWN_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE) - ) - else: - pytest_assert( - reboot_cause == KERNEL_PANIC_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE) - ) + + def config_reload_linecard_if_unhealthy(lc): + # Verify DUT is in normal state after abnormal reboot scenario. + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + + logging.info("DUT is not in normal state after abnormal reboot, doing config-reload") + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(config_reload_linecard_if_unhealthy, linecard) + + for linecard in frontend_nodes_per_hwsku: + # Make sure the dut's reboot cause is as expected + logger.info("Check reboot cause of the dut") + reboot_cause = get_reboot_cause(linecard) + out = linecard.command('show kdump config') + if "Enabled" not in out["stdout"]: + pytest_assert( + reboot_cause == UNKNOWN_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE) + ) + else: + pytest_assert( + reboot_cause == KERNEL_PANIC_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE) + ) @pytest.mark.disable_loganalyzer -def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_supervisor_dut_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_supervisor_dut_hostname, nbrhosts, + traffic_shift_community): """ Test startup TSA_TSB service after supervisor cold reboot - Verify startup_tsa_tsb.service started automatically on all linecards when they comes up + Verify startup_tsa_tsb.service started automatically on all linecards when they come up Verify this service configures TSA and starts a timer and configures TSB once the timer is expired on linecards """ suphost = duthosts[enum_supervisor_dut_hostname] - tsa_tsb_timer = dict() - dut_nbrhosts = dict() - up_bgp_neighbors = dict() - orig_v4_routes, orig_v6_routes = dict(), dict() - int_status_result, crit_process_check = dict(), dict() + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() for linecard in duthosts.frontend_nodes: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) - int_status_result[linecard] = True - crit_process_check[linecard] = True if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) - dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) - if not check_tsa_persistence_support(linecard): - pytest.skip("TSA persistence not supported in the image") - up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) + # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + orig_v4_routes, orig_v6_routes = dict(), dict() try: + # Get all routes on neighbors before doing reboot for linecard in duthosts.frontend_nodes: - # Get all routes on neighbors before doing reboot orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) @@ -452,76 +554,82 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s rebooted = float(sup_uptime_before.strftime("%s")) != float(sup_uptime.strftime("%s")) assert rebooted, "Device {} did not reboot".format(suphost.hostname) - for linecard in duthosts.frontend_nodes: - wait_for_startup(linecard, localhost, delay=10, timeout=300) + def verify_linecard_after_sup_reboot(lc): + wait_for_startup(lc, localhost, delay=10, timeout=300) # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = linecard.get_up_time() - logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(linecard) + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, + pytest_assert(int(time_diff) < 300, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(linecard), + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), "DUT is not in maintenance state when startup_tsa_tsb service is running") logging.info("Wait until all critical processes are fully started") - crit_process_check[linecard] = wait_until(600, 20, 0, _all_critical_processes_healthy, linecard) - int_status_result[linecard] = wait_until(1200, 20, 0, check_interface_status_of_up_ports, linecard) + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res # verify bgp sessions are established pytest_assert( wait_until( - 300, 10, 0, linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), "All BGP sessions are not up, no point in continuing the test") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(verify_linecard_after_sup_reboot, linecard) + + for linecard in duthosts.frontend_nodes: pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") # Once all line cards are in maintenance state, proceed further - for linecard in duthosts.frontend_nodes: + def further_verify_linecard(lc): # Verify startup_tsa_tsb service stopped after expected time - pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'exited'), + pytest_assert(wait_until(tsa_tsb_timer[lc], 20, 0, get_tsa_tsb_service_status, lc, 'exited'), "startup_tsa_tsb service is not stopped even after configured timer expiry") # Ensure dut comes back to normal state after timer expiry - if not get_tsa_tsb_service_status(linecard, 'running'): + if not get_tsa_tsb_service_status(lc, 'running'): # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc), "DUT is not in normal state after startup_tsa_tsb service is stopped") - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], - orig_v4_routes[linecard], cur_v4_routes, 4): - if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], - orig_v4_routes[linecard], cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") - - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], - orig_v6_routes[linecard], cur_v6_routes, 6): - if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], - orig_v6_routes[linecard], cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(further_verify_linecard, linecard) + verify_route_on_neighbors(duthosts.frontend_nodes, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - # Make sure DUT is in normal state after supervisor cold reboot - for linecard in duthosts.frontend_nodes: - if not (int_status_result[linecard] and crit_process_check[linecard] and - TS_NORMAL == get_traffic_shift_state(linecard)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}". - format(int_status_result[linecard], crit_process_check[linecard], TS_NORMAL)) + def config_reload_linecard_if_unhealthy(lc): + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + logging.info("DUT is not in normal state after supervisor cold reboot, doing config-reload") - config_reload(linecard, safe_reload=True, check_intf_up_ports=True) + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + # Make sure DUT is in normal state after supervisor cold reboot + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(config_reload_linecard_if_unhealthy, linecard) for linecard in duthosts.frontend_nodes: # Make sure the dut's reboot cause is as expected @@ -538,8 +646,8 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s @pytest.mark.disable_loganalyzer -def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, enum_supervisor_dut_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, enum_supervisor_dut_hostname, nbrhosts, + traffic_shift_community): """ Test startup TSA_TSB service after supervisor abnormal reboot Verify startup_tsa_tsb.service started automatically on all linecards when they come up @@ -547,28 +655,32 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en """ suphost = duthosts[enum_supervisor_dut_hostname] sup_ip = suphost.mgmt_ip - tsa_tsb_timer = dict() - dut_nbrhosts = dict() - up_bgp_neighbors = dict() - orig_v4_routes, orig_v6_routes = dict(), dict() - int_status_result, crit_process_check = dict(), dict() + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() for linecard in duthosts.frontend_nodes: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) - int_status_result[linecard] = True - crit_process_check[linecard] = True if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) - dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) - if not check_tsa_persistence_support(linecard): - pytest.skip("TSA persistence not supported in the image") - up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + orig_v4_routes, orig_v6_routes = dict(), dict() try: + # Get all routes on neighbors before doing reboot for linecard in duthosts.frontend_nodes: - # Get all routes on neighbors before doing reboot orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) @@ -602,80 +714,86 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en rebooted = float(sup_uptime_before.strftime("%s")) != float(sup_uptime.strftime("%s")) assert rebooted, "Device {} did not reboot".format(suphost.hostname) - for linecard in duthosts.frontend_nodes: - wait_for_startup(linecard, localhost, delay=10, timeout=300) + def verify_linecard_after_sup_reboot(lc): + wait_for_startup(lc, localhost, delay=10, timeout=300) # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = linecard.get_up_time() - logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(linecard) + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, + pytest_assert(int(time_diff) < 300, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Make sure BGP containers are running properly before verifying - pytest_assert(wait_until(90, 5, 0, check_tsc_command_error, linecard), + pytest_assert(wait_until(90, 5, 0, check_tsc_command_error, lc), "TSC command still returns error even after startup_tsa_tsb service started") # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(linecard), + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), "DUT is not in maintenance state when startup_tsa_tsb service is running") logging.info("Wait until all critical processes are fully started") - crit_process_check[linecard] = wait_until(600, 20, 0, _all_critical_processes_healthy, linecard) - int_status_result[linecard] = wait_until(1200, 20, 0, check_interface_status_of_up_ports, linecard) + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res # verify bgp sessions are established pytest_assert( wait_until( - 300, 10, 0, linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), "All BGP sessions are not up, no point in continuing the test") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(verify_linecard_after_sup_reboot, linecard) + + for linecard in duthosts.frontend_nodes: pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") # Once all line cards are in maintenance state, proceed further - for linecard in duthosts.frontend_nodes: + def further_verify_linecard(lc): # Verify startup_tsa_tsb service stopped after expected time - pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'exited'), + pytest_assert(wait_until(tsa_tsb_timer[lc], 20, 0, get_tsa_tsb_service_status, lc, 'exited'), "startup_tsa_tsb service is not stopped even after configured timer expiry") # Ensure dut comes back to normal state after timer expiry - if not get_tsa_tsb_service_status(linecard, 'running'): + if not get_tsa_tsb_service_status(lc, 'running'): # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc), "DUT is not in normal state after startup_tsa_tsb service is stopped") - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], - orig_v4_routes[linecard], cur_v4_routes, 4): - if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], - orig_v4_routes[linecard], cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") - - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], - orig_v6_routes[linecard], cur_v6_routes, 6): - if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], - orig_v6_routes[linecard], cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(further_verify_linecard, linecard) + verify_route_on_neighbors(duthosts.frontend_nodes, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - # Make sure DUT is in normal state after supervisor abnormal reboot - for linecard in duthosts.frontend_nodes: - if not (int_status_result[linecard] and crit_process_check[linecard] and - TS_NORMAL == get_traffic_shift_state(linecard)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}". - format(int_status_result[linecard], crit_process_check[linecard], TS_NORMAL)) + def config_reload_linecard_if_unhealthy(lc): + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + logging.info("DUT is not in normal state after SUP abnormal reboot, doing config-reload") - config_reload(linecard, safe_reload=True, check_intf_up_ports=True) + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + # Make sure DUT is in normal state after supervisor abnormal reboot + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(config_reload_linecard_if_unhealthy, linecard) for linecard in duthosts.frontend_nodes: # Make sure the dut's reboot cause is as expected @@ -701,113 +819,123 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en @pytest.mark.disable_loganalyzer -def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_tsa_tsb_service_with_user_init_tsa(request, duthosts, localhost, nbrhosts, traffic_shift_community): """ Initially, User initiates TSA on the DUT and saves the config on DUT. Test startup TSA_TSB service after DUT cold reboot Verify startup_tsa_tsb.service starts automatically when dut comes up Verify this service doesn't configure another TSA and retains the existing TSA config on DUT """ - duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] - tsa_tsb_timer = get_startup_tsb_timer(duthost) - if not tsa_tsb_timer: - pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) - dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) - orig_v4_routes, orig_v6_routes = {}, {} - if not check_tsa_persistence_support(duthost): - pytest.skip("TSA persistence not supported in the image") + frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request) + for linecard in frontend_nodes_per_hwsku: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, up_bgp_neighbors = dict(), dict() + for linecard in frontend_nodes_per_hwsku: + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) + if not tsa_tsb_timer[linecard]: + pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(linecard.hostname)) + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in frontend_nodes_per_hwsku: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") - up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") - + orig_v4_routes, orig_v6_routes = dict(), dict() try: # Get all routes on neighbors before doing reboot - orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) - orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + for linecard in frontend_nodes_per_hwsku: + orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) + orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) - # Issue TSA on DUT - duthost.shell("TSA") - duthost.shell('sudo config save -y') + def run_tsa_and_reboot_and_verify(lc): + # Issue TSA on DUT + lc.shell("TSA") + lc.shell('sudo config save -y') - # Reboot dut and wait for startup_tsa_tsb service to start - logger.info("Cold reboot on node: %s", duthost.hostname) - reboot(duthost, localhost, wait=240) + # Reboot dut and wait for startup_tsa_tsb service to start + logger.info("Cold reboot on node: %s", lc.hostname) + reboot(lc, localhost, wait=240) - logger.info('Cold reboot finished on {}'.format(duthost.hostname)) - dut_uptime = duthost.get_up_time() - logger.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) + logger.info('Cold reboot finished on {}'.format(lc.hostname)) + dut_uptime = lc.get_up_time() + logger.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) - # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = duthost.get_up_time() - logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(duthost) - time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, - "startup_tsa_tsb service started much later than the expected time after dut reboot") + # Ensure startup_tsa_tsb service started on expected time since dut rebooted + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) + time_diff = (service_uptime - dut_uptime).total_seconds() + pytest_assert(int(time_diff) < 300, + "startup_tsa_tsb service started much later than the expected time after dut reboot") - # Ensure startup_tsa_tsb service is in exited state after dut reboot - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, duthost, 'exited'), - "startup_tsa_tsb service is not in exited state after reboot") + # Ensure startup_tsa_tsb service is in exited state after dut reboot + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'exited'), + "startup_tsa_tsb service is not in exited state after reboot") - # Verify DUT continues to be in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state with saved TSA config after reboot") + # Verify DUT continues to be in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state with saved TSA config after reboot") - logging.info("Wait until all critical processes are fully started") - wait_critical_processes(duthost) - pytest_assert(wait_until(1200, 20, 0, check_interface_status_of_up_ports, duthost), - "Not all ports that are admin up on are operationally up") + logging.info("Wait until all critical processes are fully started") + wait_critical_processes(lc) + pytest_assert(wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc), + "Not all ports that are admin up on are operationally up") - # verify bgp sessions are established - pytest_assert( - wait_until(300, 10, 0, duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), - "All BGP sessions are not up, no point in continuing the test") + # verify bgp sessions are established + pytest_assert( + wait_until( + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), + "All BGP sessions are not up, no point in continuing the test") - pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( - duthosts, duthost, dut_nbrhosts, traffic_shift_community), - "Failed to verify routes on nbr in TSA") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(run_tsa_and_reboot_and_verify, linecard) + for linecard in frontend_nodes_per_hwsku: + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( + duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), + "Failed to verify routes on nbr in TSA") finally: """ Test TSB after config save and config reload Verify all routes are announced back to neighbors """ - # Recover to Normal state - duthost.shell("TSB") - duthost.shell('sudo config save -y') - config_reload(duthost, safe_reload=True, check_intf_up_ports=True) + def run_tsb_and_config_reload(lc): + # Recover to Normal state + lc.shell("TSB") + lc.shell('sudo config save -y') + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(run_tsb_and_config_reload, linecard) # Verify DUT comes back to normal state after TSB. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), "DUT is not in normal state") - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v4_routes, cur_v4_routes, 4): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") + for linecard in frontend_nodes_per_hwsku: + pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), "DUT is not in normal state") - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v6_routes, cur_v6_routes, 6): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v6_routes, cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + verify_route_on_neighbors(frontend_nodes_per_hwsku, dut_nbrhosts, orig_v4_routes, orig_v6_routes) # Make sure the dut's reboot cause is as expected - logger.info("Check reboot cause of the dut") - reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + for linecard in frontend_nodes_per_hwsku: + logger.info("Check reboot cause of the dut") + reboot_cause = get_reboot_cause(linecard) + pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) @pytest.mark.disable_loganalyzer -def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_user_init_tsa_while_service_run_on_dut(request, duthosts, localhost, nbrhosts, traffic_shift_community): """ Test startup TSA_TSB service after DUT cold reboot @@ -816,115 +944,144 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o Issue TSA while the service is running on dut, and make sure the TSA is configured Make sure TSA_TSB service is stopped and dut continues to be in maintenance mode """ - duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] - tsa_tsb_timer = get_startup_tsb_timer(duthost) - int_status_result, crit_process_check = True, True - if not tsa_tsb_timer: - pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) - dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) - if not check_tsa_persistence_support(duthost): - pytest.skip("TSA persistence not supported in the image") + frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request) + for linecard in frontend_nodes_per_hwsku: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() + for linecard in frontend_nodes_per_hwsku: + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) + if not tsa_tsb_timer[linecard]: + pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(linecard.hostname)) + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in frontend_nodes_per_hwsku: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") - up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") - + orig_v4_routes, orig_v6_routes = dict(), dict() try: # Get all routes on neighbors before doing reboot - orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) - orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + for linecard in frontend_nodes_per_hwsku: + orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) + orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) - # Reboot dut and wait for startup_tsa_tsb service to start - logger.info("Cold reboot on node: %s", duthost.hostname) - reboot(duthost, localhost, wait=240) + def reboot_and_verify(lc): + # Reboot dut and wait for startup_tsa_tsb service to start + logger.info("Cold reboot on node: %s", lc.hostname) + reboot(lc, localhost, wait=240) - logger.info('Cold reboot finished on {}'.format(duthost.hostname)) - dut_uptime = duthost.get_up_time() - logger.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) + logger.info('Cold reboot finished on {}'.format(lc.hostname)) + dut_uptime = lc.get_up_time() + logger.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) - # Ensure startup_tsa_tsb service is running after dut reboot - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, duthost, 'running'), - "startup_tsa_tsb service is not started after reboot") + # Ensure startup_tsa_tsb service is running after dut reboot + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'running'), + "startup_tsa_tsb service is not started after reboot") - # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = duthost.get_up_time() - logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(duthost) - time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, - "startup_tsa_tsb service started much later than the expected time after dut reboot") + # Ensure startup_tsa_tsb service started on expected time since dut rebooted + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) + time_diff = (service_uptime - dut_uptime).total_seconds() + pytest_assert(int(time_diff) < 300, + "startup_tsa_tsb service started much later than the expected time after dut reboot") - # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state when startup_tsa_tsb service is running") + # Verify DUT is in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state when startup_tsa_tsb service is running") - # Issue TSA on DUT - duthost.shell("TSA") - duthost.shell('sudo config save -y') + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(reboot_and_verify, linecard) - # Ensure startup_tsa_tsb service is in inactive state after user-initiated TSA - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, duthost, 'inactive'), - "startup_tsa_tsb service is not in inactive state after user init TSA") + def run_tsa_and_verify(lc): + # Issue TSA on DUT + lc.shell("TSA") + lc.shell('sudo config save -y') - # Verify DUT continues to be in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state with saved TSA config after reboot") + # Ensure startup_tsa_tsb service is in inactive state after user-initiated TSA + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'inactive'), + "startup_tsa_tsb service is not in inactive state after user init TSA") - logging.info("Wait until all critical processes are fully started") - crit_process_check = wait_until(600, 20, 0, _all_critical_processes_healthy, duthost) - int_status_result = wait_until(1200, 20, 0, check_interface_status_of_up_ports, duthost) + # Verify DUT continues to be in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state with saved TSA config after reboot") - # verify bgp sessions are established - pytest_assert( - wait_until(300, 10, 0, duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), - "All BGP sessions are not up, no point in continuing the test") + logging.info("Wait until all critical processes are fully started") + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res - pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( - duthosts, duthost, dut_nbrhosts, traffic_shift_community), - "Failed to verify routes on nbr in TSA") + # verify bgp sessions are established + pytest_assert( + wait_until(900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), + "All BGP sessions are not up, no point in continuing the test") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(run_tsa_and_verify, linecard) + + for linecard in frontend_nodes_per_hwsku: + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( + duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), + "Failed to verify routes on nbr in TSA") finally: """ Test TSB after config save and config reload Verify all routes are announced back to neighbors """ # Recover to Normal state - duthost.shell("TSB") - duthost.shell('sudo config save -y') - - # Verify DUT is in normal state after cold reboot scenario. - if not (int_status_result and crit_process_check and TS_NORMAL == get_traffic_shift_state(duthost)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}".format(int_status_result, crit_process_check, TS_NORMAL)) - logging.info("DUT is not in normal state after cold reboot, doing config-reload") - config_reload(duthost, safe_reload=True, check_intf_up_ports=True) - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v4_routes, cur_v4_routes, 4): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") + for linecard in frontend_nodes_per_hwsku: + linecard.shell("TSB") + linecard.shell('sudo config save -y') + + def config_reload_linecard_if_unhealthy(lc): + # Verify DUT is in normal state after cold reboot scenario. + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + + logging.info("DUT is not in normal state after cold reboot, doing config-reload") + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(config_reload_linecard_if_unhealthy, linecard) - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v6_routes, cur_v6_routes, 6): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v6_routes, cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + # Wait until all routes are announced to neighbors + verify_route_on_neighbors(frontend_nodes_per_hwsku, dut_nbrhosts, orig_v4_routes, orig_v6_routes) # Make sure the dut's reboot cause is as expected - logger.info("Check reboot cause of the dut") - reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + for linecard in frontend_nodes_per_hwsku: + logger.info("Check reboot cause of the dut") + reboot_cause = get_reboot_cause(linecard) + pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) @pytest.mark.disable_loganalyzer -def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_user_init_tsb_while_service_run_on_dut(request, duthosts, localhost, nbrhosts, traffic_shift_community): """ Test startup TSA_TSB service after DUT cold reboot @@ -933,105 +1090,132 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o Issue TSB while the service is running on dut, and make sure the TSB is configured Make sure TSA_TSB service is stopped and dut continues to be in normal mode """ - duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] - tsa_tsb_timer = get_startup_tsb_timer(duthost) - int_status_result, crit_process_check = True, True - if not tsa_tsb_timer: - pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) - if not check_tsa_persistence_support(duthost): - pytest.skip("TSA persistence not supported in the image") + frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request) + for linecard in frontend_nodes_per_hwsku: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() + for linecard in frontend_nodes_per_hwsku: + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) + if not tsa_tsb_timer[linecard]: + pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(linecard.hostname)) + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in frontend_nodes_per_hwsku: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") - up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") - + orig_v4_routes, orig_v6_routes = dict(), dict() try: # Get all routes on neighbors before doing reboot - orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) - orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) - - # Reboot dut and wait for startup_tsa_tsb service to start - logger.info("Cold reboot on node: %s", duthost.hostname) - reboot(duthost, localhost, wait=240) - - logger.info('Cold reboot finished on {}'.format(duthost.hostname)) - dut_uptime = duthost.get_up_time() - logger.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - - # Ensure startup_tsa_tsb service is running after dut reboot - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, duthost, 'running'), - "startup_tsa_tsb service is not started after reboot") - - # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = duthost.get_up_time() - logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(duthost) - time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, - "startup_tsa_tsb service started much later than the expected time after dut reboot") - - # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state when startup_tsa_tsb service is running") - - # Issue TSB on DUT - duthost.shell("TSB") - duthost.shell('sudo config save -y') - - # Verify DUT comes back to normal state after TSB. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), "DUT is not in normal state") - - # Ensure startup_tsa_tsb service is in inactive state after user-initiated TSB - pytest_assert(wait_until(60, 5, 10, get_tsa_tsb_service_status, duthost, 'inactive'), - "startup_tsa_tsb service is not in inactive state after user init TSB") - - # Make sure DUT continues to be in good state after TSB - assert wait_until(300, 20, 2, duthost.critical_services_fully_started), \ - "Not all critical services are fully started on {}".format(duthost.hostname) - crit_process_check = wait_until(600, 20, 0, _all_critical_processes_healthy, duthost) - int_status_result = wait_until(1200, 20, 0, check_interface_status_of_up_ports, duthost) - - # verify bgp sessions are established - pytest_assert( - wait_until(300, 10, 0, duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), - "All BGP sessions are not up, no point in continuing the test") + for linecard in duthosts.frontend_nodes: + orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) + orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v4_routes, cur_v4_routes, 4): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") + def reboot_and_verify(lc): + # Reboot dut and wait for startup_tsa_tsb service to start + logger.info("Cold reboot on node: %s", lc.hostname) + reboot(lc, localhost, wait=240) - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v6_routes, cur_v6_routes, 6): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v6_routes, cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + logger.info('Cold reboot finished on {}'.format(lc.hostname)) + dut_uptime = lc.get_up_time() + logger.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + + # Ensure startup_tsa_tsb service is running after dut reboot + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'running'), + "startup_tsa_tsb service is not started after reboot") + + # Ensure startup_tsa_tsb service started on expected time since dut rebooted + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) + time_diff = (service_uptime - dut_uptime).total_seconds() + pytest_assert(int(time_diff) < 300, + "startup_tsa_tsb service started much later than the expected time after dut reboot") + + # Verify DUT is in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state when startup_tsa_tsb service is running") + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(reboot_and_verify, linecard) + + def run_tsb_and_verify(lc): + # Issue TSB on DUT + lc.shell("TSB") + lc.shell('sudo config save -y') + # Verify DUT comes back to normal state after TSB. + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc), "DUT is not in normal state") + + # Ensure startup_tsa_tsb service is in inactive state after user-initiated TSB + pytest_assert(wait_until(60, 5, 10, get_tsa_tsb_service_status, lc, 'inactive'), + "startup_tsa_tsb service is not in inactive state after user init TSB") + + # Make sure DUT continues to be in good state after TSB + assert wait_until(300, 20, 2, lc.critical_services_fully_started), \ + "Not all critical services are fully started on {}".format(lc.hostname) + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res + + # verify bgp sessions are established + pytest_assert( + wait_until( + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), + "All BGP sessions are not up, no point in continuing the test") + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(run_tsb_and_verify, linecard) + + verify_route_on_neighbors(frontend_nodes_per_hwsku, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - # Verify DUT is in normal state after cold reboot scenario. - if not (int_status_result and crit_process_check and TS_NORMAL == get_traffic_shift_state(duthost)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}".format(int_status_result, crit_process_check, TS_NORMAL)) - logging.info("DUT is not in normal state after cold reboot, doing config-reload") - config_reload(duthost, safe_reload=True, check_intf_up_ports=True) + + def config_reload_linecard_if_unhealthy(lc): + # Verify DUT is in normal state after cold reboot scenario. + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + + logging.info("DUT is not in normal state after cold reboot, doing config-reload") + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(config_reload_linecard_if_unhealthy, linecard) # Make sure the dut's reboot cause is as expected - logger.info("Check reboot cause of the dut") - reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + for linecard in frontend_nodes_per_hwsku: + logger.info("Check reboot cause of the dut") + reboot_cause = get_reboot_cause(linecard) + pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) @pytest.mark.disable_loganalyzer -def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, - enum_supervisor_dut_hostname, ptfhost, nbrhosts, - traffic_shift_community, creds, tbinfo): +def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, enum_supervisor_dut_hostname, nbrhosts, + traffic_shift_community): """ Test startup TSA_TSB service after DUT cold reboot Verify startup_tsa_tsb.service started automatically when dut comes up @@ -1040,25 +1224,31 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, Make sure TSA_TSB service is stopped and dut changes from maintenance mode to normal mode """ suphost = duthosts[enum_supervisor_dut_hostname] - int_status_result, crit_process_check = dict(), dict() - tsa_tsb_timer = dict() - dut_nbrhosts = dict() - up_bgp_neighbors = dict() - orig_v4_routes, orig_v6_routes = dict(), dict() + for linecard in duthosts.frontend_nodes: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() for linecard in duthosts.frontend_nodes: tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) - int_status_result[linecard] = True - crit_process_check[linecard] = True if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) - dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) - if not check_tsa_persistence_support(linecard): - pytest.skip("TSA persistence not supported in the image") - up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + service_up_times = dict() + orig_v4_routes, orig_v6_routes = dict(), dict() try: for linecard in duthosts.frontend_nodes: # Get all routes on neighbors before doing reboot @@ -1078,31 +1268,40 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, rebooted = float(sup_uptime_before.strftime("%s")) != float(sup_uptime.strftime("%s")) assert rebooted, "Device {} did not reboot".format(suphost.hostname) - for linecard in duthosts.frontend_nodes: - wait_for_startup(linecard, localhost, delay=10, timeout=300) + def verify_linecard_after_sup_reboot(lc): + wait_for_startup(lc, localhost, delay=10, timeout=300) # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = linecard.get_up_time() - logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(linecard) + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, + pytest_assert(int(time_diff) < 300, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(linecard), + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), "DUT is not in maintenance state when startup_tsa_tsb service is running") logging.info("Wait until all critical processes are fully started") - crit_process_check[linecard] = wait_until(600, 20, 0, _all_critical_processes_healthy, linecard) - int_status_result[linecard] = wait_until(1200, 20, 10, check_interface_status_of_up_ports, linecard) + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 10, check_interface_status_of_up_ports, lc) + with lock: + service_up_times[lc] = service_uptime + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res # verify bgp sessions are established pytest_assert( wait_until( - 300, 10, 0, linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), "All BGP sessions are not up, no point in continuing the test") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(verify_linecard_after_sup_reboot, linecard) + + for linecard in duthosts.frontend_nodes: pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -1110,52 +1309,54 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, # Issue user initiated TSB on the supervisor suphost.shell('TSB') - for linecard in duthosts.frontend_nodes: - if get_tsa_tsb_service_status(linecard, 'running') and \ - check_tsa_tsb_service_run_time_diff(service_uptime, tsa_tsb_timer[linecard]): + def verify_linecard_after_sup_tsb(lc): + if get_tsa_tsb_service_status(lc, 'running') and \ + check_tsa_tsb_service_run_time_diff(service_up_times[lc], tsa_tsb_timer[lc]): # Verify DUT continues to be in maintenance state if the timer is running. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(linecard, cmd='TSC no-stats'), + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc, cmd='TSC no-stats'), "DUT is not in maintenance state when startup_tsa_tsb service is running") else: # Verify DUT continues came back to normal state after timer expiry. - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard, cmd='TSC no-stats'), + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc, cmd='TSC no-stats'), "DUT is not in normal state when startup_tsa_tsb service is running") # Ensure startup_tsa_tsb service is in exited state after timer expiry - pytest_assert(wait_until(tsa_tsb_timer[linecard], 5, 0, get_tsa_tsb_service_status, linecard, 'exited'), + pytest_assert(wait_until(tsa_tsb_timer[lc], 5, 0, get_tsa_tsb_service_status, lc, 'exited'), "startup_tsa_tsb service is not in exited state after user init TSB from supervisor") - int_status_result[linecard] = wait_until(1200, 20, 0, check_interface_status_of_up_ports, linecard) - for linecard in duthosts.frontend_nodes: - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], - orig_v4_routes[linecard], cur_v4_routes, 4): - if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], - orig_v4_routes[linecard], cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") - - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], - orig_v6_routes[linecard], cur_v6_routes, 6): - if not check_and_log_routes_diff(linecard, dut_nbrhosts[linecard], - orig_v6_routes[linecard], cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + int_status_result[lc] = int_status_check_res + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(verify_linecard_after_sup_tsb, linecard) + verify_route_on_neighbors(duthosts.frontend_nodes, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - for linecard in duthosts.frontend_nodes: - # Make sure linecards are in Normal state and save the config to proceed further - if not (int_status_result[linecard] and crit_process_check[linecard] and - TS_NORMAL == get_traffic_shift_state(linecard)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}". - format(int_status_result[linecard], crit_process_check[linecard], TS_NORMAL)) + def config_reload_linecard_if_unhealthy(lc): + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + logging.info("DUT is not in normal state after supervisor cold reboot, doing config-reload") - config_reload(linecard, safe_reload=True, check_intf_up_ports=True) + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + # Make sure linecards are in Normal state and save the config to proceed further + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(config_reload_linecard_if_unhealthy, linecard) + + for linecard in duthosts.frontend_nodes: # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut {}".format(linecard)) reboot_cause = get_reboot_cause(linecard) @@ -1170,143 +1371,173 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, @pytest.mark.disable_loganalyzer -def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, - nbrhosts, traffic_shift_community, tbinfo): +def test_tsa_tsb_timer_efficiency(request, duthosts, localhost, nbrhosts, traffic_shift_community): """ Test startup TSA_TSB service after DUT cold reboot Verify the configured tsa_tsb_timer is sufficient for system to be stable Verify this service configures TSA and starts a timer and configures TSB once the timer is expired """ - duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] - tsa_tsb_timer = get_startup_tsb_timer(duthost) - int_status_result, crit_process_check = True, True - if not tsa_tsb_timer: - pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) - if not check_tsa_persistence_support(duthost): - pytest.skip("TSA persistence not supported in the image") + frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request) + for linecard in frontend_nodes_per_hwsku: + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() + for linecard in frontend_nodes_per_hwsku: + tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) + if not tsa_tsb_timer[linecard]: + pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(linecard.hostname)) + + int_status_result[linecard] = True + crit_process_check[linecard] = True + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in frontend_nodes_per_hwsku: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + orig_v4_routes, orig_v6_routes = dict(), dict() try: # Get all routes on neighbors before doing reboot - orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) - orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) - - up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") - - # Reboot dut and wait for startup_tsa_tsb service to start - logger.info("Cold reboot on node: %s", duthost.hostname) - reboot(duthost, localhost, wait=240) + for linecard in frontend_nodes_per_hwsku: + orig_v4_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 4) + orig_v6_routes[linecard] = parse_routes_on_neighbors(linecard, dut_nbrhosts[linecard], 6) - logger.info('Cold reboot finished on {}'.format(duthost.hostname)) - dut_uptime = duthost.get_up_time() - logger.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) + def reboot_and_verify(lc): + # Reboot dut and wait for startup_tsa_tsb service to start + logger.info("Cold reboot on node: %s", lc.hostname) + reboot(lc, localhost, wait=240) - # Ensure startup_tsa_tsb service is running after dut reboot - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, duthost, 'running'), - "startup_tsa_tsb service is not started after reboot") + logger.info('Cold reboot finished on {}'.format(lc.hostname)) + dut_uptime = lc.get_up_time() + logger.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) - # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = duthost.get_up_time() - logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(duthost) - time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, - "startup_tsa_tsb service started much later than the expected time after dut reboot") + # Ensure startup_tsa_tsb service is running after dut reboot + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'running'), + "startup_tsa_tsb service is not started after reboot") - logging.info("Wait until all critical services are fully started") - pytest_assert(wait_until(300, 20, 2, duthost.critical_services_fully_started)), \ - "Not all critical services are fully started on {}".format(duthost.hostname) + # Ensure startup_tsa_tsb service started on expected time since dut rebooted + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) + time_diff = (service_uptime - dut_uptime).total_seconds() + pytest_assert(int(time_diff) < 300, + "startup_tsa_tsb service started much later than the expected time after dut reboot") - logging.info("Wait until all critical processes are fully started") - crit_process_check = wait_until(600, 20, 0, _all_critical_processes_healthy, duthost) - int_status_result = wait_until(1200, 20, 0, check_interface_status_of_up_ports, duthost) + logging.info("Wait until all critical services are fully started") + pytest_assert(wait_until(300, 20, 2, lc.critical_services_fully_started)), \ + "Not all critical services are fully started on {}".format(lc.hostname) - pytest_assert(wait_until(300, 10, 0, - duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established")) + logging.info("Wait until all critical processes are fully started") + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res - stability_check_time = datetime.datetime.now() - time_to_stabilize = (stability_check_time - service_uptime).total_seconds() - logging.info("Time taken for system stability : {}".format(time_to_stabilize)) + pytest_assert( + wait_until( + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established")) - # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(duthost), - "DUT is not in maintenance state when startup_tsa_tsb service is running") + stability_check_time = datetime.datetime.now() + time_to_stabilize = (stability_check_time - service_uptime).total_seconds() + logging.info("Time taken for system stability : {}".format(time_to_stabilize)) - # Verify startup_tsa_tsb service stopped after expected time - pytest_assert(wait_until(tsa_tsb_timer, 20, 0, get_tsa_tsb_service_status, duthost, 'exited'), - "startup_tsa_tsb service is not stopped even after configured timer expiry") + # Verify DUT is in maintenance state. + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), + "DUT is not in maintenance state when startup_tsa_tsb service is running") - # Verify tsa_tsb_timer configured is sufficient - pytest_assert(time_to_stabilize < tsa_tsb_timer, - "Configured tsa_tsb_timer is not sufficient for the system to be stable") + # Verify startup_tsa_tsb service stopped after expected time + pytest_assert(wait_until(tsa_tsb_timer[lc], 20, 0, get_tsa_tsb_service_status, lc, 'exited'), + "startup_tsa_tsb service is not stopped even after configured timer expiry") - # Ensure dut comes back to normal state after timer expiry - if not get_tsa_tsb_service_status(duthost, 'running'): - # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state after startup_tsa_tsb service is stopped") + # Verify tsa_tsb_timer configured is sufficient + pytest_assert(time_to_stabilize < tsa_tsb_timer[lc], + "Configured tsa_tsb_timer is not sufficient for the system to be stable") - # Wait until all routes are announced to neighbors - cur_v4_routes = {} - cur_v6_routes = {} - # Verify that all routes advertised to neighbor at the start of the test - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v4_routes, cur_v4_routes, 4): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): - pytest.fail("Not all ipv4 routes are announced to neighbors") + # Ensure dut comes back to normal state after timer expiry + if not get_tsa_tsb_service_status(lc, 'running'): + # Verify TSB is configured on the dut after startup_tsa_tsb service is stopped + pytest_assert(TS_NORMAL == get_traffic_shift_state(lc), + "DUT is not in normal state after startup_tsa_tsb service is stopped") - if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, - orig_v6_routes, cur_v6_routes, 6): - if not check_and_log_routes_diff(duthost, nbrhosts, orig_v6_routes, cur_v6_routes, 6): - pytest.fail("Not all ipv6 routes are announced to neighbors") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(reboot_and_verify, linecard) + verify_route_on_neighbors(frontend_nodes_per_hwsku, dut_nbrhosts, orig_v4_routes, orig_v6_routes) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - # Verify DUT is in normal state after cold reboot scenario. - if not (int_status_result and crit_process_check and TS_NORMAL == get_traffic_shift_state(duthost)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}".format(int_status_result, crit_process_check, TS_NORMAL)) - logging.info("DUT is not in normal state after cold reboot, doing config-reload") - config_reload(duthost, safe_reload=True, check_intf_up_ports=True) + + def config_reload_linecard_if_unhealthy(lc): + # Verify DUT is in normal state after cold reboot scenario. + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + + logging.info("DUT is not in normal state after cold reboot, doing config-reload") + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in frontend_nodes_per_hwsku: + executor.submit(config_reload_linecard_if_unhealthy, linecard) + # Make sure the dut's reboot cause is as expected - logger.info("Check reboot cause of the dut") - reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) + for linecard in frontend_nodes_per_hwsku: + logger.info("Check reboot cause of the dut") + reboot_cause = get_reboot_cause(linecard) + pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) @pytest.mark.disable_loganalyzer -def test_tsa_tsb_service_with_tsa_on_sup(duthosts, localhost, - enum_supervisor_dut_hostname, ptfhost, nbrhosts, - traffic_shift_community, creds, tbinfo): +def test_tsa_tsb_service_with_tsa_on_sup(duthosts, localhost, enum_supervisor_dut_hostname, nbrhosts, + traffic_shift_community): """ Test startup TSA_TSB service after supervisor cold reboot with TSA enabled on supervisor Verify startup_tsa_tsb.service started automatically when dut comes up Verify this service configures TSA and starts a timer and maintains TSA once the timer is expired on linecards """ suphost = duthosts[enum_supervisor_dut_hostname] - tsa_tsb_timer = dict() - dut_nbrhosts = dict() - up_bgp_neighbors = dict() - int_status_result, crit_process_check = dict(), dict() for linecard in duthosts.frontend_nodes: - up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + if not check_tsa_persistence_support(linecard): + pytest.skip("TSA persistence not supported in the image") + + tsa_tsb_timer, int_status_result, crit_process_check, up_bgp_neighbors = dict(), dict(), dict(), dict() + for linecard in duthosts.frontend_nodes: tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) - int_status_result[linecard] = True - crit_process_check[linecard] = True if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) - dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) + + int_status_result[linecard] = True + crit_process_check[linecard] = True + # Ensure that the DUT is not in maintenance already before start of the test pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), "DUT is not in normal state") - if not check_tsa_persistence_support(linecard): - pytest.skip("TSA persistence not supported in the image") + + dut_nbrhosts = dict() + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(nbrhosts_to_dut, linecard, nbrhosts, dut_nbrhosts) + # Initially make sure both supervisor and line cards are in BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) + for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") + try: # Execute user initiated TSA from supervisor card suphost.shell("TSA") @@ -1325,67 +1556,85 @@ def test_tsa_tsb_service_with_tsa_on_sup(duthosts, localhost, rebooted = float(sup_uptime_before.strftime("%s")) != float(sup_uptime.strftime("%s")) assert rebooted, "Device {} did not reboot".format(suphost.hostname) - for linecard in duthosts.frontend_nodes: - wait_for_startup(linecard, localhost, delay=10, timeout=300) + def verify_linecard_after_sup_reboot(lc): + wait_for_startup(lc, localhost, delay=10, timeout=300) # Ensure startup_tsa_tsb service is running after dut reboot - pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, linecard, 'running'), + pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'running'), "startup_tsa_tsb service is not started after reboot") # Ensure startup_tsa_tsb service started on expected time since dut rebooted - dut_uptime = linecard.get_up_time() - logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) - service_uptime = get_tsa_tsb_service_uptime(linecard) + dut_uptime = lc.get_up_time() + logging.info('DUT {} up since {}'.format(lc.hostname, dut_uptime)) + service_uptime = get_tsa_tsb_service_uptime(lc) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 160, + pytest_assert(int(time_diff) < 300, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(linecard), + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), "DUT is not in maintenance state when startup_tsa_tsb service is running") logging.info("Wait until all critical processes are fully started") - crit_process_check[linecard] = wait_until(600, 20, 0, _all_critical_processes_healthy, linecard) - int_status_result[linecard] = wait_until(1200, 20, 0, check_interface_status_of_up_ports, linecard) + crit_process_check_res = wait_until(600, 20, 0, _all_critical_processes_healthy, lc) + int_status_check_res = wait_until(1200, 20, 0, check_interface_status_of_up_ports, lc) + with lock: + crit_process_check[lc] = crit_process_check_res + int_status_result[lc] = int_status_check_res # Verify BGP sessions are established pytest_assert( wait_until( - 600, 10, 0, linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + 900, 10, 0, lc.check_bgp_session_state_all_asics, up_bgp_neighbors[lc], "established"), "All BGP sessions are not up. No point in continuing the test") + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(verify_linecard_after_sup_reboot, linecard) + + for linecard in duthosts.frontend_nodes: pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") - for linecard in duthosts.frontend_nodes: + def further_verify_linecard(lc): # Verify startup_tsa_tsb service stopped after expected time - pytest_assert(wait_until(tsa_tsb_timer[linecard], 20, 0, get_tsa_tsb_service_status, linecard, 'exited'), + pytest_assert(wait_until(tsa_tsb_timer[lc], 20, 0, get_tsa_tsb_service_status, lc, 'exited'), "startup_tsa_tsb service is not stopped even after configured timer expiry") # Ensure dut comes back to maintenance state after timer expiry - if not get_tsa_tsb_service_status(linecard, 'running'): + if not get_tsa_tsb_service_status(lc, 'running'): # Verify TSA is configured on the dut after startup_tsa_tsb service is stopped - pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(linecard), + pytest_assert(TS_MAINTENANCE == get_traffic_shift_state(lc), "DUT is not in maintenance state after startup_tsa_tsb service is stopped") - pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( - duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), - "Failed to verify routes on nbr in TSA") - + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(further_verify_linecard, linecard) finally: # Bring back the supervisor and line cards to the BGP operational normal state initial_tsa_check_before_and_after_test(duthosts) - for linecard in duthosts.frontend_nodes: - # Make sure linecards are in Normal state and save the config to proceed further - if not (int_status_result[linecard] and crit_process_check[linecard] and - TS_NORMAL == get_traffic_shift_state(linecard)): - logger.info("DUT's current interface status is {}, critical process check is {} " - "or traffic shift state is not {}". - format(int_status_result[linecard], crit_process_check[linecard], TS_NORMAL)) + # Make sure linecards are in Normal state and save the config to proceed further + def config_reload_linecard_if_unhealthy(lc): + if not (int_status_result[lc] and crit_process_check[lc] and TS_NORMAL == get_traffic_shift_state(lc)): + logger.info( + "DUT's current interface status is {}, critical process check is {} " + "or traffic shift state is not {}".format( + int_status_result[lc], + crit_process_check[lc], + TS_NORMAL, + ) + ) + logging.info("DUT is not in normal state after supervisor cold reboot, doing config-reload") - config_reload(linecard, safe_reload=True, check_intf_up_ports=True) + config_reload(lc, safe_reload=True, check_intf_up_ports=True) + + with SafeThreadPoolExecutor(max_workers=8) as executor: + for linecard in duthosts.frontend_nodes: + executor.submit(config_reload_linecard_if_unhealthy, linecard) + + for linecard in duthosts.frontend_nodes: # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut {}".format(linecard)) reboot_cause = get_reboot_cause(linecard) diff --git a/tests/conftest.py b/tests/conftest.py index cab2abee36..8bea93b921 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1232,6 +1232,12 @@ def generate_params_hostname_rand_per_hwsku(request, frontend_only=False): hosts = get_specified_duts(request) if frontend_only: hosts = generate_params_frontend_hostname(request) + + hosts_per_hwsku = get_hosts_per_hwsku(request, hosts) + return hosts_per_hwsku + + +def get_hosts_per_hwsku(request, hosts): inv_files = get_inventory_files(request) # Create a list of hosts per hwsku host_hwskus = {}