diff --git a/tests/common/plugins/sanity_check/__init__.py b/tests/common/plugins/sanity_check/__init__.py index f9ce9c570af..b62be4ad7bc 100644 --- a/tests/common/plugins/sanity_check/__init__.py +++ b/tests/common/plugins/sanity_check/__init__.py @@ -181,7 +181,7 @@ def prepare_parallel_run(request, parallel_run_context): @pytest.fixture(scope="module") -def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo): +def sanity_check_full(ptfhost, prepare_parallel_run, localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo): logger.info("Prepare sanity check") should_skip_sanity = prepare_parallel_run if should_skip_sanity: @@ -299,7 +299,7 @@ def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanout pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}" .format(json.dumps(failed_results, indent=4, default=fallback_serializer))) else: - recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, localhost, nbrhosts, + recover_on_sanity_check_failure(ptfhost, duthosts, failed_results, fanouthosts, localhost, nbrhosts, pre_check_items, recover_method, request, tbinfo, STAGE_PRE_TEST) logger.info("Done pre-test sanity check") @@ -325,15 +325,16 @@ def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanout pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}" .format(json.dumps(post_failed_results, indent=4, default=fallback_serializer))) else: - recover_on_sanity_check_failure(duthosts, post_failed_results, fanouthosts, localhost, nbrhosts, - post_check_items, recover_method, request, tbinfo, STAGE_POST_TEST) + recover_on_sanity_check_failure(ptfhost, duthosts, post_failed_results, fanouthosts, localhost, + nbrhosts, post_check_items, recover_method, request, tbinfo, + STAGE_POST_TEST) logger.info("Done post-test sanity check") else: logger.info('No post-test sanity check item, skip post-test sanity check.') -def recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, localhost, nbrhosts, check_items, +def recover_on_sanity_check_failure(ptfhost, duthosts, failed_results, fanouthosts, localhost, nbrhosts, check_items, recover_method, request, tbinfo, sanity_check_stage: str): cache_key = "pre_sanity_check_failed" recovery_cache_key = "pre_sanity_recovered" @@ -363,7 +364,7 @@ def recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, local else: for dut_name, dut_results in list(dut_failed_results.items()): # Attempt to restore DUT state - recover(duthosts[dut_name], localhost, fanouthosts, nbrhosts, tbinfo, dut_results, + recover(ptfhost, duthosts[dut_name], localhost, fanouthosts, nbrhosts, tbinfo, dut_results, recover_method) except BaseException as e: diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py index 42a4096d51e..bc85a7e6b0b 100644 --- a/tests/common/plugins/sanity_check/recover.py +++ b/tests/common/plugins/sanity_check/recover.py @@ -1,5 +1,6 @@ import json import logging +import time from tests.common import config_reload from tests.common.devices.sonic import SonicHost @@ -7,7 +8,7 @@ from tests.common.platform.device_utils import fanout_switch_port_lookup from tests.common.reboot import REBOOT_TYPE_WARM, REBOOT_TYPE_FAST, REBOOT_TYPE_COLD from tests.common.reboot import reboot -from tests.common.utilities import wait +from tests.common.utilities import wait, wait_until from . import constants from ...helpers.multi_thread_utils import SafeThreadPoolExecutor @@ -147,13 +148,49 @@ def _recover_with_command(dut, cmd, wait_time): wait(wait_time, msg="Wait {} seconds for system to be stable.".format(wait_time)) -def re_announce_routes(localhost, topo_name, ptf_ip): - localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="withdraw", path="../ansible/") - localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="announce", path="../ansible/") +def re_announce_routes(ptfhost, localhost, topo_name, ptf_ip, neighbor_number): + def _check_exabgp(): + # Get pid of exabgp processes + exabgp_pids = [] + output = ptfhost.shell("ps aux | grep exabgp/http_api |grep -v grep | awk '{print $2}'", + module_ignore_errors=True) + if output['rc'] != 0: + logger.warning("cmd to fetch exabgp pid returned with error: {}".format(output["stderr"])) + return False + for line in output["stdout_lines"]: + if len(line.strip()) == 0: + continue + exabgp_pids.append(line.strip()) + # Each bgp neighbor has 2 exabgp process, one is for v4 and another is for v6 + if len(exabgp_pids) != neighbor_number * 2: + logger.info("pids number for exabgp processes is incorrect, expected: {}, actual: {}" + .format(neighbor_number * 2, len(exabgp_pids))) + return False + # Check whether all sockets for exabgp are created + output = ptfhost.shell("ss -nltp | grep -E \"{}\"" + .format("|".join(["pid={}".format(pid) for pid in exabgp_pids])), + module_ignore_errors=True) + return output["rc"] == 0 and len(output["stdout_lines"]) == neighbor_number * 2 + + def _op_routes(action): + try: + localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action=action, path="../ansible/") + time.sleep(5) + except Exception as e: + logger.error("Failed to {} routes with error: {}".format(action, e)) + + ptfhost.shell("supervisorctl restart exabgpv4:*", module_ignore_errors=True) + ptfhost.shell("supervisorctl restart exabgpv6:*", module_ignore_errors=True) + # Wait exabgp to be ready + if not wait_until(120, 5, 0, _check_exabgp): + logger.error("Not all exabgp process are running") + + _op_routes("withdraw") + _op_routes("announce") return None -def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time): +def adaptive_recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time): outstanding_action = None for result in check_results: if result['failed']: @@ -169,7 +206,8 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result "no_v6_default_route" in result['bgp'] and len(result['bgp']) == 1 or ("no_v4_default_route" in result['bgp'] and "no_v6_default_route" in result['bgp'] and len(result['bgp']) == 2))): - action = re_announce_routes(localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"]) + action = re_announce_routes(ptfhost, localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"], + len(nbrhosts)) else: action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result) elif result['check_item'] == "neighbor_macsec_empty": @@ -199,13 +237,13 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result _recover_with_command(dut, method['cmd'], wait_time) -def recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method): +def recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method): logger.warning("Try to recover %s using method %s" % (dut.hostname, recover_method)) method = constants.RECOVER_METHODS[recover_method] wait_time = method['recover_wait'] if method["adaptive"]: - adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time) + adaptive_recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time) elif method["reload"]: config_reload(dut, config_source='running_golden_config', safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)