Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sanity_check][bgp] Enhance sanity check recover for bgp default route missing #16357

Merged
merged 3 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions tests/common/plugins/sanity_check/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def prepare_parallel_run(request, parallel_run_context):


@pytest.fixture(scope="module")
def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo):
def sanity_check_full(ptfhost, prepare_parallel_run, localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo):
logger.info("Prepare sanity check")
should_skip_sanity = prepare_parallel_run
if should_skip_sanity:
Expand Down Expand Up @@ -299,7 +299,7 @@ def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanout
pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"
.format(json.dumps(failed_results, indent=4, default=fallback_serializer)))
else:
recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, localhost, nbrhosts,
recover_on_sanity_check_failure(ptfhost, duthosts, failed_results, fanouthosts, localhost, nbrhosts,
pre_check_items, recover_method, request, tbinfo, STAGE_PRE_TEST)

logger.info("Done pre-test sanity check")
Expand All @@ -325,15 +325,16 @@ def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanout
pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"
.format(json.dumps(post_failed_results, indent=4, default=fallback_serializer)))
else:
recover_on_sanity_check_failure(duthosts, post_failed_results, fanouthosts, localhost, nbrhosts,
post_check_items, recover_method, request, tbinfo, STAGE_POST_TEST)
recover_on_sanity_check_failure(ptfhost, duthosts, post_failed_results, fanouthosts, localhost,
nbrhosts, post_check_items, recover_method, request, tbinfo,
STAGE_POST_TEST)

logger.info("Done post-test sanity check")
else:
logger.info('No post-test sanity check item, skip post-test sanity check.')


def recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, localhost, nbrhosts, check_items,
def recover_on_sanity_check_failure(ptfhost, duthosts, failed_results, fanouthosts, localhost, nbrhosts, check_items,
recover_method, request, tbinfo, sanity_check_stage: str):
cache_key = "pre_sanity_check_failed"
recovery_cache_key = "pre_sanity_recovered"
Expand Down Expand Up @@ -363,7 +364,7 @@ def recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, local
else:
for dut_name, dut_results in list(dut_failed_results.items()):
# Attempt to restore DUT state
recover(duthosts[dut_name], localhost, fanouthosts, nbrhosts, tbinfo, dut_results,
recover(ptfhost, duthosts[dut_name], localhost, fanouthosts, nbrhosts, tbinfo, dut_results,
recover_method)

except BaseException as e:
Expand Down
54 changes: 46 additions & 8 deletions tests/common/plugins/sanity_check/recover.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import json
import logging
import time

from tests.common import config_reload
from tests.common.devices.sonic import SonicHost
from tests.common.helpers.parallel import parallel_run, reset_ansible_local_tmp
from tests.common.platform.device_utils import fanout_switch_port_lookup
from tests.common.reboot import REBOOT_TYPE_WARM, REBOOT_TYPE_FAST, REBOOT_TYPE_COLD
from tests.common.reboot import reboot
from tests.common.utilities import wait
from tests.common.utilities import wait, wait_until
from . import constants
from ...helpers.multi_thread_utils import SafeThreadPoolExecutor

Expand Down Expand Up @@ -147,13 +148,49 @@ def _recover_with_command(dut, cmd, wait_time):
wait(wait_time, msg="Wait {} seconds for system to be stable.".format(wait_time))


def re_announce_routes(localhost, topo_name, ptf_ip):
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="withdraw", path="../ansible/")
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="announce", path="../ansible/")
def re_announce_routes(ptfhost, localhost, topo_name, ptf_ip, neighbor_number):
def _check_exabgp():
# Get pid of exabgp processes
exabgp_pids = []
output = ptfhost.shell("ps aux | grep exabgp/http_api |grep -v grep | awk '{print $2}'",
module_ignore_errors=True)
if output['rc'] != 0:
logger.warning("cmd to fetch exabgp pid returned with error: {}".format(output["stderr"]))
return False
for line in output["stdout_lines"]:
if len(line.strip()) == 0:
continue
exabgp_pids.append(line.strip())
# Each bgp neighbor has 2 exabgp process, one is for v4 and another is for v6
if len(exabgp_pids) != neighbor_number * 2:
logger.info("pids number for exabgp processes is incorrect, expected: {}, actual: {}"
.format(neighbor_number * 2, len(exabgp_pids)))
return False
# Check whether all sockets for exabgp are created
output = ptfhost.shell("ss -nltp | grep -E \"{}\""
.format("|".join(["pid={}".format(pid) for pid in exabgp_pids])),
module_ignore_errors=True)
return output["rc"] == 0 and len(output["stdout_lines"]) == neighbor_number * 2

def _op_routes(action):
try:
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action=action, path="../ansible/")
time.sleep(5)
except Exception as e:
logger.error("Failed to {} routes with error: {}".format(action, e))

ptfhost.shell("supervisorctl restart exabgpv4:*", module_ignore_errors=True)
ptfhost.shell("supervisorctl restart exabgpv6:*", module_ignore_errors=True)
# Wait exabgp to be ready
if not wait_until(120, 5, 0, _check_exabgp):
logger.error("Not all exabgp process are running")

_op_routes("withdraw")
_op_routes("announce")
return None


def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time):
def adaptive_recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time):
outstanding_action = None
for result in check_results:
if result['failed']:
Expand All @@ -169,7 +206,8 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result
"no_v6_default_route" in result['bgp'] and len(result['bgp']) == 1 or
("no_v4_default_route" in result['bgp'] and "no_v6_default_route" in result['bgp'] and
len(result['bgp']) == 2))):
action = re_announce_routes(localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"])
action = re_announce_routes(ptfhost, localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"],
len(nbrhosts))
else:
action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result)
elif result['check_item'] == "neighbor_macsec_empty":
Expand Down Expand Up @@ -199,13 +237,13 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result
_recover_with_command(dut, method['cmd'], wait_time)


def recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method):
def recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method):
logger.warning("Try to recover %s using method %s" % (dut.hostname, recover_method))

method = constants.RECOVER_METHODS[recover_method]
wait_time = method['recover_wait']
if method["adaptive"]:
adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time)
adaptive_recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time)
elif method["reload"]:
config_reload(dut, config_source='running_golden_config',
safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)
Expand Down
Loading