Skip to content

Commit

Permalink
[sanity_check][bgp] Enhance sanity check recover for bgp default rout…
Browse files Browse the repository at this point in the history
…e missing (sonic-net#16357)

What is the motivation for this PR?
Sometimes exabgp in ptf would be in incorrect status by stress testing, hence add restarting exabgp before re-announce routes in sanity check.

How did you do it?
Restart exabgp before re-announce routes
Add try catch to handle failed to re-announce issue
How did you verify/test it?
Run test with sanity check
  • Loading branch information
yaqiangz authored and selldinesh committed Jan 9, 2025
1 parent c8a7568 commit 8256c2a
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 14 deletions.
13 changes: 7 additions & 6 deletions tests/common/plugins/sanity_check/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def prepare_parallel_run(request, parallel_run_context):


@pytest.fixture(scope="module")
def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo):
def sanity_check_full(ptfhost, prepare_parallel_run, localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo):
logger.info("Prepare sanity check")
should_skip_sanity = prepare_parallel_run
if should_skip_sanity:
Expand Down Expand Up @@ -299,7 +299,7 @@ def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanout
pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"
.format(json.dumps(failed_results, indent=4, default=fallback_serializer)))
else:
recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, localhost, nbrhosts,
recover_on_sanity_check_failure(ptfhost, duthosts, failed_results, fanouthosts, localhost, nbrhosts,
pre_check_items, recover_method, request, tbinfo, STAGE_PRE_TEST)

logger.info("Done pre-test sanity check")
Expand All @@ -325,15 +325,16 @@ def sanity_check_full(prepare_parallel_run, localhost, duthosts, request, fanout
pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"
.format(json.dumps(post_failed_results, indent=4, default=fallback_serializer)))
else:
recover_on_sanity_check_failure(duthosts, post_failed_results, fanouthosts, localhost, nbrhosts,
post_check_items, recover_method, request, tbinfo, STAGE_POST_TEST)
recover_on_sanity_check_failure(ptfhost, duthosts, post_failed_results, fanouthosts, localhost,
nbrhosts, post_check_items, recover_method, request, tbinfo,
STAGE_POST_TEST)

logger.info("Done post-test sanity check")
else:
logger.info('No post-test sanity check item, skip post-test sanity check.')


def recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, localhost, nbrhosts, check_items,
def recover_on_sanity_check_failure(ptfhost, duthosts, failed_results, fanouthosts, localhost, nbrhosts, check_items,
recover_method, request, tbinfo, sanity_check_stage: str):
cache_key = "pre_sanity_check_failed"
recovery_cache_key = "pre_sanity_recovered"
Expand Down Expand Up @@ -363,7 +364,7 @@ def recover_on_sanity_check_failure(duthosts, failed_results, fanouthosts, local
else:
for dut_name, dut_results in list(dut_failed_results.items()):
# Attempt to restore DUT state
recover(duthosts[dut_name], localhost, fanouthosts, nbrhosts, tbinfo, dut_results,
recover(ptfhost, duthosts[dut_name], localhost, fanouthosts, nbrhosts, tbinfo, dut_results,
recover_method)

except BaseException as e:
Expand Down
54 changes: 46 additions & 8 deletions tests/common/plugins/sanity_check/recover.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import json
import logging
import time

from tests.common import config_reload
from tests.common.devices.sonic import SonicHost
from tests.common.helpers.parallel import parallel_run, reset_ansible_local_tmp
from tests.common.platform.device_utils import fanout_switch_port_lookup
from tests.common.reboot import REBOOT_TYPE_WARM, REBOOT_TYPE_FAST, REBOOT_TYPE_COLD
from tests.common.reboot import reboot
from tests.common.utilities import wait
from tests.common.utilities import wait, wait_until
from . import constants
from ...helpers.multi_thread_utils import SafeThreadPoolExecutor

Expand Down Expand Up @@ -147,13 +148,49 @@ def _recover_with_command(dut, cmd, wait_time):
wait(wait_time, msg="Wait {} seconds for system to be stable.".format(wait_time))


def re_announce_routes(localhost, topo_name, ptf_ip):
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="withdraw", path="../ansible/")
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="announce", path="../ansible/")
def re_announce_routes(ptfhost, localhost, topo_name, ptf_ip, neighbor_number):
def _check_exabgp():
# Get pid of exabgp processes
exabgp_pids = []
output = ptfhost.shell("ps aux | grep exabgp/http_api |grep -v grep | awk '{print $2}'",
module_ignore_errors=True)
if output['rc'] != 0:
logger.warning("cmd to fetch exabgp pid returned with error: {}".format(output["stderr"]))
return False
for line in output["stdout_lines"]:
if len(line.strip()) == 0:
continue
exabgp_pids.append(line.strip())
# Each bgp neighbor has 2 exabgp process, one is for v4 and another is for v6
if len(exabgp_pids) != neighbor_number * 2:
logger.info("pids number for exabgp processes is incorrect, expected: {}, actual: {}"
.format(neighbor_number * 2, len(exabgp_pids)))
return False
# Check whether all sockets for exabgp are created
output = ptfhost.shell("ss -nltp | grep -E \"{}\""
.format("|".join(["pid={}".format(pid) for pid in exabgp_pids])),
module_ignore_errors=True)
return output["rc"] == 0 and len(output["stdout_lines"]) == neighbor_number * 2

def _op_routes(action):
try:
localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action=action, path="../ansible/")
time.sleep(5)
except Exception as e:
logger.error("Failed to {} routes with error: {}".format(action, e))

ptfhost.shell("supervisorctl restart exabgpv4:*", module_ignore_errors=True)
ptfhost.shell("supervisorctl restart exabgpv6:*", module_ignore_errors=True)
# Wait exabgp to be ready
if not wait_until(120, 5, 0, _check_exabgp):
logger.error("Not all exabgp process are running")

_op_routes("withdraw")
_op_routes("announce")
return None


def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time):
def adaptive_recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time):
outstanding_action = None
for result in check_results:
if result['failed']:
Expand All @@ -169,7 +206,8 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result
"no_v6_default_route" in result['bgp'] and len(result['bgp']) == 1 or
("no_v4_default_route" in result['bgp'] and "no_v6_default_route" in result['bgp'] and
len(result['bgp']) == 2))):
action = re_announce_routes(localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"])
action = re_announce_routes(ptfhost, localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"],
len(nbrhosts))
else:
action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result)
elif result['check_item'] == "neighbor_macsec_empty":
Expand Down Expand Up @@ -199,13 +237,13 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result
_recover_with_command(dut, method['cmd'], wait_time)


def recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method):
def recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method):
logger.warning("Try to recover %s using method %s" % (dut.hostname, recover_method))

method = constants.RECOVER_METHODS[recover_method]
wait_time = method['recover_wait']
if method["adaptive"]:
adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time)
adaptive_recover(ptfhost, dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time)
elif method["reload"]:
config_reload(dut, config_source='running_golden_config',
safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)
Expand Down

0 comments on commit 8256c2a

Please sign in to comment.