Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SAI validation #15324

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
96094a0
Initial commit with draft design - in progress
opcoder0 Aug 22, 2024
30727ce
Remove test category column and fix formatting
opcoder0 Aug 22, 2024
abb1b88
Add everflow; fix description
opcoder0 Aug 22, 2024
65d2e0e
Add tests/fdb, tests/fib
opcoder0 Aug 27, 2024
e0fbbbf
Update design document; Write pubsub based sonic database access library
opcoder0 Sep 18, 2024
f4bd2d7
Merge branch 'master' into feat/sai-validation
opcoder0 Sep 30, 2024
9e510de
Add logging to test bfd and sonic db library
opcoder0 Oct 2, 2024
7b41e62
Merge branch 'master' into feat/sai-validation
opcoder0 Oct 3, 2024
25ab88d
Add wait_for_n_keys; update usage in bfd and acl
opcoder0 Oct 3, 2024
c3624dc
Basic working:
opcoder0 Oct 4, 2024
7fd5761
Add to design doc; Keep count
opcoder0 Oct 4, 2024
a3767a0
refactor code to count rules
opcoder0 Oct 4, 2024
a1a3178
Reduce logging; Add SAI validation to TestIncrementalAcl
opcoder0 Oct 5, 2024
da86d65
add log to test_bfd.py and make basic test_basic_new
opcoder0 Oct 6, 2024
b2b410f
Add new subscription based API for watching Sonic DB keys
opcoder0 Oct 11, 2024
c7e1adc
Merge branch 'master' into feat/sai-validation
opcoder0 Oct 31, 2024
d7d0ea8
Fix failures; Move changes to test_bfd.py; remove test_bfd_new.py;
opcoder0 Oct 31, 2024
fc76650
Add socat files
opcoder0 Oct 31, 2024
778acb1
remove test_bfd_new.py
opcoder0 Oct 31, 2024
e456688
Expose redis via db fixtures; Assert ACL test;
opcoder0 Nov 1, 2024
7a55ec3
Merge branch 'master' into feat/sai-validation
opcoder0 Nov 1, 2024
c51dc9f
Remove vs config files from commit
opcoder0 Nov 1, 2024
6cbabd6
Remove extra debug log entry
opcoder0 Nov 1, 2024
b1fa4e9
Fix merge issues in conftest.py
opcoder0 Nov 1, 2024
6548a75
Remove debug log entries from test_acl.py
opcoder0 Nov 1, 2024
d6523c5
Get duthost mgmt IP from fixture, make expose port configurable
opcoder0 Nov 6, 2024
33bf4b2
Merge branch 'master' into feat/sai-validation
opcoder0 Nov 20, 2024
ca07542
Merge branch 'master' into feat/sai-validation
opcoder0 Nov 26, 2024
44e891b
Handle exceptions in `wait_until_*` functions;
opcoder0 Dec 11, 2024
31afed8
Remove socat package and get it from SONiC swss container
opcoder0 Dec 16, 2024
4188ba2
Merge branch 'master' into feat/sai-validation
opcoder0 Dec 17, 2024
1da54da
Add ACL rule validation functions
opcoder0 Dec 24, 2024
32f6c41
Merge branch 'master' into feat/sai-validation
opcoder0 Dec 24, 2024
3e5ea5b
Cancel future that timeout; Skip validation for egress stage
opcoder0 Dec 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
170 changes: 170 additions & 0 deletions docs/sai_validation/sai_validation_design.md

Large diffs are not rendered by default.

52 changes: 34 additions & 18 deletions tests/acl/test_acl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from abc import ABCMeta, abstractmethod
from collections import defaultdict
from datetime import timedelta

from tests.common import reboot, port_toggle
from tests.common.helpers.assertions import pytest_require, pytest_assert
Expand All @@ -25,6 +26,7 @@
from tests.common.fixtures.conn_graph_facts import conn_graph_facts # noqa F401
from tests.common.platform.processes_utils import wait_critical_processes
from tests.common.platform.interface_utils import check_all_interface_information
from tests.common.database.sonic import start_db_monitor, await_monitor

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -623,7 +625,7 @@ class BaseAclTest(six.with_metaclass(ABCMeta, object)):
ACL_COUNTERS_UPDATE_INTERVAL_SECS = 10

@abstractmethod
def setup_rules(self, dut, acl_table, ip_version):
def setup_rules(self, dut, acl_table, ip_version, asic_db_connection):
"""Setup ACL rules for testing.

Args:
Expand Down Expand Up @@ -663,7 +665,7 @@ def teardown_rules(self, dut):

@pytest.fixture(scope="class", autouse=True)
def acl_rules(self, duthosts, localhost, setup, acl_table, populate_vlan_arp_entries, tbinfo,
ip_version, conn_graph_facts): # noqa F811
ip_version, conn_graph_facts, asic_db_connection): # noqa F811
"""Setup/teardown ACL rules for the current set of tests.

Args:
Expand All @@ -676,12 +678,11 @@ def acl_rules(self, duthosts, localhost, setup, acl_table, populate_vlan_arp_ent

"""
dut_to_analyzer_map = {}

with SafeThreadPoolExecutor(max_workers=8) as executor:
for duthost in duthosts:
executor.submit(self.set_up_acl_rules_single_dut, acl_table, conn_graph_facts,
dut_to_analyzer_map, duthost, ip_version, localhost,
populate_vlan_arp_entries, tbinfo)
populate_vlan_arp_entries, tbinfo, asic_db_connection)
logger.info("Set up acl_rules finished")

try:
Expand All @@ -705,7 +706,7 @@ def tear_down_acl_rule_single_dut(self, duthost, loganalyzer):
def set_up_acl_rules_single_dut(self, acl_table,
conn_graph_facts, dut_to_analyzer_map, duthost, # noqa F811
ip_version, localhost,
populate_vlan_arp_entries, tbinfo):
populate_vlan_arp_entries, tbinfo, asic_db_connection):
logger.info("{}: ACL rule application started".format(duthost.hostname))
if duthost.is_supervisor_node():
return
Expand All @@ -717,7 +718,7 @@ def set_up_acl_rules_single_dut(self, acl_table,
# Ignore any other errors to reduce noise
loganalyzer.ignore_regex = [r".*"]
with loganalyzer:
self.setup_rules(duthost, acl_table, ip_version)
self.setup_rules(duthost, acl_table, ip_version, asic_db_connection)
# Give the dut some time for the ACL rules to be applied and LOG message generated
wait_until(300, 20, 0, check_msg_in_syslog,
duthost, LOG_EXPECT_ACL_RULE_CREATE_RE)
Expand Down Expand Up @@ -1235,7 +1236,7 @@ def _verify_acl_traffic(self, setup, direction, ptfadapter, pkt, dropped, ip_ver
class TestBasicAcl(BaseAclTest):
"""Test Basic functionality of ACL rules (i.e. setup with full update on a running device)."""

def setup_rules(self, dut, acl_table, ip_version):
def setup_rules(self, dut, acl_table, ip_version, asic_db_connection):
"""Setup ACL rules for testing.

Args:
Expand All @@ -1247,13 +1248,22 @@ def setup_rules(self, dut, acl_table, ip_version):
dut.host.options["variable_manager"].extra_vars.update({"acl_table_name": table_name})

logger.info("Generating basic ACL rules config for ACL table \"{}\" on {}".format(table_name, dut))

dut_conf_file_path = os.path.join(DUT_TMP_DIR, "acl_rules_{}.json".format(table_name))
dut.template(src=os.path.join(TEMPLATE_DIR, ACL_RULES_FULL_TEMPLATE[ip_version]),
dest=dut_conf_file_path)

logger.info("Applying ACL rules config \"{}\"".format(dut_conf_file_path))
dut.command("config acl update full {}".format(dut_conf_file_path))
with SafeThreadPoolExecutor(max_workers=8) as executor:
logger.info('Start monitoring for ACL rules')
prefix = 'ASIC_STATE:SAI_OBJECT_TYPE_ACL_ENTRY:*'
rules = json.loads(dut.command(f'cat {dut_conf_file_path}')['stdout'])
n_rules = len(rules['acl']['acl-sets']['acl-set'][table_name]['acl-entries']['acl-entry'])
acl_rules_monitor = start_db_monitor(executor, asic_db_connection, n_rules, prefix)
logger.info("Applying ACL rules config \"{}\"".format(dut_conf_file_path))
dut.command("config acl update full {}".format(dut_conf_file_path))
events, actual_wait_secs = await_monitor(acl_rules_monitor, timedelta(minutes=5))
logger.debug(f'Received {len(events)} after waiting for {actual_wait_secs} seconds')
logger.debug(f'Events: {events}')
assert n_rules == len(events)


class TestIncrementalAcl(BaseAclTest):
Expand All @@ -1263,7 +1273,7 @@ class TestIncrementalAcl(BaseAclTest):
multiple parts.
"""

def setup_rules(self, dut, acl_table, ip_version):
def setup_rules(self, dut, acl_table, ip_version, asic_db_connection):
"""Setup ACL rules for testing.

Args:
Expand All @@ -1273,16 +1283,22 @@ def setup_rules(self, dut, acl_table, ip_version):
"""
table_name = acl_table["table_name"]
dut.host.options["variable_manager"].extra_vars.update({"acl_table_name": table_name})

logger.info("Generating incremental ACL rules config for ACL table \"{}\""
.format(table_name))

for part, config_file in enumerate(ACL_RULES_PART_TEMPLATES[ip_version]):
dut_conf_file_path = os.path.join(DUT_TMP_DIR, "acl_rules_{}_part_{}.json".format(table_name, part))
dut.template(src=os.path.join(TEMPLATE_DIR, config_file), dest=dut_conf_file_path)

logger.info("Applying ACL rules config \"{}\"".format(dut_conf_file_path))
dut.command("config acl update incremental {}".format(dut_conf_file_path))
prefix = 'ASIC_STATE:SAI_OBJECT_TYPE_ACL_ENTRY:*'
with SafeThreadPoolExecutor(max_workers=8) as executor:
for part, config_file in enumerate(ACL_RULES_PART_TEMPLATES[ip_version]):
logger.info('Start monitoring for ACL rules')
dut_conf_file_path = os.path.join(DUT_TMP_DIR, "acl_rules_{}_part_{}.json".format(table_name, part))
dut.template(src=os.path.join(TEMPLATE_DIR, config_file), dest=dut_conf_file_path)
rules = json.loads(dut.command(f'cat {dut_conf_file_path}')['stdout'])
n_rules = len(rules['acl']['acl-sets']['acl-set'][table_name]['acl-entries']['acl-entry'])
acl_rules_monitor = start_db_monitor(executor, asic_db_connection, n_rules, prefix)
logger.info("Applying ACL rules config \"{}\"".format(dut_conf_file_path))
dut.command("config acl update incremental {}".format(dut_conf_file_path))
events, actual_wait_secs = await_monitor(acl_rules_monitor, timedelta(minutes=5))
logger.debug(f'Received {len(events)} after waiting for {actual_wait_secs} seconds')


@pytest.mark.reboot
Expand Down
83 changes: 61 additions & 22 deletions tests/bfd/test_bfd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
import json
import logging

from datetime import timedelta

from tests.common.dualtor.mux_simulator_control import toggle_all_simulator_ports_to_rand_selected_tor_m # noqa F401
from tests.common.snappi_tests.common_helpers import get_egress_queue_count
from tests.common.database.sonic import start_subscribe, stop_subscribe
from tests.common.database.sonic import wait_until_condition, check_hash_key
from tests.common.utilities import wait_until

pytestmark = [
pytest.mark.topology('t1'),
Expand Down Expand Up @@ -217,16 +222,19 @@ def check_ptf_bfd_status(ptfhost, neighbor_addr, local_addr, expected_state):
for line in bfd_state:
field = line.split('=')[0].strip()
if field == "state":
assert expected_state in line.split('=')[1].strip()
return expected_state in line.split('=')[1].strip()


def check_dut_bfd_status(duthost, neighbor_addr, expected_state, max_attempts=12, retry_interval=10):
logger.info(f'in check_dut_bfd_status will run for {max_attempts} every {retry_interval} seconds')
for i in range(max_attempts + 1):
bfd_state = duthost.shell("sonic-db-cli STATE_DB HGET 'BFD_SESSION_TABLE|default|default|{}' 'state'"
.format(neighbor_addr), module_ignore_errors=False)['stdout_lines']
logger.info(f'Query state of BFD_SESSION_TABLE|default|default|{neighbor_addr} = {bfd_state}')
logger.info("BFD state check: {} - {}".format(neighbor_addr, bfd_state[0]))

if expected_state in bfd_state[0]:
logger.info('returning as bfd state matches expected state')
return # Success, no need to retry

logger.error("BFD state check failed: {} - {}".format(neighbor_addr, bfd_state[0]))
Expand Down Expand Up @@ -379,53 +387,84 @@ def verify_bfd_queue_counters(duthost, dut_intf):

@pytest.mark.parametrize('dut_init_first', [True, False], ids=['dut_init_first', 'ptf_init_first'])
@pytest.mark.parametrize('ipv6', [False, True], ids=['ipv4', 'ipv6'])
def test_bfd_basic(request, rand_selected_dut, ptfhost, tbinfo, ipv6, dut_init_first):
def test_bfd_basic(request, state_db_connection,
rand_selected_dut, ptfhost, tbinfo, ipv6, dut_init_first):
duthost = rand_selected_dut
bfd_session_cnt = int(request.config.getoption('--num_sessions'))
local_addrs, prefix_len, neighbor_addrs, neighbor_devs, neighbor_interfaces = get_neighbors(duthost, tbinfo, ipv6,
count=bfd_session_cnt)

prefix = 'BFD_SESSION_TABLE|default|default|'
key_pattern = f'{prefix}*'
q, ctx = start_subscribe(redis_conn=state_db_connection, key_pattern=key_pattern)

try:
add_dut_ip(duthost, neighbor_devs, local_addrs, prefix_len)
init_ptf_bfd(ptfhost)
add_ipaddr(ptfhost, neighbor_addrs, prefix_len, neighbor_interfaces, ipv6)
create_bfd_sessions(ptfhost, duthost, local_addrs, neighbor_addrs, dut_init_first)

time.sleep(1)
# check all STATE_DB BFD_SESSION_TABLE neighbors' state is Up
status, actual_wait = wait_until_condition(q, prefix, neighbor_addrs,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If exception is raised in wait_until_condition, it would return None. Can this tuple assignment pass if wait_until_condition returns None? If not, then there is a problem here. And other wait_until_* tools have the similar problem.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. ThreadPoolExecutor.submit raises RuntimeError, TypeError and BrokenThreadPool all of these shouldn't be handled. The concurrent.futures.Future.result submit() can raise TimeoutError, CancelledError and Exception which have been handled by logging and returning appropriate return values.

condition_cb=lambda k, v: v.get('state') == 'Up',
timeout=timedelta(minutes=2))
logger.debug(f'All up; Wait for {neighbor_addrs} to be Up completed with'
f' {status} and time taken {actual_wait} seconds')
assert status is True
for idx, neighbor_addr in enumerate(neighbor_addrs):
check_dut_bfd_status(duthost, neighbor_addr, "Up")
check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Up")
assert check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Up") is True

update_idx = random.choice(list(range(bfd_session_cnt)))
update_bfd_session_state(ptfhost, neighbor_addrs[update_idx], local_addrs[update_idx], "admin")
time.sleep(1)

# check all STATE_DB BFD_SESSION_TABLE neighbors' state is Up except
# the one on neighbor_addrs[update_idx] which has to be 'Admin_Down'
status, actual_wait = wait_until_condition(q, prefix,
[neighbor_addrs[update_idx]],
condition_cb=lambda k, v: neighbor_addrs[update_idx] in k and v.get('state') == 'Admin_Down', # noqa F501
timeout=timedelta(minutes=2)
)
logger.debug(f'Admin check; Wait for {neighbor_addrs[update_idx]}'
f' to be Admin_Down and others to be Up. Status {status}, actual time {actual_wait}')
assert status is True
for idx, neighbor_addr in enumerate(neighbor_addrs):
if idx == update_idx:
check_dut_bfd_status(duthost, neighbor_addr, "Admin_Down")
check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "AdminDown")
assert check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "AdminDown") is True
else:
check_dut_bfd_status(duthost, neighbor_addr, "Up")
check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Up")
logger.debug(f'checking hash key for {prefix}{neighbor_addr}')
assert check_hash_key(state_db_connection, f'{prefix}{neighbor_addr}', 'state', 'Up') is True
assert check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Up") is True

update_bfd_session_state(ptfhost, neighbor_addrs[update_idx], local_addrs[update_idx], "up")
time.sleep(1)

check_dut_bfd_status(duthost, neighbor_addrs[update_idx], "Up")
check_ptf_bfd_status(ptfhost, neighbor_addrs[update_idx], local_addrs[update_idx], "Up")
# check the STATE_DB BFD_SESSION_TABLE neighbor_addrs[update_idx] state is back
# to 'Up'.
status, actual_wait = wait_until_condition(q, prefix, [neighbor_addrs[update_idx]],
condition_cb=lambda k, v: v.get('state') == 'Up',
timeout=timedelta(minutes=2))
logger.debug(f'Reset to Up check; Wait for {neighbor_addrs[update_idx]}'
f' to be Up completed with {status} and time taken {actual_wait} seconds')
assert status is True
assert check_ptf_bfd_status(ptfhost, neighbor_addrs[update_idx], local_addrs[update_idx], "Up") is True

update_idx = random.choice(list(range(bfd_session_cnt)))
update_bfd_state(ptfhost, neighbor_addrs[update_idx], local_addrs[update_idx], "suspend")
time.sleep(5)

# check all STATE_DB BFD_SESSION_TABLE neighbors' state is Up except
# the one on neighbor_addrs[update_idx] which has to be 'Down'
status, actual_wait = wait_until_condition(q, prefix,
[neighbor_addrs[update_idx]],
condition_cb=lambda k, v: v.get('state') == 'Down',
timeout=timedelta(minutes=2))
logger.debug(f'Suspend check; Wait for {neighbor_addrs[update_idx]}'
f' to be Down and others to be Up. Status {status}, actual time {actual_wait}')
assert status is True
for idx, neighbor_addr in enumerate(neighbor_addrs):
if idx == update_idx:
check_dut_bfd_status(duthost, neighbor_addr, "Down")
check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Init")
wait_until(60, 5, 0, check_ptf_bfd_status, ptfhost, neighbor_addr, local_addrs[idx], "Init")
else:
check_dut_bfd_status(duthost, neighbor_addr, "Up")
check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Up")
logger.debug(f'checking hash key for {prefix}{neighbor_addr}')
assert check_hash_key(state_db_connection, f'{prefix}{neighbor_addr}', 'state', 'Up') is True
assert check_ptf_bfd_status(ptfhost, neighbor_addr, local_addrs[idx], "Up") is True

finally:
stop_subscribe(ctx)
stop_ptf_bfd(ptfhost)
del_ipaddr(ptfhost, neighbor_addrs, prefix_len, neighbor_interfaces, ipv6)
remove_bfd_sessions(duthost, neighbor_addrs)
Expand Down
Loading
Loading