Skip to content

Commit

Permalink
3893890:Issue:Exclusion list CI testing for PDR plugin (#211)
Browse files Browse the repository at this point in the history
  • Loading branch information
vg12345 authored May 19, 2024
1 parent b2b889c commit 2904204
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 59 deletions.
1 change: 1 addition & 0 deletions plugins/pdr_deterministic_plugin/.ci/ci_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
# export SERVER_HOST=$SERVER_HOST
# scp $WORKSPACE/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py root@$SERVER_HOST:/tmp
# scp $WORKSPACE/plugins/pdr_deterministic_plugin/.ci/run_simulation_test.sh root@$SERVER_HOST:/tmp
# scp -r $WORKSPACE/utils root@$SERVER_HOST:/tmp
# env
# ssh root@$SERVER_HOST '/tmp/run_simulation_test.sh'
# parallel: false
Expand Down
228 changes: 170 additions & 58 deletions plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#

import time
from http.server import HTTPServer, BaseHTTPRequestHandler
from threading import Thread
Expand All @@ -7,6 +19,9 @@
import random
from os import _exit
from os.path import exists
from collections import OrderedDict
import requests
from utils.utils import Utils

lock = Lock()

Expand All @@ -20,13 +35,22 @@
FEC_MODE = "fec_mode_active"
ENDPOINT_CONFIG = {}

EXCLUDE_PORT_LONG_TIME = "ExcludePortForLongTime"
EXCLUDE_PORT_SHORT_TIME = "ExcludePortForShortTime"
INCLUDE_PORT = "IncludePort"

EXCLUDE_LIST_TEST_NAMES = [
EXCLUDE_PORT_LONG_TIME,
EXCLUDE_PORT_SHORT_TIME,
INCLUDE_PORT
]

class CsvEndpointHandler(BaseHTTPRequestHandler):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def do_GET(self):
ENDPOINT_CONFIG["ITERATION_TIME"] += 1
self.send_response(200)
self.send_header('Content-type', 'text/plain')
self.end_headers()
Expand All @@ -37,6 +61,11 @@ def do_GET(self):
return

endpoint = ENDPOINT_CONFIG[path]
excluded_ports_simulation(endpoint)

# Increase iteration counter AFTER excluded ports simulation
ENDPOINT_CONFIG["ITERATION_TIME"] += 1

data = endpoint['data']
self.wfile.write(data.encode())

Expand All @@ -46,38 +75,69 @@ def do_GET(self):
RCV_PACKETS_COUNTER:"10000000",
}

ALL_DATA_TEST = {
# all positive tests
# iteration, row index, counter name = value
(1,0,PHY_SYMBOL_ERROR):0, # example, also negative test
(1,3,RCV_ERRORS_COUNTER):50,
# All positive tests (tested ports should be isolated)
# (iteration, row index, counter name): value
POSITIVE_DATA_TEST = {
(1, 3, RCV_ERRORS_COUNTER): 50,
# testing packet drop rate criteria
(2,3,RCV_ERRORS_COUNTER):500,
(2, 3, RCV_ERRORS_COUNTER): 500,

# testing temperature changes
(3,4,TEMP_COUNTER):90,
(3, 4, TEMP_COUNTER): 90,
# testing temperature max difference
(3,6,TEMP_COUNTER):25,
(3, 6, TEMP_COUNTER): 25,

(4,8,RCV_REMOTE_PHY_ERROR_COUNTER):50,
(4, 8, RCV_REMOTE_PHY_ERROR_COUNTER): 50,
# testing packet drop rate criteria from the second counter. because we look on rate
(5,8,RCV_REMOTE_PHY_ERROR_COUNTER):500,

# testing link down
(4,2,LINK_DOWN_COUNTER):2,
(5,2,LINK_DOWN_COUNTER):3,
(6,2,LINK_DOWN_COUNTER):4,
(5, 8, RCV_REMOTE_PHY_ERROR_COUNTER): 500,

# negative tests
# testing ber calculation ( should not pass as not all are not equal to 0)
# testing link down
(4, 2, LINK_DOWN_COUNTER): 2,
(5, 2, LINK_DOWN_COUNTER): 3,
(6, 2, LINK_DOWN_COUNTER): 4,

# testing auto remove port from exclusion list
(0, 9, EXCLUDE_PORT_SHORT_TIME): 60, # add to exclusion list for 60 seconds
(8, 9, LINK_DOWN_COUNTER): 1, # at this moment the port should be already automatically removed from exclusion list
(9, 9, LINK_DOWN_COUNTER): 2, # try trigger isolation issue
# testing forced remove port from exclusion list
(0, 1, EXCLUDE_PORT_LONG_TIME): 0, # add to exclusion list forever
(1, 1, INCLUDE_PORT): -1, # remove port from exclusion list
(2, 1, LINK_DOWN_COUNTER): 1, # at this moment the port should be already removed from exclusion list
(3, 1, LINK_DOWN_COUNTER): 2, # try trigger isolation issue

# testing ber calculation (should not pass as not all are not equal to 0)
}

# All negaitive tests (tested ports should not be isolated)
# (iteration, row index, counter name): value
NEGATIVE_DATA_TEST = {
# example, also negative test
(1, 0, PHY_SYMBOL_ERROR): 0,

# testing exclusion list
(0, 5, EXCLUDE_PORT_LONG_TIME): 0, # add to exclusion list forever
(1, 5, LINK_DOWN_COUNTER): 1,
(2, 5, LINK_DOWN_COUNTER): 2, # try trigger isolation issue (should be ignored)
(3, 5, LINK_DOWN_COUNTER): 3, # try trigger isolation issue (should be ignored)
(4, 5, LINK_DOWN_COUNTER): 4, # try trigger isolation issue (should be ignored)
(5, 5, LINK_DOWN_COUNTER): 5, # try trigger isolation issue (should be ignored)
(6, 5, LINK_DOWN_COUNTER): 6, # try trigger isolation issue (should be ignored)
(7, 5, LINK_DOWN_COUNTER): 7, # try trigger isolation issue (should be ignored)
(8, 5, LINK_DOWN_COUNTER): 8, # try trigger isolation issue (should be ignored)
(9, 5, LINK_DOWN_COUNTER): 9, # try trigger isolation issue (should be ignored)
}

def get_max_iteration_index(tests):
return max([test[0] for test in tests]) if tests else 0

# getting the max tests we test plus 2
MAX_ITERATIONS = max([x[0] for x in ALL_DATA_TEST]) + 2
MAX_POSITIVE_ITERATION_INDEX = get_max_iteration_index(POSITIVE_DATA_TEST)
MAX_NEGATIVE_ITERATION_INDEX = get_max_iteration_index(NEGATIVE_DATA_TEST)
MAX_ITERATIONS = max(MAX_POSITIVE_ITERATION_INDEX, MAX_NEGATIVE_ITERATION_INDEX) + 2

# return randomize value base on the counter name
def randomizeValues(counter_name:str,iteration:int):
def randomize_values(counter_name:str,iteration:int):
if counter_name == RCV_PACKETS_COUNTER:
return 1000000 + iteration * 10
if counter_name == TEMP_COUNTER:
Expand All @@ -90,12 +150,15 @@ def randomizeValues(counter_name:str,iteration:int):
return 0

# return value if found on our testing telemetry simulation, else return default value for that telemetry.
def findValue(row_index:int, counter_name:str, iteration:int):
def find_value(row_index:int, counter_name:str, iteration:int, default=0):
if counter_name == RCV_PACKETS_COUNTER:
return str(1000000 + iteration * 10)
return ALL_DATA_TEST.get((iteration,row_index,counter_name),
DIFFERENT_DEFAULT_VALUES.get(counter_name,0))

value = POSITIVE_DATA_TEST.get((iteration, row_index, counter_name), None)
if value is None:
value = NEGATIVE_DATA_TEST.get((iteration, row_index, counter_name),
DIFFERENT_DEFAULT_VALUES.get(counter_name, default))

return value

def start_server(port:str,changes_intervals:int, run_forever:bool):
server_address = ('', int(port))
Expand Down Expand Up @@ -130,31 +193,69 @@ def start_server(port:str,changes_intervals:int, run_forever:bool):
last_val = counter['last_val']
## here we set the value for the counters
if ENDPOINT_CONFIG["ITERATION_TIME"] < MAX_ITERATIONS:
counter['last_val'] = findValue(index,counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"])
counter['last_val'] = find_value(index,counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"], 0)
else:
counter['last_val'] = randomizeValues(counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"])
counter['last_val'] = randomize_values(counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"])
row_data.append(str(last_val))
data.append(row_data)

output = [header] + data
csv_data = '\n'.join([','.join(row) for row in output]) + '\n'
endpoint['data'] = csv_data
endpoint['data'] = csv_data
if not run_forever and ENDPOINT_CONFIG["ITERATION_TIME"] > MAX_ITERATIONS:
# after all the tests are done, we need to stop the simulator and check the logs
return
time.sleep(changes_intervals)

def excluded_ports_simulation(endpoint):
added_ports = []
removed_ports = []
rows = endpoint['row']
for port_index in range(len(rows)):
port_name = endpoint["Ports_names"][port_index]
iteration = ENDPOINT_CONFIG["ITERATION_TIME"]

# Process remove operation
if find_value(port_index, INCLUDE_PORT, iteration, None) is not None:
# Remove from exclusion list
removed_ports.append(f"\"{port_name}\"")

# Process add operation
ttl_seconds = find_value(port_index, EXCLUDE_PORT_LONG_TIME, iteration, None)
if ttl_seconds is None:
ttl_seconds = find_value(port_index, EXCLUDE_PORT_SHORT_TIME, iteration, None)

if ttl_seconds is not None:
# Add to exclusion list
if ttl_seconds == 0:
# Test optional parameter for infinite TTL
added_ports.append(f"[\"{port_name}\"]")
else:
# Test limited TTL value
added_ports.append(f"[\"{port_name}\",{ttl_seconds}]")

if added_ports or removed_ports:
plugin_port = Utils.get_plugin_port(port_conf_file='/config/pdr_deterministic_httpd_proxy.conf', default_port_value=8977)
url=f"http://127.0.0.1:{plugin_port}/excluded"
if added_ports:
added_ports_str = '[' + ','.join(added_ports) + ']'
requests.put(url=url, data=added_ports_str, timeout=5)

if removed_ports:
removed_ports_str = '[' + ','.join(removed_ports) + ']'
requests.delete(url=url, data=removed_ports_str, timeout=5)

# create an array of ports in size of ports_num
def create_ports(config:dict,ports_num: int):
ports_list = []
ports_names = []
for _ in range(ports_num):
port_str = '0x%016x' % random.randrange(16**16)
port_guid = f'0x{random.randrange(16**16):016x}'
# holds the prefix of each simulated csv rows,
# list of counters structures(will be filled further)
ports_list.append([f"{port_str},,{port_str},{port_str},1", []])
ports_names.append(port_str)
port_num = random.randint(1, 99)
ports_list.append([f"{port_guid},,{port_guid},{port_guid},{port_num}", []])
ports_names.append(f"{port_guid[2:]}_{port_num}")
config["Ports_names"] = ports_names
return ports_list

Expand All @@ -181,9 +282,18 @@ def initialize_simulated_counters(endpoint_obj: dict):

def assert_equal(message, left_expr, right_expr, test_name="positive"):
if left_expr == right_expr:
print(f" - {test_name} test name: {message} -- PASS")
print(f" - {test_name} test: {message} -- PASS")
else:
print(f" - {test_name} test name: {message} -- FAIL (expected: {right_expr}, actual: {left_expr})")
print(f" - {test_name} test: {message} -- FAIL (expected: {right_expr}, actual: {left_expr})")

def validate_simulation_data():
positive_test_port_indexes = set([x[1] for x in POSITIVE_DATA_TEST])
negative_test_port_indexes = set([x[1] for x in NEGATIVE_DATA_TEST])
if not positive_test_port_indexes.isdisjoint(negative_test_port_indexes):
print("ERROR: same port can't participate in both positive and negative tests")
return False

return True

def check_logs(config):
lines=[]
Expand All @@ -199,41 +309,40 @@ def check_logs(config):
if len(lines) == 0:
print("Could not find log file in " + str(location_logs_can_be))
return 1
# if a you want to add more tests, please add more guids and test on other indeces.

ports_should_be_isoloated_indeces = list(set([x[1] for x in ALL_DATA_TEST]))
ports_shouldnt_be_isolated_indeces = [0]
# remove negative tests from the positive ones
ports_should_be_isoloated_indeces = [port for port in ports_should_be_isoloated_indeces if port not in ports_shouldnt_be_isolated_indeces]
# if a you want to add more tests, please add more guids and test on other indices.

ports_should_be_isolated_indices = list(set([x[1] for x in POSITIVE_DATA_TEST]))
ports_should_not_be_isolated_indices = list(set([x[1] for x in NEGATIVE_DATA_TEST]))

number_of_tests_approved = len(ports_should_be_isoloated_indeces)
number_of_negative_tests = len(ports_shouldnt_be_isolated_indeces)
number_of_failed_positive_tests = 0
number_of_failed_negative_tests = 0
isolated_message="WARNING: Isolated port: "
for p in ports_should_be_isoloated_indeces:
for p in ports_should_be_isolated_indices:
found=False
port_name = config["Ports_names"][p][2:]
testedCounter = set([x[2] for x in ALL_DATA_TEST if x[1]==p])
port_name = config["Ports_names"][p]
tested_counter = list(OrderedDict.fromkeys([x[2] for x in POSITIVE_DATA_TEST if x[1] == p]))
for line in lines:
foundPort = isolated_message + port_name in line
if foundPort:
found_port = isolated_message + port_name in line
if found_port:
found = True
number_of_tests_approved -= 1 # it was found
break
assert_equal(f"{port_name} which check {testedCounter} changed and in the logs",found,True)

for p in ports_shouldnt_be_isolated_indeces:
if not found:
number_of_failed_positive_tests += 1
assert_equal(f"port {port_name} (index: {p}) which check {tested_counter} changed and should be in the logs", found, True)

for p in ports_should_not_be_isolated_indices:
found=False
port_name = config["Ports_names"][p][2:]
testedCounter = set([x[2] for x in ALL_DATA_TEST if x[1]==p])
port_name = config["Ports_names"][p]
tested_counter = list(OrderedDict.fromkeys([x[2] for x in NEGATIVE_DATA_TEST if x[1] == p]))
for line in lines:
foundPort = isolated_message + port_name in line
if foundPort:
found=True
number_of_negative_tests -= 1 # it was found, but it shouldnt
found_port = isolated_message + port_name in line
if found_port:
found = True
number_of_failed_negative_tests += 1
break
assert_equal(f"{port_name} changed and in the logs",found,False,"negative")
all_pass = number_of_tests_approved == 0 and number_of_negative_tests == len(ports_shouldnt_be_isolated_indeces)
assert_equal(f"port {port_name} (index: {p}) which check {tested_counter} should not be in the logs", found, False, "negative")

all_pass = number_of_failed_positive_tests == 0 and number_of_failed_negative_tests == 0
return 0 if all_pass else 1

# start a server which update the counters every time
Expand Down Expand Up @@ -273,6 +382,9 @@ def main():
config['row'] = config['selected_row']
initialize_simulated_counters(config)

if not validate_simulation_data():
return 1

port = args.endpoint_port
url = f'http://0.0.0.0:{port}{args.url_suffix}'
print(f'---Starting endpoint {url}')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def __init__(self, ufm_client: UFMCommunicator, logger):
self.link_down_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.LINK_DOWN_ISOLATION)
self.switch_hca_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.SWITCH_TO_HOST_ISOLATION)
self.test_mode = pdr_config.getboolean(Constants.CONF_COMMON,Constants.TEST_MODE, fallback=False)
self.test_iteration = 0
self.dynamic_unresponsive_limit = pdr_config.getint(Constants.CONF_ISOLATION,Constants.DYNAMIC_UNRESPONSIVE_LIMIT, fallback=3)
# Take from Conf
self.logger = logger
Expand Down Expand Up @@ -885,7 +886,11 @@ def main_flow(self):
t_begin = time.time()
self.exclude_list.refresh()
self.get_isolation_state()
self.logger.info("Retrieving telemetry data to determine ports' states")
if not self.test_mode:
self.logger.info("Retrieving telemetry data to determine ports' states")
else:
self.logger.info(f"Retrieving test mode telemetry data to determine ports' states: iteration {self.test_iteration}")
self.test_iteration += 1
try:
issues = self.read_next_set_of_high_ber_or_pdr_ports(endpoint_port)
except DynamicTelemetryUnresponsive:
Expand All @@ -894,6 +899,7 @@ def main_flow(self):
self.logger.error(f"Dynamic telemetry is unresponsive for {dynamic_telemetry_unresponsive_count} times, restarting telemetry session...")
endpoint_port = self.restart_telemetry_session()
dynamic_telemetry_unresponsive_count = 0
self.test_iteration = 0
continue
if len(issues) > self.max_num_isolate:
# UFM send external event
Expand Down

0 comments on commit 2904204

Please sign in to comment.