3893890:Issue:Exclusion list CI testing for PDR plugin (#211)

Mellanox · May 19, 2024 · 2904204 · 2904204
1 parent b2b889c
commit 2904204
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 59 deletions.
diff --git a/plugins/pdr_deterministic_plugin/.ci/ci_matrix.yaml b/plugins/pdr_deterministic_plugin/.ci/ci_matrix.yaml
@@ -72,6 +72,7 @@
 #       export SERVER_HOST=$SERVER_HOST
 #       scp $WORKSPACE/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py root@$SERVER_HOST:/tmp
 #       scp $WORKSPACE/plugins/pdr_deterministic_plugin/.ci/run_simulation_test.sh root@$SERVER_HOST:/tmp
+#       scp -r $WORKSPACE/utils root@$SERVER_HOST:/tmp
 #       env
 #       ssh root@$SERVER_HOST '/tmp/run_simulation_test.sh'
 #     parallel: false

diff --git a/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py b/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py
@@ -1,3 +1,15 @@
+#
+# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# This software product is a proprietary product of Nvidia Corporation and its affiliates
+# (the "Company") and all right, title, and interest in and to the software
+# product, including all associated intellectual property rights, are and
+# shall remain exclusively with the Company.
+#
+# This software product is governed by the End User License Agreement
+# provided with the software product.
+#
+
 import time
 from http.server import HTTPServer, BaseHTTPRequestHandler
 from threading import Thread
@@ -7,6 +19,9 @@
 import random
 from os import _exit
 from os.path import exists
+from collections import OrderedDict
+import requests
+from utils.utils import Utils
 
 lock = Lock()
 
@@ -20,13 +35,22 @@
 FEC_MODE = "fec_mode_active"
 ENDPOINT_CONFIG = {}
 
+EXCLUDE_PORT_LONG_TIME = "ExcludePortForLongTime"
+EXCLUDE_PORT_SHORT_TIME = "ExcludePortForShortTime"
+INCLUDE_PORT = "IncludePort"
+
+EXCLUDE_LIST_TEST_NAMES = [
+    EXCLUDE_PORT_LONG_TIME,
+    EXCLUDE_PORT_SHORT_TIME,
+    INCLUDE_PORT
+]
+
 class CsvEndpointHandler(BaseHTTPRequestHandler):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     def do_GET(self):
-        ENDPOINT_CONFIG["ITERATION_TIME"] += 1
         self.send_response(200)
         self.send_header('Content-type', 'text/plain')
         self.end_headers()
@@ -37,6 +61,11 @@ def do_GET(self):
             return
 
         endpoint = ENDPOINT_CONFIG[path]
+        excluded_ports_simulation(endpoint)
+
+        # Increase iteration counter AFTER excluded ports simulation
+        ENDPOINT_CONFIG["ITERATION_TIME"] += 1
+
         data = endpoint['data']
         self.wfile.write(data.encode())
 
@@ -46,38 +75,69 @@ def do_GET(self):
     RCV_PACKETS_COUNTER:"10000000",
 }
 
-ALL_DATA_TEST = {
-    # all positive tests
-    # iteration, row index, counter name = value
-    (1,0,PHY_SYMBOL_ERROR):0, # example, also negative test
-    (1,3,RCV_ERRORS_COUNTER):50,
+# All positive tests (tested ports should be isolated)
+# (iteration, row index, counter name): value
+POSITIVE_DATA_TEST = {
+    (1, 3, RCV_ERRORS_COUNTER): 50,
     # testing packet drop rate criteria
-    (2,3,RCV_ERRORS_COUNTER):500,
+    (2, 3, RCV_ERRORS_COUNTER): 500,
 
     # testing temperature changes
-    (3,4,TEMP_COUNTER):90,
+    (3, 4, TEMP_COUNTER): 90,
     # testing temperature max difference
-    (3,6,TEMP_COUNTER):25,
+    (3, 6, TEMP_COUNTER): 25,
 
-    (4,8,RCV_REMOTE_PHY_ERROR_COUNTER):50,
+    (4, 8, RCV_REMOTE_PHY_ERROR_COUNTER): 50,
     # testing packet drop rate criteria from the second counter. because we look on rate
-    (5,8,RCV_REMOTE_PHY_ERROR_COUNTER):500,
-
-    # testing link down
-    (4,2,LINK_DOWN_COUNTER):2,
-    (5,2,LINK_DOWN_COUNTER):3,
-    (6,2,LINK_DOWN_COUNTER):4,
+    (5, 8, RCV_REMOTE_PHY_ERROR_COUNTER): 500,
 
-    # negative tests
-    # testing ber calculation ( should not pass as not all are not equal to 0)
+    # testing link down
+    (4, 2, LINK_DOWN_COUNTER): 2,
+    (5, 2, LINK_DOWN_COUNTER): 3,
+    (6, 2, LINK_DOWN_COUNTER): 4,
+
+    # testing auto remove port from exclusion list
+    (0, 9, EXCLUDE_PORT_SHORT_TIME): 60, # add to exclusion list for 60 seconds
+    (8, 9, LINK_DOWN_COUNTER): 1,        # at this moment the port should be already automatically removed from exclusion list
+    (9, 9, LINK_DOWN_COUNTER): 2,        # try trigger isolation issue
+    # testing forced remove port from exclusion list
+    (0, 1, EXCLUDE_PORT_LONG_TIME): 0,   # add to exclusion list forever
+    (1, 1, INCLUDE_PORT): -1,            # remove port from exclusion list
+    (2, 1, LINK_DOWN_COUNTER): 1,        # at this moment the port should be already removed from exclusion list
+    (3, 1, LINK_DOWN_COUNTER): 2,        # try trigger isolation issue
+
+    # testing ber calculation (should not pass as not all are not equal to 0)
+}
 
+# All negaitive tests (tested ports should not be isolated)
+# (iteration, row index, counter name): value
+NEGATIVE_DATA_TEST = {
+    # example, also negative test
+    (1, 0, PHY_SYMBOL_ERROR): 0,
+
+    # testing exclusion list
+    (0, 5, EXCLUDE_PORT_LONG_TIME): 0, # add to exclusion list forever
+    (1, 5, LINK_DOWN_COUNTER): 1,
+    (2, 5, LINK_DOWN_COUNTER): 2,      # try trigger isolation issue (should be ignored)
+    (3, 5, LINK_DOWN_COUNTER): 3,      # try trigger isolation issue (should be ignored)
+    (4, 5, LINK_DOWN_COUNTER): 4,      # try trigger isolation issue (should be ignored)
+    (5, 5, LINK_DOWN_COUNTER): 5,      # try trigger isolation issue (should be ignored)
+    (6, 5, LINK_DOWN_COUNTER): 6,      # try trigger isolation issue (should be ignored)
+    (7, 5, LINK_DOWN_COUNTER): 7,      # try trigger isolation issue (should be ignored)
+    (8, 5, LINK_DOWN_COUNTER): 8,      # try trigger isolation issue (should be ignored)
+    (9, 5, LINK_DOWN_COUNTER): 9,      # try trigger isolation issue (should be ignored)
 }
 
+def get_max_iteration_index(tests):
+   return max([test[0] for test in tests]) if tests else 0
+
 # getting the max tests we test plus 2
-MAX_ITERATIONS = max([x[0] for x in ALL_DATA_TEST]) + 2
+MAX_POSITIVE_ITERATION_INDEX = get_max_iteration_index(POSITIVE_DATA_TEST)
+MAX_NEGATIVE_ITERATION_INDEX = get_max_iteration_index(NEGATIVE_DATA_TEST)
+MAX_ITERATIONS = max(MAX_POSITIVE_ITERATION_INDEX, MAX_NEGATIVE_ITERATION_INDEX) + 2
 
 # return randomize value base on the counter name
-def randomizeValues(counter_name:str,iteration:int):
+def randomize_values(counter_name:str,iteration:int):
     if counter_name == RCV_PACKETS_COUNTER:
         return 1000000 + iteration * 10
     if counter_name == TEMP_COUNTER:
@@ -90,12 +150,15 @@ def randomizeValues(counter_name:str,iteration:int):
         return 0
 
 # return value if found on our testing telemetry simulation, else return default value for that telemetry.
-def findValue(row_index:int, counter_name:str, iteration:int):
+def find_value(row_index:int, counter_name:str, iteration:int, default=0):
     if counter_name == RCV_PACKETS_COUNTER:
         return str(1000000 + iteration * 10)
-    return ALL_DATA_TEST.get((iteration,row_index,counter_name),
-                             DIFFERENT_DEFAULT_VALUES.get(counter_name,0))
-
+    value = POSITIVE_DATA_TEST.get((iteration, row_index, counter_name), None)
+    if value is None:
+        value = NEGATIVE_DATA_TEST.get((iteration, row_index, counter_name),
+                                       DIFFERENT_DEFAULT_VALUES.get(counter_name, default))
+
+    return value
 
 def start_server(port:str,changes_intervals:int, run_forever:bool):
     server_address = ('', int(port))
@@ -130,31 +193,69 @@ def start_server(port:str,changes_intervals:int, run_forever:bool):
                 last_val = counter['last_val']
                 ## here we set the value for the counters
                 if ENDPOINT_CONFIG["ITERATION_TIME"] < MAX_ITERATIONS:
-                    counter['last_val'] = findValue(index,counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"])
+                    counter['last_val'] = find_value(index,counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"], 0)
                 else:
-                    counter['last_val'] = randomizeValues(counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"])
+                    counter['last_val'] = randomize_values(counters_names[i],ENDPOINT_CONFIG["ITERATION_TIME"])
                 row_data.append(str(last_val))
             data.append(row_data)
 
         output = [header] + data
         csv_data = '\n'.join([','.join(row) for row in output]) + '\n'
-        endpoint['data'] = csv_data        
+        endpoint['data'] = csv_data
         if not run_forever and ENDPOINT_CONFIG["ITERATION_TIME"] > MAX_ITERATIONS:
             # after all the tests are done, we need to stop the simulator and check the logs
             return
         time.sleep(changes_intervals)
 
+def excluded_ports_simulation(endpoint):
+    added_ports = []
+    removed_ports = []
+    rows = endpoint['row']
+    for port_index in range(len(rows)):
+        port_name = endpoint["Ports_names"][port_index]
+        iteration = ENDPOINT_CONFIG["ITERATION_TIME"]
+
+        # Process remove operation
+        if find_value(port_index, INCLUDE_PORT, iteration, None) is not None:
+            # Remove from exclusion list
+            removed_ports.append(f"\"{port_name}\"")
+
+        # Process add operation
+        ttl_seconds = find_value(port_index, EXCLUDE_PORT_LONG_TIME, iteration, None)
+        if ttl_seconds is None:
+            ttl_seconds = find_value(port_index, EXCLUDE_PORT_SHORT_TIME, iteration, None)
+
+        if ttl_seconds is not None:
+            # Add to exclusion list
+            if ttl_seconds == 0:
+                # Test optional parameter for infinite TTL
+                added_ports.append(f"[\"{port_name}\"]")
+            else:
+                # Test limited TTL value
+                added_ports.append(f"[\"{port_name}\",{ttl_seconds}]")
+
+    if added_ports or removed_ports:
+        plugin_port = Utils.get_plugin_port(port_conf_file='/config/pdr_deterministic_httpd_proxy.conf', default_port_value=8977)
+        url=f"http://127.0.0.1:{plugin_port}/excluded"
+        if added_ports:
+            added_ports_str = '[' + ','.join(added_ports) + ']'
+            requests.put(url=url, data=added_ports_str, timeout=5)
+
+        if removed_ports:
+            removed_ports_str = '[' + ','.join(removed_ports) + ']'
+            requests.delete(url=url, data=removed_ports_str, timeout=5)
 
 # create an array of ports in size of ports_num
 def create_ports(config:dict,ports_num: int):
     ports_list = []
     ports_names = []
     for _ in range(ports_num):
-        port_str = '0x%016x' % random.randrange(16**16)
+        port_guid = f'0x{random.randrange(16**16):016x}'
         # holds the prefix of each simulated csv rows,
         # list of counters structures(will be filled further)
-        ports_list.append([f"{port_str},,{port_str},{port_str},1", []])
-        ports_names.append(port_str)
+        port_num = random.randint(1, 99)
+        ports_list.append([f"{port_guid},,{port_guid},{port_guid},{port_num}", []])
+        ports_names.append(f"{port_guid[2:]}_{port_num}")
     config["Ports_names"] = ports_names
     return ports_list
 
@@ -181,9 +282,18 @@ def initialize_simulated_counters(endpoint_obj: dict):
 
 def assert_equal(message, left_expr, right_expr, test_name="positive"):
         if left_expr == right_expr:
-            print(f"    - {test_name} test name: {message} -- PASS")
+            print(f"    - {test_name} test: {message} -- PASS")
         else:
-            print(f"    - {test_name} test name: {message} -- FAIL (expected: {right_expr}, actual: {left_expr})")
+            print(f"    - {test_name} test: {message} -- FAIL (expected: {right_expr}, actual: {left_expr})")
+
+def validate_simulation_data():
+    positive_test_port_indexes = set([x[1] for x in POSITIVE_DATA_TEST])
+    negative_test_port_indexes = set([x[1] for x in NEGATIVE_DATA_TEST])
+    if not positive_test_port_indexes.isdisjoint(negative_test_port_indexes):
+        print("ERROR: same port can't participate in both positive and negative tests")
+        return False
+
+    return True
 
 def check_logs(config):
     lines=[]
@@ -199,41 +309,40 @@ def check_logs(config):
     if len(lines) == 0:
         print("Could not find log file in " + str(location_logs_can_be))
         return 1        
-    # if a you want to add more tests, please add more guids and test on other indeces.
-
-    ports_should_be_isoloated_indeces = list(set([x[1] for x in ALL_DATA_TEST]))
-    ports_shouldnt_be_isolated_indeces = [0]
-    # remove negative tests from the positive ones
-    ports_should_be_isoloated_indeces = [port for port in ports_should_be_isoloated_indeces if port not in ports_shouldnt_be_isolated_indeces]
+    # if a you want to add more tests, please add more guids and test on other indices.
+
+    ports_should_be_isolated_indices = list(set([x[1] for x in POSITIVE_DATA_TEST]))
+    ports_should_not_be_isolated_indices = list(set([x[1] for x in NEGATIVE_DATA_TEST]))
 
-    number_of_tests_approved = len(ports_should_be_isoloated_indeces)
-    number_of_negative_tests = len(ports_shouldnt_be_isolated_indeces)
+    number_of_failed_positive_tests = 0
+    number_of_failed_negative_tests = 0
     isolated_message="WARNING: Isolated port: "
-    for p in ports_should_be_isoloated_indeces:
+    for p in ports_should_be_isolated_indices:
         found=False
-        port_name = config["Ports_names"][p][2:]
-        testedCounter = set([x[2] for x in ALL_DATA_TEST if x[1]==p])
+        port_name = config["Ports_names"][p]
+        tested_counter = list(OrderedDict.fromkeys([x[2] for x in POSITIVE_DATA_TEST if x[1] == p]))
         for line in lines:
-            foundPort = isolated_message + port_name in line
-            if foundPort:
+            found_port = isolated_message + port_name in line
+            if found_port:
                 found = True
-                number_of_tests_approved -= 1 # it was found
                 break
-        assert_equal(f"{port_name} which check {testedCounter} changed and in the logs",found,True)
-
-    for p in ports_shouldnt_be_isolated_indeces:
+        if not found:
+            number_of_failed_positive_tests += 1
+        assert_equal(f"port {port_name} (index: {p}) which check {tested_counter} changed and should be in the logs", found, True)
+
+    for p in ports_should_not_be_isolated_indices:
         found=False
-        port_name = config["Ports_names"][p][2:]
-        testedCounter = set([x[2] for x in ALL_DATA_TEST if x[1]==p])
+        port_name = config["Ports_names"][p]
+        tested_counter = list(OrderedDict.fromkeys([x[2] for x in NEGATIVE_DATA_TEST if x[1] == p]))
         for line in lines:
-            foundPort = isolated_message + port_name in line
-            if foundPort:
-                found=True
-                number_of_negative_tests -= 1 # it was found, but it shouldnt
+            found_port = isolated_message + port_name in line
+            if found_port:
+                found = True
+                number_of_failed_negative_tests += 1
                 break
-        assert_equal(f"{port_name} changed and in the logs",found,False,"negative")
-    
-    all_pass = number_of_tests_approved == 0 and number_of_negative_tests == len(ports_shouldnt_be_isolated_indeces)
+        assert_equal(f"port {port_name} (index: {p}) which check {tested_counter} should not be in the logs", found, False, "negative")
+
+    all_pass = number_of_failed_positive_tests == 0 and number_of_failed_negative_tests == 0
     return 0 if all_pass else 1
 
 # start a server which update the counters every time
@@ -273,6 +382,9 @@ def main():
     config['row'] = config['selected_row']
     initialize_simulated_counters(config)
 
+    if not validate_simulation_data():
+        return 1
+
     port = args.endpoint_port
     url = f'http://0.0.0.0:{port}{args.url_suffix}'
     print(f'---Starting endpoint {url}')

diff --git a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/isolation_mgr.py b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/isolation_mgr.py
@@ -199,6 +199,7 @@ def __init__(self, ufm_client: UFMCommunicator, logger):
         self.link_down_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.LINK_DOWN_ISOLATION)
         self.switch_hca_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.SWITCH_TO_HOST_ISOLATION)
         self.test_mode = pdr_config.getboolean(Constants.CONF_COMMON,Constants.TEST_MODE, fallback=False)
+        self.test_iteration = 0
         self.dynamic_unresponsive_limit = pdr_config.getint(Constants.CONF_ISOLATION,Constants.DYNAMIC_UNRESPONSIVE_LIMIT, fallback=3)
         # Take from Conf
         self.logger = logger
@@ -885,7 +886,11 @@ def main_flow(self):
                 t_begin = time.time()
                 self.exclude_list.refresh()
                 self.get_isolation_state()
-                self.logger.info("Retrieving telemetry data to determine ports' states")
+                if not self.test_mode:
+                    self.logger.info("Retrieving telemetry data to determine ports' states")
+                else:
+                    self.logger.info(f"Retrieving test mode telemetry data to determine ports' states: iteration {self.test_iteration}")
+                    self.test_iteration += 1
                 try:
                     issues = self.read_next_set_of_high_ber_or_pdr_ports(endpoint_port)
                 except DynamicTelemetryUnresponsive:
@@ -894,6 +899,7 @@ def main_flow(self):
                         self.logger.error(f"Dynamic telemetry is unresponsive for {dynamic_telemetry_unresponsive_count} times, restarting telemetry session...")
                         endpoint_port = self.restart_telemetry_session()
                         dynamic_telemetry_unresponsive_count = 0
+                        self.test_iteration = 0
                     continue
                 if len(issues) > self.max_num_isolate:
                     # UFM send external event