Skip to content

Commit

Permalink
issue:3950870 PDR: send a reset REST API of a port after isolation (#257
Browse files Browse the repository at this point in the history
)
  • Loading branch information
vg12345 authored Sep 19, 2024
1 parent 1417129 commit ec69bc2
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 1 deletion.
3 changes: 3 additions & 0 deletions plugins/pdr_deterministic_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ The following parameters are configurable via the plugin's configuration file. (
| DEISOLATE_CONSIDER_TIME | consideration time for port deisolation (in minutes) | 5 |
| DO_DEISOLATION | if set to false, the plugin will not perform deisolation | True |
| DYNAMIC_WAIT_TIME | Seconds to wait for the dynamic telemetry session to respond | 30 |
| MAX_PORT_RESET_NUM | max number of resets per port | 2
| PORT_RESET_INTERVAL_SECONDS | time interval in seconds after last reset that zeroes reset counter (reenables resets) | 604800 # 1 week in seconds


## Calculating BER Counters

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ AUTOMATIC_DEISOLATE=True
# if set to false, the plugin will not perform deisolation
DO_DEISOLATION=True

[Reset]
# max number of resets per port
MAX_PORT_RESET_NUM=2
# time interval in seconds after last reset that zeroes reset counter (reenables resets), the default is 1 week in seconds
PORT_RESET_INTERVAL_SECONDS=604800

[Metrics]
# in Celsius
TMAX=70
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,11 @@ class PDRConstants():
CONF_SAMPLING = "Sampling"
CONF_ISOLATION = "Isolation"
CONF_METRICS = "Metrics"
CONF_RESET = "Reset"
INTERVAL = "INTERVAL"
MAX_NUM_ISOLATE = "MAX_NUM_ISOLATE"
MAX_PORT_RESET_NUM = "MAX_PORT_RESET_NUM"
PORT_RESET_INTERVAL_SECONDS = "PORT_RESET_INTERVAL_SECONDS"
TMAX = "TMAX"
D_TMAX = "D_TMAX"
MAX_PDR = "MAX_PDR"
Expand All @@ -53,6 +56,7 @@ class PDRConstants():
GET_ISOLATED_PORTS = "/resources/isolated_ports"
GET_PORTS_REST = "/resources/ports"
GET_ACTIVE_PORTS_REST = "/resources/ports?active=true"
POST_ACTIONS_REST = "/actions"
API_HEALTHY_PORTS = "healthy_ports"
API_ISOLATED_PORTS = "isolated_ports"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# This software product is governed by the End User License Agreement
# provided with the software product.
#
from datetime import datetime, timedelta
import traceback
import time
import http
Expand All @@ -22,6 +23,18 @@
from ufm_communication_mgr import UFMCommunicator
from data_store import DataStore

class PortReset:
"""
Represents the port reset info.
"""
def __init__(self, port_name):
"""
Initialize a new instance of the PortReset class.
"""
self.port_name = port_name
self.reset_time = datetime.min
self.reset_count = 0

#pylint: disable=too-many-instance-attributes
class IsolationMgr:
'''
Expand All @@ -34,6 +47,8 @@ def __init__(self, ufm_client: UFMCommunicator, logger):
self.isolated_ports = {}
# {port_name: telemetry_data}
self.ports_data = {}
# {port_name: PortReset}
self.ports_resets = {}
self.ufm_latest_isolation_state = []

pdr_config = configparser.ConfigParser()
Expand All @@ -46,6 +61,10 @@ def __init__(self, ufm_client: UFMCommunicator, logger):
self.do_deisolate = pdr_config.getboolean(Constants.CONF_ISOLATION, Constants.DO_DEISOLATION)
self.switch_hca_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION, Constants.SWITCH_TO_HOST_ISOLATION)
self.test_mode = pdr_config.getboolean(Constants.CONF_COMMON, Constants.TEST_MODE, fallback=False)
self.max_port_reset_num = pdr_config.getint(Constants.CONF_RESET, Constants.MAX_PORT_RESET_NUM, fallback=2)
self.port_reset_interval = timedelta(seconds=pdr_config.getint(Constants.CONF_RESET,
Constants.PORT_RESET_INTERVAL_SECONDS,
fallback=7*24*3600)) # default is 1 week in seconds

self.test_iteration = 0
self.logger = logger
Expand Down Expand Up @@ -83,10 +102,13 @@ def eval_isolation(self, port_name, cause):
return

if not self.dry_run:
# Isolate port
ret = self.ufm_client.isolate_port(port_name)
if not ret or ret.status_code != http.HTTPStatus.OK:
self.logger.warning("Failed isolating port: %s with cause: %s... status_code= %s", port_name, cause, ret.status_code)
return
# Reset port
self.reset_port(port_name, port_obj.port_guid)
isolated_port = self.isolated_ports.get(port_name)
if not isolated_port:
self.isolated_ports[port_name] = IsolatedPort(port_name)
Expand All @@ -97,6 +119,36 @@ def eval_isolation(self, port_name, cause):
if not self.test_mode:
self.ufm_client.send_event(log_message, event_id=Constants.EXTERNAL_EVENT_ALERT, external_event_name="Isolating Port")

def reset_port(self, port_name, port_guid):
"""
Reset port if reset limit is not not exceeded
"""
# Check if reset is allowed
reset_history = self.ports_resets.get(port_name)
if reset_history:
if datetime.now() - reset_history.reset_time > self.port_reset_interval:
# Passed too much time from last reset: clean reset history and allow resets
del self.ports_resets[port_name]
elif reset_history.reset_count >= self.max_port_reset_num:
# Exceeds reset limit
self.logger.info("Skipping reset of port: %s... reset limit exceeded", port_name)
return

# Perform reset
ret = self.ufm_client.reset_port(port_name, port_guid)
if not ret or ret.status_code != http.HTTPStatus.OK:
self.logger.warning("Failed resetting port: %s... status_code= %s", port_name, ret.status_code)
return

# Update port resets history
reset_history = self.ports_resets.get(port_name)
if not reset_history:
reset_history = PortReset(port_name)
self.ports_resets[port_name] = reset_history

reset_history.reset_count += 1
reset_history.reset_time = datetime.now()
self.logger.info("Resetting port: %s... reset_count= %s", port_name, reset_history.reset_count)

def eval_deisolate(self, port_name):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def analyze_telemetry_data(self, ports_data, ports_counters):
issues[port_name] = ber_issue
# If out of operating conditions we'll overwrite the cause
if self.temp_check and self.is_out_of_operating_conf(port_name):
issues[port_name].cause = Constants.ISSUE_OONOC
issues[port_name] = Issue(port_obj.port_name, Constants.ISSUE_OONOC)
return list(issues.values())

def check_deisolation_conditions(self, isolated_port):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,18 @@ def get_ports_metadata(self):

def get_port_metadata(self, port_name):
return self.get_request(f"{Constants.GET_PORTS_REST}/ {port_name}")

def reset_port(self, port_name, port_guid):
"""
Reset port
"""
# using isolation UFM REST API - POST /ufmRestV2/actions
data = {
"params": { "port_id": port_name },
"action": "reset",
"object_ids": [ port_guid ],
"object_type": "System",
"description": "",
"identifier": "id"
}
return self.send_request(Constants.POST_ACTIONS_REST, data, method=Constants.POST_METHOD)

0 comments on commit ec69bc2

Please sign in to comment.