Skip to content

Commit

Permalink
Add debuggability for reboot function (#15868)
Browse files Browse the repository at this point in the history
* Add debuggability for reboot function

1. Add function to collect console log from starting reboot to dut up
2. When dut is not up, check if dut is pingable and collect the mgmt interface config

* fix pre-commit checker issue
  • Loading branch information
JibinBao authored Dec 20, 2024
1 parent 7d96724 commit fd0b20c
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 194 deletions.
201 changes: 200 additions & 1 deletion tests/common/helpers/dut_utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
import logging
import allure
import os
import jinja2
import glob
import re
import yaml
from tests.common.helpers.assertions import pytest_assert
from tests.common.utilities import get_host_visible_vars
from tests.common.utilities import wait_until
from tests.common.errors import RunAnsibleModuleFail
from collections import defaultdict
from tests.common.connections.console_host import ConsoleHost
from tests.common.utilities import get_dut_current_passwd
from tests.common.connections.base_console_conn import (
CONSOLE_SSH_CISCO_CONFIG,
CONSOLE_SSH_DIGI_CONFIG,
CONSOLE_SSH_SONIC_CONFIG
)
import time

CONTAINER_CHECK_INTERVAL_SECS = 1
CONTAINER_RESTART_THRESHOLD_SECS = 180

# Ansible config files
LAB_CONNECTION_GRAPH_PATH = os.path.normpath((os.path.join(os.path.dirname(__file__), "../../../ansible/files")))

BASI_PATH = os.path.dirname(os.path.abspath(__file__))


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -346,7 +361,7 @@ def get_sai_sdk_dump_file(duthost, dump_file_name):
cmd_gen_sdk_dump = f"docker exec syncd bash -c 'saisdkdump -f {full_path_dump_file}' "
duthost.shell(cmd_gen_sdk_dump)

cmd_copy_dmp_from_syncd_to_host = f"docker cp syncd:{full_path_dump_file} {full_path_dump_file}"
cmd_copy_dmp_from_syncd_to_host = f"docker cp syncd: {full_path_dump_file} {full_path_dump_file}"
duthost.shell(cmd_copy_dmp_from_syncd_to_host)

compressed_dump_file = f"/tmp/{dump_file_name}.tar.gz"
Expand Down Expand Up @@ -393,3 +408,187 @@ def is_mellanox_fanout(duthost, localhost):
return False

return True


def create_duthost_console(duthost,localhost, conn_graph_facts, creds): # noqa F811
dut_hostname = duthost.hostname
console_host = conn_graph_facts['device_console_info'][dut_hostname]['ManagementIp']
if "/" in console_host:
console_host = console_host.split("/")[0]
console_port = conn_graph_facts['device_console_link'][dut_hostname]['ConsolePort']['peerport']
console_type = conn_graph_facts['device_console_link'][dut_hostname]['ConsolePort']['type']
console_menu_type = conn_graph_facts['device_console_link'][dut_hostname]['ConsolePort']['menu_type']
console_username = conn_graph_facts['device_console_link'][dut_hostname]['ConsolePort']['proxy']

console_type = f"console_{console_type}"
console_menu_type = f"{console_type}_{console_menu_type}"

# console password and sonic_password are lists, which may contain more than one password
sonicadmin_alt_password = localhost.host.options['variable_manager']._hostvars[dut_hostname].get(
"ansible_altpassword")
sonic_password = [creds['sonicadmin_password'], sonicadmin_alt_password]

# Attempt to clear the console port
try:
duthost_clear_console_port(
menu_type=console_menu_type,
console_host=console_host,
console_port=console_port,
console_username=console_username,
console_password=creds['console_password'][console_type]
)
except Exception as e:
logger.warning(f"Issue trying to clear console port: {e}")

# Set up console host
host = None
for attempt in range(1, 4):
try:
host = ConsoleHost(console_type=console_type,
console_host=console_host,
console_port=console_port,
sonic_username=creds['sonicadmin_user'],
sonic_password=sonic_password,
console_username=console_username,
console_password=creds['console_password'][console_type])
break
except Exception as e:
logger.warning(f"Attempt {attempt}/3 failed: {e}")
continue
else:
raise Exception("Failed to set up connection to console port. See warning logs for details.")

return host


def creds_on_dut(duthost):
""" read credential information according to the dut inventory """
groups = duthost.host.options['inventory_manager'].get_host(duthost.hostname).get_vars()['group_names']
groups.append("fanout")
logger.info("dut {} belongs to groups {}".format(duthost.hostname, groups))
exclude_regex_patterns = [
r'topo_.*\.yml',
r'breakout_speed\.yml',
r'lag_fanout_ports_test_vars\.yml',
r'qos\.yml',
r'sku-sensors-data\.yml',
r'mux_simulator_http_port_map\.yml'
]
ansible_folder_path = os.path.join(BASI_PATH, "../../../ansible/")
files = glob.glob(os.path.join(ansible_folder_path, "group_vars/all/*.yml"))
files += glob.glob(os.path.join(ansible_folder_path, "vars/*.yml"))
for group in groups:
files += glob.glob(os.path.join(ansible_folder_path, f"group_vars/{group}/*.yml"))
filtered_files = [
f for f in files if not re.search('|'.join(exclude_regex_patterns), f)
]

creds = {}
for f in filtered_files:
with open(f) as stream:
v = yaml.safe_load(stream)
if v is not None:
creds.update(v)
else:
logging.info("skip empty var file {}".format(f))

cred_vars = [
"sonicadmin_user",
"sonicadmin_password",
"docker_registry_host",
"docker_registry_username",
"docker_registry_password",
"public_docker_registry_host"
]
hostvars = duthost.host.options['variable_manager']._hostvars[duthost.hostname]
for cred_var in cred_vars:
if cred_var in creds:
creds[cred_var] = jinja2.Template(creds[cred_var]).render(**hostvars)
# load creds for console
if "console_login" not in list(hostvars.keys()):
console_login_creds = {}
else:
console_login_creds = hostvars["console_login"]
creds["console_user"] = {}
creds["console_password"] = {}

creds["ansible_altpasswords"] = []

# If ansible_altpasswords is empty, add ansible_altpassword to it
if len(creds["ansible_altpasswords"]) == 0:
creds["ansible_altpasswords"].append(hostvars["ansible_altpassword"])

passwords = creds["ansible_altpasswords"] + [creds["sonicadmin_password"]]
creds['sonicadmin_password'] = get_dut_current_passwd(
duthost.mgmt_ip,
duthost.mgmt_ipv6,
creds['sonicadmin_user'],
passwords
)

for k, v in list(console_login_creds.items()):
creds["console_user"][k] = v["user"]
creds["console_password"][k] = v["passwd"]

return creds


def duthost_clear_console_port(
menu_type: str,
console_host: str,
console_port: str,
console_username: str,
console_password: str
):
"""
Helper function to clear the console port for a given DUT.
Useful when a device has an occupied console port, preventing dut_console tests from running.
Parameters:
menu_type: Connection type for the console's config menu (as expected by the ConsoleTypeMapper)
console_host: DUT host's console IP address
console_port: DUT host's console port, to be cleared
console_username: Username for the console account (overridden for Digi console)
console_password: Password for the console account
"""
if menu_type == "console_ssh_":
raise Exception("Device does not have a defined Console_menu_type.")

# Override console user if the configuration menu is Digi, as this requires admin login
console_user = 'admin' if menu_type == CONSOLE_SSH_DIGI_CONFIG else console_username

duthost_config_menu = ConsoleHost(
console_type=menu_type,
console_host=console_host,
console_port=console_port,
console_username=console_user,
console_password=console_password,
sonic_username=None,
sonic_password=None
)

# Command lists for each config menu type
# List of tuples, containing a command to execute, and an optional pattern to wait for
command_list = {
CONSOLE_SSH_DIGI_CONFIG: [
('2', None), # Enter serial port config
(console_port, None), # Choose DUT console port
('a', None), # Enter port management
('1', f'Port #{console_port} has been reset successfully.') # Reset chosen port
],
CONSOLE_SSH_SONIC_CONFIG: [
(f'sudo sonic-clear line {console_port}', None) # Clear DUT console port (requires sudo)
],
CONSOLE_SSH_CISCO_CONFIG: [
(f'clear line tty {console_port}', '[confirm]'), # Clear DUT console port
('', '[OK]') # Confirm selection
],
}

for command, wait_for_pattern in command_list[menu_type]:
duthost_config_menu.write_channel(command + duthost_config_menu.RETURN)
duthost_config_menu.read_until_prompt_or_pattern(wait_for_pattern)

duthost_config_menu.disconnect()
logger.info(f"Successfully cleared console port {console_port}, sleeping for 5 seconds")
time.sleep(5)
60 changes: 56 additions & 4 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
from .platform.interface_utils import check_interface_status_of_up_ports
from .platform.processes_utils import wait_critical_processes
from .utilities import wait_until, get_plt_reboot_ctrl
from tests.common.helpers.dut_utils import ignore_t2_syslog_msgs
from tests.common.helpers.dut_utils import ignore_t2_syslog_msgs, create_duthost_console, creds_on_dut
from tests.common.fixtures.conn_graph_facts import get_graph_facts


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -189,7 +191,8 @@ def wait_for_startup(duthost, localhost, delay, timeout):
timeout=timeout,
module_ignore_errors=True)
if res.is_failed or ('msg' in res and 'Timeout' in res['msg']):
raise Exception('DUT {} did not startup'.format(hostname))
collect_mgmt_config_by_console(duthost, localhost)
raise Exception(f'DUT {hostname} did not startup. res: {res}')

logger.info('ssh has started up on {}'.format(hostname))

Expand Down Expand Up @@ -266,7 +269,10 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
# Create a temporary file in tmpfs before reboot
logger.info('DUT {} create a file /dev/shm/test_reboot before rebooting'.format(hostname))
duthost.command('sudo touch /dev/shm/test_reboot')

wait_conlsole_connection = 5
console_thread_res = pool.apply_async(
collect_console_log, args=(duthost, localhost, timeout + wait_conlsole_connection))
time.sleep(wait_conlsole_connection)
reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type)

wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res)
Expand All @@ -277,7 +283,12 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
# if wait_for_ssh flag is False, do not wait for dut to boot up
if not wait_for_ssh:
return
wait_for_startup(duthost, localhost, delay, timeout)
try:
wait_for_startup(duthost, localhost, delay, timeout)
except Exception as err:
logger.error('collecting console log thread result: {} on {}'.format(console_thread_res.get(), hostname))
pool.terminate()
raise Exception(f"dut not start: {err}")

logger.info('waiting for switch {} to initialize'.format(hostname))
if safe_reboot:
Expand Down Expand Up @@ -516,3 +527,44 @@ def check_determine_reboot_cause_service(dut):
assert active_state == "active", f"Service 'determine-reboot-cause' is not active. Current state: {active_state}"
assert sub_state == "exited", f"Service 'determine-reboot-cause' did not exit cleanly. \
Current sub-state: {sub_state}"


def try_create_dut_console(duthost, localhost, conn_graph_facts, creds):
try:
dut_sonsole = create_duthost_console(duthost, localhost, conn_graph_facts, creds)
except Exception as err:
logger.warning(f"Fail to create dut console. Please check console config or if console works ro not. {err}")
return None
logger.info("creating dut console succeeds")
return dut_sonsole


def collect_console_log(duthost, localhost, timeout):
logger.info("start: collect console log")
creds = creds_on_dut(duthost)
conn_graph_facts = get_graph_facts(duthost, localhost, [duthost.hostname])
dut_console = try_create_dut_console(duthost, localhost, conn_graph_facts, creds)
if dut_console:
logger.info(f"sleep {timeout} to collect console log....")
time.sleep(timeout)
dut_console.disconnect()
logger.info('end: collect console log')
else:
logger.warning("dut console is not ready, we cannot get log by console")


def collect_mgmt_config_by_console(duthost, localhost):
logger.info("check if dut is pingable")
localhost.shell(f"ping -c 5 {duthost.mgmt_ip}", module_ignore_errors=True)

logger.info("Start: collect mgmt config by console")
creds = creds_on_dut(duthost)
conn_graph_facts = get_graph_facts(duthost, localhost, [duthost.hostname])
dut_console = try_create_dut_console(duthost, localhost, conn_graph_facts, creds)
if dut_console:
dut_console.send_command("ip a s eth0")
dut_console.send_command("show ip int")
dut_console.disconnect()
logger.info('End: collect mgmt config by console ...')
else:
logger.warning("dut console is not ready, we can get mgmt config by console")
Loading

0 comments on commit fd0b20c

Please sign in to comment.