Skip to content

Commit

Permalink
task:4151977Log analyzer analyze telemetry logs (#277)
Browse files Browse the repository at this point in the history
* Analyzing telemtry - for now

* Working, needs refactor + doc string

* Working for telemetry

* pylint

* pylint2

* fixed the output to pdf

* pylint

* monior changes

* Removing old text from readme
  • Loading branch information
boazhaim authored Nov 18, 2024
1 parent b2bcde4 commit 418a64a
Show file tree
Hide file tree
Showing 8 changed files with 324 additions and 79 deletions.
17 changes: 0 additions & 17 deletions plugins/ufm_log_analyzer_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,6 @@ sudo yum install -y libjpeg-devel zlib-devel
```
Know your UFM sysdump location.

#### Running on a remote server
Since the tool generates graphs, you will need to setup an X11 forwarding:

1. Mac - Install and run [Quartz](https://www.xquartz.org/). Windows - Install and run [Xming](http://www.straightrunning.com)
2. On your remote server (Ubuntu/RedHat), make sure the x11 forwarding is enabled:
```
vim /etc/ssh/sshd_config
#Enalbe x11
X11Forwarding yes
```
3. Restart the ssh service `systemctl restart ssh` or `systemctl restart sshd` depends on the OS.
4. Install `python3-tk` using `sudo yum install python3-tkinter` or `sudo apt-get install python3-tk` depends on the OS.
5. When you SSH to the server, use the flag `-X`, for example `ssh -X root@my-vm`

If you would like to make sure it is working, once connection is done, do `xclock &`. This should start a clock on your machine.

### How to run
```
./log_analzer.sh [options] -l <path to dump>
Expand Down Expand Up @@ -89,7 +73,6 @@ This logic will show links that:
2. Thermal shut down.
3. If one side went down and the other side was not rebooted.


![Tool flow](img/loganalzer.png)


4 changes: 3 additions & 1 deletion plugins/ufm_log_analyzer_plugin/src/loganalyze/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ disable=missing-function-docstring,
too-few-public-methods,
logging-fstring-interpolation,


[DESIGN]
max-locals=20
max-locals=20
max-args=8
73 changes: 49 additions & 24 deletions plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from loganalyze.log_analyzers.console_log_analyzer import ConsoleLogAnalyzer
from loganalyze.log_analyzers.rest_api_log_analyzer import RestApiAnalyzer
from loganalyze.log_analyzers.link_flapping_analyzer import LinkFlappingAnalyzer
from loganalyze.log_analyzers.ibdiagnet2_port_counters_analyzer \
import Ibdiagnet2PortCountersAnalyzer

from loganalyze.pdf_creator import PDFCreator
from loganalyze.utils.common import delete_files_by_types
Expand Down Expand Up @@ -252,7 +254,8 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
in the full report.
Returns the created analyzer
"""
if log_name in full_extracted_logs_list:
# Checking the base name since some logs in the list are with a directory name
if any(os.path.basename(log) == log_name for log in full_extracted_logs_list):
log_csvs = get_files_in_dest_by_type(parsed_args.destination,
log_name,
parsed_args.extract_level)
Expand Down Expand Up @@ -305,7 +308,7 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
log.LOGGER.debug("Starting analyzing the data")
partial_create_analyzer = partial(create_analyzer,
parsed_args=args,
full_extracted_logs_list=full_logs_list,
full_extracted_logs_list=logs_to_work_with,
ufm_top_analyzer_obj=ufm_top_analyzer)

# Creating the analyzer for each log
Expand All @@ -328,6 +331,15 @@ def create_analyzer(parsed_args, full_extracted_logs_list,

rest_api_log_analyzer = partial_create_analyzer(log_name="rest_api.log",
analyzer_clc=RestApiAnalyzer)

ibdianget_2_ports_primary_analyzer = partial_create_analyzer(
log_name="ufm_logs_ibdiagnet2_port_counters.log",
analyzer_clc=Ibdiagnet2PortCountersAnalyzer)

ibdianget_2_ports_secondary_analyzer = partial_create_analyzer(
log_name="secondary_telemetry_ibdiagnet2_port_counters.log",
analyzer_clc=Ibdiagnet2PortCountersAnalyzer)

second_telemetry_samples = get_files_in_dest_by_type(args.destination,
"secondary_",
1000,
Expand Down Expand Up @@ -358,36 +370,49 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
)

used_ufm_version = console_log_analyzer.ufm_versions
text_to_show_in_pdf = f"Used ufm version in console log {used_ufm_version}"
fabric_info = "fabric info:" + os.linesep + str(ibdiagnet_analyzer.get_fabric_size()) \
if ibdiagnet_analyzer else "No Fabric Info found" # pylint: disable=invalid-name
text_to_show_in_pdf = f"Used ufm version in console log {used_ufm_version}{os.linesep}"

pdf = PDFCreator(pdf_path, pdf_header, png_images, text_to_show_in_pdf)
dataframes_for_pdf = []
fabric_info = ibdiagnet_analyzer.get_fabric_size() \
if ibdiagnet_analyzer else "No Fabric Info found"
dataframes_for_pdf.append(("Fabric info", fabric_info))
if links_flapping_analyzer:
link_flapping = links_flapping_analyzer.get_link_flapping_last_week() \
if links_flapping_analyzer else "No link flapping info"
text_to_show_in_pdf += os.linesep + str(fabric_info) + os.linesep + \
"Link Flapping:" + os.linesep + str(link_flapping)

critical_events_burst = event_log_analyzer.get_critical_event_bursts()
critical_events_text = "The minute event_type event count" # pylint: disable=invalid-name
for critical_event in critical_events_burst:
timestamp = critical_event['timestamp']
event_type = critical_event['event_type']
event = critical_event['event']
counter = critical_event['count']
event_text = f"{timestamp} {event_type} {event} {counter}"
critical_events_text = critical_events_text + os.linesep + event_text

text_to_show_in_pdf += os.linesep + os.linesep + "More than 5 events burst over a minute:" \
+ os.linesep + critical_events_text
dataframes_for_pdf.append(("Link Flapping past week",
links_flapping_analyzer.get_link_flapping_last_week()))
lists_to_add = []
critical_events_headers = ["timestamp", "event_type", "event", "count"]
lists_to_add.append((event_log_analyzer.get_critical_event_bursts(),
"More than 5 events burst over a minute",
critical_events_headers))

for cur_telemetry in \
[ibdianget_2_ports_primary_analyzer, ibdianget_2_ports_secondary_analyzer]:
dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} Telemetry iteration time",
cur_telemetry.get_last_iterations_time_stats()))
dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} "
"Telemetry iteration first and last timestamps",
cur_telemetry.get_first_last_iteration_timestamp()))
dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} Telemetry fabric size",
cur_telemetry.get_number_of_switches_and_ports()))
lists_to_add.append(([cur_telemetry.get_number_of_core_dumps()],
f"{cur_telemetry.telemetry_type} "
"number of core dumps found in the logs",
["Amount"]))


# PDF creator gets all the images and to add to the report
pdf = PDFCreator(pdf_path, pdf_header, png_images, text_to_show_in_pdf)
pdf.created_pdf()
pdf.create_pdf(dataframes_for_pdf, lists_to_add)
# Generated a report that can be located in the destination
log.LOGGER.info("Analysis is done, please see the following outputs:")
for image, title in images_and_title_to_present:
log.LOGGER.info(f"{title}: {image}")
log.LOGGER.info(f"Summary PDF was created! you can open here at {pdf_path}")

if args.interactive:
import IPython
IPython.embed()

# Clean some unended files created during run
files_types_to_delete = set()
files_types_to_delete.add("png") #png images created for PDF report
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# pylint: disable=missing-function-docstring
# pylint: disable=missing-module-docstring

import logging
import os
import csv
import shutil
Expand All @@ -21,15 +22,18 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.dates as mdates

from loganalyze.log_analyzers.constants import DataConstants
import loganalyze.logger as log
# This makes sure the user does not see the warning from plotting
logging.getLogger('matplotlib').setLevel(logging.ERROR)
matplotlib.use('Agg') # This allows to run the tool on servers without graphic card/headless

pd.set_option("display.max_colwidth", None)
warnings.filterwarnings("ignore")


class BaseImageCreator:
# Setting the graph time interval to 1 hour
# This is out side of the constructor since
Expand All @@ -47,7 +51,7 @@ def __init__(self, dest_image_path):
self._funcs_for_analysis = set()

def _save_data_based_on_timestamp(
self, data_to_plot, x_label, y_label, title
self, data_to_plot, x_label, y_label, title, large_sample=False
):
with plt.ion():
log.LOGGER.debug(f"saving {title}")
Expand All @@ -61,7 +65,10 @@ def _save_data_based_on_timestamp(
# Set the locator to show ticks every hour and the formatter to
# include both date and time
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.HourLocator())
if large_sample:
ax.xaxis.set_major_locator(mdates.HourLocator(interval=24))
else:
ax.xaxis.set_major_locator(mdates.HourLocator())
ax.xaxis.set_minor_locator(
mdates.MinuteLocator(interval=15)
) # Add minor ticks every 15 minutes
Expand Down Expand Up @@ -94,7 +101,7 @@ def _save_data_based_on_timestamp(
self._images_created.extend(images_list_with_title)
plt.close()

def _save_pivot_data_in_bars( # pylint: disable=too-many-arguments
def _save_pivot_data_in_bars(
self, pivoted_data, x_label, y_label, title, legend_title
):
if pivoted_data.empty:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#

from typing import List
import warnings
import pandas as pd
from loganalyze.log_analyzers.base_analyzer import BaseAnalyzer


class Ibdiagnet2PortCountersAnalyzer(BaseAnalyzer):
def __init__(self,
logs_csvs: List[str],
hours: int,
dest_image_path: str,
sort_timestamp=False):
super().__init__(logs_csvs, hours, dest_image_path, sort_timestamp)
self._iteration_time_data = None
self._iteration_time_stats = None
self.text_to_show_in_pdf = ""
# This will make sure all the extra columns are int
extra_columns = ['extra1', 'extra2', 'extra3', 'extra4', 'extra5']
for col in extra_columns:
self._log_data_sorted[col] = pd.to_numeric(
self._log_data_sorted[col],
errors='coerce'
).astype('Int64')
self._funcs_for_analysis = {self.plot_iteration_time_over_time}
# Based on the log path, decided if this is primary or secondary
if "ufm_logs" in logs_csvs[0]:
self.telemetry_type = "primary"
elif "secondary_telemetry" in logs_csvs[0]:
self.telemetry_type = "secondary"
else:
self.telemetry_type = "Unknown_telemetry_type"

self._first_timestamp_of_logs = None
self._last_timestamp_of_logs = None

def get_collectx_versions(self):
unique_collectx_versions = self._log_data_sorted[\
self._log_data_sorted['type'] == 'collectx_version']['data'].unique()
return unique_collectx_versions

def get_number_of_switches_and_ports(self):
"""
Generate summary statistics for 'total_devices_ports' data.
This function calculates the average, maximum, minimum
for switches, CAs, routers, and ports.
"""
filtered_data = self._log_data_sorted[\
self._log_data_sorted['type'] == 'total_devices_ports']

ports_numbers_columns = ['extra1', 'extra3', 'extra5']
filtered_data['extra135'] = pd.to_numeric(
filtered_data[ports_numbers_columns].stack(), errors='coerce'
).groupby(level=0).sum(min_count=1)

columns_of_interest = ['data', 'extra2', 'extra4', 'extra135']
column_mapping = {
'data': '# of Switches',
'extra2': 'CAs',
'extra4': 'Routers',
'extra135': 'Ports'
}

summary_stats = []

for col in columns_of_interest:
numeric_col = pd.to_numeric(filtered_data[col], errors='coerce')
non_zero_col = numeric_col[numeric_col != 0]

avg = round(non_zero_col.mean()) if not non_zero_col.empty else 0
max_val = int(non_zero_col.max()) if not non_zero_col.empty else 0
min_val = int(non_zero_col.min()) if not non_zero_col.empty else 0
count = int(non_zero_col.count())

summary_stats.append({
'Category': column_mapping.get(col, col),
'Average': avg,
'Maximum': max_val,
'Minimum': min_val,
'Total Rows (Non-Zero)': count
})

summary_df = pd.DataFrame(summary_stats)

return summary_df

def analyze_iteration_time(self, threshold=0.15):
"""
Analyze rows where 'type' is 'iteration_time'.
Keep only 'type', 'timestamp', and 'data' columns.
Calculate statistics for the 'data' column, including timestamps for max and min.
Also, find gaps of at least 2 minutes with no data and allow filtering by a threshold.
Parameters:
- threshold (float): Minimum value to consider for analysis. Default is 0.5 seconds.
"""
filtered_data = self._log_data_sorted[self._log_data_sorted['type'] == 'iteration_time']
filtered_data = filtered_data[['type', 'timestamp', 'data']]
filtered_data['data'] = pd.to_numeric(filtered_data['data'], errors='coerce')

filtered_data = filtered_data[filtered_data['data'] >= threshold]
filtered_data['timestamp'] = pd.to_datetime(filtered_data['timestamp'], errors='coerce')
filtered_data = filtered_data.dropna(subset=['timestamp'])

filtered_data = filtered_data.sort_values(by='timestamp').reset_index(drop=True)

if not filtered_data['data'].empty:
average = filtered_data['data'].mean()
max_value = filtered_data['data'].max()
min_value = filtered_data['data'].min()

max_timestamp = filtered_data.loc[filtered_data['data'] \
== max_value, 'timestamp'].iloc[0]
min_timestamp = filtered_data.loc[filtered_data['data'] \
== min_value, 'timestamp'].iloc[0]
first_timestamp = filtered_data['timestamp'].iloc[0]
last_timestamp = filtered_data['timestamp'].iloc[-1]

else:
average = max_value = min_value = 0.0
max_timestamp = min_timestamp = None
first_timestamp = last_timestamp = None

stats = {
'Average': average,
'Maximum': max_value,
'Max Timestamp': max_timestamp,
'Minimum': min_value,
'Min Timestamp': min_timestamp,
'Total Rows': filtered_data['data'].count()
}
stats_df = pd.DataFrame([stats])
self._iteration_time_data = filtered_data
self._iteration_time_stats = stats_df
self._first_timestamp_of_logs = first_timestamp
self._last_timestamp_of_logs = last_timestamp
return stats_df

def get_first_last_iteration_timestamp(self):
if not self._first_timestamp_of_logs or not self._last_timestamp_of_logs:
self.analyze_iteration_time()
times ={
'first': str(self._first_timestamp_of_logs),
'last': str(self._last_timestamp_of_logs)
}
return pd.DataFrame([times])

def get_last_iterations_time_stats(self):
return self._iteration_time_stats

def plot_iteration_time_over_time(self):
if self._iteration_time_data is None:
self.analyze_iteration_time()

self._iteration_time_data.set_index('timestamp', inplace=True)

with warnings.catch_warnings():
warnings.filterwarnings("ignore", ".*Locator attempting to generate.*")
self._save_data_based_on_timestamp(
data_to_plot=self._iteration_time_data['data'],
x_label='Timestamp',
y_label='Iteration Time (s)',
title=f'{self.telemetry_type} Iteration Time',
large_sample=True)

def get_number_of_core_dumps(self):
core_dumps = self._log_data_sorted[self._log_data_sorted['type'] == 'timeout_dump_core']
return {"Amount":len(core_dumps)}
Loading

0 comments on commit 418a64a

Please sign in to comment.