task:4151977Log analyzer analyze telemetry logs (#277)

* Analyzing telemtry - for now * Working, needs refactor + doc string * Working for telemetry * pylint * pylint2 * fixed the output to pdf * pylint * monior changes * Removing old text from readme
Mellanox · Nov 18, 2024 · 418a64a · 418a64a
1 parent b2bcde4
commit 418a64a
Show file tree

Hide file tree

Showing 8 changed files with 324 additions and 79 deletions.
diff --git a/plugins/ufm_log_analyzer_plugin/README.md b/plugins/ufm_log_analyzer_plugin/README.md
@@ -26,22 +26,6 @@ sudo yum install -y libjpeg-devel zlib-devel
 ```
 Know your UFM sysdump location.
 
-####  Running on a remote server
-Since the tool generates graphs, you will need to setup an X11 forwarding:
-
-1. Mac - Install and run [Quartz](https://www.xquartz.org/). Windows - Install and run [Xming](http://www.straightrunning.com)
-2. On your remote server (Ubuntu/RedHat), make sure the x11 forwarding is enabled: 
-``` 
-vim /etc/ssh/sshd_config
-#Enalbe x11
-X11Forwarding yes
-```
-3. Restart the ssh service `systemctl restart ssh` or `systemctl restart sshd` depends on the OS.
-4. Install `python3-tk` using `sudo yum install python3-tkinter` or `sudo apt-get install python3-tk` depends on the OS.
-5. When you SSH to the server, use the flag `-X`, for example `ssh -X root@my-vm`
-
-If you would like to make sure it is working, once connection is done, do `xclock &`. This should start a clock on your machine.
-
 ### How to run
 ```
 ./log_analzer.sh  [options] -l <path to dump>
@@ -89,7 +73,6 @@ This logic will show links that:
 2. Thermal shut down.
 3. If one side went down and the other side was not rebooted.
 
-
 ![Tool flow](img/loganalzer.png)
 
 
diff --git a/plugins/ufm_log_analyzer_plugin/src/loganalyze/.pylintrc b/plugins/ufm_log_analyzer_plugin/src/loganalyze/.pylintrc
@@ -5,5 +5,7 @@ disable=missing-function-docstring,
         too-few-public-methods,
         logging-fstring-interpolation,
 
+
 [DESIGN]
-max-locals=20
+max-locals=20
+max-args=8
diff --git a/plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzer.py b/plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzer.py
@@ -44,6 +44,8 @@
 from loganalyze.log_analyzers.console_log_analyzer import ConsoleLogAnalyzer
 from loganalyze.log_analyzers.rest_api_log_analyzer import RestApiAnalyzer
 from loganalyze.log_analyzers.link_flapping_analyzer import LinkFlappingAnalyzer
+from loganalyze.log_analyzers.ibdiagnet2_port_counters_analyzer \
+    import Ibdiagnet2PortCountersAnalyzer
 
 from loganalyze.pdf_creator import PDFCreator
 from loganalyze.utils.common import delete_files_by_types
@@ -252,7 +254,8 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
     in the full report.
     Returns the created analyzer
     """
-    if log_name in full_extracted_logs_list:
+    # Checking the base name since some logs in the list are with a directory name
+    if any(os.path.basename(log) == log_name for log in full_extracted_logs_list):
         log_csvs = get_files_in_dest_by_type(parsed_args.destination,
                                              log_name,
                                              parsed_args.extract_level)
@@ -305,7 +308,7 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
         log.LOGGER.debug("Starting analyzing the data")
         partial_create_analyzer = partial(create_analyzer,
                                           parsed_args=args,
-                                          full_extracted_logs_list=full_logs_list,
+                                          full_extracted_logs_list=logs_to_work_with,
                                           ufm_top_analyzer_obj=ufm_top_analyzer)
 
         # Creating the analyzer for each log
@@ -328,6 +331,15 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
 
         rest_api_log_analyzer = partial_create_analyzer(log_name="rest_api.log",
                                                         analyzer_clc=RestApiAnalyzer)
+
+        ibdianget_2_ports_primary_analyzer = partial_create_analyzer(
+            log_name="ufm_logs_ibdiagnet2_port_counters.log",
+            analyzer_clc=Ibdiagnet2PortCountersAnalyzer)
+
+        ibdianget_2_ports_secondary_analyzer = partial_create_analyzer(
+            log_name="secondary_telemetry_ibdiagnet2_port_counters.log",
+            analyzer_clc=Ibdiagnet2PortCountersAnalyzer)
+
         second_telemetry_samples = get_files_in_dest_by_type(args.destination,
                                                                  "secondary_",
                                                                  1000,
@@ -358,36 +370,49 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
         )
 
         used_ufm_version = console_log_analyzer.ufm_versions
-        text_to_show_in_pdf = f"Used ufm version in console log {used_ufm_version}"
-        fabric_info = "fabric info:" + os.linesep + str(ibdiagnet_analyzer.get_fabric_size()) \
-                        if ibdiagnet_analyzer else "No Fabric Info found" # pylint: disable=invalid-name
+        text_to_show_in_pdf = f"Used ufm version in console log {used_ufm_version}{os.linesep}"
+
+        pdf = PDFCreator(pdf_path, pdf_header, png_images, text_to_show_in_pdf)
+        dataframes_for_pdf = []
+        fabric_info = ibdiagnet_analyzer.get_fabric_size() \
+                        if ibdiagnet_analyzer else "No Fabric Info found"
+        dataframes_for_pdf.append(("Fabric info", fabric_info))
         if links_flapping_analyzer:
-            link_flapping = links_flapping_analyzer.get_link_flapping_last_week() \
-                            if links_flapping_analyzer else "No link flapping info"
-            text_to_show_in_pdf += os.linesep + str(fabric_info) + os.linesep + \
-            "Link Flapping:" + os.linesep + str(link_flapping)
-
-        critical_events_burst = event_log_analyzer.get_critical_event_bursts()
-        critical_events_text = "The minute           event_type     event    count" # pylint: disable=invalid-name
-        for critical_event in critical_events_burst:
-            timestamp = critical_event['timestamp']
-            event_type = critical_event['event_type']
-            event = critical_event['event']
-            counter = critical_event['count']
-            event_text = f"{timestamp} {event_type} {event} {counter}"
-            critical_events_text = critical_events_text + os.linesep + event_text
-
-        text_to_show_in_pdf += os.linesep + os.linesep + "More than 5 events burst over a minute:" \
-            + os.linesep + critical_events_text
+            dataframes_for_pdf.append(("Link Flapping past week",
+                                       links_flapping_analyzer.get_link_flapping_last_week()))
+        lists_to_add = []
+        critical_events_headers = ["timestamp", "event_type", "event", "count"]
+        lists_to_add.append((event_log_analyzer.get_critical_event_bursts(),
+                             "More than 5 events burst over a minute",
+                             critical_events_headers))
+
+        for cur_telemetry in \
+            [ibdianget_2_ports_primary_analyzer, ibdianget_2_ports_secondary_analyzer]:
+            dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} Telemetry iteration time",
+                                       cur_telemetry.get_last_iterations_time_stats()))
+            dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} "
+                                       "Telemetry iteration first and last timestamps",
+                                       cur_telemetry.get_first_last_iteration_timestamp()))
+            dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} Telemetry fabric size",
+                                       cur_telemetry.get_number_of_switches_and_ports()))
+            lists_to_add.append(([cur_telemetry.get_number_of_core_dumps()],
+                                 f"{cur_telemetry.telemetry_type} "
+                                 "number of core dumps found in the logs",
+                                 ["Amount"]))
+
 
         # PDF creator gets all the images and to add to the report
-        pdf = PDFCreator(pdf_path, pdf_header, png_images, text_to_show_in_pdf)
-        pdf.created_pdf()
+        pdf.create_pdf(dataframes_for_pdf, lists_to_add)
         # Generated a report that can be located in the destination
         log.LOGGER.info("Analysis is done, please see the following outputs:")
         for image, title in images_and_title_to_present:
             log.LOGGER.info(f"{title}: {image}")
         log.LOGGER.info(f"Summary PDF was created! you can open here at {pdf_path}")
+
+        if args.interactive:
+            import IPython
+            IPython.embed()
+
         # Clean some unended files created during run
         files_types_to_delete = set()
         files_types_to_delete.add("png") #png images created for PDF report

diff --git a/plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzers/base_analyzer.py b/plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzers/base_analyzer.py
@@ -12,6 +12,7 @@
 # pylint: disable=missing-function-docstring
 # pylint: disable=missing-module-docstring
 
+import logging
 import os
 import csv
 import shutil
@@ -21,15 +22,18 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
+import matplotlib
 import matplotlib.dates as mdates
 
 from loganalyze.log_analyzers.constants import DataConstants
 import loganalyze.logger as log
+# This makes sure the user does not see the warning from plotting
+logging.getLogger('matplotlib').setLevel(logging.ERROR)
+matplotlib.use('Agg') # This allows to run the tool on servers without graphic card/headless
 
 pd.set_option("display.max_colwidth", None)
 warnings.filterwarnings("ignore")
 
-
 class BaseImageCreator:
     # Setting the graph time interval to 1 hour
     # This is out side of the constructor since
@@ -47,7 +51,7 @@ def __init__(self, dest_image_path):
         self._funcs_for_analysis = set()
 
     def _save_data_based_on_timestamp(
-        self, data_to_plot, x_label, y_label, title
+        self, data_to_plot, x_label, y_label, title, large_sample=False
     ):
         with plt.ion():
             log.LOGGER.debug(f"saving {title}")
@@ -61,7 +65,10 @@ def _save_data_based_on_timestamp(
             # Set the locator to show ticks every hour and the formatter to
             # include both date and time
             ax = plt.gca()
-            ax.xaxis.set_major_locator(mdates.HourLocator())
+            if large_sample:
+                ax.xaxis.set_major_locator(mdates.HourLocator(interval=24))
+            else:
+                ax.xaxis.set_major_locator(mdates.HourLocator())
             ax.xaxis.set_minor_locator(
                 mdates.MinuteLocator(interval=15)
             )  # Add minor ticks every 15 minutes
@@ -94,7 +101,7 @@ def _save_data_based_on_timestamp(
             self._images_created.extend(images_list_with_title)
             plt.close()
 
-    def _save_pivot_data_in_bars(  # pylint: disable=too-many-arguments
+    def _save_pivot_data_in_bars(
         self, pivoted_data, x_label, y_label, title, legend_title
     ):
         if pivoted_data.empty:

diff --git a/...ufm_log_analyzer_plugin/src/loganalyze/log_analyzers/ibdiagnet2_port_counters_analyzer.py b/...ufm_log_analyzer_plugin/src/loganalyze/log_analyzers/ibdiagnet2_port_counters_analyzer.py
@@ -0,0 +1,179 @@
+#
+# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# This software product is a proprietary product of Nvidia Corporation and its affiliates
+# (the "Company") and all right, title, and interest in and to the software
+# product, including all associated intellectual property rights, are and
+# shall remain exclusively with the Company.
+#
+# This software product is governed by the End User License Agreement
+# provided with the software product.
+#
+
+from typing import List
+import warnings
+import pandas as pd
+from loganalyze.log_analyzers.base_analyzer import BaseAnalyzer
+
+
+class Ibdiagnet2PortCountersAnalyzer(BaseAnalyzer):
+    def __init__(self,
+                 logs_csvs: List[str],
+                 hours: int,
+                 dest_image_path: str,
+                 sort_timestamp=False):
+        super().__init__(logs_csvs, hours, dest_image_path, sort_timestamp)
+        self._iteration_time_data = None
+        self._iteration_time_stats = None
+        self.text_to_show_in_pdf = ""
+        # This will make sure all the extra columns are int
+        extra_columns = ['extra1', 'extra2', 'extra3', 'extra4', 'extra5']
+        for col in extra_columns:
+            self._log_data_sorted[col] = pd.to_numeric(
+            self._log_data_sorted[col],
+            errors='coerce'
+            ).astype('Int64')
+        self._funcs_for_analysis = {self.plot_iteration_time_over_time}
+        # Based on the log path, decided if this is primary or secondary
+        if "ufm_logs" in logs_csvs[0]:
+            self.telemetry_type = "primary"
+        elif "secondary_telemetry" in logs_csvs[0]:
+            self.telemetry_type = "secondary"
+        else:
+            self.telemetry_type = "Unknown_telemetry_type"
+
+        self._first_timestamp_of_logs = None
+        self._last_timestamp_of_logs = None
+
+    def get_collectx_versions(self):
+        unique_collectx_versions = self._log_data_sorted[\
+            self._log_data_sorted['type'] == 'collectx_version']['data'].unique()
+        return unique_collectx_versions
+
+    def get_number_of_switches_and_ports(self):
+        """
+        Generate summary statistics for 'total_devices_ports' data.
+        This function calculates the average, maximum, minimum
+        for switches, CAs, routers, and ports.
+        """
+        filtered_data = self._log_data_sorted[\
+            self._log_data_sorted['type'] == 'total_devices_ports']
+
+        ports_numbers_columns = ['extra1', 'extra3', 'extra5']
+        filtered_data['extra135'] = pd.to_numeric(
+            filtered_data[ports_numbers_columns].stack(), errors='coerce'
+        ).groupby(level=0).sum(min_count=1)
+
+        columns_of_interest = ['data', 'extra2', 'extra4', 'extra135']
+        column_mapping = {
+            'data': '# of Switches',
+            'extra2': 'CAs',
+            'extra4': 'Routers',
+            'extra135': 'Ports'
+        }
+
+        summary_stats = []
+
+        for col in columns_of_interest:
+            numeric_col = pd.to_numeric(filtered_data[col], errors='coerce')
+            non_zero_col = numeric_col[numeric_col != 0]
+
+            avg = round(non_zero_col.mean()) if not non_zero_col.empty else 0
+            max_val = int(non_zero_col.max()) if not non_zero_col.empty else 0
+            min_val = int(non_zero_col.min()) if not non_zero_col.empty else 0
+            count = int(non_zero_col.count())
+
+            summary_stats.append({
+                'Category': column_mapping.get(col, col),
+                'Average': avg,
+                'Maximum': max_val,
+                'Minimum': min_val,
+                'Total Rows (Non-Zero)': count
+            })
+
+        summary_df = pd.DataFrame(summary_stats)
+
+        return summary_df
+
+    def analyze_iteration_time(self, threshold=0.15):
+        """
+        Analyze rows where 'type' is 'iteration_time'.
+        Keep only 'type', 'timestamp', and 'data' columns.
+        Calculate statistics for the 'data' column, including timestamps for max and min.
+        Also, find gaps of at least 2 minutes with no data and allow filtering by a threshold.
+        
+        Parameters:
+        - threshold (float): Minimum value to consider for analysis. Default is 0.5 seconds.
+        """
+        filtered_data = self._log_data_sorted[self._log_data_sorted['type'] == 'iteration_time']
+        filtered_data = filtered_data[['type', 'timestamp', 'data']]
+        filtered_data['data'] = pd.to_numeric(filtered_data['data'], errors='coerce')
+
+        filtered_data = filtered_data[filtered_data['data'] >= threshold]
+        filtered_data['timestamp'] = pd.to_datetime(filtered_data['timestamp'], errors='coerce')
+        filtered_data = filtered_data.dropna(subset=['timestamp'])
+
+        filtered_data = filtered_data.sort_values(by='timestamp').reset_index(drop=True)
+
+        if not filtered_data['data'].empty:
+            average = filtered_data['data'].mean()
+            max_value = filtered_data['data'].max()
+            min_value = filtered_data['data'].min()
+
+            max_timestamp = filtered_data.loc[filtered_data['data'] \
+                                              == max_value, 'timestamp'].iloc[0]
+            min_timestamp = filtered_data.loc[filtered_data['data'] \
+                                              == min_value, 'timestamp'].iloc[0]
+            first_timestamp = filtered_data['timestamp'].iloc[0]
+            last_timestamp = filtered_data['timestamp'].iloc[-1]
+
+        else:
+            average = max_value = min_value = 0.0
+            max_timestamp = min_timestamp = None
+            first_timestamp = last_timestamp = None
+
+        stats = {
+            'Average': average,
+            'Maximum': max_value,
+            'Max Timestamp': max_timestamp,
+            'Minimum': min_value,
+            'Min Timestamp': min_timestamp,
+            'Total Rows': filtered_data['data'].count()
+        }
+        stats_df = pd.DataFrame([stats])
+        self._iteration_time_data = filtered_data
+        self._iteration_time_stats = stats_df
+        self._first_timestamp_of_logs = first_timestamp
+        self._last_timestamp_of_logs = last_timestamp
+        return stats_df
+
+    def get_first_last_iteration_timestamp(self):
+        if not self._first_timestamp_of_logs or not self._last_timestamp_of_logs:
+            self.analyze_iteration_time()
+        times ={
+            'first': str(self._first_timestamp_of_logs),
+            'last': str(self._last_timestamp_of_logs)
+        }
+        return pd.DataFrame([times])
+
+    def get_last_iterations_time_stats(self):
+        return self._iteration_time_stats
+
+    def plot_iteration_time_over_time(self):
+        if self._iteration_time_data is None:
+            self.analyze_iteration_time()
+
+        self._iteration_time_data.set_index('timestamp', inplace=True)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", ".*Locator attempting to generate.*")
+            self._save_data_based_on_timestamp(
+                data_to_plot=self._iteration_time_data['data'],
+                x_label='Timestamp',
+                y_label='Iteration Time (s)',
+                title=f'{self.telemetry_type} Iteration Time',
+                large_sample=True)
+
+    def get_number_of_core_dumps(self):
+        core_dumps = self._log_data_sorted[self._log_data_sorted['type'] == 'timeout_dump_core']
+        return {"Amount":len(core_dumps)}