issue:4050338 Add link flapping to log analyzer (#245)

* Extracting the files of second telemetry sampling * Refactor the way we plot * Printing link flapping * Removing old files + pylint * Adding link flapping to the PDF * pylint + ruff * code quality fixes * Trying to fix pipline * Fix num 2 * dummy * Fix num 3 * Dummy2 * try #4 * pylint * fix #5 * Fixing a small bug in rest api logs + removing echo from ci * CI to run on latest image * PR Comments * ruff + lint * Fix for pylint * PR comment * PR comments * Revert changes to pipeline * More pr comments * Fix bad merge from main * Update magic numbers to be constants * Fix bad merge
Mellanox · Sep 9, 2024 · e14c8ef · e14c8ef
1 parent 59d13a0
commit e14c8ef
Show file tree

Hide file tree

Showing 21 changed files with 360 additions and 2,243 deletions.
diff --git a/.github/workflows/ufm_log_analyzer_ci_workflow.yml b/.github/workflows/ufm_log_analyzer_ci_workflow.yml
@@ -2,23 +2,35 @@ name: Ufm log analyzer CI Workflow
 
 on: 
   push:
-     paths:
+    paths:
       - 'plugins/ufm_log_analyzer_plugin/**'
+      - '.github/workflows/ufm_log_analyzer_ci_workflow.yml'
+
 jobs:
   pylint:
     runs-on: ubuntu-latest
+
     steps:
     - name: Checkout code
       uses: actions/checkout@main
-    
+
     - name: Set up Python
       uses: actions/setup-python@main
       with:
-        python-version: 3.9  # Specify the Python version you want to use
-    
-    - name: Install dependencies
+        python-version: 3.9
+
+    - name: Install dependencies and run PyLint
       run: |
-        pip install -r plugins/ufm_log_analyzer_plugin/src/loganalyze/requirements.txt
+        SCRIPT_DIR="plugins/ufm_log_analyzer_plugin"
+        # Set PYTHONPATH to include src directory and two levels up for utils
+        PYTHONPATH="$(realpath $SCRIPT_DIR/src):$(realpath $SCRIPT_DIR/../../)"
+        export PYTHONPATH
+
+        cd $SCRIPT_DIR
+
+        # Install dependencies
+        pip install -r src/loganalyze/requirements.txt
         pip install pylint
-    - name: Run PyLint
-      run: pylint --rcfile=plugins/ufm_log_analyzer_plugin/src/loganalyze/.pylintrc plugins/ufm_log_analyzer_plugin/src/loganalyze
+
+        # Run PyLint
+        pylint --rcfile=src/loganalyze/.pylintrc src/loganalyze
diff --git a/plugins/ufm_log_analyzer_plugin/README.md b/plugins/ufm_log_analyzer_plugin/README.md
@@ -67,8 +67,8 @@ options:
 What is mandatory:
 1. `--location`.
 
-## Which logs are taken from the dump
-The following list: `event.log, ufmhealth.log, ufm.log, ibdiagnet2.log, console.log`
+## Which files are taken from the dump
+The following list: `event.log, ufmhealth.log, ufm.log, ibdiagnet2.log, console.log, rest_api.log and second telemetry samples`
 
 Also, each log `tar` is taken, according to the `extract-level` flag.
 ## How it works
@@ -80,4 +80,16 @@ Also, each log `tar` is taken, according to the `extract-level` flag.
 6. A PDF file is created with the summary of the images and the fabric size.
 7. We are starting an interactive Python session, where the user can run pre-defined analysis function on the parsed data, or do personal data query/manipulation to find the needed data
 
+## Link flapping
+This logic uses second telemetry counters to identify if links are flapping due to real issues.
+The input is the telemetry sample from last week and last 5 minutes.
+Output is a list of links to check.
+This logic will show links that:
+1. Both sides of the link went down together.
+2. Thermal shut down.
+3. If one side went down and the other side was not rebooted.
+
+
 ![Tool flow](img/loganalzer.png)
+
+
diff --git a/plugins/ufm_log_analyzer_plugin/log_analyzer.sh b/plugins/ufm_log_analyzer_plugin/log_analyzer.sh
@@ -16,6 +16,6 @@
 
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 src_dir=$( realpath "${SCRIPT_DIR}/src" )
-export PYTHONPATH="${src_dir}"
+export PYTHONPATH="${src_dir}:../../" # The ../../ is to be able to use the utils
 
 python3 "${src_dir}/loganalyze/log_analyzer.py" "$@"
diff --git a/plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzer.py b/plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzer.py
@@ -29,7 +29,7 @@
 from typing import Callable, List, Set, Tuple
 
 
-from loganalyze.log_analyzers.base_analyzer import BaseAnalyzer
+from loganalyze.log_analyzers.base_analyzer import BaseImageCreator
 from loganalyze.logs_extraction.directory_extractor import DirectoryExtractor
 from loganalyze.log_analyzers.ufm_top_analyzer import UFMTopAnalyzer
 from loganalyze.logs_extraction.tar_extractor import DumpFilesExtractor
@@ -43,6 +43,7 @@
 from loganalyze.log_analyzers.events_log_analyzer import EventsLogAnalyzer
 from loganalyze.log_analyzers.console_log_analyzer import ConsoleLogAnalyzer
 from loganalyze.log_analyzers.rest_api_log_analyzer import RestApiAnalyzer
+from loganalyze.log_analyzers.link_flapping_analyzer import LinkFlappingAnalyzer
 
 from loganalyze.pdf_creator import PDFCreator
 from loganalyze.utils.common import delete_files_by_types
@@ -58,6 +59,10 @@
     "rest_api.log"
 ]
 
+DIRECTORIES_TO_EXTRACT = [
+    "telemetry_samples"
+]
+
 def run_both_functions(parser_func, action_func, save_func):
     parser_func(action_func)
     save_func()
@@ -136,17 +141,20 @@ def sorting_logs(log_path):
     return count
 
 
-def get_csvs_in_dest(location: str, base_name: str, extraction_level: int):
+def get_files_in_dest_by_type(location: str,
+                              base_name: str,
+                              extraction_level: int,
+                              file_type="csv"):
     """
-    Return a list of all the CSV files that were parsed and part of the current
+    Return a list of all the files by type that were parsed and part of the current
     extraction level requested
     """
-    csv_files = glob.glob(os.path.join(location, "*.csv"))
-    matched_files = [file for file in csv_files if base_name in os.path.basename(file)]
+    files_by_type = glob.glob(os.path.join(location, f"*.{file_type}"))
+    matched_files = [file for file in files_by_type if base_name in os.path.basename(file)]
     full_paths = [os.path.abspath(file) for file in matched_files]
-    sorted_csvs = sorted(full_paths, key=sorting_logs)
-    sliced_csvs = sorted_csvs[: (extraction_level + 1)]
-    return sliced_csvs
+    sorted_files = sorted(full_paths, key=sorting_logs)
+    sliced_files = sorted_files[: (extraction_level + 1)]
+    return sliced_files
 
 
 def parse_args():
@@ -238,7 +246,9 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
     Returns the created analyzer
     """
     if log_name in full_extracted_logs_list:
-        log_csvs = get_csvs_in_dest(parsed_args.destination, log_name, parsed_args.extract_level)
+        log_csvs = get_files_in_dest_by_type(parsed_args.destination,
+                                             log_name,
+                                             parsed_args.extract_level)
         analyzer = analyzer_clc(log_csvs, parsed_args.hours, parsed_args.destination)
         ufm_top_analyzer_obj.add_analyzer(analyzer)
         return analyzer
@@ -266,7 +276,7 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
             extractor = DumpFilesExtractor(args.location)
 
         logs_to_work_with, failed_extract = extractor.extract_files(
-            full_logs_list, args.destination
+            full_logs_list, DIRECTORIES_TO_EXTRACT, args.destination
         )
 
         if len(failed_extract) > 0:
@@ -281,7 +291,7 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
 
 
         # Setting the time granularity for the graphs
-        BaseAnalyzer.time_interval = args.interval
+        BaseImageCreator.time_interval = args.interval
 
         # Analyze the CSV and be able to query the data
         start = time.perf_counter()
@@ -311,6 +321,13 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
 
         rest_api_log_analyzer = partial_create_analyzer(log_name="rest_api.log",
                                                         analyzer_clc=RestApiAnalyzer)
+        second_telemetry_samples = get_files_in_dest_by_type(args.destination,
+                                                                 "secondary_",
+                                                                 1000,
+                                                                 "gz")
+        links_flapping_analyzer = LinkFlappingAnalyzer(second_telemetry_samples,
+                                                       args.destination)
+        ufm_top_analyzer.add_analyzer(links_flapping_analyzer)
         end = time.perf_counter()
         log.LOGGER.debug(f"Took {end-start:.3f} to load the parsed data")
 
@@ -328,10 +345,14 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
         pdf_header = (
             f"Dump analysis for {os.path.basename(args.location)}, hours={args.hours}"
         )
-        fabric_info = ibdiagnet_analyzer.get_fabric_size() \
-                        if ibdiagnet_analyzer else "No Fabric Info found"
+        FABRIC_INFO = str(ibdiagnet_analyzer.get_fabric_size() \
+                        if ibdiagnet_analyzer else "No Fabric Info found")
+
+        LINK_FLAPPING = str(links_flapping_analyzer.get_link_flapping_last_week() \
+                            if links_flapping_analyzer else "No link flapping info")
         # PDF creator gets all the images and to add to the report
-        pdf = PDFCreator(pdf_path, pdf_header, png_images, fabric_info)
+        TEXT = FABRIC_INFO + os.linesep + "Link Flapping:" + os.linesep + LINK_FLAPPING
+        pdf = PDFCreator(pdf_path, pdf_header, png_images, TEXT)
         pdf.created_pdf()
         # Generated a report that can be located in the destination
         log.LOGGER.info("Analysis is done, please see the following outputs:")
@@ -342,7 +363,8 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
         files_types_to_delete = set()
         files_types_to_delete.add("png") #png images created for PDF report
         files_types_to_delete.add("log") #logs taken from the logs
-        files_types_to_delete.add("gz") # Zipped logs taken from the logs
+        files_types_to_delete.add("csv") #tmp csv + telemetery samples
+        files_types_to_delete.add("gz") #gz files of logs and samples
         delete_files_by_types(args.destination, files_types_to_delete)
         if args.interactive:
             import IPython