diff --git a/.ci/Jenkinsfile_nbuprod b/.ci/Jenkinsfile_nbuprod new file mode 100644 index 000000000..a96eb5856 --- /dev/null +++ b/.ci/Jenkinsfile_nbuprod @@ -0,0 +1,8 @@ +#!/usr/bin/groovy + +// load pipeline functions +// Requires pipeline-github-lib plugin to load library from github +@Library('github.com/Mellanox/ci-demo@master') +def matrix = new com.mellanox.cicd.Matrix() + +matrix.main() \ No newline at end of file diff --git a/.ci/Makefile b/.ci/Makefile new file mode 100644 index 000000000..d653e91fa --- /dev/null +++ b/.ci/Makefile @@ -0,0 +1,8 @@ + +jjb-blsm-release: + # IMPORTANT: configure the /etc/jenkins_jobs/jenkins_jobs.ini file to point to the relevant blossom + python3 -m jenkins_jobs --conf /etc/jenkins_jobs/jenkins_jobs.ini update $(shell pwd)/pipeline/proj_jjb_release_nbuprod.yaml + +jjb-blsm-release-test: + # IMPORTANT: configure the /etc/jenkins_jobs/jenkins_jobs.ini file to point to the relevant blossom + python3 -m jenkins_jobs --conf /etc/jenkins_jobs/jenkins_jobs.ini test $(shell pwd)/pipeline/proj_jjb_release_nbuprod.yaml diff --git a/.ci/dockerhub_uploader.py b/.ci/dockerhub_uploader.py index 761a6a59f..94dc1bc5e 100644 --- a/.ci/dockerhub_uploader.py +++ b/.ci/dockerhub_uploader.py @@ -60,6 +60,7 @@ def __init__(self, username:str, password:str) -> None: self.image = "" self.tag = "" self.image_path = "" + self.meta_data = None def login(self) -> None: """Login to docker-hub used for validation of image tags""" @@ -80,24 +81,22 @@ def login(self) -> None: def parse_image_tags(self): """Helper function for parsing image name and tag from manifest""" # If image name and tag were passed to the object don't parse it from the metadata - if self.image and self.tag: + if self.image: if '/' in self.image: self.repository = '/'.join(self.image.split('/')[:-1]) self.image = self.image.split('/')[-1] - return - meta_data = None + if not tarfile.is_tarfile(self.image_path): log_error(f'{self.image_path} is not a valid tar file') with tarfile.open(self.image_path, 'r') as t: # get the image manifest to determine image name and tag - meta_data = json.load(t.extractfile(r'manifest.json')) - if not meta_data: + self.meta_data = json.load(t.extractfile(r'manifest.json')) + if not self.meta_data: log_error(f"Could not extract manifest.json from {self.image_path}") try: # take the first tag of the first image, there should be only one anyway - image_name = meta_data[0]['RepoTags'][0] - # validate repo in image metadata - if '/' in image_name: + image_name = self.meta_data[0]['RepoTags'][0] + if '/' in image_name and not self.image: self.repository = '/'.join(image_name.split('/')[:-1]) image_name = image_name.split('/')[-1] if not self.image: @@ -143,61 +142,91 @@ def check_image_tag_docker_hub(self, tag=None) -> bool: break return False - def _check_image_locally(self, tag=None) -> bool: + def _check_image_locally(self,repo=None, image=None, tag=None) -> bool: """helper method to check if image already exists in local docker""" if not tag: tag = self.tag - log_debug(f'starting DockerHubUploader._check_image_locally with image:"{self.image}",tag:"{tag}"') + if not image: + image = self.image + if not repo: + repo = self.repository + log_debug(f'starting DockerHubUploader._check_image_locally with image:"{repo}/{image}",tag:"{tag}"') for img in self.client.images.list(): for img_name in img.attrs['RepoTags']: - if img_name == f"{self.repository}/{self.image}:{tag}": + if img_name == f"{repo}/{image}:{tag}": return True return False - def _delete_image_locally(self, tag=None) -> None: + def _delete_image_locally(self,repo = None, image=None, tag=None) -> None: """Helper method to remove images from local docker""" - log_debug(f'starting DockerHubUploader._delete_image_locally with image:"{self.image}",tag:"{tag}"') + log_debug(f'starting DockerHubUploader._delete_image_locally with repo="{repo}", image:"{image}",tag:"{tag}"') + if not repo: + repo = self.repository + if not image: + image = self.image if not tag: tag = self.tag try: - self.client.images.remove(image=f'{self.repository}/{self.image}:{tag}', force=True) + self.client.images.remove(image=f'{repo}/{image}:{tag}', force=True) except Exception as e: - log_error(f"could not remove {self.repository}/{self.image}:{tag}\n {e}") + log_error(f"could not remove {repo}/{image}:{tag}\n {e}") def load_docker_img(self): """Load a docker file to docker engine""" log_debug(f'starting DockerHubUploader.load_docker_img with image:"{self.image}",tag:"{self.tag}", path:"{self.image_path}"') # Check if the image:tag already exists if so try to delete it. - if self._check_image_locally(): + if self._check_image_locally(repo=self.repository, image=self.image, tag=self.tag): log_warning(f'{self.image}:{self.tag} was found locally removing it from docker engine.') try: - self._delete_image_locally() + self._delete_image_locally(repo=self.repository, image=self.image, tag=self.tag) except Exception as e: log_error(f'Could not delete {self.image}:{self.tag} from local docker.\n {e}') + # check if meta_data name and tag are different from given image and tag + meta_repo = '/'.join(self.meta_data[0]['RepoTags'][0].split('/')[:-1]) + meta_image = self.meta_data[0]['RepoTags'][0].split('/')[-1].split(':')[0] + meta_tag = self.meta_data[0]['RepoTags'][0].split(':')[-1] + # check if need to re-tag the image + new_tag_required = f'{meta_repo}/{meta_image}:{meta_tag}' != f'{self.repository}/{self.image}:{self.tag}' + if new_tag_required and self._check_image_locally(repo=meta_repo, image=meta_image, tag=meta_tag): + log_warning(f'new_tag_required={new_tag_required} and {meta_repo}/{meta_image}:{meta_tag} was found locally removing it from docker engine.') + try: + self._delete_image_locally(repo=meta_repo, image=meta_image, tag=meta_tag) + except Exception as e: + log_error(f'Could not delete {meta_repo}/{meta_image}:{meta_tag} from local docker.\n {e}') if not os.path.exists(self.image_path): log_error(f'{self.image_path} doesn\'t exists.') with open(self.image_path, 'rb') as img: try: _ = self.client.images.load(img) log_info(f'Loaded {self.repository}/{self.image}:{self.tag} to local docker engine.') + # re-tag the image if needed + if new_tag_required: + self.tag_image(new_tag=self.tag, old_image=meta_image, old_repo=meta_repo, old_tag=meta_tag) + self.cleanup(repo=meta_repo, image=meta_image,tag=meta_tag) except Exception as e: log_error(f'Could not load the {self.repository}/{self.image}:{self.tag} to docker.\n {e}') # tag as latest if latest flag exists - def tag_image(self, new_tag): + def tag_image(self, new_tag, old_repo=None, old_image=None, old_tag=None): """Load a docker file to docker engine""" - log_debug(f'starting DockerHubUploader.tag_image with image:"{self.image}",current_tag:"{self.tag}", new_tag:"{new_tag}"') + if not old_repo: + old_repo = self.repository + if not old_image: + old_image = self.image + if not old_tag: + old_tag = self.tag + log_debug(f'starting DockerHubUploader.tag_image with image:"{old_repo}/{old_image}",current_tag:"{old_tag}", new_tag:"{new_tag}"') # check if latest exists locally - if self._check_image_locally(new_tag): + if self._check_image_locally(repo=self.repository, image=self.image, tag=new_tag): log_warning(f'{self.repository}/{self.image}:{new_tag} was found locally removing it from docker engine.') try: - self._delete_image_locally(self.image, new_tag) + self._delete_image_locally(repo=self.repository, image=self.image, tag=new_tag) except Exception as e: log_error(f'Could not delete {self.repository}/{self.image}:{new_tag} from local docker.\n {e}') try: - img = self.client.images.get(f'{self.repository}/{self.image}:{self.tag}') + img = self.client.images.get(f'{old_repo}/{old_image}:{old_tag}') img.tag(f'{self.repository}/{self.image}', new_tag) - log_info(f'Tagged {self.repository}/{self.image}:{self.tag} as {self.repository}/{self.image}:{new_tag}.') + log_info(f'Tagged {old_repo}/{old_image}:{self.tag} as {self.repository}/{self.image}:{new_tag}.') except Exception as e: log_error(f'Could not tag {self.repository}/{self.image}:{self.tag} as {self.repository}/{self.image}:{new_tag}\n {e}') @@ -208,7 +237,7 @@ def push_image(self,tag=None) -> None: log_debug(f'starting DockerHubUploader.push_image with image:"{self.image}"tag:"{tag}"') try: result = self.client.images.push(repository=f'{self.repository}/{self.image}', tag=tag, - auth_config={'username': self.username, 'password': self.password},stream=True) + auth_config={'username': self.username, 'password': self.password},stream=True) for line in result: if 'error' in json.loads(line).keys(): log_error(f'Could not push {self.repository}/{self.image}:{tag}\nError from Docker:{line.decode("utf-8")}"') @@ -217,10 +246,16 @@ def push_image(self,tag=None) -> None: except Exception as e: log_error(f"Could not push {self.repository}/{self.image}:{tag} to Docker-Hub") - def cleanup(self, tag=None): + def cleanup(self,repo=None, image=None, tag=None): """Cleanup function to remove loaded docker images from host""" - if self._check_image_locally(tag=tag): - self._delete_image_locally(tag=tag) + if not repo: + repo = self.repository + if not image: + image = self.image + if not tag: + tag = self.tag + if self._check_image_locally(repo=repo, image=image, tag=tag): + self._delete_image_locally(repo=repo, image=image, tag=tag) if __name__ == '__main__': parser = argparse.ArgumentParser(prog='dockerhub_uploader') @@ -257,7 +292,7 @@ def cleanup(self, tag=None): dhu.load_docker_img() dhu.push_image() if args.latest: - dhu.tag_image('latest') + dhu.tag_image(new_tag='latest') dhu.push_image('latest') dhu.cleanup('latest') dhu.cleanup() \ No newline at end of file diff --git a/.ci/jenkinsfile_hub_uploader.groovy b/.ci/jenkinsfile_hub_uploader.groovy index 24baff9b0..18a517fec 100644 --- a/.ci/jenkinsfile_hub_uploader.groovy +++ b/.ci/jenkinsfile_hub_uploader.groovy @@ -18,7 +18,7 @@ pipeline{ withCredentials([usernamePassword(credentialsId: '0fbf63c0-4a61-4543-811d-a182df47711b', usernameVariable: 'DH_USER', passwordVariable: 'DH_TOKEN' )]){ wrap([$class: 'BuildUser']) { sh '''#!/bin/bash - authorized_users=( "bitkin" "afok" "kobib" "drorl" "tlerner" "omarj" "samerd" "atolikin" "atabachnik" "eylonk" "lennyv" ) + authorized_users=( "bitkin" "afok" "kobib" "drorl" "tlerner" "omarj" "samerd" "atolikin" "atabachnik" "eylonk" "lennyv" "asafb" "sspormas" "mkianovsky") if [[ ! "${authorized_users[*]}" == *"${BUILD_USER_ID}"* ]]; then echo "${BUILD_USER_ID} not authorized to upload images to docker hub" echo "Please contact one of the approved users to upload a container: ${authorized_users[*]}" @@ -46,4 +46,4 @@ pipeline{ } } } -} \ No newline at end of file +} diff --git a/.ci/matrix_job_release_nbuprod.yaml b/.ci/matrix_job_release_nbuprod.yaml new file mode 100644 index 000000000..c9014b6ee --- /dev/null +++ b/.ci/matrix_job_release_nbuprod.yaml @@ -0,0 +1,36 @@ +--- +job: ufm-plugins + +registry_host: harbor.mellanox.com +registry_path: /swx-storage/ci-demo +registry_auth: swx-storage +step_allow_single_selector: true + +credentials: + # harbor login/password + - {credentialsId: '425bb907-c357-4fde-92e0-67854a857b4f', usernameVariable: 'UFM_USER', passwordVariable: 'UFM_PASS'} + +runs_on_agents: + - nodeLabel: 'SWX-CI-DOCKER' + +steps: + - name: Build Plugin + agentSelector: "{nodeLabel: 'SWX-CI-DOCKER'}" + run: | + set -x + if [ -e "/auto/mswg/release/ufm/plugins/${PLUGIN_NAME}/ufm-plugin-${PLUGIN_NAME}_${PLUGIN_VERSION}*" ];then + echo -e "A path and plugin with this version already exist." + echo -e "Path: /auto/mswg/release/ufm/plugins/${PLUGIN_NAME}/ufm-plugin-${PLUGIN_NAME}_${PLUGIN_VERSION}*" + exit 1 + fi + mkdir -p /auto/mswg/release/ufm/plugins/${PLUGIN_NAME} + cd plugins/${PLUGIN_NAME}/build + bash -x ./docker_build.sh ${PLUGIN_VERSION} /auto/mswg/release/ufm/plugins/${PLUGIN_NAME} + BUILD_EXIT_CODE=$? + ls /auto/mswg/release/ufm/plugins/${PLUGIN_NAME} + if [ $BUILD_EXIT_CODE -eq 1 ];then + echo -e "Error: Docker build failed in ./docker_build.sh" + fi + exit ${BUILD_EXIT_CODE} + + parallel: false diff --git a/.ci/pipeline/proj_jjb_release_nbuprod.yaml b/.ci/pipeline/proj_jjb_release_nbuprod.yaml new file mode 100644 index 000000000..73a0b3397 --- /dev/null +++ b/.ci/pipeline/proj_jjb_release_nbuprod.yaml @@ -0,0 +1,79 @@ +########## UFM APPLIANCE GEN3 jobs ################## +- project: + name: UFM_PLUGINS_SDK_RELEASE + jjb_owner: 'Mickey Kianosvky' + jobs: + - "UFM_PLUGINS_SDK_RELEASE": + jjb_git: git@github.com:Mellanox/ufm_sdk_3.0.git + jjb_email: 'mkianovsky@nvidia.com' + disabled_var: false + type: github + branch: main + +- job-template: + name: "UFM_PLUGINS_SDK_RELEASE" + project-type: pipeline + disabled: '{obj:disabled_var}' + properties: + - github: + url: "{jjb_git}" + - build-discarder: + days-to-keep: 15 + num-to-keep: 15 + - inject: + keep-system-variables: true + properties-content: | + REPOSITORY={jjb_git} + description: Do NOT edit this job through the Web GUI ! + concurrent: false + parameters: + - string: + name: "sha1" + default: '{branch}' + description: "What branch to take in ufm_sdk_3.0" + - string: + name: "PLUGIN_VERSION" + default: 'latest' + description: "Plugin docker version to create ex: 1.0/1.1/2.0 etc.

default is latest

" + - string: + name: "EMAIL" + default: "{jjb_email}" + description: "email notifications
" + - choice: + name: "PLUGIN_NAME" + choices: + - UFM_NDT_Plugin + - advanced_hello_world_plugin + - bright_plugin + - fluentd_telemetry_plugin + - grafana_infiniband_telemetry_plugin + - grpc_streamer_plugin + - hello_world_plugin + - pdr_deterministic_plugin + - snmp_receiver_plugin + - sysinfo_plugin + - ufm_consumer_plugin + - ufm_syslog_streaming_plugin + description: "On which project to run?" + - string: + name: "conf_file" + default: ".ci/matrix_job_release_nbuprod.yaml" + description: "Regex to select job config file. Do not change it" + pipeline-scm: + scm: + - git: + url: "{jjb_git}" + credentials-id: '0b0ea4b8-2b37-427b-bc3f-b68c41a341f7' + branches: ['$sha1'] + shallow-clone: true + refspec: "+refs/pull/*:refs/remotes/origin/pr/*" + browser: githubweb + browser-url: "{jjb_git}" + submodule: + recursive: true + tracking: true + parent-credentials: true + script-path: ".ci/Jenkinsfile_nbuprod" + wrappers: + - timeout: + timeout: 30 diff --git a/README.md b/README.md index ebd98090c..f1a08fbe1 100644 --- a/README.md +++ b/README.md @@ -83,3 +83,18 @@ If your plugin directory does not contain a `.ci` directory, the CI process will The CI pipeline gets triggered based on changes made to the plugins. If changes occur in multiple plugins, the pipeline will not trigger the individual `.ci` directories but instead trigger a default empty CI. + +# DRP Instructions: +### UFM_PLUGINS_SDK_RELEASE: +**Release job URL:** https://nbuprod.blsm.nvidia.com/swx-ufm/job/UFM_PLUGINS_SDK_RELEASE/. + +**Release job URL - DRP:** will be synced and updated soon. + +**Build instructions:** +- Go to url. +- Click on login (top right corner). +- Login using corp username(without '@nvidia.com') and password. +- Once logged in you now can build the job. +- Click on "Build With Parameters" +- Update the necessary parameter. +- Click on "Build" and the job will be executed \ No newline at end of file diff --git a/plugins/SLURM-Integration/README.md b/plugins/SLURM-Integration/README.md index 1fd3e2342..abc9be857 100644 --- a/plugins/SLURM-Integration/README.md +++ b/plugins/SLURM-Integration/README.md @@ -11,17 +11,16 @@ Prerequisites -------------------------------------------------------- -UFM 6.10 installed on a RH7x machine with sharp_enabled & enable_sharp_allocation true and running in management mode. -python 2.7 on SLURM controller. -UFM-SLURM Integration tar file. -Generate token_auth +UFM version installed and running on one of the nodes connected to the SLURM controller over TCP. +Python 3 installed on the SLURM controller. +Latest version of the UFM-SLURM Integration. Installing -------------------------------------------------------- -### 1) Using SLURM controller, extract UFM-SLURM Integration tar file: +### 1) Using SLURM controller, extract the UFM-SLURM Integration tar file: tar -xf ufm_slurm_integration.tar.gz ### 2) Run the installation script. @@ -38,7 +37,7 @@ If you set auth_type=token_auth in UFM SLURM’s config file, you must generate curl -H "X-Remote-User:admin" -XPOST http://127.0.0.1:8000/app/tokens -Then you must copy the generated token and paste it into the config file beside the token parameter. +Then you must copy the generated token and paste it into the UFM SLURM’s config file beside the token parameter. @@ -70,31 +69,31 @@ Deployment -------------------------------------------------------- -After installation, the configurations should be set and UFM machine should be running. -Several configurable settings need to be set to make the integration run. +After installation, the configurations should be set, and the UFM machine should be running. Several configurable +settings need to be adjusted to make the UFM-SLURM Integration function properly. -### 1) On SLURM controller open /etc/slurm/ufm_slurm.conf +### 1) On the SLURM controller open /etc/slurm/ufm_slurm.conf sudo vim /etc/slurm/ufm_slurm.conf ### 2) Set the following keys, then save the file: ufm_server: UFM server IP address to connect to - ufm_server_user: Username of UFM server used to connect to UFM if you set - auth_type=basic_auth. + ufm_server_user: Username of UFM server used to connect to UFM if you set auth_type=basic_auth. ufm_server_pass: UFM server user password. partially_alloc: Whether to allow or not allow partial allocation of nodes pkey: By default it will by default management Pkey 0x7fff - auth_type: One of (token_auth, basic_auth) by default it token_auth + auth_type: One of (token_auth, basic_auth, kerberos_auth) by default it token_auth token: If you set auth_type to be token_auth you need to set - generated token. Please see Generate token_auth section. - log_file_name: The name of integration logging file + generated token. Please see Generate token_auth section. + principal_nam: principal name to be used in kerberos authentication when you set auth_type to be kerberos_auth. + log_file_name: The path of the UFM-SLURM integration logging file ### 3) Run UFM in 'Management' mode. - On UFM machine, open the ufm config file /opt/ufm/files/conf/gv.cfg - In section [Server], set the key: "monitoring_mode" to no and then save the file. monitoring_mode=no - Start UFM - * HA mode: /etc/init.d/ufmha start - * SA mode: /etc/init.d/ufmd start + * HA mode: ufm_ha_cluster start + * SA mode: systemctl start ufm-enterprise @@ -102,18 +101,13 @@ Running -------------------------------------------------------- -After installation and deployment of UFM-SLURM integration, the integration should work for every submitted SLURM job automatically. - - - Using the Slurm controller submit a new SLURM job. - for example: # sbatch -N2 batch1.sh - - In UFM side, a new SHArP reservation will be created based on job_id, - job nodes and set pkey in ufm_slurm.conf file. - - A new pkey will be created contains all the ports of job nodes to - allow the SHArP nodes to be communicated on top of it. - - After the SLURM job is completed, UFM deletes the created SHArP - reservation and pkey. - - From the time that a job is submitted by SLURM server until - completion, a log file called /tmp/ufm_slurm.log logs all the - actions and errors occurs during the execution. This log file - could be changed by modify log_file_name parameter in UFM_SLURM - config file /etc/slurm/ufm_slurm.conf. +After the installation and deployment of UFM-SLURM Integration, the integration should automatically handle every submitted SLURM job. + + - Use the SLURM controller to submit a new SLURM job, for example: sbatch -N2 batch1.sh. + - On the UFM side, a new SHArP reservation will be created based on the job ID and the job nodes if the + sharp_allocation parameter in the ufm_slurm.conf file is set to true. + - A new pkey containing all the ports of the job nodes will be created to allow the SHArP nodes to communicate on top of it. + - After the SLURM job is completed, UFM deletes the created SHArP reservation and pkey. + - From the time a job is submitted by the SLURM server until its completion, a log file called /var/log/slurm/ufm_slurm.log + records all actions and errors that occur during execution. This log file location can be changed by modifying the + log_file_name parameter in the UFM_SLURM configuration file located at /etc/slurm/ufm_slurm.conf. diff --git a/plugins/advanced_hello_world_plugin/.ci/ci_matrix.yaml b/plugins/advanced_hello_world_plugin/.ci/ci_matrix.yaml new file mode 100644 index 000000000..7eb5339e0 --- /dev/null +++ b/plugins/advanced_hello_world_plugin/.ci/ci_matrix.yaml @@ -0,0 +1,54 @@ +--- +job: ufm-advanced-hello-world-plugin + +registry_host: harbor.mellanox.com +registry_path: /swx-storage/ci-demo +registry_auth: swx-storage + +env: + plugin_dir: advanced_hello_world_plugin + plugin_name: ufm-plugin-advanced_hello_world + DOCKER_CLI_EXPERIMENTAL: enabled + +kubernetes: + cloud: swx-k8s-spray + +volumes: + - {mountPath: /var/run/docker.sock, hostPath: /var/run/docker.sock} + - {mountPath: /auto/UFM, hostPath: /auto/UFM } + + +runs_on_dockers: + - {file: '.ci/Dockerfile', arch: 'x86_64', name: 'plugin_worker', tag: 'latest'} + + +steps: + - name: Build Plugin + containerSelector: "{name: 'plugin_worker'}" + run: | + cd plugins/$plugin_dir/build + bash -x ./docker_build.sh latest / + ls -l / + cp /ufm-plugin* /auto/UFM/tmp/${JOB_NAME}/${BUILD_ID}/ + parallel: true + +pipeline_start: + run: | + mkdir -p /auto/UFM/tmp/${JOB_NAME}/${BUILD_ID} + + +pipeline_stop: + run: | + echo 'All done'; + #sudo rm -rf /auto/UFM/tmp/${JOB_NAME}/${BUILD_ID} + + + + + + + + + +# Fail job if one of the steps fails or continue +failFast: false diff --git a/plugins/advanced_hello_world_plugin/README.md b/plugins/advanced_hello_world_plugin/README.md index aea7454b1..8b160d291 100644 --- a/plugins/advanced_hello_world_plugin/README.md +++ b/plugins/advanced_hello_world_plugin/README.md @@ -31,20 +31,27 @@ Functions commonly added by optional UFM plugins include: * Each plugin should consist of the following files: - 1. _**init.sh**_ : Initialize script that should be placed in the root folder _**(/init.sh)**_ and have execute permission. It is being invoked by the UFM plugin manager upon plugin deployment (when adding a new plugin). The developer may copy the plugins configuration files to _**/config**_ folder which is mapped to the DRBD partition on the host (location on host: _/opt/ufm/files/plugins/{plugin name}_) + 1. **init.sh** : Initialize script that should be placed in the root folder **(/init.sh)** and have execute permission. It is being invoked by the UFM plugin manager upon plugin deployment (when adding a new plugin). The developer may copy the plugins configuration files to **/config** folder which is mapped to the DRBD partition on the host (location on host: _/opt/ufm/files/conf/plugins/{plugin name}_) - 2. _**deinit.sh**_ : De-initialize script that should be placed in the root folder _**(/deinit.sh)**_, have execute permission and return zero. It is being invoked by the UFM plugin manager upon plugin removal. The developer may clear files and folders that are placed on the host (e.g., log files) + 2. **deinit.sh** : De-initialize script that should be placed in the root folder **(/deinit.sh)**, have execute permission and return zero. It is being invoked by the UFM plugin manager upon plugin removal. The developer may clear files and folders that are placed on the host (e.g., log files) + + 3. **upgrade.sh** : This is the upgrade script, which should be placed in the root folder **(/upgrade.sh)** and granted execute permission. It is invoked by the UFM plugin manager upon plugin upgrade. During the upgrade stage, the developer may handle the upgrade of the plugin's configuration files. If the upgrade is successful (i.e., exits with zero), the new plugin's TAG (version) will be updated in the plugins' configuration file located at _/opt/ufm/files/conf/ufm_plugins.conf_. * Each plugin may have the following files: - 1. _**{plugin name}_shared_volumes.conf**_ : Contains a list of folders that are mapped between the host and the container. It consists of multiple lines in format “:” (e.g., /opt/ufm/files/log:/log). + 1. _**{plugin name}_shared_volumes.conf**_ : Contains a list of folders that are mapped between the host and the container. It consists of multiple lines in format “:” (e.g., /opt/ufm/files/conf:/opt/ufm/files/conf). * The following folders are shared between the host and the container by default: - 1. _**{UFM files path}/conf/plugins/{plugin name}:/config**_ : This folder should contain the plugin’s configuration files. It is managed by DRBD (in case of HA) and thus, it is replicated between master and standby nodes. - 2. _**/opt/ufm/ufm_plugins_data/{plugin name}:/data**_ : This folder may contain the plugin’s data files that should be persistent + 1. **{UFM files path}/conf/plugins/{plugin name}:/config** : This folder should contain the plugin’s configuration files. It is managed by DRBD (in case of HA) and thus, it is replicated between master and standby nodes. + 2. **{UFM files path}/log/plugins/{plugin name}:/log** : This folder should contain the plugin’s log files. It is managed by DRBD (in case of HA) and thus, it is replicated between master and standby nodes. + 3. **/opt/ufm/ufm_plugins_data/{plugin name}:/data** : This folder may contain the plugin’s data files that should be persistent. Please note that this folder is not managed by DRBD (in case of HA) and thus, it will not be replicated between the master and standby nodes on the failover. * **Note**: The default folders are being removed by the UFM plugin manager upon plugin’s removal, while the plugin is responsible to remove all the files and folders which are mapped to host and are listed in shared volumes configuration file (e.g., logs files that were written to /log folder in the container which is mapped to /opt/ufm/files/log folder on the host) - 2. _**{plugin name}_httpd_proxy.conf**_ : Contains the port that UFM may use to forward the plugin’s HTTP request. It consists of one line in format _“port={port number}”_. All the HTTP requests to the plugin are being forwarded by the UFM server (authentication is handled by UFM). + 2. **{plugin name}_httpd_proxy.conf** : Contains the port that UFM may use to forward the plugin’s HTTP request. It consists of one line in format _“port={port number}”_. All the HTTP requests to the plugin are being forwarded by the UFM server (authentication is handled by UFM). + + 3. **ufm_plugin_{plugin name}_httpd.conf** : This file contains the Apache configuration for the plugin, redirecting HTTP requests from Apache directly to the plugin's web server. These requests are not authenticated by UFM. + + 4. **{plugin name}_runtime_args.conf** : This file serves as the configuration file for the runtime plugin's resources. It consists of key-value arguments (e.g., cpus=1.5). Currently, only CPU limits are supported. * **Note**: The configuration files _{plugin name}_shared_volumes.conf_ , _{plugin name}_httpd_proxy.conf_ and any custom configurations files must be copied to _/config_ folder upon plugin deployment for UFM to manage the plugin (the plugins configuration is written to **{UFM files path}/conf/ufm_plugins.conf** @@ -84,13 +91,15 @@ Functions commonly added by optional UFM plugins include: ## Lifecycle The UFM plugins lifecycle is managed by UFM. Currently, It is the user responsibility to pull/load the plugin’s Docker container image on both master and standby nodes. -* **Add** : Upon addition, the plugin’s Docker container is started, and the _**/init.sh**_ script is invoked. Its configuration files must be copied to _**/config**_ folder. The container will exit once the init stage is done and it will be re-started upon UFM startup. In case UFM is already running when the plugin is deployed, it will be started automatically. +* **Add** : Upon addition, the plugin’s Docker container is started, and the **_/init.sh_** script is invoked. Its configuration files must be copied to **_/config_** folder. The container will exit once the init stage is done and it will be re-started upon UFM startup. In case UFM is already running when the plugin is deployed, it will be started automatically. * **Disable** : The plugin’s Docker container is stopped. However, its data is still accessible via the host. * **Enable** : The plugin’s Docker container is re-started. -* **Remove** : The plugin’s Docker container is stopped, and the _**/deinit.sh**_ script is being invoked. In this stage, all the plugin’s data is removed. +* **Remove** : The plugin’s Docker container is stopped, and the **_/deinit.sh_** script is being invoked. In this stage, all the plugin’s data is removed. + +* **Upgrade** : The plugin's **_/upgrade.sh_** script is invoked once the plugin is stopped, either manually before the upgrade or via the optional "force" flag. In this stage, all the plugin's data may be upgraded. It is up to the developer to decide whether the data needs to be upgraded. The **_upgrade.sh_** script receives the plugin's new TAG (version) as an argument ("-to_version {TAG}"). If the upgrade is successful (i.e., exits with zero), the new plugin's TAG (version) will be updated in the plugins' configuration file located at _/opt/ufm/files/conf/ufm_plugins.conf_. The developer should decide how to proceed in case the upgrade has failed (e.g., revert to the old configuration and return a non-zero value, or reset to the new configuration with default values and return zero). **Note**: The plugin’s Docker container is started/stopped upon UFM start/stop. In case UFM is already running when the plugin is added/enabled, it will be started. While, in case it is disabled/removed, it will be stopped @@ -133,9 +142,10 @@ Currently, the UFM supports extending the following areas: | hookInfo.order | False | The order of the added menu's item / tab | | hookInfo.icon | False | Fontawsome class to menu icons | + You can find [this sample json](./conf/advanced_hello_world_ui_conf.json) that contains all the supported flows -## Hello-world plugin example +## Hello-world plugin examples We are providing hello-world plugin example that contains E2E real examples about the configurations REST API that based on the python flask server and UI angular application with all the supported cases. and also it contains examples on the above configurations files. diff --git a/plugins/bright_plugin/build/docker_build.sh b/plugins/bright_plugin/build/docker_build.sh index c2cb5a125..162b838d1 100755 --- a/plugins/bright_plugin/build/docker_build.sh +++ b/plugins/bright_plugin/build/docker_build.sh @@ -69,7 +69,6 @@ function build_docker_image() image_with_prefix_and_version="${prefix}/${image_name}:${image_version}" pushd ${build_dir} - echo "docker build --network host --no-cache --pull -t ${image_with_prefix_and_version} . --compress" docker build --network host --no-cache --pull -t ${image_with_prefix_and_version} . --compress diff --git a/plugins/hello_world_plugin/README.md b/plugins/hello_world_plugin/README.md index c9e3225ef..2efae4c06 100644 --- a/plugins/hello_world_plugin/README.md +++ b/plugins/hello_world_plugin/README.md @@ -43,7 +43,7 @@ Functions commonly added by optional UFM plugins include: * The following folders are shared between the host and the container by default: 1. **{UFM files path}/conf/plugins/{plugin name}:/config** : This folder should contain the plugin’s configuration files. It is managed by DRBD (in case of HA) and thus, it is replicated between master and standby nodes. 2. **{UFM files path}/log/plugins/{plugin name}:/log** : This folder should contain the plugin’s log files. It is managed by DRBD (in case of HA) and thus, it is replicated between master and standby nodes. - 3. **/opt/ufm/ufm_plugins_data/{plugin name}:/data** : This folder may contain the plugin’s data files that should be persistent + 3. **/opt/ufm/ufm_plugins_data/{plugin name}:/data** : This folder may contain the plugin’s data files that should be persistent. Please note that this folder is not managed by DRBD (in case of HA) and thus, it will not be replicated between the master and standby nodes on the failover. * **Note**: The default folders are being removed by the UFM plugin manager upon plugin’s removal, while the plugin is responsible to remove all the files and folders which are mapped to host and are listed in shared volumes configuration file (e.g., logs files that were written to /log folder in the container which is mapped to /opt/ufm/files/log folder on the host) diff --git a/plugins/pdr_deterministic_plugin/.ci/do_add_plugin.sh b/plugins/pdr_deterministic_plugin/.ci/do_add_plugin.sh index 2d2b5b782..210c2ac42 100755 --- a/plugins/pdr_deterministic_plugin/.ci/do_add_plugin.sh +++ b/plugins/pdr_deterministic_plugin/.ci/do_add_plugin.sh @@ -1,7 +1,7 @@ #!/bin/bash -x export SERVER_HOST=$SERVER_HOST expect << EOF -spawn ssh admin@${SERVER_HOST} +spawn ssh -o StrictHostKeyChecking=no admin@${SERVER_HOST} expect "Password:*" send -- "admin\r" expect "> " diff --git a/plugins/pdr_deterministic_plugin/.ci/do_install_plugin_server.sh b/plugins/pdr_deterministic_plugin/.ci/do_install_plugin_server.sh index a8adc7ec9..994cfcd9e 100755 --- a/plugins/pdr_deterministic_plugin/.ci/do_install_plugin_server.sh +++ b/plugins/pdr_deterministic_plugin/.ci/do_install_plugin_server.sh @@ -4,7 +4,7 @@ namehost=$(echo $HOSTNAME) export SERVER_HOST=$SERVER_HOST export PASSWORD=$PASSWORD expect << EOF -spawn ssh admin@${SERVER_HOST} +spawn ssh -o StrictHostKeyChecking=no admin@${SERVER_HOST} expect "Password:*" send -- "admin\r" expect "> " diff --git a/plugins/pdr_deterministic_plugin/.ci/do_load_plugin.sh b/plugins/pdr_deterministic_plugin/.ci/do_load_plugin.sh index ee05057b6..ba8c9bae6 100755 --- a/plugins/pdr_deterministic_plugin/.ci/do_load_plugin.sh +++ b/plugins/pdr_deterministic_plugin/.ci/do_load_plugin.sh @@ -1,7 +1,7 @@ #!/bin/bash -x export SERVER_HOST=$SERVER_HOST expect << EOF -spawn ssh admin@${SERVER_HOST} +spawn ssh -o StrictHostKeyChecking=no admin@${SERVER_HOST} expect "Password:*" send -- "admin\r" expect "> " diff --git a/plugins/pdr_deterministic_plugin/.ci/do_remove_plugin.sh b/plugins/pdr_deterministic_plugin/.ci/do_remove_plugin.sh index ee77f586c..978445e75 100755 --- a/plugins/pdr_deterministic_plugin/.ci/do_remove_plugin.sh +++ b/plugins/pdr_deterministic_plugin/.ci/do_remove_plugin.sh @@ -1,7 +1,7 @@ #!/bin/bash -x export SERVER_HOST=$SERVER_HOST expect << EOF -spawn ssh admin@${SERVER_HOST} +spawn ssh -o StrictHostKeyChecking=no admin@${SERVER_HOST} expect "Password:*" send -- "admin\r" expect "> " diff --git a/plugins/pdr_deterministic_plugin/build/config/pdr_deterministic.conf b/plugins/pdr_deterministic_plugin/build/config/pdr_deterministic.conf index c0ca1b8ae..a4535b2ce 100644 --- a/plugins/pdr_deterministic_plugin/build/config/pdr_deterministic.conf +++ b/plugins/pdr_deterministic_plugin/build/config/pdr_deterministic.conf @@ -27,9 +27,6 @@ DEISOLATE_CONSIDER_TIME=300 AUTOMATIC_DEISOLATE=True # if set to false, the plugin will not perform deisolation DO_DEISOLATION=True -DYNAMIC_WAIT_TIME=30 -# number of times to check if a dynamic session is unresponsive before restarting it -DYNAMIC_UNRESPONSIVE_LIMIT=3 [Metrics] # in Celsius diff --git a/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py b/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py index 59023bf39..edaff7349 100755 --- a/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py +++ b/plugins/pdr_deterministic_plugin/tests/simulation_telemetry.py @@ -25,12 +25,12 @@ lock = Lock() PHY_EFF_ERROR = "phy_effective_errors" -PHY_SYMBOL_ERROR = "phy_symbol_errors" +PHY_SYMBOL_ERROR = "Symbol_Errors" RCV_PACKETS_COUNTER = "PortRcvPktsExtended" -RCV_ERRORS_COUNTER = "PortRcvErrorsExtended" -LINK_DOWN_COUNTER = "LinkDownedCounterExtended" -RCV_REMOTE_PHY_ERROR_COUNTER = "PortRcvRemotePhysicalErrorsExtended" -TEMP_COUNTER = "CableInfo.Temperature" +RCV_ERRORS_COUNTER = "PortRcvErrors" +LINK_DOWN_COUNTER = "Link_Down_IB" +RCV_REMOTE_PHY_ERROR_COUNTER = "PortRcvRemotePhysicalErrors" +TEMP_COUNTER = "Module_Temperature" FEC_MODE = "fec_mode_active" ENDPOINT_CONFIG = {} @@ -192,7 +192,7 @@ def start_server(port:str,changes_intervals:int, run_forever:bool): t.daemon = True t.start() counters_names = list(counters.keys()) - header = ['timestamp', 'source_id,tag,node_guid,port_guid,port_num'] + counters_names + header = ['timestamp', 'source_id,tag,Node_GUID,port_guid,Port_Number'] + counters_names endpoint['data'] = "" while True: # lock.acquire() diff --git a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/constants.py b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/constants.py index 02f07b557..b9a210f8a 100644 --- a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/constants.py +++ b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/constants.py @@ -43,7 +43,7 @@ class PDRConstants(object): SWITCH_TO_HOST_ISOLATION = "SWITCH_TO_HOST_ISOLATION" TEST_MODE = "TEST_MODE" TEST_MODE_PORT = 9090 - DYNAMIC_UNRESPONSIVE_LIMIT = "DYNAMIC_UNRESPONSIVE_LIMIT" + SECONDARY_TELEMETRY_PORT = 9002 GET_SESSION_DATA_REST = "/monitoring/session/0/data" POST_EVENT_REST = "/app/events/external_event" @@ -53,8 +53,7 @@ class PDRConstants(object): GET_ACTIVE_PORTS_REST = "/resources/ports?active=true" API_HEALTHY_PORTS = "healthy_ports" API_ISOLATED_PORTS = "isolated_ports" - DYNAMIC_SESSION_REST = "/app/telemetry/instances/%s" - STATUS_DYNAMIC_SESSION_REST = "/app/telemetry/instances/status" + SECONDARY_INSTANCE = "low_freq_debug" EXTERNAL_EVENT_ERROR = 554 EXTERNAL_EVENT_ALERT = 553 @@ -70,12 +69,12 @@ class PDRConstants(object): CONF_USERNAME = 'admin' CONF_PASSWORD = 'password' - TEMP_COUNTER = "CableInfo.Temperature" - ERRORS_COUNTER = "errors" + ERRORS_COUNTER = "Symbol_Errors" RCV_PACKETS_COUNTER = "PortRcvPktsExtended" - RCV_ERRORS_COUNTER = "PortRcvErrorsExtended" - RCV_REMOTE_PHY_ERROR_COUNTER = "PortRcvRemotePhysicalErrorsExtended" - LNK_DOWNED_COUNTER = "LinkDownedCounterExtended" + RCV_ERRORS_COUNTER = "PortRcvErrors" + RCV_REMOTE_PHY_ERROR_COUNTER = "PortRcvRemotePhysicalErrors" + TEMP_COUNTER = "Module_Temperature" + LNK_DOWNED_COUNTER = "Link_Down_IB" PHY_RAW_ERROR_LANE0 = "phy_raw_errors_lane0" PHY_RAW_ERROR_LANE1 = "phy_raw_errors_lane1" @@ -98,6 +97,9 @@ class PDRConstants(object): NODE_TYPE_OTHER = "other" BER_TELEMETRY = "ber_telemetry" + NODE_GUID = "Node_GUID" + PORT_NUMBER = "Port_Number" + ISSUE_PDR = "pdr" ISSUE_BER = "ber" ISSUE_PDR_BER = "pdr&ber" @@ -105,6 +107,5 @@ class PDRConstants(object): ISSUE_INIT = "init" ISSUE_LINK_DOWN = "link_down" - PDR_DYNAMIC_NAME = "pdr_dynamic" # intervals in seconds for testing ber values and corresponding thresholds BER_THRESHOLDS_INTERVALS = [(125 * 60, 3), (12 * 60, 2.88)] diff --git a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/isolation_mgr.py b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/isolation_mgr.py index 3c271ada3..7e20ee364 100644 --- a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/isolation_mgr.py +++ b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/isolation_mgr.py @@ -22,17 +22,9 @@ from exclude_list import ExcludeList from constants import PDRConstants as Constants -from ufm_communication_mgr import DynamicSessionState, UFMCommunicator +from ufm_communication_mgr import UFMCommunicator # should actually be persistent and thread safe dictionary pf PortStates - -class DynamicTelemetryUnresponsive(Exception): - """ - Exception raised when the dynamic telemetry is unresponsive. - """ - pass - - class PortData(object): """ Represents the port data. @@ -182,13 +174,11 @@ def __init__(self, ufm_client: UFMCommunicator, logger): self.do_deisolate = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.DO_DEISOLATION) self.deisolate_consider_time = pdr_config.getint(Constants.CONF_ISOLATION,Constants.DEISOLATE_CONSIDER_TIME) self.automatic_deisolate = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.AUTOMATIC_DEISOLATE) - self.dynamic_wait_time = pdr_config.getint(Constants.CONF_ISOLATION,"DYNAMIC_WAIT_TIME") self.temp_check = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.CONFIGURED_TEMP_CHECK) self.link_down_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.LINK_DOWN_ISOLATION) self.switch_hca_isolation = pdr_config.getboolean(Constants.CONF_ISOLATION,Constants.SWITCH_TO_HOST_ISOLATION) self.test_mode = pdr_config.getboolean(Constants.CONF_COMMON,Constants.TEST_MODE, fallback=False) self.test_iteration = 0 - self.dynamic_unresponsive_limit = pdr_config.getint(Constants.CONF_ISOLATION,Constants.DYNAMIC_UNRESPONSIVE_LIMIT, fallback=3) # Take from Conf self.logger = logger self.ber_intervals = Constants.BER_THRESHOLDS_INTERVALS if not self.test_mode else [[0.5 * 60, 3]] @@ -215,13 +205,6 @@ def __init__(self, ufm_client: UFMCommunicator, logger): Constants.LNK_DOWNED_COUNTER, ] - # bring telemetry data on disabled ports - self.dynamic_extra_configuration = { - "plugin_env_CLX_EXPORT_API_ENABLE_DOWN_PORT_COUNTERS": "1", - "plugin_env_CLX_EXPORT_API_ENABLE_DOWN_PHY": "1", - "arg_11": "" - } - self.exclude_list = ExcludeList(self.logger) def calc_max_ber_wait_time(self, min_threshold): @@ -403,10 +386,10 @@ def find_peer_row_for_port(self, port_obj, ports_counters): return None peer_guid, peer_num = port_obj.peer.split('_') # Fix peer guid format for future search - if ports_counters['port_guid'].iloc[0].startswith('0x') and not peer_guid.startswith('0x'): + if ports_counters[Constants.NODE_GUID].iloc[0].startswith('0x') and not peer_guid.startswith('0x'): peer_guid = f'0x{peer_guid}' #TODO check for a way to save peer row in data structure for performance - peer_row_list = ports_counters.loc[(ports_counters['port_guid'] == peer_guid) & (ports_counters['port_num'] == int(peer_num))] + peer_row_list = ports_counters.loc[(ports_counters[Constants.NODE_GUID] == peer_guid) & (ports_counters[Constants.PORT_NUMBER] == int(peer_num))] if peer_row_list.empty: self.logger.warning(f"Peer port {port_obj.peer} not found in ports data") return None @@ -447,10 +430,8 @@ def check_temp_issue(self, port_obj, row, timestamp): if cable_temp is not None and not pd.isna(cable_temp): if cable_temp in ["NA", "N/A", "", "0C", "0"]: return None - # Get new and saved temperature values - cable_temp = int(cable_temp.split("C")[0]) if isinstance(cable_temp, str) else cable_temp - old_cable_temp = port_obj.counters_values.get(Constants.TEMP_COUNTER); - # Save new temperature value + cable_temp = int(cable_temp.split("C")[0]) if type(cable_temp) == str else cable_temp + old_cable_temp = port_obj.counters_values.get(Constants.TEMP_COUNTER, 0) port_obj.counters_values[Constants.TEMP_COUNTER] = cable_temp # Check temperature condition if cable_temp and (cable_temp > self.tmax): @@ -530,17 +511,17 @@ def check_ber_issue(self, port_obj, row, timestamp): return Issue(port_obj.port_name, Constants.ISSUE_BER) return None - def read_next_set_of_high_ber_or_pdr_ports(self, endpoint_port): + def read_next_set_of_high_ber_or_pdr_ports(self): """ Read the next set of ports and check if they have high BER, PDR, temperature or link downed issues """ issues = {} - ports_counters = self.ufm_client.get_telemetry(endpoint_port, Constants.PDR_DYNAMIC_NAME,self.test_mode) + ports_counters = self.ufm_client.get_telemetry(self.test_mode) if ports_counters is None: self.logger.error("Couldn't retrieve telemetry data") - raise DynamicTelemetryUnresponsive - for index, row in ports_counters.iterrows(): - port_name = f"{row.get('port_guid', '').split('x')[-1]}_{row.get('port_num', '')}" + return {} + for _, row in ports_counters.iterrows(): + port_name = f"{row.get(Constants.NODE_GUID, '').split('x')[-1]}_{row.get(Constants.PORT_NUMBER, '')}" if self.exclude_list.contains(port_name): # The port is excluded from analysis continue @@ -736,35 +717,6 @@ def get_isolation_state(self): isolated_port.update(Constants.ISSUE_OONOC) self.isolated_ports[port_name] = isolated_port - def start_telemetry_session(self): - """ - Starts a telemetry session. - - Returns: - str: The port number if the dynamic session is started successfully, False otherwise. - """ - self.logger.info("Starting telemetry session") - guids = self.get_requested_guids() - response = self.ufm_client.start_dynamic_session(Constants.PDR_DYNAMIC_NAME, self.telemetry_counters, self.interval, guids, self.dynamic_extra_configuration) - if response and response.status_code == http.HTTPStatus.ACCEPTED: - port = str(int(response.content)) - else: - self.logger.error(f"Failed to start dynamic session: {response}") - return False - return port - - def update_telemetry_session(self): - """ - Updates the telemetry session by requesting and updating the dynamic session with the specified interval and guids. - - Returns: - The response from the UFM client after updating the dynamic session. - """ - self.logger.info("Updating telemetry session") - guids = self.get_requested_guids() - response = self.ufm_client.update_dynamic_session(Constants.PDR_DYNAMIC_NAME, self.interval, guids) - return response - def get_requested_guids(self): """ Get the requested GUIDs and their corresponding ports. @@ -782,63 +734,13 @@ def get_requested_guids(self): requested_guids = [{"guid": sys_guid, "ports": ports} for sys_guid, ports in guids.items()] return requested_guids - # this function create dynamic telemetry and returns the port of this telemetry - def run_telemetry_get_port(self): - """ - Runs the telemetry and returns the endpoint port. - - If the test mode is enabled, it returns the test mode port. - Otherwise, it waits for the dynamic session to start, starts the telemetry session, - and retrieves the endpoint port. - - Returns: - int: The endpoint port for the telemetry. - - Raises: - Exception: If an error occurs during the process. - """ - if self.test_mode: - return Constants.TEST_MODE_PORT - try: - while True: - session_state = self.ufm_client.get_dynamic_session_state(Constants.PDR_DYNAMIC_NAME) - if session_state == DynamicSessionState.RUNNING: - # Telemetry session is running - break - if session_state == DynamicSessionState.NONE: - # Start new session - self.logger.info("Waiting for dynamic session to start") - endpoint_port = self.start_telemetry_session() - time.sleep(self.dynamic_wait_time) - else: - # Stop inactive session - self.logger.info("Waiting for inactive dynamic session to stop") - self.ufm_client.stop_dynamic_session(Constants.PDR_DYNAMIC_NAME) - time.sleep(self.dynamic_wait_time) - except Exception as e: - self.ufm_client.stop_dynamic_session(Constants.PDR_DYNAMIC_NAME) - time.sleep(self.dynamic_wait_time) - endpoint_port = self.ufm_client.dynamic_session_get_port(Constants.PDR_DYNAMIC_NAME) - return endpoint_port - - def restart_telemetry_session(self): - """ - Restart the dynamic telemetry session and return the new endpoint port - """ - self.logger.info("Restarting telemetry session") - self.ufm_client.stop_dynamic_session(Constants.PDR_DYNAMIC_NAME) - time.sleep(self.dynamic_wait_time) - endpoint_port = self.run_telemetry_get_port() - return endpoint_port - def main_flow(self): """ Executes the main flow of the Isolation Manager. This method synchronizes with the telemetry clock, retrieves ports metadata, - starts the telemetry session, and continuously retrieves telemetry data to - determine the states of the ports. It handles dynamic telemetry unresponsiveness, - skips isolation if too many ports are detected as unhealthy, and evaluates + continuously retrieves telemetry data from secondary telemetry to + determine the states of the ports. skips isolation if too many ports are detected as unhealthy, and evaluates isolation and deisolation for reported issues and ports with specific causes. Args: @@ -850,9 +752,6 @@ def main_flow(self): self.logger.info("Isolation Manager initialized, starting isolation loop") self.get_ports_metadata() self.logger.info("Retrieved ports metadata") - endpoint_port = self.run_telemetry_get_port() - self.logger.info("telemetry session started") - dynamic_telemetry_unresponsive_count = 0 while(True): try: t_begin = time.time() @@ -864,15 +763,9 @@ def main_flow(self): self.logger.info(f"Retrieving test mode telemetry data to determine ports' states: iteration {self.test_iteration}") self.test_iteration += 1 try: - issues = self.read_next_set_of_high_ber_or_pdr_ports(endpoint_port) - except DynamicTelemetryUnresponsive: - dynamic_telemetry_unresponsive_count += 1 - if dynamic_telemetry_unresponsive_count > self.dynamic_unresponsive_limit: - self.logger.error(f"Dynamic telemetry is unresponsive for {dynamic_telemetry_unresponsive_count} times, restarting telemetry session...") - endpoint_port = self.restart_telemetry_session() - dynamic_telemetry_unresponsive_count = 0 - self.test_iteration = 0 - continue + issues = self.read_next_set_of_high_ber_or_pdr_ports() + except (KeyError,) as e: + self.logger.error(f"failed to read information with error {e}") if len(issues) > self.max_num_isolate: # UFM send external event event_msg = f"got too many ports detected as unhealthy: {len(issues)}, skipping isolation" diff --git a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/ufm_communication_mgr.py b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/ufm_communication_mgr.py index 1b38a0928..3f2cfe33e 100644 --- a/plugins/pdr_deterministic_plugin/ufm_sim_web_service/ufm_communication_mgr.py +++ b/plugins/pdr_deterministic_plugin/ufm_sim_web_service/ufm_communication_mgr.py @@ -11,21 +11,14 @@ # from enum import Enum +import urllib.error from constants import PDRConstants as Constants import requests import logging -import copy +import urllib import http import pandas as pd -class DynamicSessionState(Enum): - """ - States of telemetry session instance - """ - NONE = 0 - INACTIVE = 1 - RUNNING = 2 - class UFMCommunicator: def __init__(self, host='127.0.0.1', ufm_port=8000): @@ -40,12 +33,14 @@ def get_request(self, uri, headers=None): request = self.ufm_protocol + '://' + self._host + uri if not headers: headers = self.headers - response = requests.get(request, verify=False, headers=headers) - logging.info("UFM API Request Status: {}, URL: {}".format(response.status_code, request)) - if response.status_code == http.client.OK: - return response.json() - else: - return + try: + response = requests.get(request, verify=False, headers=headers) + logging.info("UFM API Request Status: {}, URL: {}".format(response.status_code, request)) + if response.status_code == http.client.OK: + return response.json() + except ConnectionRefusedError as e: + logging.error(f"failed to get data from {request} with error {e}") + return def send_request(self, uri, data, method=Constants.POST_METHOD, headers=None): request = self.ufm_protocol + '://' + self._host + uri @@ -59,18 +54,23 @@ def send_request(self, uri, data, method=Constants.POST_METHOD, headers=None): response = requests.delete(url=request, verify=False, headers=headers) logging.info("UFM API Request Status: {}, URL: {}".format(response.status_code, request)) return response - - def get_telemetry(self, port, instance_name,test_mode): + + def get_telemetry(self,test_mode): + """ + get the telemetry from secondary telemetry, if it in test mode it get from the simulation + return DataFrame of the telemetry + """ if test_mode: url = f"http://127.0.0.1:9090/csv/xcset/simulated_telemetry" else: - url = f"http://127.0.0.1:{port}/csv/xcset/{instance_name}" + url = f"http://127.0.0.1:{Constants.SECONDARY_TELEMETRY_PORT}/csv/xcset/{Constants.SECONDARY_INSTANCE}" try: telemetry_data = pd.read_csv(url) - except Exception as e: + except (pd.errors.ParserError, pd.errors.EmptyDataError, urllib.error.URLError) as e: logging.error(f"Failed to get telemetry data from UFM, fetched url={url}. Error: {e}") telemetry_data = None return telemetry_data + def send_event(self, message, event_id=Constants.EXTERNAL_EVENT_NOTICE, external_event_name="PDR Plugin Event", external_event_type="PDR Plugin Event"): data = { @@ -119,41 +119,3 @@ def get_ports_metadata(self): def get_port_metadata(self, port_name): return self.get_request("%s/%s" % (Constants.GET_PORTS_REST, port_name)) - - def start_dynamic_session(self, instance_name, counters, sample_rate, guids, extra_configuration=None): - data = { - "counters": counters, - "sample_rate": sample_rate, - "requested_guids": guids, - "is_registered_discovery": False - } - if extra_configuration: - data["configuration"] = extra_configuration - return self.send_request(Constants.DYNAMIC_SESSION_REST % instance_name, data, method=Constants.POST_METHOD) - - def update_dynamic_session(self, instance_name, sample_rate, guids): - data = { - "sample_rate": sample_rate, - "requested_guids": guids - } - return self.send_request(Constants.DYNAMIC_SESSION_REST % instance_name, data, method=Constants.PUT_METHOD) - - def get_dynamic_session_state(self, instance_name): - response = self.get_request(Constants.STATUS_DYNAMIC_SESSION_REST) - if response: - instance_status = response.get(instance_name) - if instance_status: - if instance_status.get("status") == "running": - return DynamicSessionState.RUNNING - else: - return DynamicSessionState.INACTIVE - return DynamicSessionState.NONE - - def stop_dynamic_session(self, instance_name): - data = {} - return self.send_request(Constants.DYNAMIC_SESSION_REST % instance_name, data, method=Constants.DELETE_METHOD) - - def dynamic_session_get_port(self, instance_name): - data = self.get_request(Constants.DYNAMIC_SESSION_REST % instance_name) - if data: - return data.get("endpoint_port") diff --git a/plugins/ufm_consumer_plugin/conf/ufm_consumer_ui_conf.json b/plugins/ufm_consumer_plugin/conf/ufm_consumer_ui_conf.json index f98f9e95d..e93c52769 100644 --- a/plugins/ufm_consumer_plugin/conf/ufm_consumer_ui_conf.json +++ b/plugins/ufm_consumer_plugin/conf/ufm_consumer_ui_conf.json @@ -5,7 +5,7 @@ "type": "leftMenu", "label": "UFM Consumer", "key": "ufm_consumer", - "route": ":APACHE_PORT", + "route": "/ufm_consumer_web", "icon": "fas fa-network-wired", "external": true } diff --git a/plugins/ufm_consumer_plugin/conf/ufm_plugin_ufm_consumer_httpd.conf b/plugins/ufm_consumer_plugin/conf/ufm_plugin_ufm_consumer_httpd.conf new file mode 100644 index 000000000..77647092b --- /dev/null +++ b/plugins/ufm_consumer_plugin/conf/ufm_plugin_ufm_consumer_httpd.conf @@ -0,0 +1,24 @@ +Alias /ufm_consumer_web /opt/ufm/ufm_plugins_data/ufm_consumer/media + + + Options Indexes MultiViews + Options Indexes FollowSymLinks + AllowOverride None + Require all granted + Header set X-Consumer-Plugin "yes" + + + + ProxyPassMatch http://127.0.0.1:@@CONSUMER_REST_PORT@@/$1 retry=1 Keepalive=On timeout=300 + ProxyPassReverse http://127.0.0.1:@@CONSUMER_REST_PORT@@/$1 + AuthType Basic + AuthName "UFM Consumer Plugin rest server" + WSGIPassAuthorization On + AuthBasicProvider wsgi + WSGIAuthUserScript /opt/ufm/scripts/ufm_authentication_scripts/auth.py + AuthBasicAuthoritative Off + ErrorDocument 401 /login + RequestHeader set X-Remote-User "%{REMOTE_USER}s" + RequestHeader unset X-Forwarded-Server + Require valid-user + diff --git a/plugins/ufm_consumer_plugin/scripts/config_consumer.sh b/plugins/ufm_consumer_plugin/scripts/config_consumer.sh index 695960b8c..d63665244 100755 --- a/plugins/ufm_consumer_plugin/scripts/config_consumer.sh +++ b/plugins/ufm_consumer_plugin/scripts/config_consumer.sh @@ -9,7 +9,8 @@ sqlite_conf=/config/sqlite sqlite_target=/opt/ufm/files/sqlite log_dir=/log auth_log_file_path=/opt/ufm/files/log/authentication_service.log - +ufm_media_original_path=/opt/ufm/media +consumer_media_path=/data/media keep_config_file() { @@ -118,5 +119,14 @@ fi chown -R ufmapp:ufmapp ${sqlite_target} ${sqlite_conf} ${log_dir} -echo "Consumer configuration completed succesfully." +# media directory of the UFM consumer should be shared with the host +# it will be served by the Host's apache, it should be accessible via the host +# /data default shared volume with the host's dir /opt/ufm/ufm_plugins_data/ufm_consumer/ +if [ ! -f ${consumer_media_path} ]; then + cp -r ${ufm_media_original_path} ${consumer_media_path} +fi +# update href base in the index.html of the UFM UI +sed -i "s/ufm_web/ufm_consumer_web/g" ${consumer_media_path}/index.html + +echo "Consumer configuration completed successfully." exit 0 diff --git a/plugins/ufm_consumer_plugin/scripts/init.sh b/plugins/ufm_consumer_plugin/scripts/init.sh index 518dff5c9..ebf3b72a3 100755 --- a/plugins/ufm_consumer_plugin/scripts/init.sh +++ b/plugins/ufm_consumer_plugin/scripts/init.sh @@ -21,9 +21,22 @@ PLUGIN_NAME=ufm_consumer SRC_DIR_PATH=/opt/ufm/ufm_plugin_${PLUGIN_NAME}/${PLUGIN_NAME}_plugin CONFIG_PATH=/config +update_http_apache_port() { + # update the plugin http port in the apache configurations + port=8997 #default port + . ${CONFIG_PATH}/${PLUGIN_NAME}_httpd_proxy.conf + sed -i "s/@@CONSUMER_REST_PORT@@/${port}/g" ${CONFIG_PATH}/ufm_plugin_${PLUGIN_NAME}_httpd.conf +} + echo /opt/ufm/files/licenses:/opt/ufm/files/licenses > /config/${PLUGIN_NAME}_shared_volumes.conf -cp $SRC_DIR_PATH/conf/${PLUGIN_NAME}_httpd_proxy.conf $SRC_DIR_PATH/conf/${PLUGIN_NAME}_plugin.conf $SRC_DIR_PATH/conf/${PLUGIN_NAME}_ui_conf.json ${CONFIG_PATH} +cp $SRC_DIR_PATH/conf/${PLUGIN_NAME}_httpd_proxy.conf \ + $SRC_DIR_PATH/conf/${PLUGIN_NAME}_plugin.conf \ + $SRC_DIR_PATH/conf/${PLUGIN_NAME}_ui_conf.json \ + $SRC_DIR_PATH/conf/ufm_plugin_${PLUGIN_NAME}_httpd.conf \ + ${CONFIG_PATH} + +update_http_apache_port # UFM version test required_ufm_version=(6 14 0)