From 37fd36f9881e8c8e91f41661a7bc5f4aaf02e2ce Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 17:07:45 +0100 Subject: [PATCH 01/11] custom kube_metric support is added to monitoring --- .../monitoring/aviso_monitoring/config.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/aviso-server/monitoring/aviso_monitoring/config.py b/aviso-server/monitoring/aviso_monitoring/config.py index cc9ae9b..4c248c5 100644 --- a/aviso-server/monitoring/aviso_monitoring/config.py +++ b/aviso-server/monitoring/aviso_monitoring/config.py @@ -27,6 +27,7 @@ def __init__( aviso_auth_reporter=None, etcd_reporter=None, prometheus_reporter=None, + kube_state_metrics=None, ): try: # we build the configuration in priority order from the lower to the higher @@ -41,6 +42,7 @@ def __init__( self.aviso_auth_reporter = aviso_auth_reporter self.etcd_reporter = etcd_reporter self.prometheus_reporter = prometheus_reporter + self.kube_state_metrics = kube_state_metrics logger.debug("Loading configuration completed") @@ -113,6 +115,8 @@ def _create_default_config() -> Dict: }, } + kube_state_metrics = {"ssl_enabled": False, "token": None} + # main config config = {} config["udp_server"] = udp_server @@ -121,6 +125,7 @@ def _create_default_config() -> Dict: config["aviso_auth_reporter"] = aviso_auth_reporter config["etcd_reporter"] = etcd_reporter config["prometheus_reporter"] = prometheus_reporter + config["kube_state_metrics"] = kube_state_metrics return config def _read_env_variables(self) -> Dict: @@ -246,6 +251,23 @@ def prometheus_reporter(self, prometheus_reporter): assert pr.get("port") is not None, "prometheus_reporter port has not been configured" self._prometheus_reporter = pr + @property + def kube_state_metrics(self): + return self._kube_state_metrics + + @kube_state_metrics.setter + def kube_state_metrics(self, kube_state_metrics): + ksm = self._config.get("kube_state_metrics") + if kube_state_metrics is not None and ksm is not None: + Config.deep_update(ksm, kube_state_metrics) + elif kube_state_metrics is not None: + ksm = kube_state_metrics + # verify is valid + assert ksm is not None, "kube_state_metrics has not been configured" + assert ksm.get("ssl_enabled") is not None, "kube_state_metrics ssl_enabled has not been configured" + assert ksm.get("token") is not None, "kube_state_metrics token has not been configured" + self._kube_state_metrics = ksm + def __str__(self): config_string = ( f"udp_server: {self.udp_server}" @@ -254,6 +276,7 @@ def __str__(self): + f", aviso_auth_reporter: {self.aviso_auth_reporter}" + f", etcd_reporter: {self.etcd_reporter}" + f", prometheus_reporter: {self.prometheus_reporter}" + + f", kube_state_metrics: {self.kube_state_metrics}" ) return config_string From 03aebe7d27910ba46d1b3da103de22565399e6eb Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 17:19:45 +0100 Subject: [PATCH 02/11] token should only be checked if ssl is enabled --- aviso-server/monitoring/aviso_monitoring/config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/config.py b/aviso-server/monitoring/aviso_monitoring/config.py index 4c248c5..a6ded4d 100644 --- a/aviso-server/monitoring/aviso_monitoring/config.py +++ b/aviso-server/monitoring/aviso_monitoring/config.py @@ -184,7 +184,7 @@ def aviso_rest_reporter(self, aviso_rest_reporter): assert ar is not None, "aviso_rest_reporter has not been configured" assert ar.get("tlms") is not None, "aviso_rest_reporter tlms has not been configured" assert ar.get("enabled") is not None, "aviso_rest_reporter enabled has not been configured" - if type(ar["enabled"]) is str: + if isinstance(ar["enabled"], str): ar["enabled"] = ar["enabled"].casefold() == "true".casefold() assert ar.get("frequency") is not None, "aviso_rest_reporter frequency has not been configured" self._aviso_rest_reporter = ar @@ -204,7 +204,7 @@ def aviso_auth_reporter(self, aviso_auth_reporter): assert aa is not None, "aviso_auth_reporter has not been configured" assert aa.get("tlms") is not None, "aviso_auth_reporter tlms has not been configured" assert aa.get("enabled") is not None, "aviso_auth_reporter enabled has not been configured" - if type(aa["enabled"]) is str: + if isinstance(aa["enabled"], str): aa["enabled"] = aa["enabled"].casefold() == "true".casefold() assert aa.get("frequency") is not None, "aviso_auth_reporter frequency has not been configured" self._aviso_auth_reporter = aa @@ -224,7 +224,7 @@ def etcd_reporter(self, etcd_reporter): assert e is not None, "etcd_reporter has not been configured" assert e.get("tlms") is not None, "etcd_reporter tlms has not been configured" assert e.get("enabled") is not None, "etcd_reporter enabled has not been configured" - if type(e["enabled"]) is str: + if isinstance(e["enabled"], str): e["enabled"] = e["enabled"].casefold() == "true".casefold() assert e.get("frequency") is not None, "etcd_reporter frequency has not been configured" assert e.get("member_urls") is not None, "etcd_reporter member_urls has not been configured" @@ -246,7 +246,7 @@ def prometheus_reporter(self, prometheus_reporter): assert pr is not None, "prometheus_reporter has not been configured" assert pr.get("host") is not None, "prometheus_reporter host has not been configured" assert pr.get("enabled") is not None, "prometheus_reporter enabled has not been configured" - if type(pr["enabled"]) is str: + if isinstance(pr["enabled"], str): pr["enabled"] = pr["enabled"].casefold() == "true".casefold() assert pr.get("port") is not None, "prometheus_reporter port has not been configured" self._prometheus_reporter = pr @@ -265,7 +265,8 @@ def kube_state_metrics(self, kube_state_metrics): # verify is valid assert ksm is not None, "kube_state_metrics has not been configured" assert ksm.get("ssl_enabled") is not None, "kube_state_metrics ssl_enabled has not been configured" - assert ksm.get("token") is not None, "kube_state_metrics token has not been configured" + if ksm["ssl_enabled"]: + assert ksm.get("token") is not None, "kube_state_metrics token has not been configured" self._kube_state_metrics = ksm def __str__(self): From 7032e597549423f0d0d8ef43d441100516222885 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 18:06:12 +0100 Subject: [PATCH 03/11] metric token support added for retrieve metrics function --- .../aviso_monitoring/reporter/opsview_reporter.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py index 09ba543..4dd81c9 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py @@ -24,6 +24,9 @@ def __init__(self, config: Config, msg_receiver=None): self.monitor_servers = config.monitor_servers self.msg_receiver = msg_receiver self.token = {} + if config.kube_state_metrics["ssl_enabled"]: + self.metric_ssl_enabled = True + self.metric_token = config.kube_state_metrics["token"] def ms_authenticate(self, m_server): """ @@ -199,7 +202,7 @@ def aggregate_unique_counter_tlms(tlms): } return agg_tlm - def retrieve_metrics(metric_servers, req_timeout): + def retrieve_metrics(self, metric_servers, req_timeout): """ This methods retrieves the metrics provided by specific metric servers using a Prometheus interface. """ @@ -207,8 +210,11 @@ def retrieve_metrics(metric_servers, req_timeout): for u in metric_servers: url = u + "/metrics" logger.debug(f"Retrieving metrics from {url}...") + headers = {} try: - resp = requests.get(url, verify=False, timeout=req_timeout) + if self.metric_ssl_enabled: + headers["Authorization"] = f"Bearer {self.metric_token}" + resp = requests.get(url, verify=False, timeout=req_timeout, headers=headers) except Exception as e: logger.exception(f"Not able to get metrics from {url}, error {e}") raw_tlms[u] = None From 4089bc243a1938d3dcb24b1dd44d04faba30a592 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 18:31:44 +0100 Subject: [PATCH 04/11] opsviewreporter instantiated when necessary to retrieve logs --- .../aviso_monitoring/reporter/aviso_auth_reporter.py | 5 +++-- .../aviso_monitoring/reporter/aviso_rest_reporter.py | 5 +++-- .../monitoring/aviso_monitoring/reporter/etcd_reporter.py | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py index 7a118f8..8d50cd9 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py @@ -198,6 +198,7 @@ def __init__(self, *args, **kwargs): self.critical_t = kwargs["critical_t"] self.req_timeout = kwargs["req_timeout"] self.metric_server_url = kwargs["metric_server_url"] + self.opsview_reporter = OpsviewReporter() super().__init__(*args, **kwargs) def metric(self): @@ -209,13 +210,13 @@ def metric(self): # fetch the cluster metrics if self.metric_server_url: - metrics = OpsviewReporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ + metrics = self.opsview_reporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ self.metric_server_url ] if metrics: logger.debug(f"Processing tlm {self.metric_name}...") - av_pod = OpsviewReporter.read_from_metrics(metrics, pattern) + av_pod = self.opsview_reporter.read_from_metrics(metrics, pattern) if av_pod: av_pod = int(av_pod) if av_pod <= self.critical_t: diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index da520a8..cfa1e66 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -179,6 +179,7 @@ def __init__(self, *args, **kwargs): self.critical_t = kwargs["critical_t"] self.req_timeout = kwargs["req_timeout"] self.metric_server_url = kwargs["metric_server_url"] + self.opsview_reporter = OpsviewReporter() super().__init__(*args, **kwargs) def metric(self): @@ -190,13 +191,13 @@ def metric(self): # fetch the cluster metrics if self.metric_server_url: - metrics = OpsviewReporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ + metrics = self.opsview_reporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ self.metric_server_url ] if metrics: logger.debug(f"Processing tlm {self.metric_name}...") - av_pod = OpsviewReporter.read_from_metrics(metrics, pattern) + av_pod = self.opsview_reporter.read_from_metrics(metrics, pattern) if av_pod: av_pod = int(av_pod) if av_pod <= self.critical_t: diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py index 6d6900b..bb705c7 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py @@ -23,6 +23,7 @@ def __init__(self, config, *args, **kwargs): self.req_timeout = self.etcd_config["req_timeout"] self.member_urls = self.etcd_config["member_urls"] self.tlms = self.etcd_config["tlms"] + self.opsview_reporter = OpsviewReporter() super().__init__(config, *args, **kwargs) def process_messages(self): @@ -33,7 +34,7 @@ def process_messages(self): logger.debug("Etcd processing metrics...") # fetch the raw tlms provided by etcd - raw_tlms = OpsviewReporter.retrieve_metrics(self.member_urls, self.req_timeout) # noqa: F841 + raw_tlms = self.opsview_reporter.retrieve_metrics(self.member_urls, self.req_timeout) # noqa: F841 # array of metric to return metrics = [] From 637b5d49b3004c5d773515f2c1b03c5436e29156 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 18:40:15 +0100 Subject: [PATCH 05/11] Revert "opsviewreporter instantiated when necessary to retrieve logs" This reverts commit 4089bc243a1938d3dcb24b1dd44d04faba30a592. --- .../aviso_monitoring/reporter/aviso_auth_reporter.py | 5 ++--- .../aviso_monitoring/reporter/aviso_rest_reporter.py | 5 ++--- .../monitoring/aviso_monitoring/reporter/etcd_reporter.py | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py index 8d50cd9..7a118f8 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py @@ -198,7 +198,6 @@ def __init__(self, *args, **kwargs): self.critical_t = kwargs["critical_t"] self.req_timeout = kwargs["req_timeout"] self.metric_server_url = kwargs["metric_server_url"] - self.opsview_reporter = OpsviewReporter() super().__init__(*args, **kwargs) def metric(self): @@ -210,13 +209,13 @@ def metric(self): # fetch the cluster metrics if self.metric_server_url: - metrics = self.opsview_reporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ + metrics = OpsviewReporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ self.metric_server_url ] if metrics: logger.debug(f"Processing tlm {self.metric_name}...") - av_pod = self.opsview_reporter.read_from_metrics(metrics, pattern) + av_pod = OpsviewReporter.read_from_metrics(metrics, pattern) if av_pod: av_pod = int(av_pod) if av_pod <= self.critical_t: diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index cfa1e66..da520a8 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -179,7 +179,6 @@ def __init__(self, *args, **kwargs): self.critical_t = kwargs["critical_t"] self.req_timeout = kwargs["req_timeout"] self.metric_server_url = kwargs["metric_server_url"] - self.opsview_reporter = OpsviewReporter() super().__init__(*args, **kwargs) def metric(self): @@ -191,13 +190,13 @@ def metric(self): # fetch the cluster metrics if self.metric_server_url: - metrics = self.opsview_reporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ + metrics = OpsviewReporter.retrieve_metrics([self.metric_server_url], self.req_timeout)[ self.metric_server_url ] if metrics: logger.debug(f"Processing tlm {self.metric_name}...") - av_pod = self.opsview_reporter.read_from_metrics(metrics, pattern) + av_pod = OpsviewReporter.read_from_metrics(metrics, pattern) if av_pod: av_pod = int(av_pod) if av_pod <= self.critical_t: diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py index bb705c7..6d6900b 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/etcd_reporter.py @@ -23,7 +23,6 @@ def __init__(self, config, *args, **kwargs): self.req_timeout = self.etcd_config["req_timeout"] self.member_urls = self.etcd_config["member_urls"] self.tlms = self.etcd_config["tlms"] - self.opsview_reporter = OpsviewReporter() super().__init__(config, *args, **kwargs) def process_messages(self): @@ -34,7 +33,7 @@ def process_messages(self): logger.debug("Etcd processing metrics...") # fetch the raw tlms provided by etcd - raw_tlms = self.opsview_reporter.retrieve_metrics(self.member_urls, self.req_timeout) # noqa: F841 + raw_tlms = OpsviewReporter.retrieve_metrics(self.member_urls, self.req_timeout) # noqa: F841 # array of metric to return metrics = [] From 3a202f7b52fcde0847c0e628a9f73799e9fb02c6 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 19:43:26 +0100 Subject: [PATCH 06/11] metric token support added for retrieve metrics function --- .../aviso_monitoring/reporter/opsview_reporter.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py index 4dd81c9..14d7be6 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py @@ -202,7 +202,8 @@ def aggregate_unique_counter_tlms(tlms): } return agg_tlm - def retrieve_metrics(self, metric_servers, req_timeout): + @classmethod + def retrieve_metrics(cls, metric_servers, req_timeout): """ This methods retrieves the metrics provided by specific metric servers using a Prometheus interface. """ @@ -212,8 +213,8 @@ def retrieve_metrics(self, metric_servers, req_timeout): logger.debug(f"Retrieving metrics from {url}...") headers = {} try: - if self.metric_ssl_enabled: - headers["Authorization"] = f"Bearer {self.metric_token}" + if cls.metric_ssl_enabled: + headers["Authorization"] = f"Bearer {cls.metric_token}" resp = requests.get(url, verify=False, timeout=req_timeout, headers=headers) except Exception as e: logger.exception(f"Not able to get metrics from {url}, error {e}") From 1026913a6e19a58ad22564af545b3c6fdaf4ca2a Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 20:02:49 +0100 Subject: [PATCH 07/11] mitigate metric variable installation --- .../reporter/aviso_rest_reporter.py | 2 ++ .../aviso_monitoring/reporter/opsview_reporter.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index da520a8..b0ec55b 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -18,6 +18,8 @@ def __init__(self, config, *args, **kwargs): self.frequency = aviso_rest_config["frequency"] self.enabled = aviso_rest_config["enabled"] self.tlms = aviso_rest_config["tlms"] + #configure the metric vars once only here + OpsviewReporter.configure_metric_vars(config) super().__init__(config, *args, **kwargs) def process_messages(self): diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py index 14d7be6..1dc3b6a 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py @@ -19,14 +19,24 @@ class OpsviewReporter(ABC): + metric_ssl_enabled = False + metric_token = "" + def __init__(self, config: Config, msg_receiver=None): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) self.monitor_servers = config.monitor_servers self.msg_receiver = msg_receiver self.token = {} + + @classmethod + def configure_metric_vars(cls, config): + """ + Configures the class attributes based on the provided config. + """ + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) if config.kube_state_metrics["ssl_enabled"]: - self.metric_ssl_enabled = True - self.metric_token = config.kube_state_metrics["token"] + cls.metric_ssl_enabled = True + cls.metric_token = config.kube_state_metrics["token"] def ms_authenticate(self, m_server): """ From 964d4c05ddb2a991da0e02db9587064dcc83d529 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 22:55:31 +0100 Subject: [PATCH 08/11] namespace is now taken from the pod instead of a fixed name --- .../reporter/aviso_rest_reporter.py | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index b0ec55b..4b6752b 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -18,7 +18,7 @@ def __init__(self, config, *args, **kwargs): self.frequency = aviso_rest_config["frequency"] self.enabled = aviso_rest_config["enabled"] self.tlms = aviso_rest_config["tlms"] - #configure the metric vars once only here + # configure the metric vars once only here OpsviewReporter.configure_metric_vars(config) super().__init__(config, *args, **kwargs) @@ -184,7 +184,14 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def metric(self): - pattern = r'kube_deployment_status_replicas{namespace="aviso",deployment="aviso-rest-\w+"}' + namespace = self.get_k8s_pod_namespace() + if namespace: + logger.info(f"The pod is running in the '{namespace}' namespace.") + else: + logger.warning("Could not determine the pod's namespace.") + namespace = "aviso" + + pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-rest-\w+"}}' # defaults status = 0 message = "All pods available" @@ -226,3 +233,30 @@ def metric(self): m_status = {"name": self.metric_name, "status": 1, "message": "Metric could not be retrieved"} logger.debug(f"{self.metric_name} metric: {m_status}") return m_status + + @staticmethod + def get_k8s_pod_namespace(): + """ + Retrieves the Kubernetes (k8s) namespace in which the current pod is running. + + This function reads the namespace name from a file that Kubernetes automatically + mounts inside the pod. This file is typically located at: + '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + + Returns: + str: The namespace in which the pod is running. If the namespace cannot be determined + (e.g., the file doesn't exist or the pod is not running in a k8s environment), + the function returns None. + """ + namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + try: + with open(namespace_file, "r") as file: + return file.read().strip() + except FileNotFoundError: + logger.error(f"Namespace file not found: {namespace_file}") + except IOError as e: + logger.error(f"I/O error occurred when reading namespace file: {e}") + except Exception as e: + logger.exception(f"Unexpected error occurred when reading namespace file: {e}") + + return None From 884fd4f9aa9c069e4c318ebc343f72af3b249f40 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 23:03:23 +0100 Subject: [PATCH 09/11] namespace is now taken from the pod instead of a fixed name --- .../reporter/aviso_auth_reporter.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py index 7a118f8..2a2fc97 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py @@ -201,7 +201,14 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def metric(self): - pattern = r'kube_deployment_status_replicas{namespace="aviso",deployment="aviso-auth-\w+"}' + namespace = self.get_k8s_pod_namespace() + if namespace: + logger.info(f"The pod is running in the '{namespace}' namespace.") + else: + logger.warning("Could not determine the pod's namespace.") + namespace = "aviso" + + pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-auth-\w+"}}' # defaults status = 0 message = "All pods available" @@ -243,3 +250,30 @@ def metric(self): m_status = {"name": self.metric_name, "status": 1, "message": "Metric could not be retrieved"} logger.debug(f"{self.metric_name} metric: {m_status}") return m_status + + @staticmethod + def get_k8s_pod_namespace(): + """ + Retrieves the Kubernetes (k8s) namespace in which the current pod is running. + + This function reads the namespace name from a file that Kubernetes automatically + mounts inside the pod. This file is typically located at: + '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + + Returns: + str: The namespace in which the pod is running. If the namespace cannot be determined + (e.g., the file doesn't exist or the pod is not running in a k8s environment), + the function returns None. + """ + namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + try: + with open(namespace_file, "r") as file: + return file.read().strip() + except FileNotFoundError: + logger.error(f"Namespace file not found: {namespace_file}") + except IOError as e: + logger.error(f"I/O error occurred when reading namespace file: {e}") + except Exception as e: + logger.exception(f"Unexpected error occurred when reading namespace file: {e}") + + return None From 7675c3c9eafb32f008d40c8812ba99517094662b Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 23:28:27 +0100 Subject: [PATCH 10/11] name correction for the metric --- .../monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py | 2 +- .../monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py index 2a2fc97..c34022b 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py @@ -208,7 +208,7 @@ def metric(self): logger.warning("Could not determine the pod's namespace.") namespace = "aviso" - pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-auth-\w+"}}' + pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-auth"}}' # defaults status = 0 message = "All pods available" diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index 4b6752b..91f2f45 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -191,7 +191,7 @@ def metric(self): logger.warning("Could not determine the pod's namespace.") namespace = "aviso" - pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-rest-\w+"}}' + pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-rest"}}' # defaults status = 0 message = "All pods available" From 9478e9ff63f4814952ef1925ed97d9a41d4e0227 Mon Sep 17 00:00:00 2001 From: sametd Date: Wed, 20 Dec 2023 23:34:29 +0100 Subject: [PATCH 11/11] unnecessary info log is removed, otherwise same log would be written over and over --- .../aviso_monitoring/reporter/aviso_auth_reporter.py | 4 +--- .../aviso_monitoring/reporter/aviso_rest_reporter.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py index c34022b..6526930 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py @@ -202,9 +202,7 @@ def __init__(self, *args, **kwargs): def metric(self): namespace = self.get_k8s_pod_namespace() - if namespace: - logger.info(f"The pod is running in the '{namespace}' namespace.") - else: + if not namespace: logger.warning("Could not determine the pod's namespace.") namespace = "aviso" diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index 91f2f45..384f59d 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -185,9 +185,7 @@ def __init__(self, *args, **kwargs): def metric(self): namespace = self.get_k8s_pod_namespace() - if namespace: - logger.info(f"The pod is running in the '{namespace}' namespace.") - else: + if not namespace: logger.warning("Could not determine the pod's namespace.") namespace = "aviso"