Skip to content

Commit

Permalink
Merge pull request #12 from ecmwf/hotfix/ssl_fix
Browse files Browse the repository at this point in the history
Hotfix/ssl fix
  • Loading branch information
sametd authored Dec 20, 2023
2 parents 42a4f50 + 9478e9f commit c0d0f2b
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 8 deletions.
32 changes: 28 additions & 4 deletions aviso-server/monitoring/aviso_monitoring/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
aviso_auth_reporter=None,
etcd_reporter=None,
prometheus_reporter=None,
kube_state_metrics=None,
):
try:
# we build the configuration in priority order from the lower to the higher
Expand All @@ -41,6 +42,7 @@ def __init__(
self.aviso_auth_reporter = aviso_auth_reporter
self.etcd_reporter = etcd_reporter
self.prometheus_reporter = prometheus_reporter
self.kube_state_metrics = kube_state_metrics

logger.debug("Loading configuration completed")

Expand Down Expand Up @@ -113,6 +115,8 @@ def _create_default_config() -> Dict:
},
}

kube_state_metrics = {"ssl_enabled": False, "token": None}

# main config
config = {}
config["udp_server"] = udp_server
Expand All @@ -121,6 +125,7 @@ def _create_default_config() -> Dict:
config["aviso_auth_reporter"] = aviso_auth_reporter
config["etcd_reporter"] = etcd_reporter
config["prometheus_reporter"] = prometheus_reporter
config["kube_state_metrics"] = kube_state_metrics
return config

def _read_env_variables(self) -> Dict:
Expand Down Expand Up @@ -179,7 +184,7 @@ def aviso_rest_reporter(self, aviso_rest_reporter):
assert ar is not None, "aviso_rest_reporter has not been configured"
assert ar.get("tlms") is not None, "aviso_rest_reporter tlms has not been configured"
assert ar.get("enabled") is not None, "aviso_rest_reporter enabled has not been configured"
if type(ar["enabled"]) is str:
if isinstance(ar["enabled"], str):
ar["enabled"] = ar["enabled"].casefold() == "true".casefold()
assert ar.get("frequency") is not None, "aviso_rest_reporter frequency has not been configured"
self._aviso_rest_reporter = ar
Expand All @@ -199,7 +204,7 @@ def aviso_auth_reporter(self, aviso_auth_reporter):
assert aa is not None, "aviso_auth_reporter has not been configured"
assert aa.get("tlms") is not None, "aviso_auth_reporter tlms has not been configured"
assert aa.get("enabled") is not None, "aviso_auth_reporter enabled has not been configured"
if type(aa["enabled"]) is str:
if isinstance(aa["enabled"], str):
aa["enabled"] = aa["enabled"].casefold() == "true".casefold()
assert aa.get("frequency") is not None, "aviso_auth_reporter frequency has not been configured"
self._aviso_auth_reporter = aa
Expand All @@ -219,7 +224,7 @@ def etcd_reporter(self, etcd_reporter):
assert e is not None, "etcd_reporter has not been configured"
assert e.get("tlms") is not None, "etcd_reporter tlms has not been configured"
assert e.get("enabled") is not None, "etcd_reporter enabled has not been configured"
if type(e["enabled"]) is str:
if isinstance(e["enabled"], str):
e["enabled"] = e["enabled"].casefold() == "true".casefold()
assert e.get("frequency") is not None, "etcd_reporter frequency has not been configured"
assert e.get("member_urls") is not None, "etcd_reporter member_urls has not been configured"
Expand All @@ -241,11 +246,29 @@ def prometheus_reporter(self, prometheus_reporter):
assert pr is not None, "prometheus_reporter has not been configured"
assert pr.get("host") is not None, "prometheus_reporter host has not been configured"
assert pr.get("enabled") is not None, "prometheus_reporter enabled has not been configured"
if type(pr["enabled"]) is str:
if isinstance(pr["enabled"], str):
pr["enabled"] = pr["enabled"].casefold() == "true".casefold()
assert pr.get("port") is not None, "prometheus_reporter port has not been configured"
self._prometheus_reporter = pr

@property
def kube_state_metrics(self):
return self._kube_state_metrics

@kube_state_metrics.setter
def kube_state_metrics(self, kube_state_metrics):
ksm = self._config.get("kube_state_metrics")
if kube_state_metrics is not None and ksm is not None:
Config.deep_update(ksm, kube_state_metrics)
elif kube_state_metrics is not None:
ksm = kube_state_metrics
# verify is valid
assert ksm is not None, "kube_state_metrics has not been configured"
assert ksm.get("ssl_enabled") is not None, "kube_state_metrics ssl_enabled has not been configured"
if ksm["ssl_enabled"]:
assert ksm.get("token") is not None, "kube_state_metrics token has not been configured"
self._kube_state_metrics = ksm

def __str__(self):
config_string = (
f"udp_server: {self.udp_server}"
Expand All @@ -254,6 +277,7 @@ def __str__(self):
+ f", aviso_auth_reporter: {self.aviso_auth_reporter}"
+ f", etcd_reporter: {self.etcd_reporter}"
+ f", prometheus_reporter: {self.prometheus_reporter}"
+ f", kube_state_metrics: {self.kube_state_metrics}"
)
return config_string

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,12 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def metric(self):
pattern = r'kube_deployment_status_replicas{namespace="aviso",deployment="aviso-auth-\w+"}'
namespace = self.get_k8s_pod_namespace()
if not namespace:
logger.warning("Could not determine the pod's namespace.")
namespace = "aviso"

pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-auth"}}'
# defaults
status = 0
message = "All pods available"
Expand Down Expand Up @@ -243,3 +248,30 @@ def metric(self):
m_status = {"name": self.metric_name, "status": 1, "message": "Metric could not be retrieved"}
logger.debug(f"{self.metric_name} metric: {m_status}")
return m_status

@staticmethod
def get_k8s_pod_namespace():
"""
Retrieves the Kubernetes (k8s) namespace in which the current pod is running.
This function reads the namespace name from a file that Kubernetes automatically
mounts inside the pod. This file is typically located at:
'/var/run/secrets/kubernetes.io/serviceaccount/namespace'
Returns:
str: The namespace in which the pod is running. If the namespace cannot be determined
(e.g., the file doesn't exist or the pod is not running in a k8s environment),
the function returns None.
"""
namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
try:
with open(namespace_file, "r") as file:
return file.read().strip()
except FileNotFoundError:
logger.error(f"Namespace file not found: {namespace_file}")
except IOError as e:
logger.error(f"I/O error occurred when reading namespace file: {e}")
except Exception as e:
logger.exception(f"Unexpected error occurred when reading namespace file: {e}")

return None
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(self, config, *args, **kwargs):
self.frequency = aviso_rest_config["frequency"]
self.enabled = aviso_rest_config["enabled"]
self.tlms = aviso_rest_config["tlms"]
# configure the metric vars once only here
OpsviewReporter.configure_metric_vars(config)
super().__init__(config, *args, **kwargs)

def process_messages(self):
Expand Down Expand Up @@ -182,7 +184,12 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def metric(self):
pattern = r'kube_deployment_status_replicas{namespace="aviso",deployment="aviso-rest-\w+"}'
namespace = self.get_k8s_pod_namespace()
if not namespace:
logger.warning("Could not determine the pod's namespace.")
namespace = "aviso"

pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-rest"}}'
# defaults
status = 0
message = "All pods available"
Expand Down Expand Up @@ -224,3 +231,30 @@ def metric(self):
m_status = {"name": self.metric_name, "status": 1, "message": "Metric could not be retrieved"}
logger.debug(f"{self.metric_name} metric: {m_status}")
return m_status

@staticmethod
def get_k8s_pod_namespace():
"""
Retrieves the Kubernetes (k8s) namespace in which the current pod is running.
This function reads the namespace name from a file that Kubernetes automatically
mounts inside the pod. This file is typically located at:
'/var/run/secrets/kubernetes.io/serviceaccount/namespace'
Returns:
str: The namespace in which the pod is running. If the namespace cannot be determined
(e.g., the file doesn't exist or the pod is not running in a k8s environment),
the function returns None.
"""
namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
try:
with open(namespace_file, "r") as file:
return file.read().strip()
except FileNotFoundError:
logger.error(f"Namespace file not found: {namespace_file}")
except IOError as e:
logger.error(f"I/O error occurred when reading namespace file: {e}")
except Exception as e:
logger.exception(f"Unexpected error occurred when reading namespace file: {e}")

return None
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,25 @@


class OpsviewReporter(ABC):
metric_ssl_enabled = False
metric_token = ""

def __init__(self, config: Config, msg_receiver=None):
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
self.monitor_servers = config.monitor_servers
self.msg_receiver = msg_receiver
self.token = {}

@classmethod
def configure_metric_vars(cls, config):
"""
Configures the class attributes based on the provided config.
"""
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
if config.kube_state_metrics["ssl_enabled"]:
cls.metric_ssl_enabled = True
cls.metric_token = config.kube_state_metrics["token"]

def ms_authenticate(self, m_server):
"""
This method authenticate to the monitoring server
Expand Down Expand Up @@ -199,16 +212,20 @@ def aggregate_unique_counter_tlms(tlms):
}
return agg_tlm

def retrieve_metrics(metric_servers, req_timeout):
@classmethod
def retrieve_metrics(cls, metric_servers, req_timeout):
"""
This methods retrieves the metrics provided by specific metric servers using a Prometheus interface.
"""
raw_tlms = {}
for u in metric_servers:
url = u + "/metrics"
logger.debug(f"Retrieving metrics from {url}...")
headers = {}
try:
resp = requests.get(url, verify=False, timeout=req_timeout)
if cls.metric_ssl_enabled:
headers["Authorization"] = f"Bearer {cls.metric_token}"
resp = requests.get(url, verify=False, timeout=req_timeout, headers=headers)
except Exception as e:
logger.exception(f"Not able to get metrics from {url}, error {e}")
raw_tlms[u] = None
Expand Down

0 comments on commit c0d0f2b

Please sign in to comment.