Azure Add loadbalancer healthprobe status detector (#527)

* Create detector * Improve backend_unhealthy_host_ratio detector * Improve backend_unhealthy_host_ratio detector * Add conf file for healthprobe * Add notes for LB SKU and healthprobe * Add note in readme for Healthprobe detector * typo * Update modules/integration_azure-load-balancer/conf/01-healthprobe.yaml Co-authored-by: Spi <BzSpi@users.noreply.github.com> * Change backend_unhealthy_host_ratio_aggregation_function to max * update module * integration_azure-load-balancer: Fix generated code --------- Co-authored-by: Spi <BzSpi@users.noreply.github.com> Co-authored-by: Jean-Baptiste Simillon <jb.simillon@fr.clara.net> Co-authored-by: Laurent Piroelle <laurent.piroelle@fr.clara.net>
claranet · Dec 13, 2024 · a9ebcc0 · a9ebcc0
1 parent 2376314
commit a9ebcc0
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 0 deletions.
diff --git a/docs/severity.md b/docs/severity.md
@@ -592,6 +592,7 @@
 |Detector|Critical|Major|Minor|Warning|Info|
 |---|---|---|---|---|---|
 |Azure Load Balancer heartbeat|X|-|-|-|-|
+|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-|
 
 
 ## integration_azure-mariadb

diff --git a/modules/integration_azure-load-balancer/README.md b/modules/integration_azure-load-balancer/README.md
@@ -8,6 +8,8 @@
 - [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module)
 - [How to collect required metrics?](#how-to-collect-required-metrics)
   - [Metrics](#metrics)
+- [Notes](#notes)
+  - [About Healthprobe detector](#about-healthprobe-detector)
 - [Related documentation](#related-documentation)
 
 <!-- END doctoc generated TOC please keep comment here to allow auto update -->
@@ -76,6 +78,7 @@ This module creates the following SignalFx detectors which could contain one or
 |Detector|Critical|Major|Minor|Warning|Info|
 |---|---|---|---|---|---|
 |Azure Load Balancer heartbeat|X|-|-|-|-|
+|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-|
 
 ## How to collect required metrics?
 
@@ -94,9 +97,14 @@ Check the [Related documentation](#related-documentation) section for more detai
 Here is the list of required metrics for detectors in this module.
 
 * `ByteCount`
+* `DipAvailability`
 
 
+## Notes
 
+### About Healthprobe detector
+
+Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus).
 
 ## Related documentation
 

diff --git a/modules/integration_azure-load-balancer/conf/01-healthprobe.yaml b/modules/integration_azure-load-balancer/conf/01-healthprobe.yaml
@@ -0,0 +1,21 @@
+---
+module: "Azure Load Balancer"
+name: backend unhealthy host ratio
+filtering: "filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')"
+aggregation: ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
+value_unit: "%"
+transformation: true
+signals:
+  signal:
+    metric: "DipAvailability"
+rules:
+  critical:
+    threshold: 50
+    comparator: "<"
+    lasting_duration: '10m'
+  major:
+    threshold: 100
+    comparator: "<"
+    lasting_duration: '10m'
+    dependency: critical
+...
diff --git a/modules/integration_azure-load-balancer/conf/readme.yaml b/modules/integration_azure-load-balancer/conf/readme.yaml
@@ -1,3 +1,8 @@
 documentations:
   - name: Azure Monitor metrics
     url: 'https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-network-loadbalancers-metrics'
+
+notes: |
+  ### About Healthprobe detector
+
+  Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus).
diff --git a/modules/integration_azure-load-balancer/detectors-gen.tf b/modules/integration_azure-load-balancer/detectors-gen.tf
@@ -27,3 +27,49 @@ EOF
   max_delay = var.heartbeat_max_delay
 }
 
+resource "signalfx_detector" "backend_unhealthy_host_ratio" {
+  name = format("%s %s", local.detector_name_prefix, "Azure Load Balancer backend unhealthy host ratio")
+
+  authorized_writer_teams = var.authorized_writer_teams
+  teams                   = try(coalescelist(var.teams, var.authorized_writer_teams), null)
+  tags                    = compact(concat(local.common_tags, local.tags, var.extra_tags))
+
+  viz_options {
+    label        = "signal"
+    value_suffix = "%"
+  }
+
+  program_text = <<-EOF
+    base_filtering = filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')
+    signal = data('DipAvailability', filter=base_filtering and ${module.filtering.signalflow})${var.backend_unhealthy_host_ratio_aggregation_function}${var.backend_unhealthy_host_ratio_transformation_function}.publish('signal')
+    detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif})).publish('CRIT')
+    detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_major}%{if var.backend_unhealthy_host_ratio_lasting_duration_major != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_major}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_major}%{endif}) and (not when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif}))).publish('MAJOR')
+EOF
+
+  rule {
+    description           = "is too low < ${var.backend_unhealthy_host_ratio_threshold_critical}%"
+    severity              = "Critical"
+    detect_label          = "CRIT"
+    disabled              = coalesce(var.backend_unhealthy_host_ratio_disabled_critical, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled)
+    notifications         = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "critical", []), var.notifications.critical), null)
+    runbook_url           = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "")
+    tip                   = var.backend_unhealthy_host_ratio_tip
+    parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
+    parameterized_body    = var.message_body == "" ? local.rule_body : var.message_body
+  }
+
+  rule {
+    description           = "is too low < ${var.backend_unhealthy_host_ratio_threshold_major}%"
+    severity              = "Major"
+    detect_label          = "MAJOR"
+    disabled              = coalesce(var.backend_unhealthy_host_ratio_disabled_major, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled)
+    notifications         = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "major", []), var.notifications.major), null)
+    runbook_url           = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "")
+    tip                   = var.backend_unhealthy_host_ratio_tip
+    parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
+    parameterized_body    = var.message_body == "" ? local.rule_body : var.message_body
+  }
+
+  max_delay = var.backend_unhealthy_host_ratio_max_delay
+}
+
diff --git a/modules/integration_azure-load-balancer/outputs.tf b/modules/integration_azure-load-balancer/outputs.tf
@@ -1,3 +1,8 @@
+output "backend_unhealthy_host_ratio" {
+  description = "Detector resource for backend_unhealthy_host_ratio"
+  value       = signalfx_detector.backend_unhealthy_host_ratio
+}
+
 output "heartbeat" {
   description = "Detector resource for heartbeat"
   value       = signalfx_detector.heartbeat

diff --git a/modules/integration_azure-load-balancer/variables-gen.tf b/modules/integration_azure-load-balancer/variables-gen.tf
@@ -48,3 +48,93 @@ variable "heartbeat_timeframe" {
   default     = "25m"
 }
 
+# backend_unhealthy_host_ratio detector
+
+variable "backend_unhealthy_host_ratio_notifications" {
+  description = "Notification recipients list per severity overridden for backend_unhealthy_host_ratio detector"
+  type        = map(list(string))
+  default     = {}
+}
+
+variable "backend_unhealthy_host_ratio_aggregation_function" {
+  description = "Aggregation function and group by for backend_unhealthy_host_ratio detector (i.e. \".mean(by=['host'])\")"
+  type        = string
+  default     = ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
+}
+
+variable "backend_unhealthy_host_ratio_transformation_function" {
+  description = "Transformation function for backend_unhealthy_host_ratio detector (i.e. \".mean(over='5m')\")"
+  type        = string
+  default     = ""
+}
+
+variable "backend_unhealthy_host_ratio_max_delay" {
+  description = "Enforce max delay for backend_unhealthy_host_ratio detector (use \"0\" or \"null\" for \"Auto\")"
+  type        = number
+  default     = null
+}
+
+variable "backend_unhealthy_host_ratio_tip" {
+  description = "Suggested first course of action or any note useful for incident handling"
+  type        = string
+  default     = ""
+}
+
+variable "backend_unhealthy_host_ratio_runbook_url" {
+  description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
+  type        = string
+  default     = ""
+}
+
+variable "backend_unhealthy_host_ratio_disabled" {
+  description = "Disable all alerting rules for backend_unhealthy_host_ratio detector"
+  type        = bool
+  default     = null
+}
+
+variable "backend_unhealthy_host_ratio_disabled_critical" {
+  description = "Disable critical alerting rule for backend_unhealthy_host_ratio detector"
+  type        = bool
+  default     = null
+}
+
+variable "backend_unhealthy_host_ratio_disabled_major" {
+  description = "Disable major alerting rule for backend_unhealthy_host_ratio detector"
+  type        = bool
+  default     = null
+}
+
+variable "backend_unhealthy_host_ratio_threshold_critical" {
+  description = "Critical threshold for backend_unhealthy_host_ratio detector in %"
+  type        = number
+  default     = 50
+}
+
+variable "backend_unhealthy_host_ratio_lasting_duration_critical" {
+  description = "Minimum duration that conditions must be true before raising alert"
+  type        = string
+  default     = "10m"
+}
+
+variable "backend_unhealthy_host_ratio_at_least_percentage_critical" {
+  description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
+  type        = number
+  default     = 1
+}
+variable "backend_unhealthy_host_ratio_threshold_major" {
+  description = "Major threshold for backend_unhealthy_host_ratio detector in %"
+  type        = number
+  default     = 100
+}
+
+variable "backend_unhealthy_host_ratio_lasting_duration_major" {
+  description = "Minimum duration that conditions must be true before raising alert"
+  type        = string
+  default     = "10m"
+}
+
+variable "backend_unhealthy_host_ratio_at_least_percentage_major" {
+  description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
+  type        = number
+  default     = 1
+}