Skip to content

Commit

Permalink
Azure Add loadbalancer healthprobe status detector (#527)
Browse files Browse the repository at this point in the history
* Create detector

* Improve backend_unhealthy_host_ratio detector

* Improve backend_unhealthy_host_ratio detector

* Add conf file for healthprobe

* Add notes for LB SKU and healthprobe

* Add note in readme for Healthprobe detector

* typo

* Update modules/integration_azure-load-balancer/conf/01-healthprobe.yaml

Co-authored-by: Spi <BzSpi@users.noreply.github.com>

* Change backend_unhealthy_host_ratio_aggregation_function to max

* update module

* integration_azure-load-balancer: Fix generated code

---------

Co-authored-by: Spi <BzSpi@users.noreply.github.com>
Co-authored-by: Jean-Baptiste Simillon <jb.simillon@fr.clara.net>
Co-authored-by: Laurent Piroelle <laurent.piroelle@fr.clara.net>
  • Loading branch information
4 people authored Dec 13, 2024
1 parent 2376314 commit a9ebcc0
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Load Balancer heartbeat|X|-|-|-|-|
|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-|


## integration_azure-mariadb
Expand Down
8 changes: 8 additions & 0 deletions modules/integration_azure-load-balancer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module)
- [How to collect required metrics?](#how-to-collect-required-metrics)
- [Metrics](#metrics)
- [Notes](#notes)
- [About Healthprobe detector](#about-healthprobe-detector)
- [Related documentation](#related-documentation)

<!-- END doctoc generated TOC please keep comment here to allow auto update -->
Expand Down Expand Up @@ -76,6 +78,7 @@ This module creates the following SignalFx detectors which could contain one or
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Load Balancer heartbeat|X|-|-|-|-|
|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-|

## How to collect required metrics?

Expand All @@ -94,9 +97,14 @@ Check the [Related documentation](#related-documentation) section for more detai
Here is the list of required metrics for detectors in this module.

* `ByteCount`
* `DipAvailability`


## Notes

### About Healthprobe detector

Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus).

## Related documentation

Expand Down
21 changes: 21 additions & 0 deletions modules/integration_azure-load-balancer/conf/01-healthprobe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
module: "Azure Load Balancer"
name: backend unhealthy host ratio
filtering: "filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')"
aggregation: ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
value_unit: "%"
transformation: true
signals:
signal:
metric: "DipAvailability"
rules:
critical:
threshold: 50
comparator: "<"
lasting_duration: '10m'
major:
threshold: 100
comparator: "<"
lasting_duration: '10m'
dependency: critical
...
5 changes: 5 additions & 0 deletions modules/integration_azure-load-balancer/conf/readme.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
documentations:
- name: Azure Monitor metrics
url: 'https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-network-loadbalancers-metrics'

notes: |
### About Healthprobe detector
Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus).
46 changes: 46 additions & 0 deletions modules/integration_azure-load-balancer/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,49 @@ EOF
max_delay = var.heartbeat_max_delay
}

resource "signalfx_detector" "backend_unhealthy_host_ratio" {
name = format("%s %s", local.detector_name_prefix, "Azure Load Balancer backend unhealthy host ratio")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "%"
}

program_text = <<-EOF
base_filtering = filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')
signal = data('DipAvailability', filter=base_filtering and ${module.filtering.signalflow})${var.backend_unhealthy_host_ratio_aggregation_function}${var.backend_unhealthy_host_ratio_transformation_function}.publish('signal')
detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif})).publish('CRIT')
detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_major}%{if var.backend_unhealthy_host_ratio_lasting_duration_major != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_major}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_major}%{endif}) and (not when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif}))).publish('MAJOR')
EOF

rule {
description = "is too low < ${var.backend_unhealthy_host_ratio_threshold_critical}%"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.backend_unhealthy_host_ratio_disabled_critical, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "")
tip = var.backend_unhealthy_host_ratio_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too low < ${var.backend_unhealthy_host_ratio_threshold_major}%"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.backend_unhealthy_host_ratio_disabled_major, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "")
tip = var.backend_unhealthy_host_ratio_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.backend_unhealthy_host_ratio_max_delay
}

5 changes: 5 additions & 0 deletions modules/integration_azure-load-balancer/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
output "backend_unhealthy_host_ratio" {
description = "Detector resource for backend_unhealthy_host_ratio"
value = signalfx_detector.backend_unhealthy_host_ratio
}

output "heartbeat" {
description = "Detector resource for heartbeat"
value = signalfx_detector.heartbeat
Expand Down
90 changes: 90 additions & 0 deletions modules/integration_azure-load-balancer/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,93 @@ variable "heartbeat_timeframe" {
default = "25m"
}

# backend_unhealthy_host_ratio detector

variable "backend_unhealthy_host_ratio_notifications" {
description = "Notification recipients list per severity overridden for backend_unhealthy_host_ratio detector"
type = map(list(string))
default = {}
}

variable "backend_unhealthy_host_ratio_aggregation_function" {
description = "Aggregation function and group by for backend_unhealthy_host_ratio detector (i.e. \".mean(by=['host'])\")"
type = string
default = ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
}

variable "backend_unhealthy_host_ratio_transformation_function" {
description = "Transformation function for backend_unhealthy_host_ratio detector (i.e. \".mean(over='5m')\")"
type = string
default = ""
}

variable "backend_unhealthy_host_ratio_max_delay" {
description = "Enforce max delay for backend_unhealthy_host_ratio detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "backend_unhealthy_host_ratio_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "backend_unhealthy_host_ratio_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "backend_unhealthy_host_ratio_disabled" {
description = "Disable all alerting rules for backend_unhealthy_host_ratio detector"
type = bool
default = null
}

variable "backend_unhealthy_host_ratio_disabled_critical" {
description = "Disable critical alerting rule for backend_unhealthy_host_ratio detector"
type = bool
default = null
}

variable "backend_unhealthy_host_ratio_disabled_major" {
description = "Disable major alerting rule for backend_unhealthy_host_ratio detector"
type = bool
default = null
}

variable "backend_unhealthy_host_ratio_threshold_critical" {
description = "Critical threshold for backend_unhealthy_host_ratio detector in %"
type = number
default = 50
}

variable "backend_unhealthy_host_ratio_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "backend_unhealthy_host_ratio_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "backend_unhealthy_host_ratio_threshold_major" {
description = "Major threshold for backend_unhealthy_host_ratio detector in %"
type = number
default = 100
}

variable "backend_unhealthy_host_ratio_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "backend_unhealthy_host_ratio_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

0 comments on commit a9ebcc0

Please sign in to comment.