Skip to content

Commit

Permalink
AZ-1269: New detector for VPN Tunnels' FAME Query (#534)
Browse files Browse the repository at this point in the history
* AZ-1269: New detector for VPN Tunnels' FAME Query

* AZ-1269: Reflect changes done on the query in the RUN module

* Update modules/fame_azure-vpn/conf/02-tunnel-status.yaml

Co-authored-by: Spi <BzSpi@users.noreply.github.com>

* AZ-1269: Add duration into fill function

* AZ-1269: remove transformation. No more needed

* Disable total flow count detector - replaced by tunnel status

* fame vpn add dependency

---------

Co-authored-by: Spi <BzSpi@users.noreply.github.com>
Co-authored-by: Gauthier AMPE <gauthier.ampe@fr.clara.net>
Co-authored-by: Jean-Baptiste Simillon <jb.simillon@fr.clara.net>
  • Loading branch information
4 people authored Dec 13, 2024
1 parent a3bece2 commit 2376314
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@
|---|---|---|---|---|---|
|Azure VPN heartbeat|X|-|-|-|-|
|Azure VPN total flow count|X|-|-|-|-|
|Azure VPN ipsec tunnel status|X|X|-|-|-|


## integration_aws-alb
Expand Down
2 changes: 2 additions & 0 deletions modules/fame_azure-vpn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ This module creates the following SignalFx detectors which could contain one or
|---|---|---|---|---|---|
|Azure VPN heartbeat|X|-|-|-|-|
|Azure VPN total flow count|X|-|-|-|-|
|Azure VPN ipsec tunnel status|X|X|-|-|-|

## How to collect required metrics?

Expand All @@ -97,6 +98,7 @@ Check the [Related documentation](#related-documentation) section for more detai
Here is the list of required metrics for detectors in this module.

* `fame.azure.virtual_network_gateway.total_flow_count`
* `fame.azure.virtual_network_gateway.tunnel_status`



Expand Down
2 changes: 1 addition & 1 deletion modules/fame_azure-vpn/conf/00-heartbeat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ aggregation: true

signals:
signal:
metric: fame.azure.virtual_network_gateway.total_flow_count
metric: fame.azure.virtual_network_gateway.tunnel_status

rules:
critical:
2 changes: 2 additions & 0 deletions modules/fame_azure-vpn/conf/01-total-flow-count.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ signals:
signal:
metric: fame.azure.virtual_network_gateway.total_flow_count

disabled: true

rules:
critical:
threshold: 0
Expand Down
21 changes: 21 additions & 0 deletions modules/fame_azure-vpn/conf/02-tunnel-status.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module: Azure VPN
name: IPSEC Tunnel Status
id: tunnel_status

transformation: true
aggregation: ".mean(by=['azure_resource_group', 'azure_resource_name', 'remote_ip'])"

signals:
signal:
metric: fame.azure.virtual_network_gateway.tunnel_status

rules:
critical:
threshold: 0
comparator: "=="
lasting_duration: '20m'
major:
threshold: 0
comparator: "=="
lasting_duration: '10m'
dependency: critical
42 changes: 41 additions & 1 deletion modules/fame_azure-vpn/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ resource "signalfx_detector" "heartbeat" {

program_text = <<-EOF
from signalfx.detectors.not_reporting import not_reporting
signal = data('fame.azure.virtual_network_gateway.total_flow_count', filter=${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal')
signal = data('fame.azure.virtual_network_gateway.tunnel_status', filter=${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal')
not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT')
EOF

Expand Down Expand Up @@ -53,3 +53,43 @@ EOF
max_delay = var.totalflowcount_max_delay
}

resource "signalfx_detector" "tunnel_status" {
name = format("%s %s", local.detector_name_prefix, "Azure VPN ipsec tunnel status")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('fame.azure.virtual_network_gateway.tunnel_status', filter=${module.filtering.signalflow})${var.tunnel_status_aggregation_function}${var.tunnel_status_transformation_function}.publish('signal')
detect(when(signal == ${var.tunnel_status_threshold_critical}%{if var.tunnel_status_lasting_duration_critical != null}, lasting='${var.tunnel_status_lasting_duration_critical}', at_least=${var.tunnel_status_at_least_percentage_critical}%{endif})).publish('CRIT')
detect(when(signal == ${var.tunnel_status_threshold_major}%{if var.tunnel_status_lasting_duration_major != null}, lasting='${var.tunnel_status_lasting_duration_major}', at_least=${var.tunnel_status_at_least_percentage_major}%{endif}) and (not when(signal == ${var.tunnel_status_threshold_critical}%{if var.tunnel_status_lasting_duration_critical != null}, lasting='${var.tunnel_status_lasting_duration_critical}', at_least=${var.tunnel_status_at_least_percentage_critical}%{endif}))).publish('MAJOR')
EOF

rule {
description = "is == ${var.tunnel_status_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.tunnel_status_disabled_critical, var.tunnel_status_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.tunnel_status_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.tunnel_status_runbook_url, var.runbook_url), "")
tip = var.tunnel_status_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is == ${var.tunnel_status_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.tunnel_status_disabled_major, var.tunnel_status_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.tunnel_status_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.tunnel_status_runbook_url, var.runbook_url), "")
tip = var.tunnel_status_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.tunnel_status_max_delay
}

5 changes: 5 additions & 0 deletions modules/fame_azure-vpn/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ output "totalflowcount" {
value = signalfx_detector.totalflowcount
}

output "tunnel_status" {
description = "Detector resource for tunnel_status"
value = signalfx_detector.tunnel_status
}

92 changes: 91 additions & 1 deletion modules/fame_azure-vpn/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ variable "totalflowcount_runbook_url" {
variable "totalflowcount_disabled" {
description = "Disable all alerting rules for totalflowcount detector"
type = bool
default = null
default = true
}

variable "totalflowcount_threshold_critical" {
Expand All @@ -109,3 +109,93 @@ variable "totalflowcount_at_least_percentage_critical" {
type = number
default = 1
}
# tunnel_status detector

variable "tunnel_status_notifications" {
description = "Notification recipients list per severity overridden for tunnel_status detector"
type = map(list(string))
default = {}
}

variable "tunnel_status_aggregation_function" {
description = "Aggregation function and group by for tunnel_status detector (i.e. \".mean(by=['host'])\")"
type = string
default = ".mean(by=['azure_resource_group', 'azure_resource_name', 'remote_ip'])"
}

variable "tunnel_status_transformation_function" {
description = "Transformation function for tunnel_status detector (i.e. \".mean(over='5m')\")"
type = string
default = ""
}

variable "tunnel_status_max_delay" {
description = "Enforce max delay for tunnel_status detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "tunnel_status_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "tunnel_status_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "tunnel_status_disabled" {
description = "Disable all alerting rules for tunnel_status detector"
type = bool
default = null
}

variable "tunnel_status_disabled_critical" {
description = "Disable critical alerting rule for tunnel_status detector"
type = bool
default = null
}

variable "tunnel_status_disabled_major" {
description = "Disable major alerting rule for tunnel_status detector"
type = bool
default = null
}

variable "tunnel_status_threshold_critical" {
description = "Critical threshold for tunnel_status detector"
type = number
default = 0
}

variable "tunnel_status_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "20m"
}

variable "tunnel_status_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "tunnel_status_threshold_major" {
description = "Major threshold for tunnel_status detector"
type = number
default = 0
}

variable "tunnel_status_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "tunnel_status_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

0 comments on commit 2376314

Please sign in to comment.