From eb37e4989ef2c30f2c8320942ec6c5877b72abdc Mon Sep 17 00:00:00 2001 From: Jean-Philippe LAINE Date: Fri, 25 Jun 2021 17:54:56 +0200 Subject: [PATCH 1/2] Init pm2 detectors --- modules/smart-agent_pm2/README.md | 114 ++++++++++++++ modules/smart-agent_pm2/common-filters.tf | 1 + modules/smart-agent_pm2/common-locals.tf | 1 + modules/smart-agent_pm2/common-modules.tf | 1 + modules/smart-agent_pm2/common-variables.tf | 1 + modules/smart-agent_pm2/common-versions.tf | 1 + .../smart-agent_pm2/conf/00-heartbeat.yaml | 14 ++ modules/smart-agent_pm2/conf/01-up.yaml | 15 ++ modules/smart-agent_pm2/conf/02-restarts.yaml | 18 +++ modules/smart-agent_pm2/conf/readme.yaml | 3 + modules/smart-agent_pm2/detectors-gen.tf | 94 ++++++++++++ modules/smart-agent_pm2/outputs.tf | 15 ++ modules/smart-agent_pm2/tags.tf | 4 + modules/smart-agent_pm2/variables-gen.tf | 144 ++++++++++++++++++ 14 files changed, 426 insertions(+) create mode 100644 modules/smart-agent_pm2/README.md create mode 120000 modules/smart-agent_pm2/common-filters.tf create mode 120000 modules/smart-agent_pm2/common-locals.tf create mode 120000 modules/smart-agent_pm2/common-modules.tf create mode 120000 modules/smart-agent_pm2/common-variables.tf create mode 120000 modules/smart-agent_pm2/common-versions.tf create mode 100644 modules/smart-agent_pm2/conf/00-heartbeat.yaml create mode 100644 modules/smart-agent_pm2/conf/01-up.yaml create mode 100644 modules/smart-agent_pm2/conf/02-restarts.yaml create mode 100644 modules/smart-agent_pm2/conf/readme.yaml create mode 100644 modules/smart-agent_pm2/detectors-gen.tf create mode 100644 modules/smart-agent_pm2/outputs.tf create mode 100644 modules/smart-agent_pm2/tags.tf create mode 100644 modules/smart-agent_pm2/variables-gen.tf diff --git a/modules/smart-agent_pm2/README.md b/modules/smart-agent_pm2/README.md new file mode 100644 index 000000000..6fa91334f --- /dev/null +++ b/modules/smart-agent_pm2/README.md @@ -0,0 +1,114 @@ +# PM2 SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/docs/modules/usage.html) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-smart-agent-pm2" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/smart-agent_pm2?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/docs/modules/sources.html)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + [filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/docs/configuration/types.html#object-) where each key represents an + available [detector rule severity](https://docs.signalfx.com/en/latest/detect-alert/set-up-detectors.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/docs/configuration/variables.html) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailled in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Pm2 heartbeat|X|-|-|-|-| +|Pm2 up application|X|-|-|-|-| +|Pm2 restarts counter|X|X|-|-|-| + +## How to collect required metrics? + +This module uses metrics available from +[monitors](https://docs.signalfx.com/en/latest/integrations/agent/monitors/_monitor-config.html) +available in the [SignalFx Smart +Agent](https://github.com/signalfx/signalfx-agent). Check the "Related documentation" section for more +information including the official documentation of this monitor. + + + + +### Metrics + + +To filter only required metrics for the detectors of this module, add the +[datapointsToExclude](https://docs.signalfx.com/en/latest/integrations/agent/filtering.html) parameter to +the corresponding monitor configuration: + +```yaml + datapointsToExclude: + - metricNames: + - '*' + - '!pm2_restarts' + - '!pm2_up' + +``` + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) diff --git a/modules/smart-agent_pm2/common-filters.tf b/modules/smart-agent_pm2/common-filters.tf new file mode 120000 index 000000000..4df54e41e --- /dev/null +++ b/modules/smart-agent_pm2/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-smart-agent.tf \ No newline at end of file diff --git a/modules/smart-agent_pm2/common-locals.tf b/modules/smart-agent_pm2/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/smart-agent_pm2/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/smart-agent_pm2/common-modules.tf b/modules/smart-agent_pm2/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/smart-agent_pm2/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/smart-agent_pm2/common-variables.tf b/modules/smart-agent_pm2/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/smart-agent_pm2/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/smart-agent_pm2/common-versions.tf b/modules/smart-agent_pm2/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/smart-agent_pm2/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/smart-agent_pm2/conf/00-heartbeat.yaml b/modules/smart-agent_pm2/conf/00-heartbeat.yaml new file mode 100644 index 000000000..c70e27bc4 --- /dev/null +++ b/modules/smart-agent_pm2/conf/00-heartbeat.yaml @@ -0,0 +1,14 @@ +## Example +module: pm2 +name: heartbeat + +transformation: false +aggregation: true +filtering: "(not filter('name', 'pm2-metrics'))" +exclude_not_running_vm: true + +signals: + signal: + metric: "pm2_up" +rules: + critical: diff --git a/modules/smart-agent_pm2/conf/01-up.yaml b/modules/smart-agent_pm2/conf/01-up.yaml new file mode 100644 index 000000000..ea1dd3b04 --- /dev/null +++ b/modules/smart-agent_pm2/conf/01-up.yaml @@ -0,0 +1,15 @@ +module: pm2 +name: "up application" + +id: up +transformation: ".min(over='5min')" +filtering: "(not filter('name', 'pm2-metrics'))" +exclude_not_running_vm: true + +signals: + signal: + metric: pm2_up +rules: + critical: + threshold: 1 + comparator: ">" diff --git a/modules/smart-agent_pm2/conf/02-restarts.yaml b/modules/smart-agent_pm2/conf/02-restarts.yaml new file mode 100644 index 000000000..6bfb52b9e --- /dev/null +++ b/modules/smart-agent_pm2/conf/02-restarts.yaml @@ -0,0 +1,18 @@ +module: pm2 +name: "restarts counter" + +id: restarts +transformation: ".min(over='5min')" +filtering: "(not filter('name', 'pm2-metrics'))" +exclude_not_running_vm: true + +signals: + signal: + metric: pm2_restarts +rules: + critical: + threshold: 5 + comparator: ">" + major: + threshold: 3 + comparator: ">=" diff --git a/modules/smart-agent_pm2/conf/readme.yaml b/modules/smart-agent_pm2/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/smart-agent_pm2/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/smart-agent_pm2/detectors-gen.tf b/modules/smart-agent_pm2/detectors-gen.tf new file mode 100644 index 000000000..d3c41b66a --- /dev/null +++ b/modules/smart-agent_pm2/detectors-gen.tf @@ -0,0 +1,94 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Pm2 heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + max_delay = 900 + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + base_filtering = (not filter('name', 'pm2-metrics')) + signal = data('pm2_up', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } +} + +resource "signalfx_detector" "up" { + name = format("%s %s", local.detector_name_prefix, "Pm2 up application") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = (not filter('name', 'pm2-metrics')) + signal = data('pm2_up', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.up_aggregation_function}${var.up_transformation_function}.publish('signal') + detect(when(signal > ${var.up_threshold_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.up_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.up_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.up_notifications, "critical", []), var.notifications.critical) + runbook_url = try(coalesce(var.up_runbook_url, var.runbook_url), "") + tip = var.up_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } +} + +resource "signalfx_detector" "restarts" { + name = format("%s %s", local.detector_name_prefix, "Pm2 restarts counter") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = (not filter('name', 'pm2-metrics')) + signal = data('pm2_restarts', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.restarts_aggregation_function}${var.restarts_transformation_function}.publish('signal') + detect(when(signal > ${var.restarts_threshold_critical})).publish('CRIT') + detect(when(signal >= ${var.restarts_threshold_major})).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.restarts_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.restarts_disabled_critical, var.restarts_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.restarts_notifications, "critical", []), var.notifications.critical) + runbook_url = try(coalesce(var.restarts_runbook_url, var.runbook_url), "") + tip = var.restarts_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high >= ${var.restarts_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.restarts_disabled_major, var.restarts_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.restarts_notifications, "major", []), var.notifications.major) + runbook_url = try(coalesce(var.restarts_runbook_url, var.runbook_url), "") + tip = var.restarts_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } +} + diff --git a/modules/smart-agent_pm2/outputs.tf b/modules/smart-agent_pm2/outputs.tf new file mode 100644 index 000000000..0560fc237 --- /dev/null +++ b/modules/smart-agent_pm2/outputs.tf @@ -0,0 +1,15 @@ +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + +output "restarts" { + description = "Detector resource for restarts" + value = signalfx_detector.restarts +} + +output "up" { + description = "Detector resource for up" + value = signalfx_detector.up +} + diff --git a/modules/smart-agent_pm2/tags.tf b/modules/smart-agent_pm2/tags.tf new file mode 100644 index 000000000..a2c65d5b3 --- /dev/null +++ b/modules/smart-agent_pm2/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["smart-agent", "pm2"] +} + diff --git a/modules/smart-agent_pm2/variables-gen.tf b/modules/smart-agent_pm2/variables-gen.tf new file mode 100644 index 000000000..971920b4d --- /dev/null +++ b/modules/smart-agent_pm2/variables-gen.tf @@ -0,0 +1,144 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"10m\")" + type = string + default = "20m" +} + +# up detector + +variable "up_notifications" { + description = "Notification recipients list per severity overridden for up detector" + type = map(list(string)) + default = {} +} + +variable "up_aggregation_function" { + description = "Aggregation function and group by for up detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "up_transformation_function" { + description = "Transformation function for up detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5min')" +} + +variable "up_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "up_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "up_disabled" { + description = "Disable all alerting rules for up detector" + type = bool + default = null +} + +variable "up_threshold_critical" { + description = "Critical threshold for up detector" + type = number + default = 1 +} + +# restarts detector + +variable "restarts_notifications" { + description = "Notification recipients list per severity overridden for restarts detector" + type = map(list(string)) + default = {} +} + +variable "restarts_aggregation_function" { + description = "Aggregation function and group by for restarts detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "restarts_transformation_function" { + description = "Transformation function for restarts detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5min')" +} + +variable "restarts_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "restarts_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "restarts_disabled" { + description = "Disable all alerting rules for restarts detector" + type = bool + default = null +} + +variable "restarts_disabled_critical" { + description = "Disable critical alerting rule for restarts detector" + type = bool + default = null +} + +variable "restarts_disabled_major" { + description = "Disable major alerting rule for restarts detector" + type = bool + default = null +} + +variable "restarts_threshold_critical" { + description = "Critical threshold for restarts detector" + type = number + default = 5 +} + +variable "restarts_threshold_major" { + description = "Major threshold for restarts detector" + type = number + default = 3 +} + From 196138467225d32e9e2855e9f99fe0d65fa56608 Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Wed, 3 Nov 2021 17:31:03 +0100 Subject: [PATCH 2/2] smart-agent_pm2 - detect when we have more than X restarts on the last X minutes --- modules/smart-agent_pm2/detectors-gen.tf | 3 ++- modules/smart-agent_pm2/variables-gen.tf | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/modules/smart-agent_pm2/detectors-gen.tf b/modules/smart-agent_pm2/detectors-gen.tf index d3c41b66a..82953e6ab 100644 --- a/modules/smart-agent_pm2/detectors-gen.tf +++ b/modules/smart-agent_pm2/detectors-gen.tf @@ -62,7 +62,8 @@ resource "signalfx_detector" "restarts" { program_text = <<-EOF base_filtering = (not filter('name', 'pm2-metrics')) - signal = data('pm2_restarts', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.restarts_aggregation_function}${var.restarts_transformation_function}.publish('signal') + A = data('pm2_restarts', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.restarts_aggregation_function}${var.restarts_transformation_function} + signal = (A - A.timeshift("${var.restarts_counter_timeshift}")).publish('signal') detect(when(signal > ${var.restarts_threshold_critical})).publish('CRIT') detect(when(signal >= ${var.restarts_threshold_major})).publish('MAJOR') EOF diff --git a/modules/smart-agent_pm2/variables-gen.tf b/modules/smart-agent_pm2/variables-gen.tf index 971920b4d..1040b6e13 100644 --- a/modules/smart-agent_pm2/variables-gen.tf +++ b/modules/smart-agent_pm2/variables-gen.tf @@ -139,6 +139,11 @@ variable "restarts_threshold_critical" { variable "restarts_threshold_major" { description = "Major threshold for restarts detector" type = number - default = 3 + default = 2 } +variable "restarts_counter_timeshift" { + description = "Timeframe for the timeshift on the restart counter (i.e. \"10m\"" + type = string + default = "10m" +}