From 1a1ae8bf89d3e5ff3ba38e688e4505660f8500ce Mon Sep 17 00:00:00 2001 From: Hugues Lepesant Date: Sat, 22 Jun 2024 01:09:06 +0200 Subject: [PATCH] feat: prometheus-exporter_postfix --- docs/severity.md | 12 + modules/prometheus-exporter_postfix/README.md | 119 +++++ .../common-filters.tf | 1 + .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/00-heartbeat.yaml | 13 + .../conf/01-queue_deferred.yaml | 21 + .../conf/02-queue_hold.yaml | 21 + .../conf/03-queue_maildrop.yaml | 21 + .../conf/04-mail_delivery_delay.yaml | 21 + .../conf/readme.yaml | 3 + .../detectors-gen.tf | 192 ++++++++ .../prometheus-exporter_postfix/outputs.tf | 25 ++ modules/prometheus-exporter_postfix/tags.tf | 4 + .../variables-gen.tf | 410 ++++++++++++++++++ 17 files changed, 867 insertions(+) create mode 100644 modules/prometheus-exporter_postfix/README.md create mode 120000 modules/prometheus-exporter_postfix/common-filters.tf create mode 120000 modules/prometheus-exporter_postfix/common-locals.tf create mode 120000 modules/prometheus-exporter_postfix/common-modules.tf create mode 120000 modules/prometheus-exporter_postfix/common-variables.tf create mode 120000 modules/prometheus-exporter_postfix/common-versions.tf create mode 100644 modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/readme.yaml create mode 100644 modules/prometheus-exporter_postfix/detectors-gen.tf create mode 100644 modules/prometheus-exporter_postfix/outputs.tf create mode 100644 modules/prometheus-exporter_postfix/tags.tf create mode 100644 modules/prometheus-exporter_postfix/variables-gen.tf diff --git a/docs/severity.md b/docs/severity.md index 22752ae50..9b4dbe2cb 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -81,6 +81,7 @@ - [prometheus-exporter_docker-state](#prometheus-exporter_docker-state) - [prometheus-exporter_kong](#prometheus-exporter_kong) - [prometheus-exporter_oracledb](#prometheus-exporter_oracledb) +- [prometheus-exporter_postfix](#prometheus-exporter_postfix) - [prometheus-exporter_squid](#prometheus-exporter_squid) - [prometheus-exporter_varnish](#prometheus-exporter_varnish) - [prometheus-exporter_wallix-bastion](#prometheus-exporter_wallix-bastion) @@ -889,6 +890,17 @@ |Oracle database status|X|-|-|-|-| +## prometheus-exporter_postfix + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Postfix heartbeat|X|-|-|-|-| +|Postfix size postfix queue deferred|X|X|-|-|-| +|Postfix size postfix queue hold|X|X|-|-|-| +|Postfix size postfix queue maildrop|X|X|-|-|-| +|Postfix size postfix delivery delay|X|X|-|-|-| + + ## prometheus-exporter_squid |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/prometheus-exporter_postfix/README.md b/modules/prometheus-exporter_postfix/README.md new file mode 100644 index 000000000..bf4ed14d4 --- /dev/null +++ b/modules/prometheus-exporter_postfix/README.md @@ -0,0 +1,119 @@ +# POSTFIX SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-prometheus-exporter-postfix" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/prometheus-exporter_postfix?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Postfix heartbeat|X|-|-|-|-| +|Postfix size postfix queue deferred|X|X|-|-|-| +|Postfix size postfix queue hold|X|X|-|-|-| +|Postfix size postfix queue maildrop|X|X|-|-|-| +|Postfix size postfix delivery delay|X|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +scraping of a server following the [OpenMetrics convention](https://openmetrics.io/) based on and compatible with [the Prometheus +exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#openmetrics-text-format). + +They are generally called `Prometheus Exporters` which can be fetched by both the [SignalFx Smart Agent](https://github.com/signalfx/signalfx-agent) +thanks to its [prometheus exporter monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/prometheus-exporter.md) and the +[OpenTelemetry Collector](https://github.com/signalfx/splunk-otel-collector) using its [prometheus +receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) or its derivatives. + +These exporters could be embedded directly in the tool you want to monitor (e.g. nginx ingress) or must be installed next to it as +a separate program configured to connect, create metrics and expose them as server. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `postfix_showq_message_size_bytes_count` +* `postfix_smtp_delivery_delay_seconds_count` +* `postfix_up` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/prometheus-exporter_postfix/common-filters.tf b/modules/prometheus-exporter_postfix/common-filters.tf new file mode 120000 index 000000000..51ac61525 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-prometheus-exporter.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-locals.tf b/modules/prometheus-exporter_postfix/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-modules.tf b/modules/prometheus-exporter_postfix/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-variables.tf b/modules/prometheus-exporter_postfix/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-versions.tf b/modules/prometheus-exporter_postfix/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml b/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml new file mode 100644 index 000000000..79a6c49e5 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +## Example +module: postfix +name: heartbeat + +transformation: false +aggregation: true +exclude_not_running_vm: true + +signals: + signal: + metric: "postfix_up" +rules: + critical: diff --git a/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml b/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml new file mode 100644 index 000000000..6627879f8 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Deferred" +id: "postfix_showq_message_size_bytes_count_deferred" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'deferred')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml b/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml new file mode 100644 index 000000000..99052ccf1 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Hold" +id: "postfix_showq_message_size_bytes_count_hold" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'hold')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml b/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml new file mode 100644 index 000000000..1fe7c10a4 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Maildrop" +id: "postfix_showq_message_size_bytes_count_maildrop" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'maildrop')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml b/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml new file mode 100644 index 000000000..0a1297572 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Delivery Delay" +id: "postfix_smtp_delivery_delay_seconds_count" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'maildrop')" + +signals: + signal: + metric: "postfix_smtp_delivery_delay_seconds_count" + +rules: + critical: + threshold: 60 + comparator: ">" + + major: + threshold: 45 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/readme.yaml b/modules/prometheus-exporter_postfix/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/prometheus-exporter_postfix/detectors-gen.tf b/modules/prometheus-exporter_postfix/detectors-gen.tf new file mode 100644 index 000000000..d6df80533 --- /dev/null +++ b/modules/prometheus-exporter_postfix/detectors-gen.tf @@ -0,0 +1,192 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Postfix heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('postfix_up', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_deferred" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue deferred") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'deferred') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_deferred_aggregation_function}${var.postfix_showq_message_size_bytes_count_deferred_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_major}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_deferred_disabled_critical, var.postfix_showq_message_size_bytes_count_deferred_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_deferred_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_deferred_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_deferred_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_deferred_disabled_major, var.postfix_showq_message_size_bytes_count_deferred_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_deferred_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_deferred_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_deferred_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_deferred_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_hold" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue hold") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'hold') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_hold_aggregation_function}${var.postfix_showq_message_size_bytes_count_hold_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_major}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_hold_disabled_critical, var.postfix_showq_message_size_bytes_count_hold_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_hold_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_hold_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_hold_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_hold_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_hold_disabled_major, var.postfix_showq_message_size_bytes_count_hold_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_hold_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_hold_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_hold_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_hold_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_maildrop" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue maildrop") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'maildrop') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_maildrop_aggregation_function}${var.postfix_showq_message_size_bytes_count_maildrop_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_major}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_maildrop_disabled_critical, var.postfix_showq_message_size_bytes_count_maildrop_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_maildrop_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_maildrop_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_maildrop_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_maildrop_disabled_major, var.postfix_showq_message_size_bytes_count_maildrop_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_maildrop_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_maildrop_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_maildrop_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_maildrop_max_delay +} + +resource "signalfx_detector" "postfix_smtp_delivery_delay_seconds_count" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix delivery delay") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'maildrop') + signal = data('postfix_smtp_delivery_delay_seconds_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_smtp_delivery_delay_seconds_count_aggregation_function}${var.postfix_smtp_delivery_delay_seconds_count_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_major}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_major != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_major}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_smtp_delivery_delay_seconds_count_disabled_critical, var.postfix_smtp_delivery_delay_seconds_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_smtp_delivery_delay_seconds_count_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_smtp_delivery_delay_seconds_count_runbook_url, var.runbook_url), "") + tip = var.postfix_smtp_delivery_delay_seconds_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_smtp_delivery_delay_seconds_count_disabled_major, var.postfix_smtp_delivery_delay_seconds_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_smtp_delivery_delay_seconds_count_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_smtp_delivery_delay_seconds_count_runbook_url, var.runbook_url), "") + tip = var.postfix_smtp_delivery_delay_seconds_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_smtp_delivery_delay_seconds_count_max_delay +} + diff --git a/modules/prometheus-exporter_postfix/outputs.tf b/modules/prometheus-exporter_postfix/outputs.tf new file mode 100644 index 000000000..c816c0800 --- /dev/null +++ b/modules/prometheus-exporter_postfix/outputs.tf @@ -0,0 +1,25 @@ +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + +output "postfix_showq_message_size_bytes_count_deferred" { + description = "Detector resource for postfix_showq_message_size_bytes_count_deferred" + value = signalfx_detector.postfix_showq_message_size_bytes_count_deferred +} + +output "postfix_showq_message_size_bytes_count_hold" { + description = "Detector resource for postfix_showq_message_size_bytes_count_hold" + value = signalfx_detector.postfix_showq_message_size_bytes_count_hold +} + +output "postfix_showq_message_size_bytes_count_maildrop" { + description = "Detector resource for postfix_showq_message_size_bytes_count_maildrop" + value = signalfx_detector.postfix_showq_message_size_bytes_count_maildrop +} + +output "postfix_smtp_delivery_delay_seconds_count" { + description = "Detector resource for postfix_smtp_delivery_delay_seconds_count" + value = signalfx_detector.postfix_smtp_delivery_delay_seconds_count +} + diff --git a/modules/prometheus-exporter_postfix/tags.tf b/modules/prometheus-exporter_postfix/tags.tf new file mode 100644 index 000000000..d8c3398d2 --- /dev/null +++ b/modules/prometheus-exporter_postfix/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["prometheus-exporter", "postfix"] +} + diff --git a/modules/prometheus-exporter_postfix/variables-gen.tf b/modules/prometheus-exporter_postfix/variables-gen.tf new file mode 100644 index 000000000..3089c2425 --- /dev/null +++ b/modules/prometheus-exporter_postfix/variables-gen.tf @@ -0,0 +1,410 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_exclude_not_running_vm" { + description = "Don’t send alerts if associated VM is stopped or stopping (metadata provided by cloud provider integration). Can be useful for ephemeral infrastructure (such as auto scaling groups) as VM will be stopped and started regularly. Note that timeframe must be at least 25 minutes for the metadata to be available to the detector." + type = bool + default = true +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"25m\"). Must be at least \"25m\" if \"heartbeat_exclude_not_running_vm\" is true" + type = string + default = "25m" +} + +# postfix_showq_message_size_bytes_count_deferred detector + +variable "postfix_showq_message_size_bytes_count_deferred_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_deferred detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_deferred_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_deferred detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_deferred detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_deferred_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_deferred detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_deferred detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_deferred_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_deferred detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_deferred_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_showq_message_size_bytes_count_hold detector + +variable "postfix_showq_message_size_bytes_count_hold_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_hold detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_hold_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_hold detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_hold detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_hold_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_hold detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_hold detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_hold_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_hold_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_hold detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_hold_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_showq_message_size_bytes_count_maildrop detector + +variable "postfix_showq_message_size_bytes_count_maildrop_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_maildrop detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_maildrop_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_maildrop detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_maildrop detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_maildrop detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_maildrop detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_maildrop_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_maildrop detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_smtp_delivery_delay_seconds_count detector + +variable "postfix_smtp_delivery_delay_seconds_count_notifications" { + description = "Notification recipients list per severity overridden for postfix_smtp_delivery_delay_seconds_count detector" + type = map(list(string)) + default = {} +} + +variable "postfix_smtp_delivery_delay_seconds_count_aggregation_function" { + description = "Aggregation function and group by for postfix_smtp_delivery_delay_seconds_count detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_transformation_function" { + description = "Transformation function for postfix_smtp_delivery_delay_seconds_count detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_smtp_delivery_delay_seconds_count_max_delay" { + description = "Enforce max delay for postfix_smtp_delivery_delay_seconds_count detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled" { + description = "Disable all alerting rules for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled_critical" { + description = "Disable critical alerting rule for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled_major" { + description = "Disable major alerting rule for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_threshold_critical" { + description = "Critical threshold for postfix_smtp_delivery_delay_seconds_count detector" + type = number + default = 60 +} + +variable "postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_smtp_delivery_delay_seconds_count_threshold_major" { + description = "Major threshold for postfix_smtp_delivery_delay_seconds_count detector" + type = number + default = 45 +} + +variable "postfix_smtp_delivery_delay_seconds_count_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}