diff --git a/docs/severity.md b/docs/severity.md index 36adaa1f4..149003026 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -66,6 +66,7 @@ - [integration_azure-virtual-machine-scaleset](#integration_azure-virtual-machine-scaleset) - [integration_azure-virtual-machine](#integration_azure-virtual-machine) - [integration_gcp-bigquery](#integration_gcp-bigquery) +- [integration_gcp-cloud-run](#integration_gcp-cloud-run) - [integration_gcp-cloud-sql-common](#integration_gcp-cloud-sql-common) - [integration_gcp-cloud-sql-failover](#integration_gcp-cloud-sql-failover) - [integration_gcp-cloud-sql-mysql](#integration_gcp-cloud-sql-mysql) @@ -79,9 +80,11 @@ - [organization_usage](#organization_usage) - [otel-collector_kubernetes-common](#otel-collector_kubernetes-common) - [prometheus-exporter_active-directory](#prometheus-exporter_active-directory) +- [prometheus-exporter_dnsmasq](#prometheus-exporter_dnsmasq) - [prometheus-exporter_docker-state](#prometheus-exporter_docker-state) - [prometheus-exporter_kong](#prometheus-exporter_kong) - [prometheus-exporter_oracledb](#prometheus-exporter_oracledb) +- [prometheus-exporter_postfix](#prometheus-exporter_postfix) - [prometheus-exporter_squid](#prometheus-exporter_squid) - [prometheus-exporter_varnish](#prometheus-exporter_varnish) - [prometheus-exporter_wallix-bastion](#prometheus-exporter_wallix-bastion) @@ -738,6 +741,15 @@ |GCP BigQuery uploaded bytes billed|X|X|-|-|-| +## integration_gcp-cloud-run + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|GCP Cloud Run container count|X|-|-|-|-| +|GCP Cloud Run cpu utilizations|X|X|-|-|-| +|GCP Cloud Run memory utilizations|X|X|-|-|-| + + ## integration_gcp-cloud-sql-common |Detector|Critical|Major|Minor|Warning|Info| @@ -871,6 +883,15 @@ |Active-directory active directory services|X|-|-|-|-| +## prometheus-exporter_dnsmasq + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Dnsmasq heartbeat|X|-|-|-|-| +|Dnsmasq hits|X|-|-|-|-| +|Dnsmasq hit rate|-|X|X|-|-| + + ## prometheus-exporter_docker-state |Detector|Critical|Major|Minor|Warning|Info| @@ -897,6 +918,17 @@ |Oracle database status|X|-|-|-|-| +## prometheus-exporter_postfix + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Postfix heartbeat|X|-|-|-|-| +|Postfix size postfix queue deferred|X|X|-|-|-| +|Postfix size postfix queue hold|X|X|-|-|-| +|Postfix size postfix queue maildrop|X|X|-|-|-| +|Postfix size postfix delivery delay|X|X|-|-|-| + + ## prometheus-exporter_squid |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml b/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml index a409edd83..b09233081 100644 --- a/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml +++ b/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml @@ -1,7 +1,6 @@ module: AWS Elasticsearch name: JVM Memory Pressure -transformation: ".min(over='15m')" aggregation: true filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" value_unit: "%" @@ -15,7 +14,9 @@ rules: critical: threshold: 90 comparator: ">" + lasting_duration: "15m" major: threshold: 80 comparator: ">" dependency: critical + lasting_duration: "15m" diff --git a/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml b/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml index 4e6f7329c..8d3b3960f 100644 --- a/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml +++ b/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml @@ -1,7 +1,6 @@ module: AWS Elasticsearch name: Cluster status -aggregation: ".min(over='15m')" filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper')" signals: @@ -18,8 +17,10 @@ rules: comparator: ">=" description: "is red" signal: red + lasting_duration: "15m" major: threshold: 1 comparator: ">=" description: "is yellow" signal: yellow + lasting_duration: "15m" diff --git a/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml b/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml index 9249b2cb5..b1800b639 100644 --- a/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml +++ b/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml @@ -2,7 +2,6 @@ module: AWS Elasticsearch id: "cluster_cpu" name: "CPU utilization" -transformation: ".min(over='45m')" aggregation: "" filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" @@ -20,6 +19,8 @@ rules: threshold: 80 comparator: ">" dependency: critical + lasting_duration: "45m" critical: threshold: 90 comparator: ">" + lasting_duration: "45m" diff --git a/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml b/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml index 04385c8d1..ac054bc27 100644 --- a/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml +++ b/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml @@ -2,7 +2,6 @@ module: AWS Elasticsearch id: "master_cpu" name: "Master CPU utilization" -transformation: ".min(over='20m')" aggregation: "" filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" @@ -15,6 +14,8 @@ rules: threshold: 60 comparator: ">" dependency: critical + lasting_duration: "20m" critical: threshold: 70 comparator: ">" + lasting_duration: "20m" diff --git a/modules/integration_aws-elasticsearch/variables-gen.tf b/modules/integration_aws-elasticsearch/variables-gen.tf index b9b5939f0..e5f406bd3 100644 --- a/modules/integration_aws-elasticsearch/variables-gen.tf +++ b/modules/integration_aws-elasticsearch/variables-gen.tf @@ -59,7 +59,7 @@ variable "jvm_memory_pressure_aggregation_function" { variable "jvm_memory_pressure_transformation_function" { description = "Transformation function for jvm_memory_pressure detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" } variable "jvm_memory_pressure_max_delay" { @@ -109,7 +109,7 @@ variable "jvm_memory_pressure_threshold_critical" { variable "jvm_memory_pressure_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_memory_pressure_at_least_percentage_critical" { @@ -126,7 +126,7 @@ variable "jvm_memory_pressure_threshold_major" { variable "jvm_memory_pressure_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_memory_pressure_at_least_percentage_major" { @@ -467,7 +467,7 @@ variable "cluster_status_notifications" { variable "cluster_status_aggregation_function" { description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='15m')" + default = "" } variable "cluster_status_transformation_function" { @@ -521,7 +521,7 @@ variable "cluster_status_threshold_critical" { variable "cluster_status_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_status_at_least_percentage_critical" { @@ -538,7 +538,7 @@ variable "cluster_status_threshold_major" { variable "cluster_status_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_status_at_least_percentage_major" { @@ -725,7 +725,7 @@ variable "cluster_cpu_notifications" { variable "cluster_cpu_transformation_function" { description = "Transformation function for cluster_cpu detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='45m')" + default = "" } variable "cluster_cpu_max_delay" { @@ -773,7 +773,7 @@ variable "cluster_cpu_threshold_major" { variable "cluster_cpu_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "45m" } variable "cluster_cpu_at_least_percentage_major" { @@ -790,7 +790,7 @@ variable "cluster_cpu_threshold_critical" { variable "cluster_cpu_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "45m" } variable "cluster_cpu_at_least_percentage_critical" { @@ -809,7 +809,7 @@ variable "master_cpu_notifications" { variable "master_cpu_transformation_function" { description = "Transformation function for master_cpu detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='20m')" + default = "" } variable "master_cpu_max_delay" { @@ -857,7 +857,7 @@ variable "master_cpu_threshold_major" { variable "master_cpu_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "20m" } variable "master_cpu_at_least_percentage_major" { @@ -874,7 +874,7 @@ variable "master_cpu_threshold_critical" { variable "master_cpu_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "20m" } variable "master_cpu_at_least_percentage_critical" { diff --git a/modules/integration_gcp-cloud-run/README.md b/modules/integration_gcp-cloud-run/README.md new file mode 100644 index 000000000..0077314a6 --- /dev/null +++ b/modules/integration_gcp-cloud-run/README.md @@ -0,0 +1,229 @@ +# GCP-CLOUD-RUN SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Notes](#notes) + - [Metadata configuration for default filtering](#metadata-configuration-for-default-filtering) + - [CPU utilizations](#cpu-utilizations) + - [Memory utilizations](#memory-utilizations) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-integration-gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run?ref={revision}" + + environment = var.environment + notifications = local.notifications + gcp_project_id = "fillme" +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|GCP Cloud Run container count|X|-|-|-|-| +|GCP Cloud Run cpu utilizations|X|X|-|-|-| +|GCP Cloud Run memory utilizations|X|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +[GCP integration](https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp-metrics.html) configurable +with [this Terraform module](https://github.com/claranet/terraform-signalfx-integrations/tree/master/cloud/gcp). + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `container/containers` +* `container/cpu/utilizations` +* `container/memory/utilizations` + + +## Notes + + +### Metadata configuration for default filtering + +label to use : + +sfx_env = true +sfx_monitored = true + +For example: + +via gcloud, at the Cloud Run level: +``` +gcloud run deploy hello \ +--image=us-docker.pkg.dev/cloudrun/container/hello \ +--allow-unauthenticated \ +--port=8080 \ +--service-account=123456789-compute@developer.gserviceaccount.com \ +--region=europe-west9 \ +--project=claranet-425413 \ +--labels=sfx_env=true,sfx_monitored=true +``` +via terraform, [at the Cloud Run level](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_service#nested_metadata) +```hcl +resource "google_cloud_run_service" "hello" { + name = "hello" + location = "europe-west9" + + template { + spec { + containers { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources { + limits = { + cpu = "1000m" // adjust based on your needs + memory = "512Mi" // adjust based on your needs + } + } + ports { + name = "http1" // This name is a standard identifier (http1 or h2c) for the protocol + container_port = 8080 + } + } + service_account_name = "123456789-compute@developer.gserviceaccount.com" + } + + metadata { + annotations = { + "run.googleapis.com/launch-stage" = "BETA" // adjust this according to the launch stage of your application + } + labels = { + sfx_env = "true" + sfx_monitored = "true" + } + } + } + autogenerate_revision_name = true + + traffic { + percent = 100 + latest_revision = true + } + + project = "claranet-425413" +} +``` +You also **need** to check if those metadata are in the metadata `includeList` in your [SignalFx GCP +integration](https://dev.splunk.com/observability/docs/integrations/gcp_integration_overview/#Optional-fields). + +### CPU utilizations + +Monitoring the CPU utilization helps in understanding the system's capability and efficiency. + +```hcl +module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + cpu_usage_threshold_critical = 85 + cpu_usage_threshold_major = 80 +} +``` + +### Memory utilizations + +Accurate tracking of memory usage aids in optimizing and improving performance. + +```hcl +module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + memory_usage_threshold_critical = 85 + memory_usage_threshold_major = 80 +} +``` + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) +* [Stackdriver metrics for Memorystore for Redis](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-run) +* [Splunk Observability metrics](https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp.html) diff --git a/modules/integration_gcp-cloud-run/common-locals.tf b/modules/integration_gcp-cloud-run/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/common-modules.tf b/modules/integration_gcp-cloud-run/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/common-variables.tf b/modules/integration_gcp-cloud-run/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/common-versions.tf b/modules/integration_gcp-cloud-run/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/conf/00-containers.yaml b/modules/integration_gcp-cloud-run/conf/00-containers.yaml new file mode 100644 index 000000000..358765d98 --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/00-containers.yaml @@ -0,0 +1,15 @@ +module: "GCP Cloud Run" +name: "Container count" + +transformation: true +aggregation: true + +signals: + signal: + metric: "container/containers" + +rules: + critical: + threshold: 0 + comparator: "==" + diff --git a/modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml b/modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml new file mode 100644 index 000000000..45d339d66 --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml @@ -0,0 +1,19 @@ +module: "GCP Cloud Run" +name: "CPU utilizations" + +value_unit: "%" +transformation: ".min(over='30m')" + +signals: + signal: + metric: "container/cpu/utilizations" + +rules: + critical: + threshold: 90 + comparator: ">" + + major: + threshold: 85 + comparator: ">" + dependency: "critical" diff --git a/modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml b/modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml new file mode 100644 index 000000000..702a16086 --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml @@ -0,0 +1,19 @@ +module: "GCP Cloud Run" +name: "Memory utilizations" + +value_unit: "%" +transformation: ".min(over='30m')" + +signals: + signal: + metric: "container/memory/utilizations" + +rules: + critical: + threshold: 95 + comparator: ">" + + major: + threshold: 90 + comparator: ">" + dependency: "critical" diff --git a/modules/integration_gcp-cloud-run/conf/readme.yaml b/modules/integration_gcp-cloud-run/conf/readme.yaml new file mode 100644 index 000000000..eba5fabdb --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/readme.yaml @@ -0,0 +1,118 @@ +documentations: + - name: Stackdriver metrics for Memorystore for Redis + url: https://cloud.google.com/monitoring/api/metrics_gcp#gcp-run + - name: Splunk Observability metrics + url: https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp.html +notes: | + + ### Metadata configuration for default filtering + + label to use : + + sfx_env = true + sfx_monitored = true + + For example: + + via gcloud, at the Cloud Run level: + ``` + gcloud run deploy hello \ + --image=us-docker.pkg.dev/cloudrun/container/hello \ + --allow-unauthenticated \ + --port=8080 \ + --service-account=123456789-compute@developer.gserviceaccount.com \ + --region=europe-west9 \ + --project=claranet-425413 \ + --labels=sfx_env=true,sfx_monitored=true + ``` + via terraform, [at the Cloud Run level](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_service#nested_metadata) + ```hcl + resource "google_cloud_run_service" "hello" { + name = "hello" + location = "europe-west9" + + template { + spec { + containers { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources { + limits = { + cpu = "1000m" // adjust based on your needs + memory = "512Mi" // adjust based on your needs + } + } + ports { + name = "http1" // This name is a standard identifier (http1 or h2c) for the protocol + container_port = 8080 + } + } + service_account_name = "123456789-compute@developer.gserviceaccount.com" + } + + metadata { + annotations = { + "run.googleapis.com/launch-stage" = "BETA" // adjust this according to the launch stage of your application + } + labels = { + sfx_env = "true" + sfx_monitored = "true" + } + } + } + autogenerate_revision_name = true + + traffic { + percent = 100 + latest_revision = true + } + + project = "claranet-425413" + } + ``` + You also **need** to check if those metadata are in the metadata `includeList` in your [SignalFx GCP + integration](https://dev.splunk.com/observability/docs/integrations/gcp_integration_overview/#Optional-fields). + + ### CPU utilizations + + Monitoring the CPU utilization helps in understanding the system's capability and efficiency. + + ```hcl + module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + cpu_usage_threshold_critical = 85 + cpu_usage_threshold_major = 80 + } + ``` + + ### Memory utilizations + + Accurate tracking of memory usage aids in optimizing and improving performance. + + ```hcl + module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + memory_usage_threshold_critical = 85 + memory_usage_threshold_major = 80 + } + ``` + diff --git a/modules/integration_gcp-cloud-run/detectors-gen.tf b/modules/integration_gcp-cloud-run/detectors-gen.tf new file mode 100644 index 000000000..fcb996be9 --- /dev/null +++ b/modules/integration_gcp-cloud-run/detectors-gen.tf @@ -0,0 +1,117 @@ +resource "signalfx_detector" "container_count" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud Run container count") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('container/containers', filter=${module.filtering.signalflow})${var.container_count_aggregation_function}${var.container_count_transformation_function}.publish('signal') + detect(when(signal == ${var.container_count_threshold_critical}%{if var.container_count_lasting_duration_critical != null}, lasting='${var.container_count_lasting_duration_critical}', at_least=${var.container_count_at_least_percentage_critical}%{endif})).publish('CRIT') +EOF + + rule { + description = "is == ${var.container_count_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.container_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.container_count_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.container_count_runbook_url, var.runbook_url), "") + tip = var.container_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.container_count_max_delay +} + +resource "signalfx_detector" "cpu_utilizations" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud Run cpu utilizations") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + signal = data('container/cpu/utilizations', filter=${module.filtering.signalflow})${var.cpu_utilizations_aggregation_function}${var.cpu_utilizations_transformation_function}.publish('signal') + detect(when(signal > ${var.cpu_utilizations_threshold_critical}%{if var.cpu_utilizations_lasting_duration_critical != null}, lasting='${var.cpu_utilizations_lasting_duration_critical}', at_least=${var.cpu_utilizations_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.cpu_utilizations_threshold_major}%{if var.cpu_utilizations_lasting_duration_major != null}, lasting='${var.cpu_utilizations_lasting_duration_major}', at_least=${var.cpu_utilizations_at_least_percentage_major}%{endif}) and (not when(signal > ${var.cpu_utilizations_threshold_critical}%{if var.cpu_utilizations_lasting_duration_critical != null}, lasting='${var.cpu_utilizations_lasting_duration_critical}', at_least=${var.cpu_utilizations_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cpu_utilizations_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cpu_utilizations_disabled_critical, var.cpu_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_utilizations_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cpu_utilizations_runbook_url, var.runbook_url), "") + tip = var.cpu_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cpu_utilizations_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cpu_utilizations_disabled_major, var.cpu_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_utilizations_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cpu_utilizations_runbook_url, var.runbook_url), "") + tip = var.cpu_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cpu_utilizations_max_delay +} + +resource "signalfx_detector" "memory_utilizations" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud Run memory utilizations") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + signal = data('container/memory/utilizations', filter=${module.filtering.signalflow})${var.memory_utilizations_aggregation_function}${var.memory_utilizations_transformation_function}.publish('signal') + detect(when(signal > ${var.memory_utilizations_threshold_critical}%{if var.memory_utilizations_lasting_duration_critical != null}, lasting='${var.memory_utilizations_lasting_duration_critical}', at_least=${var.memory_utilizations_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.memory_utilizations_threshold_major}%{if var.memory_utilizations_lasting_duration_major != null}, lasting='${var.memory_utilizations_lasting_duration_major}', at_least=${var.memory_utilizations_at_least_percentage_major}%{endif}) and (not when(signal > ${var.memory_utilizations_threshold_critical}%{if var.memory_utilizations_lasting_duration_critical != null}, lasting='${var.memory_utilizations_lasting_duration_critical}', at_least=${var.memory_utilizations_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.memory_utilizations_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.memory_utilizations_disabled_critical, var.memory_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.memory_utilizations_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.memory_utilizations_runbook_url, var.runbook_url), "") + tip = var.memory_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.memory_utilizations_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.memory_utilizations_disabled_major, var.memory_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.memory_utilizations_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.memory_utilizations_runbook_url, var.runbook_url), "") + tip = var.memory_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.memory_utilizations_max_delay +} + diff --git a/modules/integration_gcp-cloud-run/filters.tf b/modules/integration_gcp-cloud-run/filters.tf new file mode 100644 index 000000000..f396ead7e --- /dev/null +++ b/modules/integration_gcp-cloud-run/filters.tf @@ -0,0 +1,3 @@ +locals { + filters = "filter('project_id', '${var.gcp_project_id}')" +} diff --git a/modules/integration_gcp-cloud-run/outputs.tf b/modules/integration_gcp-cloud-run/outputs.tf new file mode 100644 index 000000000..d749c95b5 --- /dev/null +++ b/modules/integration_gcp-cloud-run/outputs.tf @@ -0,0 +1,15 @@ +output "container_count" { + description = "Detector resource for container_count" + value = signalfx_detector.container_count +} + +output "cpu_utilizations" { + description = "Detector resource for cpu_utilizations" + value = signalfx_detector.cpu_utilizations +} + +output "memory_utilizations" { + description = "Detector resource for memory_utilizations" + value = signalfx_detector.memory_utilizations +} + diff --git a/modules/integration_gcp-cloud-run/tags.tf b/modules/integration_gcp-cloud-run/tags.tf new file mode 100644 index 000000000..e5af37772 --- /dev/null +++ b/modules/integration_gcp-cloud-run/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["integration", "gcp-cloud-run"] +} + diff --git a/modules/integration_gcp-cloud-run/variables-gen.tf b/modules/integration_gcp-cloud-run/variables-gen.tf new file mode 100644 index 000000000..ced3c21a7 --- /dev/null +++ b/modules/integration_gcp-cloud-run/variables-gen.tf @@ -0,0 +1,241 @@ +# container_count detector + +variable "container_count_notifications" { + description = "Notification recipients list per severity overridden for container_count detector" + type = map(list(string)) + default = {} +} + +variable "container_count_aggregation_function" { + description = "Aggregation function and group by for container_count detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "container_count_transformation_function" { + description = "Transformation function for container_count detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "container_count_max_delay" { + description = "Enforce max delay for container_count detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "container_count_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "container_count_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "container_count_disabled" { + description = "Disable all alerting rules for container_count detector" + type = bool + default = null +} + +variable "container_count_threshold_critical" { + description = "Critical threshold for container_count detector" + type = number + default = 0 +} + +variable "container_count_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "container_count_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cpu_utilizations detector + +variable "cpu_utilizations_notifications" { + description = "Notification recipients list per severity overridden for cpu_utilizations detector" + type = map(list(string)) + default = {} +} + +variable "cpu_utilizations_aggregation_function" { + description = "Aggregation function and group by for cpu_utilizations detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cpu_utilizations_transformation_function" { + description = "Transformation function for cpu_utilizations detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "cpu_utilizations_max_delay" { + description = "Enforce max delay for cpu_utilizations detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cpu_utilizations_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cpu_utilizations_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cpu_utilizations_disabled" { + description = "Disable all alerting rules for cpu_utilizations detector" + type = bool + default = null +} + +variable "cpu_utilizations_disabled_critical" { + description = "Disable critical alerting rule for cpu_utilizations detector" + type = bool + default = null +} + +variable "cpu_utilizations_disabled_major" { + description = "Disable major alerting rule for cpu_utilizations detector" + type = bool + default = null +} + +variable "cpu_utilizations_threshold_critical" { + description = "Critical threshold for cpu_utilizations detector in %" + type = number + default = 90 +} + +variable "cpu_utilizations_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cpu_utilizations_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cpu_utilizations_threshold_major" { + description = "Major threshold for cpu_utilizations detector in %" + type = number + default = 85 +} + +variable "cpu_utilizations_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cpu_utilizations_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# memory_utilizations detector + +variable "memory_utilizations_notifications" { + description = "Notification recipients list per severity overridden for memory_utilizations detector" + type = map(list(string)) + default = {} +} + +variable "memory_utilizations_aggregation_function" { + description = "Aggregation function and group by for memory_utilizations detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "memory_utilizations_transformation_function" { + description = "Transformation function for memory_utilizations detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "memory_utilizations_max_delay" { + description = "Enforce max delay for memory_utilizations detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "memory_utilizations_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "memory_utilizations_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "memory_utilizations_disabled" { + description = "Disable all alerting rules for memory_utilizations detector" + type = bool + default = null +} + +variable "memory_utilizations_disabled_critical" { + description = "Disable critical alerting rule for memory_utilizations detector" + type = bool + default = null +} + +variable "memory_utilizations_disabled_major" { + description = "Disable major alerting rule for memory_utilizations detector" + type = bool + default = null +} + +variable "memory_utilizations_threshold_critical" { + description = "Critical threshold for memory_utilizations detector in %" + type = number + default = 95 +} + +variable "memory_utilizations_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "memory_utilizations_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "memory_utilizations_threshold_major" { + description = "Major threshold for memory_utilizations detector in %" + type = number + default = 90 +} + +variable "memory_utilizations_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "memory_utilizations_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/integration_gcp-cloud-run/variables.tf b/modules/integration_gcp-cloud-run/variables.tf new file mode 100644 index 000000000..901d3ad46 --- /dev/null +++ b/modules/integration_gcp-cloud-run/variables.tf @@ -0,0 +1,4 @@ +variable "gcp_project_id" { + description = "GCP project id used for default filtering while lables are not synced" + type = string +} diff --git a/modules/prometheus-exporter_dnsmasq/README.md b/modules/prometheus-exporter_dnsmasq/README.md new file mode 100644 index 000000000..e5e5e43d8 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/README.md @@ -0,0 +1,117 @@ +# DNSMASQ SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-prometheus-exporter-dnsmasq" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/prometheus-exporter_dnsmasq?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Dnsmasq heartbeat|X|-|-|-|-| +|Dnsmasq hits|X|-|-|-|-| +|Dnsmasq hit rate|-|X|X|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +scraping of a server following the [OpenMetrics convention](https://openmetrics.io/) based on and compatible with [the Prometheus +exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#openmetrics-text-format). + +They are generally called `Prometheus Exporters` which can be fetched by both the [SignalFx Smart Agent](https://github.com/signalfx/signalfx-agent) +thanks to its [prometheus exporter monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/prometheus-exporter.md) and the +[OpenTelemetry Collector](https://github.com/signalfx/splunk-otel-collector) using its [prometheus +receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) or its derivatives. + +These exporters could be embedded directly in the tool you want to monitor (e.g. nginx ingress) or must be installed next to it as +a separate program configured to connect, create metrics and expose them as server. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `dnsmasq_cachesize` +* `dnsmasq_hits` +* `dnsmasq_misses` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/prometheus-exporter_dnsmasq/common-filters.tf b/modules/prometheus-exporter_dnsmasq/common-filters.tf new file mode 120000 index 000000000..51ac61525 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-prometheus-exporter.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-locals.tf b/modules/prometheus-exporter_dnsmasq/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-modules.tf b/modules/prometheus-exporter_dnsmasq/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-variables.tf b/modules/prometheus-exporter_dnsmasq/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-versions.tf b/modules/prometheus-exporter_dnsmasq/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml b/modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml new file mode 100644 index 000000000..c303d719a --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +module: dnsmasq +name: heartbeat + +transformation: false +aggregation: true + +exclude_not_running_vm: true + +signals: + signal: + metric: "dnsmasq_cachesize" +rules: + critical: diff --git a/modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml b/modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml new file mode 100644 index 000000000..b4bc3b575 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml @@ -0,0 +1,16 @@ +module: dnsmasq +name: hits +id: dnsmasq_hits + +transformation: true +aggregation: true + +signals: + signal: + metric: dnsmasq_hits + +rules: + critical: + threshold: 1 + comparator: "<=" + lasting_duration: '5m' diff --git a/modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml b/modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml new file mode 100644 index 000000000..fb05bcf33 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml @@ -0,0 +1,24 @@ +module: dnsmasq +name: Hit Rate +id: dnsmasq_hit_rate + +transformation: ".min(over='5m')" +aggregation: true + +signals: + A: + metric: dnsmasq_hits + B: + metric: dnsmasq_misses + signal: + formula: (A/(A+B)).fill(0).scale(100) +rules: + minor: + threshold: 90 + comparator: "<" + lasting_duration: "5m" + dependency: major + major: + threshold: 80 + comparator: "<=" + lasting_duration: "5m" diff --git a/modules/prometheus-exporter_dnsmasq/conf/readme.yaml b/modules/prometheus-exporter_dnsmasq/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/prometheus-exporter_dnsmasq/detectors-gen.tf b/modules/prometheus-exporter_dnsmasq/detectors-gen.tf new file mode 100644 index 000000000..4f4212ab9 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/detectors-gen.tf @@ -0,0 +1,97 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Dnsmasq heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('dnsmasq_cachesize', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "dnsmasq_hits" { + name = format("%s %s", local.detector_name_prefix, "Dnsmasq hits") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('dnsmasq_hits', filter=${module.filtering.signalflow})${var.dnsmasq_hits_aggregation_function}${var.dnsmasq_hits_transformation_function}.publish('signal') + detect(when(signal <= ${var.dnsmasq_hits_threshold_critical}%{if var.dnsmasq_hits_lasting_duration_critical != null}, lasting='${var.dnsmasq_hits_lasting_duration_critical}', at_least=${var.dnsmasq_hits_at_least_percentage_critical}%{endif})).publish('CRIT') +EOF + + rule { + description = "is too low <= ${var.dnsmasq_hits_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.dnsmasq_hits_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.dnsmasq_hits_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.dnsmasq_hits_runbook_url, var.runbook_url), "") + tip = var.dnsmasq_hits_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.dnsmasq_hits_max_delay +} + +resource "signalfx_detector" "dnsmasq_hit_rate" { + name = format("%s %s", local.detector_name_prefix, "Dnsmasq hit rate") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + A = data('dnsmasq_hits', filter=${module.filtering.signalflow})${var.dnsmasq_hit_rate_aggregation_function}${var.dnsmasq_hit_rate_transformation_function} + B = data('dnsmasq_misses', filter=${module.filtering.signalflow})${var.dnsmasq_hit_rate_aggregation_function}${var.dnsmasq_hit_rate_transformation_function} + signal = (A/(A+B)).fill(0).scale(100).publish('signal') + detect(when(signal < ${var.dnsmasq_hit_rate_threshold_minor}%{if var.dnsmasq_hit_rate_lasting_duration_minor != null}, lasting='${var.dnsmasq_hit_rate_lasting_duration_minor}', at_least=${var.dnsmasq_hit_rate_at_least_percentage_minor}%{endif}) and (not when(signal <= ${var.dnsmasq_hit_rate_threshold_major}%{if var.dnsmasq_hit_rate_lasting_duration_major != null}, lasting='${var.dnsmasq_hit_rate_lasting_duration_major}', at_least=${var.dnsmasq_hit_rate_at_least_percentage_major}%{endif}))).publish('MINOR') + detect(when(signal <= ${var.dnsmasq_hit_rate_threshold_major}%{if var.dnsmasq_hit_rate_lasting_duration_major != null}, lasting='${var.dnsmasq_hit_rate_lasting_duration_major}', at_least=${var.dnsmasq_hit_rate_at_least_percentage_major}%{endif})).publish('MAJOR') +EOF + + rule { + description = "is too low < ${var.dnsmasq_hit_rate_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.dnsmasq_hit_rate_disabled_minor, var.dnsmasq_hit_rate_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.dnsmasq_hit_rate_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.dnsmasq_hit_rate_runbook_url, var.runbook_url), "") + tip = var.dnsmasq_hit_rate_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too low <= ${var.dnsmasq_hit_rate_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.dnsmasq_hit_rate_disabled_major, var.dnsmasq_hit_rate_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.dnsmasq_hit_rate_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.dnsmasq_hit_rate_runbook_url, var.runbook_url), "") + tip = var.dnsmasq_hit_rate_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.dnsmasq_hit_rate_max_delay +} + diff --git a/modules/prometheus-exporter_dnsmasq/outputs.tf b/modules/prometheus-exporter_dnsmasq/outputs.tf new file mode 100644 index 000000000..4c3112430 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/outputs.tf @@ -0,0 +1,15 @@ +output "dnsmasq_hit_rate" { + description = "Detector resource for dnsmasq_hit_rate" + value = signalfx_detector.dnsmasq_hit_rate +} + +output "dnsmasq_hits" { + description = "Detector resource for dnsmasq_hits" + value = signalfx_detector.dnsmasq_hits +} + +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + diff --git a/modules/prometheus-exporter_dnsmasq/tags.tf b/modules/prometheus-exporter_dnsmasq/tags.tf new file mode 100644 index 000000000..9c6615c89 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["prometheus-exporter", "dnsmasq"] +} + diff --git a/modules/prometheus-exporter_dnsmasq/variables-gen.tf b/modules/prometheus-exporter_dnsmasq/variables-gen.tf new file mode 100644 index 000000000..9027271c0 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/variables-gen.tf @@ -0,0 +1,201 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_exclude_not_running_vm" { + description = "Don’t send alerts if associated VM is stopped or stopping (metadata provided by cloud provider integration). Can be useful for ephemeral infrastructure (such as auto scaling groups) as VM will be stopped and started regularly. Note that timeframe must be at least 25 minutes for the metadata to be available to the detector." + type = bool + default = true +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"25m\"). Must be at least \"25m\" if \"heartbeat_exclude_not_running_vm\" is true" + type = string + default = "25m" +} + +# dnsmasq_hits detector + +variable "dnsmasq_hits_notifications" { + description = "Notification recipients list per severity overridden for dnsmasq_hits detector" + type = map(list(string)) + default = {} +} + +variable "dnsmasq_hits_aggregation_function" { + description = "Aggregation function and group by for dnsmasq_hits detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "dnsmasq_hits_transformation_function" { + description = "Transformation function for dnsmasq_hits detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "dnsmasq_hits_max_delay" { + description = "Enforce max delay for dnsmasq_hits detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "dnsmasq_hits_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "dnsmasq_hits_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "dnsmasq_hits_disabled" { + description = "Disable all alerting rules for dnsmasq_hits detector" + type = bool + default = null +} + +variable "dnsmasq_hits_threshold_critical" { + description = "Critical threshold for dnsmasq_hits detector" + type = number + default = 1 +} + +variable "dnsmasq_hits_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "dnsmasq_hits_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# dnsmasq_hit_rate detector + +variable "dnsmasq_hit_rate_notifications" { + description = "Notification recipients list per severity overridden for dnsmasq_hit_rate detector" + type = map(list(string)) + default = {} +} + +variable "dnsmasq_hit_rate_aggregation_function" { + description = "Aggregation function and group by for dnsmasq_hit_rate detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "dnsmasq_hit_rate_transformation_function" { + description = "Transformation function for dnsmasq_hit_rate detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5m')" +} + +variable "dnsmasq_hit_rate_max_delay" { + description = "Enforce max delay for dnsmasq_hit_rate detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "dnsmasq_hit_rate_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "dnsmasq_hit_rate_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "dnsmasq_hit_rate_disabled" { + description = "Disable all alerting rules for dnsmasq_hit_rate detector" + type = bool + default = null +} + +variable "dnsmasq_hit_rate_disabled_minor" { + description = "Disable minor alerting rule for dnsmasq_hit_rate detector" + type = bool + default = null +} + +variable "dnsmasq_hit_rate_disabled_major" { + description = "Disable major alerting rule for dnsmasq_hit_rate detector" + type = bool + default = null +} + +variable "dnsmasq_hit_rate_threshold_minor" { + description = "Minor threshold for dnsmasq_hit_rate detector" + type = number + default = 90 +} + +variable "dnsmasq_hit_rate_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "dnsmasq_hit_rate_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "dnsmasq_hit_rate_threshold_major" { + description = "Major threshold for dnsmasq_hit_rate detector" + type = number + default = 80 +} + +variable "dnsmasq_hit_rate_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "dnsmasq_hit_rate_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/prometheus-exporter_postfix/README.md b/modules/prometheus-exporter_postfix/README.md new file mode 100644 index 000000000..bf4ed14d4 --- /dev/null +++ b/modules/prometheus-exporter_postfix/README.md @@ -0,0 +1,119 @@ +# POSTFIX SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-prometheus-exporter-postfix" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/prometheus-exporter_postfix?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Postfix heartbeat|X|-|-|-|-| +|Postfix size postfix queue deferred|X|X|-|-|-| +|Postfix size postfix queue hold|X|X|-|-|-| +|Postfix size postfix queue maildrop|X|X|-|-|-| +|Postfix size postfix delivery delay|X|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +scraping of a server following the [OpenMetrics convention](https://openmetrics.io/) based on and compatible with [the Prometheus +exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#openmetrics-text-format). + +They are generally called `Prometheus Exporters` which can be fetched by both the [SignalFx Smart Agent](https://github.com/signalfx/signalfx-agent) +thanks to its [prometheus exporter monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/prometheus-exporter.md) and the +[OpenTelemetry Collector](https://github.com/signalfx/splunk-otel-collector) using its [prometheus +receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) or its derivatives. + +These exporters could be embedded directly in the tool you want to monitor (e.g. nginx ingress) or must be installed next to it as +a separate program configured to connect, create metrics and expose them as server. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `postfix_showq_message_size_bytes_count` +* `postfix_smtp_delivery_delay_seconds_count` +* `postfix_up` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/prometheus-exporter_postfix/common-filters.tf b/modules/prometheus-exporter_postfix/common-filters.tf new file mode 120000 index 000000000..51ac61525 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-prometheus-exporter.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-locals.tf b/modules/prometheus-exporter_postfix/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-modules.tf b/modules/prometheus-exporter_postfix/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-variables.tf b/modules/prometheus-exporter_postfix/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-versions.tf b/modules/prometheus-exporter_postfix/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml b/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml new file mode 100644 index 000000000..79a6c49e5 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +## Example +module: postfix +name: heartbeat + +transformation: false +aggregation: true +exclude_not_running_vm: true + +signals: + signal: + metric: "postfix_up" +rules: + critical: diff --git a/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml b/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml new file mode 100644 index 000000000..6627879f8 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Deferred" +id: "postfix_showq_message_size_bytes_count_deferred" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'deferred')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml b/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml new file mode 100644 index 000000000..99052ccf1 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Hold" +id: "postfix_showq_message_size_bytes_count_hold" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'hold')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml b/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml new file mode 100644 index 000000000..1fe7c10a4 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Maildrop" +id: "postfix_showq_message_size_bytes_count_maildrop" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'maildrop')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml b/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml new file mode 100644 index 000000000..0a1297572 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Delivery Delay" +id: "postfix_smtp_delivery_delay_seconds_count" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'maildrop')" + +signals: + signal: + metric: "postfix_smtp_delivery_delay_seconds_count" + +rules: + critical: + threshold: 60 + comparator: ">" + + major: + threshold: 45 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/readme.yaml b/modules/prometheus-exporter_postfix/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/prometheus-exporter_postfix/detectors-gen.tf b/modules/prometheus-exporter_postfix/detectors-gen.tf new file mode 100644 index 000000000..d6df80533 --- /dev/null +++ b/modules/prometheus-exporter_postfix/detectors-gen.tf @@ -0,0 +1,192 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Postfix heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('postfix_up', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_deferred" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue deferred") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'deferred') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_deferred_aggregation_function}${var.postfix_showq_message_size_bytes_count_deferred_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_major}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_deferred_disabled_critical, var.postfix_showq_message_size_bytes_count_deferred_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_deferred_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_deferred_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_deferred_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_deferred_disabled_major, var.postfix_showq_message_size_bytes_count_deferred_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_deferred_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_deferred_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_deferred_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_deferred_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_hold" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue hold") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'hold') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_hold_aggregation_function}${var.postfix_showq_message_size_bytes_count_hold_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_major}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_hold_disabled_critical, var.postfix_showq_message_size_bytes_count_hold_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_hold_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_hold_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_hold_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_hold_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_hold_disabled_major, var.postfix_showq_message_size_bytes_count_hold_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_hold_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_hold_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_hold_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_hold_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_maildrop" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue maildrop") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'maildrop') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_maildrop_aggregation_function}${var.postfix_showq_message_size_bytes_count_maildrop_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_major}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_maildrop_disabled_critical, var.postfix_showq_message_size_bytes_count_maildrop_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_maildrop_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_maildrop_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_maildrop_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_maildrop_disabled_major, var.postfix_showq_message_size_bytes_count_maildrop_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_maildrop_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_maildrop_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_maildrop_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_maildrop_max_delay +} + +resource "signalfx_detector" "postfix_smtp_delivery_delay_seconds_count" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix delivery delay") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'maildrop') + signal = data('postfix_smtp_delivery_delay_seconds_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_smtp_delivery_delay_seconds_count_aggregation_function}${var.postfix_smtp_delivery_delay_seconds_count_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_major}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_major != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_major}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_smtp_delivery_delay_seconds_count_disabled_critical, var.postfix_smtp_delivery_delay_seconds_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_smtp_delivery_delay_seconds_count_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_smtp_delivery_delay_seconds_count_runbook_url, var.runbook_url), "") + tip = var.postfix_smtp_delivery_delay_seconds_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_smtp_delivery_delay_seconds_count_disabled_major, var.postfix_smtp_delivery_delay_seconds_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_smtp_delivery_delay_seconds_count_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_smtp_delivery_delay_seconds_count_runbook_url, var.runbook_url), "") + tip = var.postfix_smtp_delivery_delay_seconds_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_smtp_delivery_delay_seconds_count_max_delay +} + diff --git a/modules/prometheus-exporter_postfix/outputs.tf b/modules/prometheus-exporter_postfix/outputs.tf new file mode 100644 index 000000000..c816c0800 --- /dev/null +++ b/modules/prometheus-exporter_postfix/outputs.tf @@ -0,0 +1,25 @@ +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + +output "postfix_showq_message_size_bytes_count_deferred" { + description = "Detector resource for postfix_showq_message_size_bytes_count_deferred" + value = signalfx_detector.postfix_showq_message_size_bytes_count_deferred +} + +output "postfix_showq_message_size_bytes_count_hold" { + description = "Detector resource for postfix_showq_message_size_bytes_count_hold" + value = signalfx_detector.postfix_showq_message_size_bytes_count_hold +} + +output "postfix_showq_message_size_bytes_count_maildrop" { + description = "Detector resource for postfix_showq_message_size_bytes_count_maildrop" + value = signalfx_detector.postfix_showq_message_size_bytes_count_maildrop +} + +output "postfix_smtp_delivery_delay_seconds_count" { + description = "Detector resource for postfix_smtp_delivery_delay_seconds_count" + value = signalfx_detector.postfix_smtp_delivery_delay_seconds_count +} + diff --git a/modules/prometheus-exporter_postfix/tags.tf b/modules/prometheus-exporter_postfix/tags.tf new file mode 100644 index 000000000..d8c3398d2 --- /dev/null +++ b/modules/prometheus-exporter_postfix/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["prometheus-exporter", "postfix"] +} + diff --git a/modules/prometheus-exporter_postfix/variables-gen.tf b/modules/prometheus-exporter_postfix/variables-gen.tf new file mode 100644 index 000000000..3089c2425 --- /dev/null +++ b/modules/prometheus-exporter_postfix/variables-gen.tf @@ -0,0 +1,410 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_exclude_not_running_vm" { + description = "Don’t send alerts if associated VM is stopped or stopping (metadata provided by cloud provider integration). Can be useful for ephemeral infrastructure (such as auto scaling groups) as VM will be stopped and started regularly. Note that timeframe must be at least 25 minutes for the metadata to be available to the detector." + type = bool + default = true +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"25m\"). Must be at least \"25m\" if \"heartbeat_exclude_not_running_vm\" is true" + type = string + default = "25m" +} + +# postfix_showq_message_size_bytes_count_deferred detector + +variable "postfix_showq_message_size_bytes_count_deferred_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_deferred detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_deferred_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_deferred detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_deferred detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_deferred_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_deferred detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_deferred detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_deferred_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_deferred detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_deferred_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_showq_message_size_bytes_count_hold detector + +variable "postfix_showq_message_size_bytes_count_hold_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_hold detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_hold_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_hold detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_hold detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_hold_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_hold detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_hold detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_hold_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_hold_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_hold detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_hold_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_showq_message_size_bytes_count_maildrop detector + +variable "postfix_showq_message_size_bytes_count_maildrop_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_maildrop detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_maildrop_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_maildrop detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_maildrop detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_maildrop detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_maildrop detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_maildrop_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_maildrop detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_smtp_delivery_delay_seconds_count detector + +variable "postfix_smtp_delivery_delay_seconds_count_notifications" { + description = "Notification recipients list per severity overridden for postfix_smtp_delivery_delay_seconds_count detector" + type = map(list(string)) + default = {} +} + +variable "postfix_smtp_delivery_delay_seconds_count_aggregation_function" { + description = "Aggregation function and group by for postfix_smtp_delivery_delay_seconds_count detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_transformation_function" { + description = "Transformation function for postfix_smtp_delivery_delay_seconds_count detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_smtp_delivery_delay_seconds_count_max_delay" { + description = "Enforce max delay for postfix_smtp_delivery_delay_seconds_count detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled" { + description = "Disable all alerting rules for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled_critical" { + description = "Disable critical alerting rule for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled_major" { + description = "Disable major alerting rule for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_threshold_critical" { + description = "Critical threshold for postfix_smtp_delivery_delay_seconds_count detector" + type = number + default = 60 +} + +variable "postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_smtp_delivery_delay_seconds_count_threshold_major" { + description = "Major threshold for postfix_smtp_delivery_delay_seconds_count detector" + type = number + default = 45 +} + +variable "postfix_smtp_delivery_delay_seconds_count_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/smart-agent_redis/README.md b/modules/smart-agent_redis/README.md index b71f9ec63..01baa38a2 100644 --- a/modules/smart-agent_redis/README.md +++ b/modules/smart-agent_redis/README.md @@ -140,8 +140,8 @@ parameter to the corresponding monitor configuration: - '!bytes.maxmemory' - '!bytes.total_system_memory' - '!bytes.used_memory' - - '!${var.use_otel_receiver ? "redis.client.blocked" : "gauge.blocked_clients"}' - - '!${var.use_otel_receiver ? "redis.client.connected" : "gauge.connected_clients"}' + - '!${var.use_otel_receiver ? "redis.clients.blocked" : "gauge.blocked_clients"}' + - '!${var.use_otel_receiver ? "redis.clients.connected" : "gauge.connected_clients"}' - '!${var.use_otel_receiver ? "redis.connections.rejected" : "counter.rejected_connections"}' - '!${var.use_otel_receiver ? "redis.db.keys" : "gauge.db0_keys"}' - '!${var.use_otel_receiver ? "redis.keys.evicted" : "counter.evicted_keys"}' diff --git a/modules/smart-agent_redis/conf/03-blocked-clients.yaml b/modules/smart-agent_redis/conf/03-blocked-clients.yaml index ed4b711c8..e7da685d3 100644 --- a/modules/smart-agent_redis/conf/03-blocked-clients.yaml +++ b/modules/smart-agent_redis/conf/03-blocked-clients.yaml @@ -5,9 +5,9 @@ value_unit: "%" signals: A: - metric: '${var.use_otel_receiver ? "redis.client.blocked" : "gauge.blocked_clients"}' + metric: '${var.use_otel_receiver ? "redis.clients.blocked" : "gauge.blocked_clients"}' B: - metric: '${var.use_otel_receiver ? "redis.client.connected" : "gauge.connected_clients"}' + metric: '${var.use_otel_receiver ? "redis.clients.connected" : "gauge.connected_clients"}' signal: formula: (A/B).scale(100) diff --git a/modules/smart-agent_redis/detectors-gen.tf b/modules/smart-agent_redis/detectors-gen.tf index 80c2012bb..51c2d80c4 100644 --- a/modules/smart-agent_redis/detectors-gen.tf +++ b/modules/smart-agent_redis/detectors-gen.tf @@ -121,8 +121,8 @@ resource "signalfx_detector" "blocked_over_connected_clients_ratio" { } program_text = <<-EOF - A = data('${var.use_otel_receiver ? "redis.client.blocked" : "gauge.blocked_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} - B = data('${var.use_otel_receiver ? "redis.client.connected" : "gauge.connected_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} + A = data('${var.use_otel_receiver ? "redis.clients.blocked" : "gauge.blocked_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} + B = data('${var.use_otel_receiver ? "redis.clients.connected" : "gauge.connected_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} signal = (A/B).scale(100).publish('signal') detect(when(signal > ${var.blocked_over_connected_clients_ratio_threshold_critical}%{if var.blocked_over_connected_clients_ratio_lasting_duration_critical != null}, lasting='${var.blocked_over_connected_clients_ratio_lasting_duration_critical}', at_least=${var.blocked_over_connected_clients_ratio_at_least_percentage_critical}%{endif})).publish('CRIT') detect(when(signal > ${var.blocked_over_connected_clients_ratio_threshold_major}%{if var.blocked_over_connected_clients_ratio_lasting_duration_major != null}, lasting='${var.blocked_over_connected_clients_ratio_lasting_duration_major}', at_least=${var.blocked_over_connected_clients_ratio_at_least_percentage_major}%{endif}) and (not when(signal > ${var.blocked_over_connected_clients_ratio_threshold_critical}%{if var.blocked_over_connected_clients_ratio_lasting_duration_critical != null}, lasting='${var.blocked_over_connected_clients_ratio_lasting_duration_critical}', at_least=${var.blocked_over_connected_clients_ratio_at_least_percentage_critical}%{endif}))).publish('MAJOR')