From 3f85d43e5b36fa7ecc4b433658796908ae94d51f Mon Sep 17 00:00:00 2001 From: nsd <2766897+NSenaud@users.noreply.github.com> Date: Fri, 24 Mar 2023 10:31:11 +0100 Subject: [PATCH] Add AWS Backup detectors (#457) * feat(aws_backup): Init detectors module Supersed https://github.com/claranet/terraform-signalfx-detectors/pull/338 * feat(aws_backup): Add recovery points detectors * refactor(aws_backup): improvements according to comments * style(aws_backup): lowercase name Co-authored-by: Patrick Decat --------- Co-authored-by: Nicolas VION Co-authored-by: Patrick Decat --- docs/severity.md | 13 + modules/integration_aws-backup/README.md | 117 ++++++ .../integration_aws-backup/common-filters.tf | 3 + .../integration_aws-backup/common-locals.tf | 1 + .../integration_aws-backup/common-modules.tf | 1 + .../common-variables.tf | 1 + .../integration_aws-backup/common-versions.tf | 1 + .../conf/00-aws-backup-failed.yaml | 16 + .../conf/01-aws-backup-job-expired.yaml | 17 + .../conf/02-aws-backup-copyjob-failed.yaml | 16 + .../conf/03-aws-backup-check.yaml | 23 ++ .../conf/04-aws-backup-rp-partial.yaml | 16 + .../conf/05-aws-backup-rp-expired.yaml | 16 + .../integration_aws-backup/conf/readme.yaml | 5 + .../integration_aws-backup/detectors-gen.tf | 170 ++++++++ modules/integration_aws-backup/outputs.tf | 30 ++ modules/integration_aws-backup/tags.tf | 4 + .../integration_aws-backup/variables-gen.tf | 366 ++++++++++++++++++ 18 files changed, 816 insertions(+) create mode 100644 modules/integration_aws-backup/README.md create mode 100644 modules/integration_aws-backup/common-filters.tf create mode 120000 modules/integration_aws-backup/common-locals.tf create mode 120000 modules/integration_aws-backup/common-modules.tf create mode 120000 modules/integration_aws-backup/common-variables.tf create mode 120000 modules/integration_aws-backup/common-versions.tf create mode 100644 modules/integration_aws-backup/conf/00-aws-backup-failed.yaml create mode 100644 modules/integration_aws-backup/conf/01-aws-backup-job-expired.yaml create mode 100644 modules/integration_aws-backup/conf/02-aws-backup-copyjob-failed.yaml create mode 100644 modules/integration_aws-backup/conf/03-aws-backup-check.yaml create mode 100644 modules/integration_aws-backup/conf/04-aws-backup-rp-partial.yaml create mode 100644 modules/integration_aws-backup/conf/05-aws-backup-rp-expired.yaml create mode 100644 modules/integration_aws-backup/conf/readme.yaml create mode 100644 modules/integration_aws-backup/detectors-gen.tf create mode 100644 modules/integration_aws-backup/outputs.tf create mode 100644 modules/integration_aws-backup/tags.tf create mode 100644 modules/integration_aws-backup/variables-gen.tf diff --git a/docs/severity.md b/docs/severity.md index 497370117..2c24b888d 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -9,6 +9,7 @@ - [fame_azure-vpn](#fame_azure-vpn) - [integration_aws-alb](#integration_aws-alb) - [integration_aws-apigateway](#integration_aws-apigateway) +- [integration_aws-backup](#integration_aws-backup) - [integration_aws-beanstalk](#integration_aws-beanstalk) - [integration_aws-ecs-cluster](#integration_aws-ecs-cluster) - [integration_aws-ecs-service](#integration_aws-ecs-service) @@ -158,6 +159,18 @@ |AWS APIGateway http 4xx error rate|X|X|X|-|-| +## integration_aws-backup + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|AWS Backup failed|X|-|-|-|-| +|AWS Backup job expired|X|-|-|-|-| +|AWS Backup copy jobs failed|X|-|-|-|-| +|AWS Backup check jobs completed successfully|X|-|-|-|-| +|AWS Backup recovery point partial|-|-|X|-|-| +|AWS Backup recovery point expired|-|X|-|-|-| + + ## integration_aws-beanstalk |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/integration_aws-backup/README.md b/modules/integration_aws-backup/README.md new file mode 100644 index 000000000..7e3e67fd7 --- /dev/null +++ b/modules/integration_aws-backup/README.md @@ -0,0 +1,117 @@ +# AWS-BACKUP SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-integration-aws-backup" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_aws-backup?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailled in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|AWS Backup failed|X|-|-|-|-| +|AWS Backup job expired|X|-|-|-|-| +|AWS Backup copy jobs failed|X|-|-|-|-| +|AWS Backup check jobs completed successfully|X|-|-|-|-| +|AWS Backup recovery point partial|-|-|X|-|-| +|AWS Backup recovery point expired|-|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +[AWS integration](https://docs.splunk.com/Observability/gdi/get-data-in/connect/aws/aws.html) configurable +with [this Terraform module](https://github.com/claranet/terraform-signalfx-integrations/tree/master/cloud/aws). + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `NumberOfBackupJobsCompleted` +* `NumberOfBackupJobsCreated` +* `NumberOfBackupJobsExpired` +* `NumberOfBackupJobsFailed` +* `NumberOfCopyJobsFailed` +* `NumberOfRecoveryPointsExpired` +* `NumberOfRecoveryPointsPartial` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) +* [CloudWatch metrics](https://docs.aws.amazon.com/aws-backup/latest/devguide/cloudwatch.html) diff --git a/modules/integration_aws-backup/common-filters.tf b/modules/integration_aws-backup/common-filters.tf new file mode 100644 index 000000000..d1ac2ad87 --- /dev/null +++ b/modules/integration_aws-backup/common-filters.tf @@ -0,0 +1,3 @@ +locals { + filters = "filter('ResourceType', '*')" +} \ No newline at end of file diff --git a/modules/integration_aws-backup/common-locals.tf b/modules/integration_aws-backup/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/integration_aws-backup/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_aws-backup/common-modules.tf b/modules/integration_aws-backup/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/integration_aws-backup/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_aws-backup/common-variables.tf b/modules/integration_aws-backup/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/integration_aws-backup/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_aws-backup/common-versions.tf b/modules/integration_aws-backup/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/integration_aws-backup/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_aws-backup/conf/00-aws-backup-failed.yaml b/modules/integration_aws-backup/conf/00-aws-backup-failed.yaml new file mode 100644 index 000000000..9df275133 --- /dev/null +++ b/modules/integration_aws-backup/conf/00-aws-backup-failed.yaml @@ -0,0 +1,16 @@ +module: AWS Backup +name: failed +id: backup_failed + +transformation: ".max(over='1d').fill(0)" +aggregation: true +filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')" + +signals: + signal: + metric: NumberOfBackupJobsFailed +rules: + critical: + threshold: 0 + comparator: ">" + lasting_duration: '1h' diff --git a/modules/integration_aws-backup/conf/01-aws-backup-job-expired.yaml b/modules/integration_aws-backup/conf/01-aws-backup-job-expired.yaml new file mode 100644 index 000000000..017403c6d --- /dev/null +++ b/modules/integration_aws-backup/conf/01-aws-backup-job-expired.yaml @@ -0,0 +1,17 @@ +module: AWS Backup +name: job expired +id: backup_job_expired + +transformation: ".max(over='1d').fill(0)" +aggregation: true +filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')" + +signals: + signal: + metric: NumberOfBackupJobsExpired + extrapolation: zero +rules: + critical: + threshold: 0 + comparator: ">" + lasting_duration: '1h' diff --git a/modules/integration_aws-backup/conf/02-aws-backup-copyjob-failed.yaml b/modules/integration_aws-backup/conf/02-aws-backup-copyjob-failed.yaml new file mode 100644 index 000000000..8837302ce --- /dev/null +++ b/modules/integration_aws-backup/conf/02-aws-backup-copyjob-failed.yaml @@ -0,0 +1,16 @@ +module: AWS Backup +name: copy jobs failed +id: backup_copy_jobs_failed + +transformation: ".max(over='1d').fill(0)" +aggregation: true +filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')" + +signals: + signal: + metric: NumberOfCopyJobsFailed +rules: + critical: + threshold: 0 + comparator: ">" + lasting_duration: '1h' diff --git a/modules/integration_aws-backup/conf/03-aws-backup-check.yaml b/modules/integration_aws-backup/conf/03-aws-backup-check.yaml new file mode 100644 index 000000000..bbf29602e --- /dev/null +++ b/modules/integration_aws-backup/conf/03-aws-backup-check.yaml @@ -0,0 +1,23 @@ +module: AWS Backup +name: check jobs completed successfully +id: backup_successful + +transformation: ".min(over='23h')" +aggregation: true +filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')" + +signals: + created: + metric: NumberOfBackupJobsCreated + extrapolation: zero + completed: + metric: NumberOfBackupJobsCompleted + extrapolation: zero + signal: + formula: (created-completed) +rules: + critical: + threshold: 0 + comparator: ">" + lasting_duration: 1d + lasting_at_least: 0.9 diff --git a/modules/integration_aws-backup/conf/04-aws-backup-rp-partial.yaml b/modules/integration_aws-backup/conf/04-aws-backup-rp-partial.yaml new file mode 100644 index 000000000..9d399db57 --- /dev/null +++ b/modules/integration_aws-backup/conf/04-aws-backup-rp-partial.yaml @@ -0,0 +1,16 @@ +module: AWS Backup +name: recovery point partial +id: backup_rp_partial + +transformation: ".max(over='1d').fill(0)" +aggregation: true +filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')" + +signals: + signal: + metric: NumberOfRecoveryPointsPartial +rules: + minor: + threshold: 0 + comparator: ">" + lasting_duration: '1h' diff --git a/modules/integration_aws-backup/conf/05-aws-backup-rp-expired.yaml b/modules/integration_aws-backup/conf/05-aws-backup-rp-expired.yaml new file mode 100644 index 000000000..c0aa5484d --- /dev/null +++ b/modules/integration_aws-backup/conf/05-aws-backup-rp-expired.yaml @@ -0,0 +1,16 @@ +module: AWS Backup +name: recovery point expired +id: backup_rp_expired + +transformation: ".max(over='1d').fill(0)" +aggregation: true +filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')" + +signals: + signal: + metric: NumberOfRecoveryPointsExpired +rules: + major: + threshold: 0 + comparator: ">" + lasting_duration: '1h' diff --git a/modules/integration_aws-backup/conf/readme.yaml b/modules/integration_aws-backup/conf/readme.yaml new file mode 100644 index 000000000..8e926892a --- /dev/null +++ b/modules/integration_aws-backup/conf/readme.yaml @@ -0,0 +1,5 @@ +documentations: + - name: CloudWatch metrics + url: 'https://docs.aws.amazon.com/aws-backup/latest/devguide/cloudwatch.html' + +source_doc: \ No newline at end of file diff --git a/modules/integration_aws-backup/detectors-gen.tf b/modules/integration_aws-backup/detectors-gen.tf new file mode 100644 index 000000000..b30d6a32b --- /dev/null +++ b/modules/integration_aws-backup/detectors-gen.tf @@ -0,0 +1,170 @@ +resource "signalfx_detector" "backup_failed" { + name = format("%s %s", local.detector_name_prefix, "AWS Backup failed") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum') + signal = data('NumberOfBackupJobsFailed', filter=base_filtering and ${module.filtering.signalflow})${var.backup_failed_aggregation_function}${var.backup_failed_transformation_function}.publish('signal') + detect(when(signal > ${var.backup_failed_threshold_critical}, lasting=%{if var.backup_failed_lasting_duration_critical == null}None%{else}'${var.backup_failed_lasting_duration_critical}'%{endif}, at_least=${var.backup_failed_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.backup_failed_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.backup_failed_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backup_failed_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.backup_failed_runbook_url, var.runbook_url), "") + tip = var.backup_failed_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backup_failed_max_delay +} + +resource "signalfx_detector" "backup_job_expired" { + name = format("%s %s", local.detector_name_prefix, "AWS Backup job expired") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum') + signal = data('NumberOfBackupJobsExpired', filter=base_filtering and ${module.filtering.signalflow}, extrapolation='zero')${var.backup_job_expired_aggregation_function}${var.backup_job_expired_transformation_function}.publish('signal') + detect(when(signal > ${var.backup_job_expired_threshold_critical}, lasting=%{if var.backup_job_expired_lasting_duration_critical == null}None%{else}'${var.backup_job_expired_lasting_duration_critical}'%{endif}, at_least=${var.backup_job_expired_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.backup_job_expired_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.backup_job_expired_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backup_job_expired_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.backup_job_expired_runbook_url, var.runbook_url), "") + tip = var.backup_job_expired_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backup_job_expired_max_delay +} + +resource "signalfx_detector" "backup_copy_jobs_failed" { + name = format("%s %s", local.detector_name_prefix, "AWS Backup copy jobs failed") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum') + signal = data('NumberOfCopyJobsFailed', filter=base_filtering and ${module.filtering.signalflow})${var.backup_copy_jobs_failed_aggregation_function}${var.backup_copy_jobs_failed_transformation_function}.publish('signal') + detect(when(signal > ${var.backup_copy_jobs_failed_threshold_critical}, lasting=%{if var.backup_copy_jobs_failed_lasting_duration_critical == null}None%{else}'${var.backup_copy_jobs_failed_lasting_duration_critical}'%{endif}, at_least=${var.backup_copy_jobs_failed_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.backup_copy_jobs_failed_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.backup_copy_jobs_failed_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backup_copy_jobs_failed_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.backup_copy_jobs_failed_runbook_url, var.runbook_url), "") + tip = var.backup_copy_jobs_failed_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backup_copy_jobs_failed_max_delay +} + +resource "signalfx_detector" "backup_successful" { + name = format("%s %s", local.detector_name_prefix, "AWS Backup check jobs completed successfully") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum') + created = data('NumberOfBackupJobsCreated', filter=base_filtering and ${module.filtering.signalflow}, extrapolation='zero')${var.backup_successful_aggregation_function}${var.backup_successful_transformation_function} + completed = data('NumberOfBackupJobsCompleted', filter=base_filtering and ${module.filtering.signalflow}, extrapolation='zero')${var.backup_successful_aggregation_function}${var.backup_successful_transformation_function} + signal = (created-completed).publish('signal') + detect(when(signal > ${var.backup_successful_threshold_critical}, lasting=%{if var.backup_successful_lasting_duration_critical == null}None%{else}'${var.backup_successful_lasting_duration_critical}'%{endif}, at_least=${var.backup_successful_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.backup_successful_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.backup_successful_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backup_successful_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.backup_successful_runbook_url, var.runbook_url), "") + tip = var.backup_successful_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backup_successful_max_delay +} + +resource "signalfx_detector" "backup_rp_partial" { + name = format("%s %s", local.detector_name_prefix, "AWS Backup recovery point partial") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum') + signal = data('NumberOfRecoveryPointsPartial', filter=base_filtering and ${module.filtering.signalflow})${var.backup_rp_partial_aggregation_function}${var.backup_rp_partial_transformation_function}.publish('signal') + detect(when(signal > ${var.backup_rp_partial_threshold_minor}, lasting=%{if var.backup_rp_partial_lasting_duration_minor == null}None%{else}'${var.backup_rp_partial_lasting_duration_minor}'%{endif}, at_least=${var.backup_rp_partial_at_least_percentage_minor})).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.backup_rp_partial_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.backup_rp_partial_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backup_rp_partial_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.backup_rp_partial_runbook_url, var.runbook_url), "") + tip = var.backup_rp_partial_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backup_rp_partial_max_delay +} + +resource "signalfx_detector" "backup_rp_expired" { + name = format("%s %s", local.detector_name_prefix, "AWS Backup recovery point expired") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum') + signal = data('NumberOfRecoveryPointsExpired', filter=base_filtering and ${module.filtering.signalflow})${var.backup_rp_expired_aggregation_function}${var.backup_rp_expired_transformation_function}.publish('signal') + detect(when(signal > ${var.backup_rp_expired_threshold_major}, lasting=%{if var.backup_rp_expired_lasting_duration_major == null}None%{else}'${var.backup_rp_expired_lasting_duration_major}'%{endif}, at_least=${var.backup_rp_expired_at_least_percentage_major})).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.backup_rp_expired_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.backup_rp_expired_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backup_rp_expired_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.backup_rp_expired_runbook_url, var.runbook_url), "") + tip = var.backup_rp_expired_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backup_rp_expired_max_delay +} + diff --git a/modules/integration_aws-backup/outputs.tf b/modules/integration_aws-backup/outputs.tf new file mode 100644 index 000000000..edf0b3339 --- /dev/null +++ b/modules/integration_aws-backup/outputs.tf @@ -0,0 +1,30 @@ +output "backup_copy_jobs_failed" { + description = "Detector resource for backup_copy_jobs_failed" + value = signalfx_detector.backup_copy_jobs_failed +} + +output "backup_failed" { + description = "Detector resource for backup_failed" + value = signalfx_detector.backup_failed +} + +output "backup_job_expired" { + description = "Detector resource for backup_job_expired" + value = signalfx_detector.backup_job_expired +} + +output "backup_rp_expired" { + description = "Detector resource for backup_rp_expired" + value = signalfx_detector.backup_rp_expired +} + +output "backup_rp_partial" { + description = "Detector resource for backup_rp_partial" + value = signalfx_detector.backup_rp_partial +} + +output "backup_successful" { + description = "Detector resource for backup_successful" + value = signalfx_detector.backup_successful +} + diff --git a/modules/integration_aws-backup/tags.tf b/modules/integration_aws-backup/tags.tf new file mode 100644 index 000000000..8bc0f3c46 --- /dev/null +++ b/modules/integration_aws-backup/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["integration", "aws-backup"] +} + diff --git a/modules/integration_aws-backup/variables-gen.tf b/modules/integration_aws-backup/variables-gen.tf new file mode 100644 index 000000000..ac3fecd80 --- /dev/null +++ b/modules/integration_aws-backup/variables-gen.tf @@ -0,0 +1,366 @@ +# backup_failed detector + +variable "backup_failed_notifications" { + description = "Notification recipients list per severity overridden for backup_failed detector" + type = map(list(string)) + default = {} +} + +variable "backup_failed_aggregation_function" { + description = "Aggregation function and group by for backup_failed detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "backup_failed_transformation_function" { + description = "Transformation function for backup_failed detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='1d').fill(0)" +} + +variable "backup_failed_max_delay" { + description = "Enforce max delay for backup_failed detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backup_failed_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backup_failed_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backup_failed_disabled" { + description = "Disable all alerting rules for backup_failed detector" + type = bool + default = null +} + +variable "backup_failed_threshold_critical" { + description = "Critical threshold for backup_failed detector" + type = number + default = 0 +} + +variable "backup_failed_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1h" +} + +variable "backup_failed_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# backup_job_expired detector + +variable "backup_job_expired_notifications" { + description = "Notification recipients list per severity overridden for backup_job_expired detector" + type = map(list(string)) + default = {} +} + +variable "backup_job_expired_aggregation_function" { + description = "Aggregation function and group by for backup_job_expired detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "backup_job_expired_transformation_function" { + description = "Transformation function for backup_job_expired detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='1d').fill(0)" +} + +variable "backup_job_expired_max_delay" { + description = "Enforce max delay for backup_job_expired detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backup_job_expired_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backup_job_expired_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backup_job_expired_disabled" { + description = "Disable all alerting rules for backup_job_expired detector" + type = bool + default = null +} + +variable "backup_job_expired_threshold_critical" { + description = "Critical threshold for backup_job_expired detector" + type = number + default = 0 +} + +variable "backup_job_expired_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1h" +} + +variable "backup_job_expired_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# backup_copy_jobs_failed detector + +variable "backup_copy_jobs_failed_notifications" { + description = "Notification recipients list per severity overridden for backup_copy_jobs_failed detector" + type = map(list(string)) + default = {} +} + +variable "backup_copy_jobs_failed_aggregation_function" { + description = "Aggregation function and group by for backup_copy_jobs_failed detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "backup_copy_jobs_failed_transformation_function" { + description = "Transformation function for backup_copy_jobs_failed detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='1d').fill(0)" +} + +variable "backup_copy_jobs_failed_max_delay" { + description = "Enforce max delay for backup_copy_jobs_failed detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backup_copy_jobs_failed_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backup_copy_jobs_failed_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backup_copy_jobs_failed_disabled" { + description = "Disable all alerting rules for backup_copy_jobs_failed detector" + type = bool + default = null +} + +variable "backup_copy_jobs_failed_threshold_critical" { + description = "Critical threshold for backup_copy_jobs_failed detector" + type = number + default = 0 +} + +variable "backup_copy_jobs_failed_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1h" +} + +variable "backup_copy_jobs_failed_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# backup_successful detector + +variable "backup_successful_notifications" { + description = "Notification recipients list per severity overridden for backup_successful detector" + type = map(list(string)) + default = {} +} + +variable "backup_successful_aggregation_function" { + description = "Aggregation function and group by for backup_successful detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "backup_successful_transformation_function" { + description = "Transformation function for backup_successful detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='23h')" +} + +variable "backup_successful_max_delay" { + description = "Enforce max delay for backup_successful detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backup_successful_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backup_successful_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backup_successful_disabled" { + description = "Disable all alerting rules for backup_successful detector" + type = bool + default = null +} + +variable "backup_successful_threshold_critical" { + description = "Critical threshold for backup_successful detector" + type = number + default = 0 +} + +variable "backup_successful_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1d" +} + +variable "backup_successful_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.9 +} +# backup_rp_partial detector + +variable "backup_rp_partial_notifications" { + description = "Notification recipients list per severity overridden for backup_rp_partial detector" + type = map(list(string)) + default = {} +} + +variable "backup_rp_partial_aggregation_function" { + description = "Aggregation function and group by for backup_rp_partial detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "backup_rp_partial_transformation_function" { + description = "Transformation function for backup_rp_partial detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='1d').fill(0)" +} + +variable "backup_rp_partial_max_delay" { + description = "Enforce max delay for backup_rp_partial detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backup_rp_partial_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backup_rp_partial_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backup_rp_partial_disabled" { + description = "Disable all alerting rules for backup_rp_partial detector" + type = bool + default = null +} + +variable "backup_rp_partial_threshold_minor" { + description = "Minor threshold for backup_rp_partial detector" + type = number + default = 0 +} + +variable "backup_rp_partial_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1h" +} + +variable "backup_rp_partial_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# backup_rp_expired detector + +variable "backup_rp_expired_notifications" { + description = "Notification recipients list per severity overridden for backup_rp_expired detector" + type = map(list(string)) + default = {} +} + +variable "backup_rp_expired_aggregation_function" { + description = "Aggregation function and group by for backup_rp_expired detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "backup_rp_expired_transformation_function" { + description = "Transformation function for backup_rp_expired detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='1d').fill(0)" +} + +variable "backup_rp_expired_max_delay" { + description = "Enforce max delay for backup_rp_expired detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backup_rp_expired_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backup_rp_expired_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backup_rp_expired_disabled" { + description = "Disable all alerting rules for backup_rp_expired detector" + type = bool + default = null +} + +variable "backup_rp_expired_threshold_major" { + description = "Major threshold for backup_rp_expired detector" + type = number + default = 0 +} + +variable "backup_rp_expired_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1h" +} + +variable "backup_rp_expired_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}