Skip to content

Commit

Permalink
refactor(aws_backup): improvements according to comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicolas Senaud committed Feb 16, 2023
1 parent 4797d2c commit 0429274
Show file tree
Hide file tree
Showing 11 changed files with 97 additions and 164 deletions.
2 changes: 1 addition & 1 deletion docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
|AWS Backup failed|X|-|-|-|-|
|AWS Backup job expired|X|-|-|-|-|
|AWS Backup copy jobs failed|X|-|-|-|-|
|AWS Backup check|X|-|-|-|-|
|AWS Backup check jobs completed successfully|X|-|-|-|-|
|AWS Backup recovery point partial|-|-|X|-|-|
|AWS Backup recovery point expired|-|X|-|-|-|

Expand Down
2 changes: 1 addition & 1 deletion modules/integration_aws-backup/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ This module creates the following SignalFx detectors which could contain one or
|AWS Backup failed|X|-|-|-|-|
|AWS Backup job expired|X|-|-|-|-|
|AWS Backup copy jobs failed|X|-|-|-|-|
|AWS Backup check|X|-|-|-|-|
|AWS Backup check jobs completed successfully|X|-|-|-|-|
|AWS Backup recovery point partial|-|-|X|-|-|
|AWS Backup recovery point expired|-|X|-|-|-|

Expand Down
14 changes: 4 additions & 10 deletions modules/integration_aws-backup/conf/00-aws-backup-failed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@ module: AWS Backup
name: failed
id: backup_failed

transformation: ".min(over='1h')"
transformation: ".max(over='1d').fill(0)"
aggregation: true
filtering: "filter('namespace', 'AWS/Backup')"
value_unit: "count"
filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')"

signals:
failed:
metric: NumberOfBackupJobsFailed
extrapolation: zero
rollup: sum
signal:
formula: failed
metric: NumberOfBackupJobsFailed
rules:
critical:
threshold: 0
comparator: ">"
lasting_duration: 2h
lasting_at_least: 0.9
lasting_duration: '1h'
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,16 @@ module: AWS Backup
name: job expired
id: backup_job_expired

transformation: ".min(over='1h')"
transformation: ".max(over='1d').fill(0)"
aggregation: true
filtering: "filter('namespace', 'AWS/Backup')"
value_unit: "count"
filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')"

signals:
failed:
signal:
metric: NumberOfBackupJobsExpired
extrapolation: zero
rollup: sum
signal:
formula: failed
rules:
critical:
threshold: 0
comparator: ">"
lasting_duration: 2h
lasting_at_least: 0.9
lasting_duration: '1h'
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@ module: AWS Backup
name: copy jobs failed
id: backup_copy_jobs_failed

transformation: ".min(over='1h')"
transformation: ".max(over='1d').fill(0)"
aggregation: true
filtering: "filter('namespace', 'AWS/Backup')"
value_unit: "count"
filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')"

signals:
failed:
metric: NumberOfCopyJobsFailed
extrapolation: zero
rollup: sum
signal:
formula: failed
metric: NumberOfCopyJobsFailed
rules:
critical:
threshold: 0
comparator: ">"
lasting_duration: 2h
lasting_at_least: 0.9
lasting_duration: '1h'
13 changes: 5 additions & 8 deletions modules/integration_aws-backup/conf/03-aws-backup-check.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
module: AWS Backup
name: Check
id: backup
name: Check jobs completed successfully
id: backup_successful

transformation: ".min(over='23h')"
aggregation: true
filtering: "filter('namespace', 'AWS/Backup')"
value_unit: "count"
filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')"

signals:
created:
metric: NumberOfBackupJobsCreated
extrapolation: zero
rollup: sum
completed:
metric: NumberOfBackupJobsCompleted
extrapolation: zero
rollup: sum
signal:
formula: created - completed
formula: (created-completed)
rules:
critical:
threshold: 0
comparator: ">"
lasting_duration: 1d
lasting_at_least: 0.9
lasting_at_least: 0.9
14 changes: 4 additions & 10 deletions modules/integration_aws-backup/conf/04-aws-backup-rp-partial.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@ module: AWS Backup
name: recovery point partial
id: backup_rp_partial

transformation: ".min(over='1h')"
transformation: ".max(over='1d').fill(0)"
aggregation: true
filtering: "filter('namespace', 'AWS/Backup')"
value_unit: "count"
filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')"

signals:
failed:
metric: NumberOfRecoveryPointsPartial
extrapolation: zero
rollup: sum
signal:
formula: failed
metric: NumberOfRecoveryPointsPartial
rules:
minor:
threshold: 0
comparator: ">"
lasting_duration: 2h
lasting_at_least: 0.9
lasting_duration: '1h'
14 changes: 4 additions & 10 deletions modules/integration_aws-backup/conf/05-aws-backup-rp-expired.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@ module: AWS Backup
name: recovery point expired
id: backup_rp_expired

transformation: ".min(over='1h')"
transformation: ".max(over='1d').fill(0)"
aggregation: true
filtering: "filter('namespace', 'AWS/Backup')"
value_unit: "count"
filtering: "filter('namespace', 'AWS/Backup') and filter('stat', 'sum')"

signals:
failed:
metric: NumberOfRecoveryPointsExpired
extrapolation: zero
rollup: sum
signal:
formula: failed
metric: NumberOfRecoveryPointsExpired
rules:
major:
threshold: 0
comparator: ">"
lasting_duration: 2h
lasting_at_least: 0.9
lasting_duration: '1h'
91 changes: 28 additions & 63 deletions modules/integration_aws-backup/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,14 @@ resource "signalfx_detector" "backup_failed" {
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "count"
}

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/Backup')
failed = data('NumberOfBackupJobsFailed', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_failed_aggregation_function}${var.backup_failed_transformation_function}
signal = failed.publish('signal')
base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum')
signal = data('NumberOfBackupJobsFailed', filter=base_filtering and ${module.filtering.signalflow})${var.backup_failed_aggregation_function}${var.backup_failed_transformation_function}.publish('signal')
detect(when(signal > ${var.backup_failed_threshold_critical}, lasting=%{if var.backup_failed_lasting_duration_critical == null}None%{else}'${var.backup_failed_lasting_duration_critical}'%{endif}, at_least=${var.backup_failed_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "is too high > ${var.backup_failed_threshold_critical}count"
description = "is too high > ${var.backup_failed_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.backup_failed_disabled, var.detectors_disabled)
Expand All @@ -39,20 +33,14 @@ resource "signalfx_detector" "backup_job_expired" {
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "count"
}

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/Backup')
failed = data('NumberOfBackupJobsExpired', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_job_expired_aggregation_function}${var.backup_job_expired_transformation_function}
signal = failed.publish('signal')
base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum')
signal = data('NumberOfBackupJobsExpired', filter=base_filtering and ${module.filtering.signalflow}, extrapolation='zero')${var.backup_job_expired_aggregation_function}${var.backup_job_expired_transformation_function}.publish('signal')
detect(when(signal > ${var.backup_job_expired_threshold_critical}, lasting=%{if var.backup_job_expired_lasting_duration_critical == null}None%{else}'${var.backup_job_expired_lasting_duration_critical}'%{endif}, at_least=${var.backup_job_expired_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "is too high > ${var.backup_job_expired_threshold_critical}count"
description = "is too high > ${var.backup_job_expired_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.backup_job_expired_disabled, var.detectors_disabled)
Expand All @@ -73,20 +61,14 @@ resource "signalfx_detector" "backup_copy_jobs_failed" {
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "count"
}

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/Backup')
failed = data('NumberOfCopyJobsFailed', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_copy_jobs_failed_aggregation_function}${var.backup_copy_jobs_failed_transformation_function}
signal = failed.publish('signal')
base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum')
signal = data('NumberOfCopyJobsFailed', filter=base_filtering and ${module.filtering.signalflow})${var.backup_copy_jobs_failed_aggregation_function}${var.backup_copy_jobs_failed_transformation_function}.publish('signal')
detect(when(signal > ${var.backup_copy_jobs_failed_threshold_critical}, lasting=%{if var.backup_copy_jobs_failed_lasting_duration_critical == null}None%{else}'${var.backup_copy_jobs_failed_lasting_duration_critical}'%{endif}, at_least=${var.backup_copy_jobs_failed_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "is too high > ${var.backup_copy_jobs_failed_threshold_critical}count"
description = "is too high > ${var.backup_copy_jobs_failed_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.backup_copy_jobs_failed_disabled, var.detectors_disabled)
Expand All @@ -100,39 +82,34 @@ EOF
max_delay = var.backup_copy_jobs_failed_max_delay
}

resource "signalfx_detector" "backup" {
name = format("%s %s", local.detector_name_prefix, "AWS Backup check")
resource "signalfx_detector" "backup_successful" {
name = format("%s %s", local.detector_name_prefix, "AWS Backup check jobs completed successfully")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "count"
}

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/Backup')
created = data('NumberOfBackupJobsCreated', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_aggregation_function}${var.backup_transformation_function}
completed = data('NumberOfBackupJobsCompleted', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_aggregation_function}${var.backup_transformation_function}
signal = created - completed.publish('signal')
detect(when(signal > ${var.backup_threshold_critical}, lasting=%{if var.backup_lasting_duration_critical == null}None%{else}'${var.backup_lasting_duration_critical}'%{endif}, at_least=${var.backup_at_least_percentage_critical})).publish('CRIT')
base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum')
created = data('NumberOfBackupJobsCreated', filter=base_filtering and ${module.filtering.signalflow}, extrapolation='zero')${var.backup_successful_aggregation_function}${var.backup_successful_transformation_function}
completed = data('NumberOfBackupJobsCompleted', filter=base_filtering and ${module.filtering.signalflow}, extrapolation='zero')${var.backup_successful_aggregation_function}${var.backup_successful_transformation_function}
signal = (created-completed).publish('signal')
detect(when(signal > ${var.backup_successful_threshold_critical}, lasting=%{if var.backup_successful_lasting_duration_critical == null}None%{else}'${var.backup_successful_lasting_duration_critical}'%{endif}, at_least=${var.backup_successful_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "is too high > ${var.backup_threshold_critical}count"
description = "is too high > ${var.backup_successful_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.backup_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.backup_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.backup_runbook_url, var.runbook_url), "")
tip = var.backup_tip
disabled = coalesce(var.backup_successful_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.backup_successful_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.backup_successful_runbook_url, var.runbook_url), "")
tip = var.backup_successful_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.backup_max_delay
max_delay = var.backup_successful_max_delay
}

resource "signalfx_detector" "backup_rp_partial" {
Expand All @@ -142,20 +119,14 @@ resource "signalfx_detector" "backup_rp_partial" {
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "count"
}

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/Backup')
failed = data('NumberOfRecoveryPointsPartial', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_rp_partial_aggregation_function}${var.backup_rp_partial_transformation_function}
signal = failed.publish('signal')
base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum')
signal = data('NumberOfRecoveryPointsPartial', filter=base_filtering and ${module.filtering.signalflow})${var.backup_rp_partial_aggregation_function}${var.backup_rp_partial_transformation_function}.publish('signal')
detect(when(signal > ${var.backup_rp_partial_threshold_minor}, lasting=%{if var.backup_rp_partial_lasting_duration_minor == null}None%{else}'${var.backup_rp_partial_lasting_duration_minor}'%{endif}, at_least=${var.backup_rp_partial_at_least_percentage_minor})).publish('MINOR')
EOF

rule {
description = "is too high > ${var.backup_rp_partial_threshold_minor}count"
description = "is too high > ${var.backup_rp_partial_threshold_minor}"
severity = "Minor"
detect_label = "MINOR"
disabled = coalesce(var.backup_rp_partial_disabled, var.detectors_disabled)
Expand All @@ -176,20 +147,14 @@ resource "signalfx_detector" "backup_rp_expired" {
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "count"
}

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/Backup')
failed = data('NumberOfRecoveryPointsExpired', filter=base_filtering and ${module.filtering.signalflow}, rollup='sum', extrapolation='zero')${var.backup_rp_expired_aggregation_function}${var.backup_rp_expired_transformation_function}
signal = failed.publish('signal')
base_filtering = filter('namespace', 'AWS/Backup') and filter('stat', 'sum')
signal = data('NumberOfRecoveryPointsExpired', filter=base_filtering and ${module.filtering.signalflow})${var.backup_rp_expired_aggregation_function}${var.backup_rp_expired_transformation_function}.publish('signal')
detect(when(signal > ${var.backup_rp_expired_threshold_major}, lasting=%{if var.backup_rp_expired_lasting_duration_major == null}None%{else}'${var.backup_rp_expired_lasting_duration_major}'%{endif}, at_least=${var.backup_rp_expired_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.backup_rp_expired_threshold_major}count"
description = "is too high > ${var.backup_rp_expired_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.backup_rp_expired_disabled, var.detectors_disabled)
Expand Down
10 changes: 5 additions & 5 deletions modules/integration_aws-backup/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
output "backup" {
description = "Detector resource for backup"
value = signalfx_detector.backup
}

output "backup_copy_jobs_failed" {
description = "Detector resource for backup_copy_jobs_failed"
value = signalfx_detector.backup_copy_jobs_failed
Expand All @@ -28,3 +23,8 @@ output "backup_rp_partial" {
value = signalfx_detector.backup_rp_partial
}

output "backup_successful" {
description = "Detector resource for backup_successful"
value = signalfx_detector.backup_successful
}

Loading

0 comments on commit 0429274

Please sign in to comment.