Skip to content

Commit

Permalink
TM-500: enable endpoint monitoring alerts (#8617)
Browse files Browse the repository at this point in the history
* enable remaining endpoint alarms

* improvements to endpoint widgets

* fix
  • Loading branch information
drobinson-moj authored Nov 8, 2024
1 parent 107c7f3 commit d041177
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 20 deletions.
54 changes: 42 additions & 12 deletions terraform/environments/hmpps-oem/locals_cloudwatch_metric_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ locals {
type_instance = "hmppgw1.justice.gov.uk"
}
})

"endpoint-down-hmpps-domain-rdgateway-test" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "rdgateway1.test.hmpps-domain.service.justice.gov.uk"
}
})
}

"preproduction" = {
Expand All @@ -71,6 +78,13 @@ locals {
}
})

"endpoint-down-nomis-reporting-pp" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "reporting.pp-nomis.az.justice.gov.uk"
}
})

"endpoint-down-nomis-lsast" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
Expand Down Expand Up @@ -146,17 +160,27 @@ locals {
type = "exitcode"
type_instance = "cafmwebx.pp.planetfm.service.justice.gov.uk"
}
alarm_actions = [] # TODO: remove when IP allow listing fixed
ok_actions = []
})

"endpoint-down-cafmtx-pp" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "cafmtx.pp.planetfm.service.justice.gov.uk"
}
})

"endpoint-down-hpa-preprod" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "hpa-preprod.service.hmpps.dsd.io"
}
alarm_actions = [] # TODO: remove when IP allow listing fixed
ok_actions = []
})

"endpoint-down-hmpps-domain-rdgateway-preproduction" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "rdgateway1.preproduction.hmpps-domain.service.justice.gov.uk"
}
})
}

Expand All @@ -183,8 +207,6 @@ locals {
type = "exitcode"
type_instance = "oasys.az.justice.gov.uk"
}
alarm_actions = [] # TODO: remove when IP allow listing fixed
ok_actions = []
})

"endpoint-down-oasys-training" = merge(local.endpoint_down_alarm, {
Expand Down Expand Up @@ -257,13 +279,18 @@ locals {
}
})

"endpoint-down-cafmtx" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "cafmtx.planetfm.service.justice.gov.uk"
}
})

"endpoint-down-cafmwebx2" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "cafmwebx2.az.justice.gov.uk"
}
alarm_actions = [] # TODO: remove when IP allow listing fixed
ok_actions = []
})

"endpoint-down-cafmtrainweb" = merge(local.endpoint_down_alarm, {
Expand All @@ -278,17 +305,13 @@ locals {
type = "exitcode"
type_instance = "www.offloc.service.justice.gov.uk"
}
alarm_actions = [] # TODO: remove when IP allow listing fixed
ok_actions = []
})

"endpoint-down-hpa" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "hpa.service.hmpps.dsd.io"
}
alarm_actions = [] # TODO: remove when IP allow listing fixed
ok_actions = []
})

"endpoint-down-hmpps-az-gw1-rdgateway" = merge(local.endpoint_down_alarm, {
Expand All @@ -297,6 +320,13 @@ locals {
type_instance = "hmpps-az-gw1.justice.gov.uk"
}
})

"endpoint-down-hmpps-domain-rdgateway" = merge(local.endpoint_down_alarm, {
dimensions = {
type = "exitcode"
type_instance = "rdgateway1.hmpps-domain.service.justice.gov.uk"
}
})
}
}

Expand Down
16 changes: 8 additions & 8 deletions terraform/modules/baseline_presets/cloudwatch_dashboards.tf
Original file line number Diff line number Diff line change
Expand Up @@ -442,15 +442,15 @@ locals {
}

ec2_instance_cwagent_collectd_endpoint_monitoring = {
endpoint-down = {
endpoint-status = {
type = "metric"
alarm_threshold = 1
expression = "SORT(SEARCH('{CWAgent,InstanceId,type,type_instance} MetricName=\"collectd_endpoint_status_value\"','Maximum'),MAX,DESC)"
properties = {
view = "timeSeries"
stacked = true
region = "eu-west-2"
title = "EC2 Endpoint Monitoring endpoint-down"
title = "endpoint-status"
stat = "Maximum"
yAxis = {
left = {
Expand All @@ -460,16 +460,16 @@ locals {
}
}
}
endpoint-cert-expires-soon = {
endpoint-cert-days-to-expiry = {
type = "metric"
alarm_threshold = local.cloudwatch_metric_alarms.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-cert-expires-soon.threshold
expression = "SORT(SEARCH('{CWAgent,InstanceId,type,type_instance} MetricName=\"collectd_endpoint_cert_expiry_value\"','Minimum'),MIN,ASC)"
properties = {
view = "timeSeries"
view = "bar"
stacked = false
region = "eu-west-2"
title = "EC2 Endpoint Monitoring endpoint-cert-expires-soon"
stat = "Maximum"
title = "endpoint-cert-days-to-expiry"
stat = "Minimum"
yAxis = {
left = {
showUnits = false,
Expand Down Expand Up @@ -921,8 +921,8 @@ locals {
width = 8
height = 8
widgets = [
local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-down,
local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-cert-expires-soon,
local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-status,
local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-cert-days-to-expiry,
]
}

Expand Down

0 comments on commit d041177

Please sign in to comment.