From d041177c393aaef49d2b614c9898b51d63447999 Mon Sep 17 00:00:00 2001 From: Dominic Robinson <65237317+drobinson-moj@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:16:36 +0000 Subject: [PATCH] TM-500: enable endpoint monitoring alerts (#8617) * enable remaining endpoint alarms * improvements to endpoint widgets * fix --- .../locals_cloudwatch_metric_alarms.tf | 54 ++++++++++++++----- .../baseline_presets/cloudwatch_dashboards.tf | 16 +++--- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/terraform/environments/hmpps-oem/locals_cloudwatch_metric_alarms.tf b/terraform/environments/hmpps-oem/locals_cloudwatch_metric_alarms.tf index c7faceb83a7..4629216a4af 100644 --- a/terraform/environments/hmpps-oem/locals_cloudwatch_metric_alarms.tf +++ b/terraform/environments/hmpps-oem/locals_cloudwatch_metric_alarms.tf @@ -59,6 +59,13 @@ locals { type_instance = "hmppgw1.justice.gov.uk" } }) + + "endpoint-down-hmpps-domain-rdgateway-test" = merge(local.endpoint_down_alarm, { + dimensions = { + type = "exitcode" + type_instance = "rdgateway1.test.hmpps-domain.service.justice.gov.uk" + } + }) } "preproduction" = { @@ -71,6 +78,13 @@ locals { } }) + "endpoint-down-nomis-reporting-pp" = merge(local.endpoint_down_alarm, { + dimensions = { + type = "exitcode" + type_instance = "reporting.pp-nomis.az.justice.gov.uk" + } + }) + "endpoint-down-nomis-lsast" = merge(local.endpoint_down_alarm, { dimensions = { type = "exitcode" @@ -146,8 +160,13 @@ locals { type = "exitcode" type_instance = "cafmwebx.pp.planetfm.service.justice.gov.uk" } - alarm_actions = [] # TODO: remove when IP allow listing fixed - ok_actions = [] + }) + + "endpoint-down-cafmtx-pp" = merge(local.endpoint_down_alarm, { + dimensions = { + type = "exitcode" + type_instance = "cafmtx.pp.planetfm.service.justice.gov.uk" + } }) "endpoint-down-hpa-preprod" = merge(local.endpoint_down_alarm, { @@ -155,8 +174,13 @@ locals { type = "exitcode" type_instance = "hpa-preprod.service.hmpps.dsd.io" } - alarm_actions = [] # TODO: remove when IP allow listing fixed - ok_actions = [] + }) + + "endpoint-down-hmpps-domain-rdgateway-preproduction" = merge(local.endpoint_down_alarm, { + dimensions = { + type = "exitcode" + type_instance = "rdgateway1.preproduction.hmpps-domain.service.justice.gov.uk" + } }) } @@ -183,8 +207,6 @@ locals { type = "exitcode" type_instance = "oasys.az.justice.gov.uk" } - alarm_actions = [] # TODO: remove when IP allow listing fixed - ok_actions = [] }) "endpoint-down-oasys-training" = merge(local.endpoint_down_alarm, { @@ -257,13 +279,18 @@ locals { } }) + "endpoint-down-cafmtx" = merge(local.endpoint_down_alarm, { + dimensions = { + type = "exitcode" + type_instance = "cafmtx.planetfm.service.justice.gov.uk" + } + }) + "endpoint-down-cafmwebx2" = merge(local.endpoint_down_alarm, { dimensions = { type = "exitcode" type_instance = "cafmwebx2.az.justice.gov.uk" } - alarm_actions = [] # TODO: remove when IP allow listing fixed - ok_actions = [] }) "endpoint-down-cafmtrainweb" = merge(local.endpoint_down_alarm, { @@ -278,8 +305,6 @@ locals { type = "exitcode" type_instance = "www.offloc.service.justice.gov.uk" } - alarm_actions = [] # TODO: remove when IP allow listing fixed - ok_actions = [] }) "endpoint-down-hpa" = merge(local.endpoint_down_alarm, { @@ -287,8 +312,6 @@ locals { type = "exitcode" type_instance = "hpa.service.hmpps.dsd.io" } - alarm_actions = [] # TODO: remove when IP allow listing fixed - ok_actions = [] }) "endpoint-down-hmpps-az-gw1-rdgateway" = merge(local.endpoint_down_alarm, { @@ -297,6 +320,13 @@ locals { type_instance = "hmpps-az-gw1.justice.gov.uk" } }) + + "endpoint-down-hmpps-domain-rdgateway" = merge(local.endpoint_down_alarm, { + dimensions = { + type = "exitcode" + type_instance = "rdgateway1.hmpps-domain.service.justice.gov.uk" + } + }) } } diff --git a/terraform/modules/baseline_presets/cloudwatch_dashboards.tf b/terraform/modules/baseline_presets/cloudwatch_dashboards.tf index 632a203f57b..6a19ff3ae4d 100644 --- a/terraform/modules/baseline_presets/cloudwatch_dashboards.tf +++ b/terraform/modules/baseline_presets/cloudwatch_dashboards.tf @@ -442,7 +442,7 @@ locals { } ec2_instance_cwagent_collectd_endpoint_monitoring = { - endpoint-down = { + endpoint-status = { type = "metric" alarm_threshold = 1 expression = "SORT(SEARCH('{CWAgent,InstanceId,type,type_instance} MetricName=\"collectd_endpoint_status_value\"','Maximum'),MAX,DESC)" @@ -450,7 +450,7 @@ locals { view = "timeSeries" stacked = true region = "eu-west-2" - title = "EC2 Endpoint Monitoring endpoint-down" + title = "endpoint-status" stat = "Maximum" yAxis = { left = { @@ -460,16 +460,16 @@ locals { } } } - endpoint-cert-expires-soon = { + endpoint-cert-days-to-expiry = { type = "metric" alarm_threshold = local.cloudwatch_metric_alarms.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-cert-expires-soon.threshold expression = "SORT(SEARCH('{CWAgent,InstanceId,type,type_instance} MetricName=\"collectd_endpoint_cert_expiry_value\"','Minimum'),MIN,ASC)" properties = { - view = "timeSeries" + view = "bar" stacked = false region = "eu-west-2" - title = "EC2 Endpoint Monitoring endpoint-cert-expires-soon" - stat = "Maximum" + title = "endpoint-cert-days-to-expiry" + stat = "Minimum" yAxis = { left = { showUnits = false, @@ -921,8 +921,8 @@ locals { width = 8 height = 8 widgets = [ - local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-down, - local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-cert-expires-soon, + local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-status, + local.cloudwatch_dashboard_widgets.ec2_instance_cwagent_collectd_endpoint_monitoring.endpoint-cert-days-to-expiry, ] }