-
Notifications
You must be signed in to change notification settings - Fork 245
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Alloy-Mixin: allow k8s cluster and alloy cluster disable, add logs da…
…shboard (#808)
- Loading branch information
Showing
17 changed files
with
965 additions
and
684 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,20 @@ | ||
local clusterAlerts = (import './alerts/clustering.libsonnet'); | ||
local controllerAlerts = (import './alerts/controller.libsonnet'); | ||
local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet'); | ||
|
||
{ | ||
local alloyClusterAlerts = [clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster)], | ||
|
||
local otherAlerts = [ | ||
controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), | ||
openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster) | ||
], | ||
|
||
prometheusAlerts+: { | ||
groups+: [ | ||
(import './alerts/clustering.libsonnet'), | ||
(import './alerts/controller.libsonnet'), | ||
(import './alerts/opentelemetry.libsonnet'), | ||
], | ||
groups+: | ||
if $._config.enableAlloyCluster then | ||
alloyClusterAlerts + otherAlerts | ||
else | ||
otherAlerts | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,67 +1,91 @@ | ||
local alert = import './utils/alert.jsonnet'; | ||
|
||
alert.newGroup( | ||
'alloy_clustering', | ||
[ | ||
// Cluster not converging. | ||
alert.newRule( | ||
'ClusterNotConverging', | ||
'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0', | ||
'Cluster is not converging: nodes report different number of peers in the cluster.', | ||
'10m', | ||
), | ||
{ | ||
newAlloyClusterAlertsGroup(enableK8sCluster=true):: | ||
alert.newGroup( | ||
'alloy_clustering', | ||
[ | ||
// Cluster not converging. | ||
alert.newRule( | ||
'ClusterNotConverging', | ||
if enableK8sCluster then | ||
'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0' | ||
else | ||
'stddev by (job) (sum without (state) (cluster_node_peers)) != 0', | ||
'Cluster is not converging: nodes report different number of peers in the cluster.', | ||
'10m', | ||
), | ||
|
||
alert.newRule( | ||
'ClusterNodeCountMismatch', | ||
// Assert that the number of known peers (regardless of state) reported by each | ||
// Alloy instance matches the number of running Alloy instances in the | ||
// same cluster and namespace as reported by a count of Prometheus | ||
// metrics. | ||
||| | ||
sum without (state) (cluster_node_peers) != | ||
on (cluster, namespace, job) group_left | ||
count by (cluster, namespace, job) (cluster_node_info) | ||
|||, | ||
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.', | ||
'15m', | ||
), | ||
alert.newRule( | ||
'ClusterNodeCountMismatch', | ||
// Assert that the number of known peers (regardless of state) reported by each | ||
// Alloy instance matches the number of running Alloy instances in the | ||
// same cluster and namespace as reported by a count of Prometheus | ||
// metrics. | ||
if enableK8sCluster then ||| | ||
sum without (state) (cluster_node_peers) != | ||
on (cluster, namespace, job) group_left | ||
count by (cluster, namespace, job) (cluster_node_info) | ||
||| else ||| | ||
sum without (state) (cluster_node_peers) != | ||
on (job) group_left | ||
count by (job) (cluster_node_info) | ||
||| | ||
, | ||
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.', | ||
'15m', | ||
), | ||
// Nodes health score is not zero. | ||
alert.newRule( | ||
'ClusterNodeUnhealthy', | ||
||| | ||
cluster_node_gossip_health_score > 0 | ||
|||, | ||
'Cluster node is reporting a gossip protocol health score > 0.', | ||
'10m', | ||
), | ||
// Nodes health score is not zero. | ||
alert.newRule( | ||
'ClusterNodeUnhealthy', | ||
||| | ||
cluster_node_gossip_health_score > 0 | ||
|||, | ||
'Cluster node is reporting a gossip protocol health score > 0.', | ||
'10m', | ||
), | ||
// Node tried to join the cluster with an already-present node name. | ||
alert.newRule( | ||
'ClusterNodeNameConflict', | ||
'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0', | ||
'A node tried to join the cluster with a name conflicting with an existing peer.', | ||
'10m', | ||
), | ||
// Node tried to join the cluster with an already-present node name. | ||
alert.newRule( | ||
'ClusterNodeNameConflict', | ||
if enableK8sCluster then | ||
'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' | ||
else | ||
'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' | ||
, | ||
'A node tried to join the cluster with a name conflicting with an existing peer.', | ||
'10m', | ||
), | ||
// Node stuck in Terminating state. | ||
alert.newRule( | ||
'ClusterNodeStuckTerminating', | ||
'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0', | ||
'Cluster node stuck in Terminating state.', | ||
'10m', | ||
), | ||
// Node stuck in Terminating state. | ||
alert.newRule( | ||
'ClusterNodeStuckTerminating', | ||
if enableK8sCluster then | ||
'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0' | ||
else | ||
'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0' | ||
, | ||
'Cluster node stuck in Terminating state.', | ||
'10m', | ||
), | ||
// Nodes are not using the same configuration file. | ||
alert.newRule( | ||
'ClusterConfigurationDrift', | ||
||| | ||
count without (sha256) ( | ||
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info) | ||
) > 1 | ||
|||, | ||
'Cluster nodes are not using the same configuration file.', | ||
'5m', | ||
), | ||
] | ||
) | ||
// Nodes are not using the same configuration file. | ||
alert.newRule( | ||
'ClusterConfigurationDrift', | ||
if enableK8sCluster then ||| | ||
count without (sha256) ( | ||
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) | ||
) > 1 | ||
||| else ||| | ||
count without (sha256) ( | ||
max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info) | ||
) > 1 | ||
||| | ||
, | ||
'Cluster nodes are not using the same configuration file.', | ||
'5m', | ||
), | ||
] | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,33 @@ | ||
local alert = import './utils/alert.jsonnet'; | ||
|
||
alert.newGroup( | ||
'alloy_controller', | ||
[ | ||
// Component evaluations are taking too long, which can lead to e.g. stale targets. | ||
alert.newRule( | ||
'SlowComponentEvaluations', | ||
'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0', | ||
'Component evaluations are taking too long.', | ||
'15m', | ||
), | ||
{ | ||
newControllerAlertsGroup(enableK8sCluster=true): | ||
alert.newGroup( | ||
'alloy_controller', | ||
[ | ||
// Component evaluations are taking too long, which can lead to e.g. stale targets. | ||
alert.newRule( | ||
'SlowComponentEvaluations', | ||
if enableK8sCluster then | ||
'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0' | ||
else | ||
'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0' | ||
, | ||
'Component evaluations are taking too long.', | ||
'15m', | ||
), | ||
|
||
// Unhealthy components detected. | ||
alert.newRule( | ||
'UnhealthyComponents', | ||
'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0', | ||
'Unhealthy components detected.', | ||
'15m', | ||
), | ||
] | ||
) | ||
// Unhealthy components detected. | ||
alert.newRule( | ||
'UnhealthyComponents', | ||
if enableK8sCluster then | ||
'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0' | ||
else | ||
'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0' | ||
, | ||
'Unhealthy components detected.', | ||
'15m', | ||
), | ||
] | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,36 @@ | ||
local alert = import './utils/alert.jsonnet'; | ||
|
||
alert.newGroup( | ||
'alloy_otelcol', | ||
[ | ||
// An otelcol.exporter component rcould not push some spans to the pipeline. | ||
// This could be due to reaching a limit such as the ones | ||
// imposed by otelcol.processor.memory_limiter. | ||
alert.newRule( | ||
'OtelcolReceiverRefusedSpans', | ||
'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0', | ||
'The receiver could not push some spans to the pipeline.', | ||
'5m', | ||
), | ||
{ | ||
newOpenTelemetryAlertsGroup(enableK8sCluster=true): | ||
alert.newGroup( | ||
'alloy_otelcol', | ||
[ | ||
// An otelcol.exporter component rcould not push some spans to the pipeline. | ||
// This could be due to reaching a limit such as the ones | ||
// imposed by otelcol.processor.memory_limiter. | ||
alert.newRule( | ||
'OtelcolReceiverRefusedSpans', | ||
if enableK8sCluster then | ||
'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0' | ||
else | ||
'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0' | ||
, | ||
'The receiver could not push some spans to the pipeline.', | ||
'5m', | ||
), | ||
|
||
// The exporter failed to send spans to their destination. | ||
// There could be an issue with the payload or with the destination endpoint. | ||
alert.newRule( | ||
'OtelcolExporterFailedSpans', | ||
'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0', | ||
'The exporter failed to send spans to their destination.', | ||
'5m', | ||
), | ||
] | ||
) | ||
// The exporter failed to send spans to their destination. | ||
// There could be an issue with the payload or with the destination endpoint. | ||
alert.newRule( | ||
'OtelcolExporterFailedSpans', | ||
if enableK8sCluster then | ||
'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0' | ||
else | ||
'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0' | ||
, | ||
'The exporter failed to send spans to their destination.', | ||
'5m', | ||
), | ||
] | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
_config+:: { | ||
enableK8sCluster: true, | ||
enableAlloyCluster: true, | ||
enableLokiLogs: true, | ||
filterSelector: 'job=~"$job"', | ||
groupSelector: if self.enableK8sCluster then self.k8sClusterSelector + ', ' + self.filterSelector else self.filterSelector, | ||
instanceSelector: self.groupSelector + ', instance=~"$instance"', | ||
k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"', | ||
dashboardTag: 'alloy-mixin' | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,21 @@ | ||
{ | ||
grafanaDashboards+: | ||
(import './dashboards/controller.libsonnet') + | ||
(import './dashboards/resources.libsonnet') + | ||
(import './dashboards/prometheus.libsonnet') + | ||
(import './dashboards/cluster-node.libsonnet') + | ||
(import './dashboards/opentelemetry.libsonnet') + | ||
(import './dashboards/cluster-overview.libsonnet'), | ||
local alloyClusterDashboards = | ||
(import './dashboards/cluster-node.libsonnet') + | ||
(import './dashboards/cluster-overview.libsonnet') + | ||
(import './config.libsonnet'); | ||
|
||
local otherDashboards = | ||
(import './dashboards/resources.libsonnet') + | ||
(import './dashboards/controller.libsonnet') + | ||
(import './dashboards/prometheus.libsonnet') + | ||
(import './dashboards/opentelemetry.libsonnet') + | ||
(import './config.libsonnet'); | ||
|
||
(import './dashboards/alloy-logs.libsonnet') + | ||
{ | ||
grafanaDashboards+: | ||
if $._config.enableAlloyCluster then | ||
alloyClusterDashboards + | ||
otherDashboards | ||
else | ||
otherDashboards | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet'; | ||
local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; | ||
|
||
{ | ||
|
||
local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'], | ||
|
||
grafanaDashboards+: | ||
if $._config.enableLokiLogs then { | ||
local alloyLogs = | ||
logsDashboard.new( | ||
'Alloy logs overview', | ||
datasourceName='loki_datasource', | ||
datasourceRegex='', | ||
filterSelector=$._config.filterSelector, | ||
labels=labels, | ||
formatParser=null, | ||
showLogsVolume=true | ||
) | ||
{ | ||
panels+: | ||
{ | ||
logs+: | ||
// Alloy logs already have timestamp | ||
g.panel.logs.options.withShowTime(false), | ||
}, | ||
dashboards+: | ||
{ | ||
logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links) | ||
+ g.dashboard.withRefresh('10s'), | ||
}, | ||
}, | ||
'alloy-logs.json': alloyLogs.dashboards.logs, | ||
} else {}, | ||
} |
Oops, something went wrong.