From 5fb2b0a1f5702446513e8fac980a92d030dec35f Mon Sep 17 00:00:00 2001 From: Aniket Rao Date: Wed, 6 Oct 2021 19:32:31 +0530 Subject: [PATCH 1/2] update metrics for kube state: wip --- modules/prometheus/istio.hcl | 44 +++++------ modules/prometheus/k8s_via_kube_state.hcl | 92 +++++++++-------------- 2 files changed, 56 insertions(+), 80 deletions(-) diff --git a/modules/prometheus/istio.hcl b/modules/prometheus/istio.hcl index c8341db..e77bf30 100644 --- a/modules/prometheus/istio.hcl +++ b/modules/prometheus/istio.hcl @@ -41,8 +41,8 @@ ingester prometheus_istio_workload module { unit = "count" source prometheus "throughput" { - //query = "sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m]))" - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" + //query = "sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m]))" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -54,7 +54,7 @@ ingester prometheus_istio_workload module { unit = "count" source prometheus "status_2xx" { - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='source', source_canonical_service!='unknown', response_code=~'^2.*', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='destination', source_canonical_service!='unknown', response_code=~'^2.*', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -66,7 +66,7 @@ ingester prometheus_istio_workload module { unit = "count" source prometheus "status_3xx" { - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster', response_code=~'^3.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster', response_code=~'^3.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -78,7 +78,7 @@ ingester prometheus_istio_workload module { unit = "count" source prometheus "status_4xx" { - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster', response_code=~'^4.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster', response_code=~'^4.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -90,7 +90,7 @@ ingester prometheus_istio_workload module { unit = "count" source prometheus "status_5xx" { - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster', response_code=~'^5.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_requests_total{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster', response_code=~'^5.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -140,7 +140,7 @@ ingester prometheus_istio_workload module { unit = "bytes" source prometheus "bytes_in" { - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_request_bytes_sum{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_request_bytes_sum{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -152,7 +152,7 @@ ingester prometheus_istio_workload module { unit = "bytes" source prometheus "bytes_out" { - query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_response_bytes_sum{reporter='source', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, destination_canonical_service, destination_workload, destination_workload_namespace, destination_version, pod_name) (increase(istio_response_bytes_sum{reporter='destination', source_canonical_service!='unknown', destination_service_name!='PassthroughCluster'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -199,7 +199,7 @@ ingester prometheus_istio_cluster module { unit = "count" source prometheus "throughput" { - query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='source'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='destination'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -211,7 +211,7 @@ ingester prometheus_istio_cluster module { unit = "count" source prometheus "status_2xx" { - query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='source', response_code=~'^2.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='destination', response_code=~'^2.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -223,7 +223,7 @@ ingester prometheus_istio_cluster module { unit = "count" source prometheus "status_3xx" { - query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='source', response_code=~'^3.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='destination', response_code=~'^3.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -235,7 +235,7 @@ ingester prometheus_istio_cluster module { unit = "count" source prometheus "status_4xx" { - query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='source', response_code=~'^4.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='destination', response_code=~'^4.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -247,7 +247,7 @@ ingester prometheus_istio_cluster module { unit = "count" source prometheus "status_5xx" { - query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='source', response_code=~'^5.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_requests_total{reporter='destination', response_code=~'^5.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -259,7 +259,7 @@ ingester prometheus_istio_cluster module { unit = "bytes" source prometheus "bytes_in" { - query = "label_set(sum by (cluster) (increase(istio_request_bytes_sum{reporter='source'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_request_bytes_sum{reporter='destination'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -271,7 +271,7 @@ ingester prometheus_istio_cluster module { unit = "bytes" source prometheus "bytes_out" { - query = "label_set(sum by (cluster) (increase(istio_response_bytes_sum{reporter='source'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster) (increase(istio_response_bytes_sum{reporter='destination'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -338,7 +338,7 @@ ingester prometheus_istio_k8s_pod module { unit = "count" source prometheus "throughput" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -350,7 +350,7 @@ ingester prometheus_istio_k8s_pod module { unit = "count" source prometheus "status_2xx" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^2.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^2.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -362,7 +362,7 @@ ingester prometheus_istio_k8s_pod module { unit = "count" source prometheus "status_3xx" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^3.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^3.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -374,7 +374,7 @@ ingester prometheus_istio_k8s_pod module { unit = "count" source prometheus "status_4xx" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^4.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^4.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -386,7 +386,7 @@ ingester prometheus_istio_k8s_pod module { unit = "count" source prometheus "status_5xx" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^5.*'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_requests_total{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown', response_code=~'^5.*'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -398,7 +398,7 @@ ingester prometheus_istio_k8s_pod module { unit = "bytes" source prometheus "bytes_in" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_request_bytes_sum{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_request_bytes_sum{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" @@ -410,7 +410,7 @@ ingester prometheus_istio_k8s_pod module { unit = "bytes" source prometheus "bytes_out" { - query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_response_bytes_sum{reporter='source', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown'}[1m])), 'cluster', '$input{cluster}')" + query = "label_set(sum by (cluster, pod_name, destination_canonical_service, destination_workload_namespace) (increase(istio_response_bytes_sum{reporter='destination', destination_service_name!='PassthroughCluster', source_canonical_service!='unknown'}[1m])), 'cluster', '$input{cluster}')" join_on = { "$output{cluster}" = "$input{cluster}" diff --git a/modules/prometheus/k8s_via_kube_state.hcl b/modules/prometheus/k8s_via_kube_state.hcl index 5b9a81c..d4f5616 100644 --- a/modules/prometheus/k8s_via_kube_state.hcl +++ b/modules/prometheus/k8s_via_kube_state.hcl @@ -121,7 +121,7 @@ ingester prometheus_kube_cluster module { } } - +# Done ingester prometheus_kube_cluster_with_namespace module { frequency = 600 lookback = 600 @@ -156,55 +156,44 @@ ingester prometheus_kube_cluster_with_namespace module { "default" : "$input{using}" } - gauge "total_memory_requested" { - unit = "bytes" - - source prometheus "total_memory_requested" { - query = "label_set(sum by (cluster, namespace)(kube_pod_container_resource_requests{resource='memory', unit='byte'}), 'cluster', '$input{cluster}')" - join_on = { - "$output{cluster}" = "$input{cluster}" - } - } - } - - gauge "total_cpu_requested" { + gauge "unscheduled_pods" { unit = "count" - source prometheus "total_cpu_requested" { - query = "label_set(sum by (cluster, namespace) (kube_pod_container_resource_requests{resource='cpu', unit='core'}), 'cluster', '$input{cluster}')" + source prometheus "unscheduled_pods" { + query = "sum by (cluster, namespace) (increase(kube_pod_status_unschedulable{}[1m]))" join_on = { "$output{cluster}" = "$input{cluster}" } } } - gauge "total_unscheduled_pods" { + gauge "desired_pods" { unit = "count" - source prometheus "total_unscheduled_pods" { - query = "label_set(sum by (cluster, namespace) (kube_pod_status_unschedulable{}), 'cluster', '$input{cluster}')" + source prometheus "desired_pods" { + query = "sum by (cluster, namespace) (increase(kube_pod_status_phase{}[1m]))" join_on = { "$output{cluster}" = "$input{cluster}" } } } - gauge "total_failed_and_unknown_pods" { + gauge "failed_and_unknown_pods" { unit = "count" - source prometheus "total_failed_and_unknown_pods" { - query = "label_set(sum by (cluster, namespace) (kube_pod_status_phase{phase=~'Failed|Unknown'}), 'cluster', '$input{cluster}')" + source prometheus "failed_and_unknown_pods" { + query = "sum by (cluster, namespace) (kube_pod_status_phase{phase=~'Failed|Unknown'})" join_on = { "$output{cluster}" = "$input{cluster}" } } } - gauge "total_container_restarts" { + gauge "container_restarts" { unit = "count" - source prometheus "total_container_restarts" { - query = "label_set(sum by (cluster, namespace) (kube_pod_container_status_restarts_total{}), 'cluster', '$input{cluster}')" + source prometheus "container_restarts" { + query = "sum by (cluster, namespace) (kube_pod_container_status_restarts_total{})" join_on = { "$output{cluster}" = "$input{cluster}" } @@ -252,66 +241,53 @@ ingester prometheus_kube_node module { "default" : "$input{using}" } - gauge "total_cpu_for_scheduling" { + gauge "disk_pressure" { unit = "count" - source prometheus "total_cpu_for_scheduling" { - query = "label_set(sum by (cluster, node) (kube_node_status_allocatable{resource='cpu', unit='core'}) - sum by (cluster, node) (kube_pod_container_resource_limits{resource='cpu', unit='core'}), 'cluster', '$input{cluster}')" - join_on = { - "$output{cluster}" = "$input{cluster}" - } - } - } - - gauge "total_memory_for_scheduling" { - unit = "bytes" - - source prometheus "total_memory_for_scheduling" { - query = "label_set(sum by (cluster, node) (kube_node_status_allocatable{resource='memory', unit='byte'}) - sum by (cluster, node) (kube_pod_container_resource_limits{resource='memory', unit='byte'}), 'cluster', '$input{cluster}')" - join_on = { - "$output{cluster}" = "$input{cluster}" - } - } - } - - gauge "out_of_pods" { - unit = "count" - - source prometheus "out_of_pods" { - query = "label_set(sum by (cluster, node) (kube_node_spec_unschedulable{}), 'cluster', '$input{cluster}')" + source prometheus "disk_pressure" { + query = < Date: Thu, 7 Oct 2021 10:53:17 +0530 Subject: [PATCH 2/2] update metrics for kube state ingesters --- modules/prometheus/k8s_via_kube_state.hcl | 157 ++++++++++++++++------ 1 file changed, 114 insertions(+), 43 deletions(-) diff --git a/modules/prometheus/k8s_via_kube_state.hcl b/modules/prometheus/k8s_via_kube_state.hcl index d4f5616..d54f201 100644 --- a/modules/prometheus/k8s_via_kube_state.hcl +++ b/modules/prometheus/k8s_via_kube_state.hcl @@ -121,7 +121,6 @@ ingester prometheus_kube_cluster module { } } -# Done ingester prometheus_kube_cluster_with_namespace module { frequency = 600 lookback = 600 @@ -160,7 +159,7 @@ ingester prometheus_kube_cluster_with_namespace module { unit = "count" source prometheus "unscheduled_pods" { - query = "sum by (cluster, namespace) (increase(kube_pod_status_unschedulable{}[1m]))" + query = "label_set(sum by (cluster, namespace) (increase(kube_pod_status_unschedulable{}[1m])))" join_on = { "$output{cluster}" = "$input{cluster}" } @@ -171,7 +170,7 @@ ingester prometheus_kube_cluster_with_namespace module { unit = "count" source prometheus "desired_pods" { - query = "sum by (cluster, namespace) (increase(kube_pod_status_phase{}[1m]))" + query = "label_set(sum by (cluster, namespace) (increase(kube_pod_status_phase{}[1m])))" join_on = { "$output{cluster}" = "$input{cluster}" } @@ -182,7 +181,7 @@ ingester prometheus_kube_cluster_with_namespace module { unit = "count" source prometheus "failed_and_unknown_pods" { - query = "sum by (cluster, namespace) (kube_pod_status_phase{phase=~'Failed|Unknown'})" + query = "label_set(sum by (cluster, namespace) (kube_pod_status_phase{phase=~'Failed|Unknown'}))" join_on = { "$output{cluster}" = "$input{cluster}" } @@ -193,7 +192,7 @@ ingester prometheus_kube_cluster_with_namespace module { unit = "count" source prometheus "container_restarts" { - query = "sum by (cluster, namespace) (kube_pod_container_status_restarts_total{})" + query = "label_set(sum by (cluster, namespace) (kube_pod_container_status_restarts_total{}))" join_on = { "$output{cluster}" = "$input{cluster}" } @@ -201,7 +200,6 @@ ingester prometheus_kube_cluster_with_namespace module { } } - ingester prometheus_kube_node module { frequency = 600 lookback = 600 @@ -246,8 +244,10 @@ ingester prometheus_kube_node module { source prometheus "disk_pressure" { query = <