Created
April 2, 2019 23:17
-
-
Save Gangareddy/07f984003591f45ae42466b3246b2012 to your computer and use it in GitHub Desktop.
Prometheus Rules and Alerts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: monitoring.coreos.com/v1 | |
| kind: PrometheusRule | |
| metadata: | |
| generation: 1 | |
| labels: | |
| app: prometheus-operator | |
| chart: prometheus-operator-0.1.26 | |
| heritage: Tiller | |
| release: prom | |
| name: pod-node-rules | |
| namespace: default | |
| spec: | |
| groups: | |
| - name: pod-cpu.rules | |
| rules: | |
| - expr: | | |
| sum(rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])) by (namespace) | |
| record: namespace:pod_cpu_usage_seconds_total:sum_rate | |
| - expr: | | |
| sum by (namespace, pod_name, container_name) ( | |
| rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m]) | |
| ) | |
| record: namespace_pod_name_container_name:pod_cpu_usage_seconds_total:sum_rate | |
| - expr: | | |
| sum(container_memory_usage_bytes{job!="", image!="", container_name!=""}) by (namespace) | |
| record: namespace:pod_memory_usage_bytes:sum | |
| - expr: | | |
| sum by (namespace, label_name) ( | |
| sum(rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])) by (namespace, pod_name) | |
| * on (namespace, pod_name) group_left(label_name) | |
| label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
| ) | |
| record: namespace_name:pod_cpu_usage_seconds_total:sum_rate | |
| - expr: | | |
| sum by (namespace, label_name) ( | |
| sum(container_memory_usage_bytes{job!="",image!="", container_name!=""}) by (pod_name, namespace) | |
| * on (namespace, pod_name) group_left(label_name) | |
| label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
| ) | |
| record: namespace_name:pod_memory_usage_bytes:sum | |
| - expr: | | |
| sum by (namespace, label_name) ( | |
| sum(kube_pod_container_resource_requests_memory_bytes{job!=""}) by (namespace, pod) | |
| * on (namespace, pod) group_left(label_name) | |
| label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
| ) | |
| record: namespace_name:pod_container_resource_requests_memory_bytes:sum | |
| - expr: | | |
| sum by (namespace, label_name) ( | |
| sum(kube_pod_container_resource_requests_cpu_cores{job!=""} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) | |
| * on (namespace, pod) group_left(label_name) | |
| label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
| ) | |
| record: namespace_name:pod_container_resource_requests_cpu_cores:sum | |
| - name: node.rules | |
| rules: | |
| - expr: sum(min(kube_pod_info) by (node)) | |
| record: ':kube_pod_info_node_count:' | |
| - expr: | | |
| max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) | |
| record: 'node_namespace_pod:kube_pod_info:' | |
| - expr: | | |
| count by (node) (sum by (node, cpu) ( | |
| node_cpu{job="node-exporter"} | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| )) | |
| record: node:node_num_cpu:sum | |
| - expr: | | |
| 1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) | |
| record: :node_cpu_utilisation:avg1m | |
| - expr: | | |
| 1 - avg by (node) ( | |
| rate(node_cpu{job="node-exporter",mode="idle"}[1m]) | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info:) | |
| record: node:node_cpu_utilisation:avg1m | |
| - expr: | | |
| sum(node_load1{job="node-exporter"}) | |
| / | |
| sum(node:node_num_cpu:sum) | |
| record: ':node_cpu_saturation_load1:' | |
| - expr: | | |
| sum by (node) ( | |
| node_load1{job="node-exporter"} | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| / | |
| node:node_num_cpu:sum | |
| record: 'node:node_cpu_saturation_load1:' | |
| - expr: | | |
| 1 - | |
| sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
| / | |
| sum(node_memory_MemTotal{job="node-exporter"}) | |
| record: ':node_memory_utilisation:' | |
| - expr: | | |
| sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
| record: :node_memory_MemFreeCachedBuffers:sum | |
| - expr: | | |
| sum(node_memory_MemTotal{job="node-exporter"}) | |
| record: :node_memory_MemTotal:sum | |
| - expr: | | |
| sum by (node) ( | |
| (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_memory_bytes_available:sum | |
| - expr: | | |
| sum by (node) ( | |
| node_memory_MemTotal{job="node-exporter"} | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_memory_bytes_total:sum | |
| - expr: | | |
| (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) | |
| / | |
| scalar(sum(node:node_memory_bytes_total:sum)) | |
| record: node:node_memory_utilisation:ratio | |
| - expr: | | |
| 1e3 * sum( | |
| (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) | |
| + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) | |
| ) | |
| record: :node_memory_swap_io_bytes:sum_rate | |
| - expr: | | |
| 1 - | |
| sum by (node) ( | |
| (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| / | |
| sum by (node) ( | |
| node_memory_MemTotal{job="node-exporter"} | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: 'node:node_memory_utilisation:' | |
| - expr: | | |
| 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) | |
| record: 'node:node_memory_utilisation_2:' | |
| - expr: | | |
| 1e3 * sum by (node) ( | |
| (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) | |
| + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_memory_swap_io_bytes:sum_rate | |
| - expr: | | |
| avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
| record: :node_disk_utilisation:avg_irate | |
| - expr: | | |
| avg by (node) ( | |
| irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_disk_utilisation:avg_irate | |
| - expr: | | |
| avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
| record: :node_disk_saturation:avg_irate | |
| - expr: | | |
| avg by (node) ( | |
| irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_disk_saturation:avg_irate | |
| - expr: | | |
| max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} | |
| - node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
| / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
| record: 'node:node_filesystem_usage:' | |
| - expr: | | |
| max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
| record: 'node:node_filesystem_avail:' | |
| - expr: | | |
| sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + | |
| sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) | |
| record: :node_net_utilisation:sum_irate | |
| - expr: | | |
| sum by (node) ( | |
| (irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + | |
| irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_net_utilisation:sum_irate | |
| - expr: | | |
| sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + | |
| sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) | |
| record: :node_net_saturation:sum_irate | |
| - expr: | | |
| sum by (node) ( | |
| (irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + | |
| irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) | |
| * on (namespace, pod) group_left(node) | |
| node_namespace_pod:kube_pod_info: | |
| ) | |
| record: node:node_net_saturation:sum_irate | |
| - name: pod-apps | |
| rules: | |
| - alert: K8SPodCrashLooping | |
| annotations: | |
| message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
| }}) is restarting {{ printf "%.2f" $value }} times / second. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping | |
| expr: | | |
| rate(kube_pod_container_status_restarts_total{job!=""}[15m]) > 0 | |
| for: 1h | |
| labels: | |
| severity: critical | |
| - alert: K8SPodNotReady | |
| annotations: | |
| message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready | |
| state for longer than an hour. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready | |
| expr: | | |
| sum by (namespace, pod) (kube_pod_status_phase{job!="", phase=~"Pending|Unknown"}) > 0 | |
| for: 1h | |
| labels: | |
| severity: critical | |
| - alert: K8SDeploymentGenerationMismatch | |
| annotations: | |
| message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment | |
| }} does not match, this indicates that the Deployment has failed but has | |
| not been rolled back. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch | |
| expr: | | |
| kube_deployment_status_observed_generation{job!=""} | |
| != | |
| kube_deployment_metadata_generation{job!=""} | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - alert: K8SDeploymentReplicasMismatch | |
| annotations: | |
| message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not | |
| matched the expected number of replicas for longer than an hour. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch | |
| expr: | | |
| kube_deployment_spec_replicas{job!=""} | |
| != | |
| kube_deployment_status_replicas_available{job!=""} | |
| for: 1h | |
| labels: | |
| severity: critical | |
| - alert: K8SKubeStatefulSetReplicasMismatch | |
| annotations: | |
| message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has | |
| not matched the expected number of replicas for longer than 15 minutes. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch | |
| expr: | | |
| kube_statefulset_status_replicas_ready{job=""} | |
| != | |
| kube_statefulset_status_replicas{job=""} | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - alert: K8SStatefulSetGenerationMismatch | |
| annotations: | |
| message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset | |
| }} does not match, this indicates that the StatefulSet has failed but has | |
| not been rolled back. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch | |
| expr: | | |
| kube_statefulset_status_observed_generation{job!=""} | |
| != | |
| kube_statefulset_metadata_generation{job!=""} | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - alert: K8SStatefulSetUpdateNotRolledOut | |
| annotations: | |
| message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update | |
| has not been rolled out. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout | |
| expr: | | |
| max without (revision) ( | |
| kube_statefulset_status_current_revision{job!=""} | |
| unless | |
| kube_statefulset_status_update_revision{job!=""} | |
| ) | |
| * | |
| ( | |
| kube_statefulset_replicas{job!=""} | |
| != | |
| kube_statefulset_status_replicas_updated{job!=""} | |
| ) | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - alert: K8SDaemonSetRolloutStuck | |
| annotations: | |
| message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace | |
| }}/{{ $labels.daemonset }} are scheduled and ready. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck | |
| expr: | | |
| kube_daemonset_status_number_ready{job!=""} | |
| / | |
| kube_daemonset_status_desired_number_scheduled{job!=""} * 100 < 100 | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - alert: K8SDaemonSetNotScheduled | |
| annotations: | |
| message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
| }} are not scheduled.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled | |
| expr: | | |
| kube_daemonset_status_desired_number_scheduled{job!=""} | |
| - | |
| kube_daemonset_status_current_number_scheduled{job!=""} > 0 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - alert: K8SDaemonSetMisScheduled | |
| annotations: | |
| message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
| }} are running where they are not supposed to run.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled | |
| expr: | | |
| kube_daemonset_status_number_misscheduled{job!=""} > 0 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - alert: K8SCronJobRunning | |
| annotations: | |
| message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more | |
| than 1h to complete. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning | |
| expr: | | |
| time() - kube_cronjob_next_schedule_time{job!=""} > 3600 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - alert: K8SJobCompletion | |
| annotations: | |
| message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more | |
| than one hour to complete. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion | |
| expr: | | |
| kube_job_spec_completions{job!=""} - kube_job_status_succeeded{job!=""} > 0 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - alert: K8SJobFailed | |
| annotations: | |
| message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed | |
| expr: | | |
| kube_job_status_failed{job!=""} > 0 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - name: k8s-resources | |
| rules: | |
| - alert: K8SCPUOvercommit | |
| annotations: | |
| message: Cluster has overcommitted CPU resource requests for Pods and cannot | |
| tolerate node failure. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit | |
| expr: | | |
| sum(namespace_name:pod_container_resource_requests_cpu_cores:sum) | |
| / | |
| sum(node:node_num_cpu:sum) | |
| > | |
| (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: K8SMemOvercommit | |
| annotations: | |
| message: Cluster has overcommitted memory resource requests for Pods and cannot | |
| tolerate node failure. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit | |
| expr: | | |
| sum(namespace_name:pod_container_resource_requests_memory_bytes:sum) | |
| / | |
| sum(node_memory_MemTotal) | |
| > | |
| (count(node:node_num_cpu:sum)-1) | |
| / | |
| count(node:node_num_cpu:sum) | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: K8SCPUOvercommit | |
| annotations: | |
| message: Cluster has overcommitted CPU resource requests for Namespaces. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit | |
| expr: | | |
| sum(kube_resourcequota{job!="", type="hard", resource="requests.cpu"}) | |
| / | |
| sum(node:node_num_cpu:sum) | |
| > 1.5 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: K8SMemOvercommit | |
| annotations: | |
| message: Cluster has overcommitted memory resource requests for Namespaces. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit | |
| expr: | | |
| sum(kube_resourcequota{job!="", type="hard", resource="requests.memory"}) | |
| / | |
| sum(node_memory_MemTotal{job="node-exporter"}) | |
| > 1.5 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: K8SQuotaExceeded | |
| annotations: | |
| message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value | |
| }}% of its {{ $labels.resource }} quota. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded | |
| expr: | | |
| 100 * kube_resourcequota{job!="", type="used"} | |
| / ignoring(instance, job, type) | |
| (kube_resourcequota{job!="", type="hard"} > 0) | |
| > 90 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - name: kubernetes-storage | |
| rules: | |
| - alert: K8SPersistentVolumeUsageCritical | |
| annotations: | |
| message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
| }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value | |
| }}% free. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical | |
| expr: | | |
| 100 * kubelet_volume_stats_available_bytes{job!=""} | |
| / | |
| kubelet_volume_stats_capacity_bytes{job!=""} | |
| < 20 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| - alert: K8SPersistentVolumeFullInSevenDays | |
| annotations: | |
| message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
| }} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
| days. Currently {{ $value }} bytes are available. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
| expr: | | |
| kubelet_volume_stats_available_bytes{job!=""} and predict_linear(kubelet_volume_stats_available_bytes{job!=""}[6h], 7 * 24 * 3600) < 0 | |
| for: 5m | |
| labels: | |
| severity: critical | |
| - alert: K8SPersistentVolumeFullInThirtyDays | |
| annotations: | |
| message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
| }} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
| days. Currently {{ $value }} bytes are available. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
| expr: | | |
| kubelet_volume_stats_available_bytes{job!=""} and predict_linear(kubelet_volume_stats_available_bytes{job!=""}[6h], 30 * 24 * 3600) < 0 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: K8SPersistentVolumeBeingDeleted | |
| annotations: | |
| message: Based on recent sampling, the PersistentVolume claimed are deleted in {{ $labels.namespace }} . | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
| expr: count(kubelet_volume_stats_used_bytes{job!=""}) by (persistentvolumeclaim) < 0 | |
| for: 5m | |
| labels: | |
| severity: critical | |
| - alert: K8SPersistentVolumeNonePresentinNamespace | |
| annotations: | |
| annotations: | |
| message: Based on recent sampling, the PersistentVolume claimed are not found in namespace {{ $labels.namespace }} . | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
| expr: count(kubelet_volume_stats_used_bytes{job!=""}) by (namespace) <= 0 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - name: k8s-system | |
| rules: | |
| - alert: K8SNodeNotReady | |
| annotations: | |
| message: '{{ $labels.node }} has been unready for more than an hour.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready | |
| expr: | | |
| kube_node_status_condition{job!="",condition="Ready",status="true"} == 0 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - alert: K8SVersionMismatch | |
| annotations: | |
| message: There are {{ $value }} different versions of Kubernetes components | |
| running. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch | |
| expr: | | |
| count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - alert: K8SClientErrors | |
| annotations: | |
| message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
| }}' is experiencing {{ printf "%0.0f" $value }}% errors.' | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
| expr: | | |
| (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) | |
| / | |
| sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
| * 100 > 1 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: K8SClientErrors | |
| annotations: | |
| message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance}}' is experiencing {{ printf "%0.0f" $value }} errors / second. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
| expr: | | |
| sum(rate(ksm_scrape_error_total{job!=""}[5m])) by (instance, job) > 0.1 | |
| for: 15m | |
| labels: | |
| severity: warning | |
| - alert: K8STooManyPods | |
| annotations: | |
| message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close | |
| to the limit of 110. | |
| runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods | |
| expr: | | |
| kubelet_running_pod_count{job!=""} > 110 * 0.9 | |
| for: 15m | |
| labels: | |
| severity: warning |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment