Skip to content

Instantly share code, notes, and snippets.

@jiang-wei
Last active October 22, 2019 18:25
Show Gist options
  • Save jiang-wei/900383566ddffb27b18a1e79207e8c20 to your computer and use it in GitHub Desktop.
Save jiang-wei/900383566ddffb27b18a1e79207e8c20 to your computer and use it in GitHub Desktop.
Prometheus alert rules for GKE
$ cat alerts.yaml
"groups":
- "name": "kubernetes-absent"
"rules":
- "alert": "AlertmanagerDown"
"annotations":
"message": "Alertmanager has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown"
"expr": |
absent(up{job="alertmanager"} == 1)
# up{...} 得到 vector
# vector == 1 筛选掉值非 1 的序列
# absent(v) 当v不包含序列时,返回 vector(1)
# 因此此表达式表示所有 up{job=alertmanager}==1 时,返回 vector(1)
# 多实例都失效时报警
"for": "15m"
"labels":
"severity": "critical"
- "alert": "CAdvisorDown"
"annotations":
"message": "CAdvisor has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cadvisordown"
"expr": |
absent(up{job="cadvisor"} == 1)
# cadvisor 每个 node 都需要运行
# 这里不能用 absent(up{...} == 1)
# 应该用 up{job="cadvisor", instance="node_name"} == 0 or absent(up{job="cadvisor", instance="foo",} == 1)
# TODO: 或者从 kube_node_info 中拿到所有 node 列表过滤 up{...} 查询结果
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeAPIDown"
"annotations":
"message": "KubeAPI has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown"
"expr": |
absent(up{job="apiserver"} == 1)
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeDNSDown"
"annotations":
"message": "KubeDNS has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubednsdown"
"expr": |
absent(up{job="kube-dns"} == 1)
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeStateMetricsDown"
"annotations":
"message": "KubeStateMetrics has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown"
"expr": |
absent(up{job="kube-state-metrics"} == 1)
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeletDown"
"annotations":
"message": "Kubelet has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown"
"expr": |
absent(up{job="kubelet"} == 1)
"for": "15m"
"labels":
"severity": "critical"
- "alert": "NodeExporterDown"
"annotations":
"message": "NodeExporter has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown"
"expr": |
absent(up{job="node-exporter"} == 1)
"for": "15m"
"labels":
"severity": "critical"
- "alert": "PrometheusDown"
"annotations":
"message": "Prometheus has disappeared from Prometheus target discovery."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown"
"expr": |
absent(up{job="prometheus"} == 1)
"for": "15m"
"labels":
"severity": "critical"
- "name": "kubernetes-apps"
"rules":
- "alert": "KubePodCrashLooping"
"annotations":
"message": "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} / second"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping"
"expr": |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0
"for": "1h"
"labels":
"severity": "critical"
- "alert": "KubePodNotReady"
"annotations":
"message": "{{ $labels.namespace }}/{{ $labels.pod }} is not ready."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready"
"expr": |
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0
"for": "1h"
"labels":
"severity": "critical"
- "alert": "KubeDeploymentGenerationMismatch"
"annotations":
"message": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation mismatch"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch"
"expr": |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics"}
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeDeploymentReplicasMismatch"
"annotations":
"message": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch"
"expr": |
kube_deployment_spec_replicas{job="kube-state-metrics"}
!=
kube_deployment_status_replicas_available{job="kube-state-metrics"}
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeStatefulSetReplicasMismatch"
"annotations":
"message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch"
"expr": |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics"}
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeStatefulSetGenerationMismatch"
"annotations":
"message": "StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch"
"expr": |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics"}
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeDaemonSetRolloutStuck"
"annotations":
"message": "Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}}"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck"
"expr": |
kube_daemonset_status_number_ready{job="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
"for": "15m"
"labels":
"severity": "critical"
- "alert": "KubeDaemonSetNotScheduled"
"annotations":
"message": "A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are not scheduled."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled"
"expr": |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
"for": "10m"
"labels":
"severity": "warning"
- "alert": "KubeDaemonSetMisScheduled"
"annotations":
"message": "A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are running where they are not supposed to run."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled"
"expr": |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
"for": "10m"
"labels":
"severity": "warning"
- "alert": "KubeCronJobRunning"
"annotations":
"message": "CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking more than 1h to complete."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning"
"expr": |
time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
"for": "1h"
"labels":
"severity": "warning"
- "alert": "KubeJobCompletion"
"annotations":
"message": "Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than 1h to complete."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion"
"expr": |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
"for": "1h"
"labels":
"severity": "warning"
- "alert": "KubeJobFailed"
"annotations":
"message": "Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed"
"expr": |
kube_job_status_failed{job="kube-state-metrics"} > 0
"for": "1h"
"labels":
"severity": "warning"
- "name": "kubernetes-resources"
"rules":
- "alert": "KubeCPUOvercommit"
"annotations":
"message": "Overcommited CPU resource requests on Pods, cannot tolerate node failure."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit"
"expr": |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
"for": "5m"
"labels":
"severity": "warning"
- "alert": "KubeMemOvercommit"
"annotations":
"message": "Overcommited Memory resource requests on Pods, cannot tolerate node failure."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit"
"expr": |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
"for": "5m"
"labels":
"severity": "warning"
- "alert": "KubeCPUOvercommit"
"annotations":
"message": "Overcommited CPU resource request quota on Namespaces."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit"
"expr": |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
"for": "5m"
"labels":
"severity": "warning"
- "alert": "KubeMemOvercommit"
"annotations":
"message": "Overcommited Memory resource request quota on Namespaces."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit"
"expr": |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal{job="node-exporter"})
> 1.5
"for": "5m"
"labels":
"severity": "warning"
- "alert": "KubeQuotaExceeded"
"annotations":
"message": "{{ printf \"%0.0f\" $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded"
"expr": |
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
kube_resourcequota{job="kube-state-metrics", type="hard"}
> 90
"for": "15m"
"labels":
"severity": "warning"
- "name": "kubernetes-storage"
"rules":
- "alert": "KubePersistentVolumeUsageCritical"
"annotations":
"message": "The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has {{ printf \"%0.0f\" $value }}% free."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical"
"expr": |
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
< 3
"for": "1m"
"labels":
"severity": "critical"
- "alert": "KubePersistentVolumeFullInFourDays"
"annotations":
"message": "Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is expected to fill up within four days."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays"
"expr": |
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0
"for": "5m"
"labels":
"severity": "critical"
- "name": "kubernetes-system"
"rules":
- "alert": "KubeNodeNotReady"
"annotations":
"message": "{{ $labels.node }} has been unready for more than an hour"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready"
"expr": |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
"for": "1h"
"labels":
"severity": "warning"
- "alert": "KubeVersionMismatch"
"annotations":
"message": "There are {{ $value }} different versions of Kubernetes components running."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch"
"expr": |
count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
"for": "1h"
"labels":
"severity": "warning"
- "alert": "KubeClientErrors"
"annotations":
"message": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \"%0.0f\" $value }}% errors.'"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors"
"expr": |
sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100
/
sum(rate(rest_client_requests_total[5m])) by (instance, job)
> 1
"for": "15m"
"labels":
"severity": "warning"
- "alert": "KubeClientErrors"
"annotations":
"message": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \"%0.0f\" $value }} errors / sec.'"
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors"
"expr": |
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
"for": "15m"
"labels":
"severity": "warning"
- "alert": "KubeletTooManyPods"
"annotations":
"message": "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods"
"expr": |
kubelet_running_pod_count{job="kubelet"} > 100
"for": "15m"
"labels":
"severity": "warning"
- "alert": "KubeAPILatencyHigh"
"annotations":
"message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh"
"expr": |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
"for": "10m"
"labels":
"severity": "warning"
- "alert": "KubeAPILatencyHigh"
"annotations":
"message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh"
"expr": |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
"for": "10m"
"labels":
"severity": "critical"
- "alert": "KubeAPIErrorsHigh"
"annotations":
"message": "API server is erroring for {{ $value }}% of requests."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
"expr": |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
"for": "10m"
"labels":
"severity": "critical"
- "alert": "KubeAPIErrorsHigh"
"annotations":
"message": "API server is erroring for {{ $value }}% of requests."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
"expr": |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
"for": "10m"
"labels":
"severity": "warning"
- "alert": "KubeClientCertificateExpiration"
"annotations":
"message": "Kubernetes API certificate is expiring in less than 7 days."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration"
"expr": |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
"labels":
"severity": "warning"
- "alert": "KubeClientCertificateExpiration"
"annotations":
"message": "Kubernetes API certificate is expiring in less than 1 day."
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration"
"expr": |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
"labels":
"severity": "critical"
- "name": "alertmanager.rules"
"rules":
- "alert": "AlertmanagerFailedReload"
"annotations":
"description": "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
"summary": "Alertmanager's configuration reload failed"
"expr": |
alertmanager_config_last_reload_successful{job="alertmanager"} == 0
"for": "10m"
"labels":
"severity": "warning"
- "name": "general.rules"
"rules":
- "alert": "TargetDown"
"annotations":
"description": "{{ $value }}% of {{ $labels.job }} targets are down."
"summary": "Targets are down"
"expr": "100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10"
"for": "10m"
"labels":
"severity": "warning"
- "alert": "DeadMansSwitch"
"annotations":
"description": "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional."
"summary": "Alerting DeadMansSwitch"
"expr": "vector(1)"
"labels":
"severity": "none"
- "name": "kube-prometheus-node-alerting.rules"
"rules":
- "alert": "NodeDiskRunningFull"
"annotations":
"description": "device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})"
"summary": "Node disk is running full within 24 hours"
"expr": |
predict_linear(node_filesystem_free{job="node-exporter"}[6h], 3600 * 24) < 0
"for": "30m"
"labels":
"severity": "warning"
- "alert": "NodeDiskRunningFull"
"annotations":
"description": "device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})"
"summary": "Node disk is running full within 2 hours"
"expr": |
predict_linear(node_filesystem_free{job="node-exporter"}[30m], 3600 * 2) < 0
"for": "10m"
"labels":
"severity": "critical"
- "name": "prometheus.rules"
"rules":
- "alert": "PrometheusConfigReloadFailed"
"annotations":
"description": "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}"
"summary": "Reloading Promehteus' configuration failed"
"expr": |
prometheus_config_last_reload_successful{job="prometheus"} == 0
"for": "10m"
"labels":
"severity": "warning"
- "alert": "PrometheusNotificationQueueRunningFull"
"annotations":
"description": "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}"
"summary": "Prometheus' alert notification queue is running full"
"expr": |
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus"}
"for": "10m"
"labels":
"severity": "warning"
- "alert": "PrometheusErrorSendingAlerts"
"annotations":
"description": "Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}"
"summary": "Errors while sending alert from Prometheus"
"expr": |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.01
"for": "10m"
"labels":
"severity": "warning"
- "alert": "PrometheusErrorSendingAlerts"
"annotations":
"description": "Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}"
"summary": "Errors while sending alerts from Prometheus"
"expr": |
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) > 0.03
"for": "10m"
"labels":
"severity": "critical"
- "alert": "PrometheusNotConnectedToAlertmanagers"
"annotations":
"description": "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers"
"summary": "Prometheus is not connected to any Alertmanagers"
"expr": |
prometheus_notifications_alertmanagers_discovered{job="prometheus"} < 1
"for": "10m"
"labels":
"severity": "warning"
- "alert": "PrometheusTSDBReloadsFailing"
"annotations":
"description": "{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours."
"summary": "Prometheus has issues reloading data blocks from disk"
"expr": |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[2h]) > 0
"for": "12h"
"labels":
"severity": "warning"
- "alert": "PrometheusTSDBCompactionsFailing"
"annotations":
"description": "{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours."
"summary": "Prometheus has issues compacting sample blocks"
"expr": |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[2h]) > 0
"for": "12h"
"labels":
"severity": "warning"
- "alert": "PrometheusTSDBWALCorruptions"
"annotations":
"description": "{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL)."
"summary": "Prometheus write-ahead log is corrupted"
"expr": |
tsdb_wal_corruptions_total{job="prometheus"} > 0
"for": "4h"
"labels":
"severity": "warning"
- "alert": "PrometheusNotIngestingSamples"
"annotations":
"description": "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
"summary": "Prometheus isn't ingesting samples"
"expr": |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
"for": "10m"
"labels":
"severity": "warning"
- "alert": "PrometheusTargetScapesDuplicate"
"annotations":
"description": "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
"summary": "Prometheus has many samples rejected"
"expr": |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
"for": "10m"
"labels":
"severity": "warning"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment