Skip to content

Instantly share code, notes, and snippets.

@bastjan
Created November 15, 2021 15:23
Show Gist options
  • Save bastjan/fa57becf0bc957c93ae8733562943696 to your computer and use it in GitHub Desktop.
Save bastjan/fa57becf0bc957c93ae8733562943696 to your computer and use it in GitHub Desktop.
Full diff between OCP 4.8 and 4.9 monitoring rules
diff -rub compiled-4.8/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml compiled/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
--- compiled-4.8/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml 2021-11-15 16:12:31.000000000 +0100
+++ compiled/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml 2021-11-15 16:13:35.000000000 +0100
@@ -156,8 +156,10 @@
syn: 'true'
- alert: SYN_SamplesTBRInaccessibleOnBoot
annotations:
- message: 'Samples operator could not access ''registry.redhat.io'' during
- its initial installation and it bootstrapped as removed.
+ message: 'One of two situations has occurred. Either
+
+ samples operator could not access ''registry.redhat.io'' during its
+ initial installation and it bootstrapped as removed.
If this is expected, and stems from installing in a restricted network
environment, please note that if you
@@ -175,7 +177,11 @@
assist the mirroring process.
- '
+ Or, the use of allowed registries or blocked registries with global
+ imagestream configuration will not allow
+
+ samples operator to create imagestreams using the default image registry
+ ''registry.redhat.io''.'
syn_component: openshift4-monitoring
expr: openshift_samples_tbr_inaccessible_info == 1
for: 2d
@@ -250,6 +256,7 @@
annotations:
description: Configuration has failed to load for {{ $labels.namespace
}}/{{ $labels.pod}}.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedReload.md
summary: Reloading an Alertmanager configuration has failed.
syn_component: openshift4-monitoring
expr: '# Without max_over_time, failed scrapes could create false negatives,
@@ -301,18 +308,6 @@
rules: []
- name: syn-cluster-machine-approver.rules
rules:
- - alert: SYN_ClusterMachineApproverDown
- annotations:
- message: ClusterMachineApprover has disappeared from Prometheus target
- discovery.
- syn_component: openshift4-monitoring
- expr: 'absent(up{job="machine-approver"} == 1)
-
- '
- for: 10m
- labels:
- severity: critical
- syn: 'true'
- alert: SYN_MachineApproverMaxPendingCSRsReached
annotations:
message: max pending CSRs threshold reached.
@@ -328,7 +323,7 @@
rules:
- alert: SYN_ClusterProxyApplySlow
annotations:
- message: The cluster is taking too long, on average, to apply kubernetes
+ summary: The cluster is taking too long, on average, to apply kubernetes
service rules to iptables.
syn_component: openshift4-monitoring
expr: 'histogram_quantile(0.95, sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket[5m]))
@@ -340,7 +335,7 @@
syn: 'true'
- alert: SYN_NodeProxyApplySlow
annotations:
- message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
+ summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
{{"}}"}} is taking too long, on average, to apply kubernetes service
rules to iptables.
syn_component: openshift4-monitoring
@@ -352,7 +347,7 @@
syn: 'true'
- alert: SYN_NodeProxyApplyStale
annotations:
- message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
+ summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
{{"}}"}} has stale kubernetes service rules in iptables.
syn_component: openshift4-monitoring
expr: '(kubeproxy_sync_proxy_rules_last_queued_timestamp_seconds - kubeproxy_sync_proxy_rules_last_timestamp_seconds)
@@ -368,10 +363,8 @@
syn: 'true'
- alert: SYN_NodeWithoutSDNPod
annotations:
- message: 'All nodes should be running an sdn pod, {{"{{"}} $labels.node
+ summary: All nodes should be running an sdn pod, {{"{{"}} $labels.node
{{"}}"}} is not.
-
- '
syn_component: openshift4-monitoring
expr: '(kube_node_info unless on(node) topk by (node) (1, kube_pod_info{namespace="openshift-sdn", pod=~"sdn.*"}))
> 0
@@ -383,7 +376,7 @@
syn: 'true'
- alert: SYN_SDNPodNotReady
annotations:
- message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
+ summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
{{"}}"}} is not ready.
syn_component: openshift4-monitoring
expr: 'kube_pod_status_ready{namespace=''openshift-sdn'', condition=''true''}
@@ -398,16 +391,18 @@
rules:
- alert: SYN_ClusterNotUpgradeable
annotations:
- message: One or more cluster operators have been blocking minor version
- cluster upgrades for at least an hour for reason {{ with $cluster_operator_conditions
- := "cluster_operator_conditions" | query}}{{range $value := .}}{{if
- and (eq (label "name" $value) "version") (eq (label "condition" $value)
- "Upgradeable") (eq (label "endpoint" $value) "metrics") (eq (value $value)
- 0.0) (ne (len (label "reason" $value)) 0) }}{{label "reason" $value}}.{{end}}{{end}}{{end}}
- {{ with $console_url := "console_url" | query }}{{ if ne (len (label
- "url" (first $console_url ) ) ) 0}} For more information refer to {{
- label "url" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end
- }}
+ description: In most cases, you will still be able to apply patch releases.
+ Reason {{ with $cluster_operator_conditions := "cluster_operator_conditions"
+ | query}}{{range $value := .}}{{if and (eq (label "name" $value) "version")
+ (eq (label "condition" $value) "Upgradeable") (eq (label "endpoint"
+ $value) "metrics") (eq (value $value) 0.0) (ne (len (label "reason"
+ $value)) 0) }}{{label "reason" $value}}.{{end}}{{end}}{{end}} For more
+ information refer to 'oc adm upgrade'{{ with $console_url := "console_url"
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or
+ {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{
+ end }}.
+ summary: One or more cluster operators have been blocking minor version
+ cluster upgrades for at least an hour.
syn_component: openshift4-monitoring
expr: 'max by (name, condition, endpoint) (cluster_operator_conditions{name="version",
condition="Upgradeable", endpoint="metrics"} == 0)
@@ -419,9 +414,14 @@
syn: 'true'
- alert: SYN_ClusterOperatorDegraded
annotations:
- message: Cluster operator {{ $labels.name }} has been degraded for 30
- minutes. Operator is degraded because {{ $labels.reason }} and cluster
- upgrades will be unstable.
+ description: The {{ $labels.name }} operator is degraded because {{ $labels.reason
+ }}, and the components it manages may have reduced quality of service. Cluster
+ upgrades may not complete. For more information refer to 'oc get -o
+ yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url"
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or
+ {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{
+ end }}.
+ summary: Cluster operator has been degraded for 30 minutes.
syn_component: openshift4-monitoring
expr: "(\n cluster_operator_conditions{job=\"cluster-version-operator\"\
, condition=\"Degraded\"}\n or on (name)\n group by (name) (cluster_operator_up{job=\"\
@@ -432,9 +432,14 @@
syn: 'true'
- alert: SYN_ClusterOperatorDown
annotations:
- message: Cluster operator {{ $labels.name }} has not been available for
- 10 minutes. Operator may be down or disabled, cluster will not be kept
- up to date and upgrades will not be possible.
+ description: The {{ $labels.name }} operator may be down or disabled,
+ and the components it manages may be unavailable or degraded. Cluster
+ upgrades may not complete. For more information refer to 'oc get -o
+ yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url"
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or
+ {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{
+ end }}.
+ summary: Cluster operator has not been available for 10 minutes.
syn_component: openshift4-monitoring
expr: 'cluster_operator_up{job="cluster-version-operator"} == 0
@@ -445,8 +450,12 @@
syn: 'true'
- alert: SYN_ClusterOperatorFlapping
annotations:
- message: Cluster operator {{ $labels.name }} up status is changing often.
- This might cause upgrades to be unstable.
+ description: The {{ $labels.name }} operator behavior might cause upgrades
+ to be unstable. For more information refer to 'oc get -o yaml clusteroperator
+ {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{
+ if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label "url"
+ (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}.
+ summary: Cluster operator up status is changing often.
syn_component: openshift4-monitoring
expr: 'changes(cluster_operator_up{job="cluster-version-operator"}[2m])
> 2
@@ -460,8 +469,11 @@
rules:
- alert: SYN_CannotRetrieveUpdates
annotations:
- message: Cluster version operator has not retrieved updates in {{ $value
- | humanizeDuration }}. Failure reason {{ with $cluster_operator_conditions
+ description: Failure to retrieve updates means that cluster administrators
+ will need to monitor for available updates on their own or risk falling
+ behind on security or other bugfixes. If the failure is expected, you
+ can clear spec.channel in the ClusterVersion object to tell the cluster-version
+ operator to not retrieve updates. Failure reason {{ with $cluster_operator_conditions
:= "cluster_operator_conditions" | query}}{{range $value := .}}{{if
and (eq (label "name" $value) "version") (eq (label "condition" $value)
"RetrievedUpdates") (eq (label "endpoint" $value) "metrics") (eq (value
@@ -469,6 +481,8 @@
$console_url := "console_url" | query }}{{ if ne (len (label "url" (first
$console_url ) ) ) 0}} For more information refer to {{ label "url"
(first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}
+ summary: Cluster version operator has not retrieved updates in {{ $value
+ | humanizeDuration }}.
syn_component: openshift4-monitoring
expr: '(time()-cluster_version_operator_update_retrieval_timestamp_seconds)
>= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version",
@@ -480,9 +494,15 @@
syn: 'true'
- alert: SYN_ClusterVersionOperatorDown
annotations:
- message: Cluster version operator has disappeared from Prometheus target
- discovery. Operator may be down or disabled, cluster will not be kept
- up to date and upgrades will not be possible.
+ description: The operator may be down or disabled. The cluster will not
+ be kept up to date and upgrades will not be possible. Inspect the openshift-cluster-version
+ namespace for events or changes to the cluster-version-operator deployment
+ or pods to diagnose and repair. {{ with $console_url := "console_url"
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} For
+ more information refer to {{ label "url" (first $console_url ) }}/k8s/cluster/projects/openshift-cluster-version.{{
+ end }}{{ end }}
+ summary: Cluster version operator has disappeared from Prometheus target
+ discovery.
syn_component: openshift4-monitoring
expr: 'absent(up{job="cluster-version-operator"} == 1)
@@ -554,13 +574,14 @@
syn: 'true'
- alert: SYN_UpdateAvailable
annotations:
- message: Your upstream update recommendation service recommends you update
- your cluster. For more information refer to 'oc adm upgrade'{{ with
- $console_url := "console_url" | query }}{{ if ne (len (label "url" (first
- $console_url ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{
+ description: For more information refer to 'oc adm upgrade'{{ with $console_url
+ := "console_url" | query }}{{ if ne (len (label "url" (first $console_url
+ ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{
end }}{{ end }}.
+ summary: Your upstream update recommendation service recommends you update
+ your cluster.
syn_component: openshift4-monitoring
- expr: 'cluster_version_available_updates > 0
+ expr: 'sum by (channel,upstream) (cluster_version_available_updates) > 0
'
labels:
@@ -582,13 +603,14 @@
more CPU pressure is likely to cause a failover; increase available
CPU.
syn_component: openshift4-monitoring
- expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))
+ expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m]))
* 100) > 90 AND on (instance) label_replace( kube_node_role{role="master"},
"instance", "$1", "node", "(.+)" )
'
for: 5m
labels:
+ namespace: openshift-kube-apiserver
severity: critical
syn: 'true'
- alert: SYN_HighOverallControlPlaneCPU
@@ -605,11 +627,12 @@
outage may cause a cascading failure; increase available CPU.
syn_component: openshift4-monitoring
expr: "sum(\n 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"\
- idle\"}[5m])) * 100)\n AND on (instance) label_replace( kube_node_role{role=\"\
+ idle\"}[1m])) * 100)\n AND on (instance) label_replace( kube_node_role{role=\"\
master\"}, \"instance\", \"$1\", \"node\", \"(.+)\" )\n)\n/\ncount(kube_node_role{role=\"\
master\"})\n> 60\n"
for: 10m
labels:
+ namespace: openshift-kube-apiserver
severity: warning
syn: 'true'
- name: syn-etcd
@@ -791,11 +814,12 @@
syn_component: openshift4-monitoring
expr: vector(1)
labels:
+ namespace: openshift-monitoring
severity: none
syn: 'true'
- name: syn-k8s.rules
rules: []
- - name: syn-kube-apiserver-slos
+ - name: syn-kube-apiserver-slos-basic
rules:
- alert: SYN_KubeAPIErrorBudgetBurn
annotations:
@@ -816,6 +840,7 @@
for: 2m
labels:
long: 1h
+ namespace: openshift-kube-apiserver
severity: critical
short: 5m
syn: 'true'
@@ -838,53 +863,10 @@
for: 15m
labels:
long: 6h
+ namespace: openshift-kube-apiserver
severity: critical
short: 30m
syn: 'true'
- - alert: SYN_KubeAPIErrorBudgetBurn
- annotations:
- description: The API server is burning too much error budget. This alert
- fires when too many requests are failing with high latency. Use the
- 'API Performance' monitoring dashboards to narrow down the request states
- and latency. The 'etcd' monitoring dashboards also provides metrics
- to help determine etcd stability and performance.
- summary: The API server is burning too much error budget.
- syn_component: openshift4-monitoring
- expr: 'sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
-
- and
-
- sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
-
- '
- for: 1h
- labels:
- long: 1d
- severity: warning
- short: 2h
- syn: 'true'
- - alert: SYN_KubeAPIErrorBudgetBurn
- annotations:
- description: The API server is burning too much error budget. This alert
- fires when too many requests are failing with high latency. Use the
- 'API Performance' monitoring dashboards to narrow down the request states
- and latency. The 'etcd' monitoring dashboards also provides metrics
- to help determine etcd stability and performance.
- summary: The API server is burning too much error budget.
- syn_component: openshift4-monitoring
- expr: 'sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
-
- and
-
- sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
-
- '
- for: 3h
- labels:
- long: 3d
- severity: warning
- short: 6h
- syn: 'true'
- name: syn-kube-apiserver.rules
rules: []
- name: syn-kube-prometheus-general.rules
@@ -933,7 +915,7 @@
$labels.container}} has been in waiting state for longer than 1 hour.
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
- expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"})
+ expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"})
> 0
'
@@ -947,7 +929,7 @@
$labels.daemonset }} are running where they are not supposed to run.'
summary: DaemonSet pods are misscheduled.
syn_component: openshift4-monitoring
- expr: 'kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
+ expr: 'kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}
> 0
'
@@ -961,10 +943,9 @@
$labels.daemonset }} are not scheduled.'
summary: DaemonSet pods are not scheduled.
syn_component: openshift4-monitoring
- expr: "kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"} >\
- \ 0\n"
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"} > 0\n"
for: 10m
labels:
severity: warning
@@ -976,20 +957,18 @@
summary: DaemonSet rollout is stuck.
syn_component: openshift4-monitoring
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ !=\n kube_daemonset_status_desired_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ ) or (\n kube_daemonset_status_number_misscheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ !=\n kube_daemonset_status_desired_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ ) or (\n kube_daemonset_status_number_available{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
- ,job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ )\n) and (\n changes(kube_daemonset_updated_number_scheduled{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\
+ \ kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\
+ \ 0\n ) or (\n kube_daemonset_updated_number_scheduled{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\
+ \ kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\
+ \ kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[5m])\n \
\ ==\n 0\n)\n"
for: 30m
labels:
@@ -1002,46 +981,44 @@
has not been rolled back.
summary: Deployment generation mismatch due to possible roll-back
syn_component: openshift4-monitoring
- expr: "kube_deployment_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "kube_deployment_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n"
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n"
for: 15m
labels:
severity: warning
syn: 'true'
- alert: SYN_KubeHpaMaxedOut
annotations:
- description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running
- at max replicas for longer than 15 minutes.
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
+ has been running at max replicas for longer than 15 minutes.
summary: HPA is running at max replicas
syn_component: openshift4-monitoring
expr: "kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ ==\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
- ,job=\"kube-state-metrics\"}\n"
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n"
for: 15m
labels:
severity: warning
syn: 'true'
- alert: SYN_KubeHpaReplicasMismatch
annotations:
- description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched
- the desired number of replicas for longer than 15 minutes.
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
+ has not matched the desired number of replicas for longer than 15 minutes.
summary: HPA has not matched descired number of replicas.
syn_component: openshift4-monitoring
expr: "(kube_horizontalpodautoscaler_status_desired_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ !=\nkube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"})\n\
- \ and\n(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ >\nkube_horizontalpodautoscaler_spec_min_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
- ,job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ <\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
- ,job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[15m])\
- \ == 0\n"
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n and\n\
+ (kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n and\n\
+ (kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n and\n\
+ changes(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[15m]) ==\
+ \ 0\n"
for: 15m
labels:
severity: warning
@@ -1052,8 +1029,8 @@
more than 12 hours to complete.
summary: Job did not complete in time
syn_component: openshift4-monitoring
- expr: 'kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
- - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} >
+ expr: 'kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}
+ - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} >
0
'
@@ -1066,9 +1043,10 @@
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed
to complete. Removing failed job after investigation should clear this
alert.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeJobFailed.md
summary: Job failed to complete.
syn_component: openshift4-monitoring
- expr: 'kube_job_failed{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} >
+ expr: 'kube_job_failed{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} >
0
'
@@ -1078,12 +1056,13 @@
syn: 'true'
- alert: SYN_KubePodCrashLooping
annotations:
- description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
- }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes.
+ description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
+ }}) is in waiting state (reason: "CrashLoopBackOff").'
summary: Pod is crash looping.
syn_component: openshift4-monitoring
- expr: 'rate(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[10m])
- * 60 * 5 > 0
+ expr: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
+ namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}[5m])
+ >= 1
'
for: 15m
@@ -1094,10 +1073,11 @@
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in
a non-ready state for longer than 15 minutes.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md
summary: Pod has been in a non-ready state for more than 15 minutes.
syn_component: openshift4-monitoring
expr: "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\", phase=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\", phase=~\"\
Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk\
\ by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"\
Job\"})\n )\n) > 0\n"
@@ -1112,9 +1092,9 @@
has not been rolled back.
summary: StatefulSet generation mismatch due to possible roll-back
syn_component: openshift4-monitoring
- expr: "kube_statefulset_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "kube_statefulset_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n"
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n"
for: 15m
labels:
severity: warning
@@ -1126,12 +1106,11 @@
minutes.
summary: Deployment has not matched the expected number of replicas.
syn_component: openshift4-monitoring
- expr: "(\n kube_statefulset_status_replicas_ready{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "(\n kube_statefulset_status_replicas_ready{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- ) and (\n changes(kube_statefulset_status_replicas_updated{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[10m])\n\
- \ ==\n 0\n)\n"
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n) and (\n\
+ \ changes(kube_statefulset_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n"
for: 15m
labels:
severity: warning
@@ -1143,14 +1122,13 @@
summary: StatefulSet update has not been rolled out.
syn_component: openshift4-monitoring
expr: "(\n max without (revision) (\n kube_statefulset_status_current_revision{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ unless\n kube_statefulset_status_update_revision{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ )\n *\n (\n kube_statefulset_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
- ,job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- \ )\n) and (\n changes(kube_statefulset_status_replicas_updated{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n unless\n\
+ \ kube_statefulset_status_update_revision{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\
+ \ kube_statefulset_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[5m])\n \
\ ==\n 0\n)\n"
for: 15m
labels:
@@ -1163,15 +1141,21 @@
- alert: SYN_KubeCPUOvercommit
annotations:
description: Cluster has overcommitted CPU resource requests for Pods
- and cannot tolerate node failure.
+ by {{ $value }} CPU shares and cannot tolerate node failure.
summary: Cluster has overcommitted CPU resource requests.
syn_component: openshift4-monitoring
- expr: "sum(namespace_cpu:kube_pod_container_resource_requests:sum{})\n \
- \ /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n >\n((count(kube_node_status_allocatable{resource=\"\
- cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"\
- })\n"
- for: 5m
+ expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"})
+ - max(kube_node_status_allocatable{resource="cpu"})) > 0
+
+ and
+
+ (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"}))
+ > 0
+
+ '
+ for: 10m
labels:
+ namespace: kube-system
severity: warning
syn: 'true'
- alert: SYN_KubeCPUQuotaOvercommit
@@ -1179,7 +1163,7 @@
description: Cluster has overcommitted CPU resource requests for Namespaces.
summary: Cluster has overcommitted CPU resource requests.
syn_component: openshift4-monitoring
- expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\n\
sum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n"
for: 5m
@@ -1189,15 +1173,22 @@
- alert: SYN_KubeMemoryOvercommit
annotations:
description: Cluster has overcommitted memory resource requests for Pods
- and cannot tolerate node failure.
+ by {{ $value }} bytes and cannot tolerate node failure.
summary: Cluster has overcommitted memory resource requests.
syn_component: openshift4-monitoring
- expr: "sum(namespace_memory:kube_pod_container_resource_requests:sum{})\n\
- \ /\nsum(kube_node_status_allocatable{resource=\"memory\"})\n >\n((count(kube_node_status_allocatable{resource=\"\
- memory\"}) > 1) - 1)\n /\ncount(kube_node_status_allocatable{resource=\"\
- memory\"})\n"
- for: 5m
+ expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{})
+ - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"}))
+ > 0
+
+ and
+
+ (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"}))
+ > 0
+
+ '
+ for: 10m
labels:
+ namespace: kube-system
severity: warning
syn: 'true'
- alert: SYN_KubeMemoryQuotaOvercommit
@@ -1205,7 +1196,7 @@
description: Cluster has overcommitted memory resource requests for Namespaces.
summary: Cluster has overcommitted memory resource requests.
syn_component: openshift4-monitoring
- expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\n\
sum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"\
})\n > 1.5\n"
@@ -1219,9 +1210,9 @@
}} of its {{ $labels.resource }} quota.
summary: Namespace quota is going to be full.
syn_component: openshift4-monitoring
- expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job,\
- \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n"
for: 15m
labels:
@@ -1233,9 +1224,9 @@
}} of its {{ $labels.resource }} quota.
summary: Namespace quota has exceeded the limits.
syn_component: openshift4-monitoring
- expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job,\
- \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n"
for: 15m
labels:
@@ -1247,9 +1238,9 @@
}} of its {{ $labels.resource }} quota.
summary: Namespace quota is fully used.
syn_component: openshift4-monitoring
- expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job,\
- \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n"
for: 15m
labels:
@@ -1263,7 +1254,7 @@
status {{ $labels.phase }}.
summary: PersistentVolume is having issues with provisioning.
syn_component: openshift4-monitoring
- expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
+ expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}
> 0
'
@@ -1276,14 +1267,14 @@
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md
summary: PersistentVolume is filling up.
syn_component: openshift4-monitoring
- expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
- /metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
- /metrics\"} > 0\n"
+ (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\
+ }\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kubelet\", metrics_path=\"/metrics\"} > 0\n"
for: 1m
labels:
severity: critical
@@ -1294,16 +1285,16 @@
{{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace
}} is expected to fill up within four days. Currently {{ $value | humanizePercentage
}} is available.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md
summary: PersistentVolume is filling up.
syn_component: openshift4-monitoring
- expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
+ expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
,job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
- /metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
- /metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
- /metrics\"}[6h], 4 * 24 * 3600) < 0\n"
+ (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\
+ }\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\
+ }[6h], 4 * 24 * 3600) < 0\n"
for: 1h
labels:
severity: warning
@@ -1317,8 +1308,8 @@
summary: Kubernetes API server client is experiencing errors.
syn_component: openshift4-monitoring
expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance,\
- \ job)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job))\n\
- > 0.01\n"
+ \ job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance,\
+ \ job, namespace))\n> 0.01\n"
for: 15m
labels:
severity: warning
@@ -1332,7 +1323,7 @@
summary: An aggregated API is down.
syn_component: openshift4-monitoring
expr: '(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m])))
- * 100 < 70
+ * 100 < 85
'
for: 5m
@@ -1356,6 +1347,7 @@
- alert: SYN_KubeAPIDown
annotations:
description: KubeAPI has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md
summary: Target disappeared from Prometheus target discovery.
syn_component: openshift4-monitoring
expr: 'absent(up{job="apiserver"} == 1)
@@ -1386,6 +1378,7 @@
- alert: SYN_KubeNodeNotReady
annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeNodeNotReady.md
summary: Node is not ready.
syn_component: openshift4-monitoring
expr: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"}
@@ -1442,6 +1435,7 @@
- alert: SYN_KubeletDown
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeletDown.md
summary: Target disappeared from Prometheus target discovery.
syn_component: openshift4-monitoring
expr: 'absent(up{job="kubelet", metrics_path="/metrics"} == 1)
@@ -1449,6 +1443,7 @@
'
for: 15m
labels:
+ namespace: kube-system
severity: critical
syn: 'true'
- alert: SYN_KubeletPlegDurationHigh
@@ -1650,6 +1645,21 @@
labels:
severity: critical
syn: 'true'
+ - name: syn-machine-health-check-unterminated-short-circuit
+ rules:
+ - alert: SYN_MachineHealthCheckUnterminatedShortCircuit
+ annotation:
+ message: machine health check {{ $labels.name }} has been disabled by
+ short circuit for more than 30 minutes
+ annotations:
+ syn_component: openshift4-monitoring
+ expr: 'mapi_machinehealthcheck_short_circuit == 1
+
+ '
+ for: 30m
+ labels:
+ severity: warning
+ syn: 'true'
- name: syn-machine-not-yet-deleted
rules:
- alert: SYN_MachineNotYetDeleted
@@ -1692,6 +1702,27 @@
labels:
severity: warning
syn: 'true'
+ - name: syn-master-nodes-high-memory-usage
+ rules:
+ - alert: SYN_MasterNodesHighMemoryUsage
+ annotations:
+ message: Memory usage of {{ $value | humanize }} on {{ $labels.node }}
+ exceeds 90%. Master nodes starved of memory could result in degraded
+ performance of the control plane.
+ syn_component: openshift4-monitoring
+ expr: '((sum(node_memory_MemTotal_bytes AND on (instance) label_replace(
+ kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" )) - sum(node_memory_MemFree_bytes
+ + node_memory_Buffers_bytes + node_memory_Cached_bytes AND on (instance)
+ label_replace( kube_node_role{role="master"}, "instance", "$1", "node",
+ "(.+)" ))) / sum(node_memory_MemTotal_bytes AND on (instance) label_replace(
+ kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" )) * 100)
+ > 90
+
+ '
+ for: 15m
+ labels:
+ severity: warning
+ syn: 'true'
- name: syn-mcd-drain-error
rules:
- alert: SYN_MCDDrainError
@@ -1774,10 +1805,37 @@
labels:
severity: warning
syn: 'true'
+ - alert: SYN_NodeFileDescriptorLimit
+ annotations:
+ description: File descriptors limit at {{ $labels.instance }} is currently
+ at {{ printf "%.2f" $value }}%.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md
+ summary: Kernel is predicted to exhaust file descriptors limit soon.
+ syn_component: openshift4-monitoring
+ expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\
+ node-exporter\"} > 70\n)\n"
+ for: 15m
+ labels:
+ severity: warning
+ syn: 'true'
+ - alert: SYN_NodeFileDescriptorLimit
+ annotations:
+ description: File descriptors limit at {{ $labels.instance }} is currently
+ at {{ printf "%.2f" $value }}%.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md
+ summary: Kernel is predicted to exhaust file descriptors limit soon.
+ syn_component: openshift4-monitoring
+ expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\
+ node-exporter\"} > 90\n)\n"
+ for: 15m
+ labels:
+ severity: critical
+ syn: 'true'
- alert: SYN_NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available inodes left.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md
summary: Filesystem has less than 5% inodes left.
syn_component: openshift4-monitoring
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
@@ -1792,6 +1850,7 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available inodes left.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md
summary: Filesystem has less than 3% inodes left.
syn_component: openshift4-monitoring
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
@@ -1806,13 +1865,14 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available space left.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md
summary: Filesystem has less than 5% space left.
syn_component: openshift4-monitoring
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} *\
\ 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
\"} == 0\n)\n"
- for: 1h
+ for: 30m
labels:
severity: warning
syn: 'true'
@@ -1820,13 +1880,14 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available space left.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md
summary: Filesystem has less than 3% space left.
syn_component: openshift4-monitoring
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} *\
\ 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
\"} == 0\n)\n"
- for: 1h
+ for: 30m
labels:
severity: critical
syn: 'true'
@@ -1835,6 +1896,7 @@
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available inodes left and is
filling up.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md
summary: Filesystem is predicted to run out of inodes within the next
24 hours.
syn_component: openshift4-monitoring
@@ -1852,6 +1914,7 @@
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available inodes left and is
filling up fast.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md
summary: Filesystem is predicted to run out of inodes within the next
4 hours.
syn_component: openshift4-monitoring
@@ -1869,6 +1932,7 @@
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available space left and is
filling up.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md
summary: Filesystem is predicted to run out of space within the next 24
hours.
syn_component: openshift4-monitoring
@@ -1886,6 +1950,7 @@
description: Filesystem on {{ $labels.device }} at {{ $labels.instance
}} has only {{ printf "%.2f" $value }}% available space left and is
filling up fast.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md
summary: Filesystem is predicted to run out of space within the next 4
hours.
syn_component: openshift4-monitoring
@@ -1945,6 +2010,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance
}} is in degraded state due to one or more disks failures. Number of
spare drives is insufficient to fix issue automatically.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeRAIDDegraded.md
summary: RAID Array is degraded
syn_component: openshift4-monitoring
expr: 'node_md_disks_required - ignoring (state) (node_md_disks{state="active"})
@@ -1985,8 +2051,10 @@
rules:
- alert: SYN_NodeNetworkInterfaceFlapping
annotations:
- message: Network interface "{{ $labels.device }}" changing it's up status
- often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
+ description: Network interface "{{ $labels.device }}" changing its up
+ status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod
+ }}
+ summary: Network interface is often changing its status
syn_component: openshift4-monitoring
expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m])
> 2
@@ -2091,12 +2159,12 @@
syn_component: openshift4-monitoring
expr: "count without (node)\n(\n group by (node, workload, namespace)\n\
\ (\n kube_pod_info{node!=\"\"}\n * on(namespace,pod) group_left(workload)\n\
- \ (\n kube_pod_spec_volumes_persistentvolumeclaims_info\n \
+ \ (\n max by(namespace, pod, workload) (kube_pod_spec_volumes_persistentvolumeclaims_info)\n\
\ * on(namespace,pod) group_left(workload)\n (\n namespace_workload_pod:kube_pod_owner:relabel\n\
\ * on(namespace,workload,workload_type) group_left()\n \
\ (\n count without(pod) (namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\"}) > 1\n )\n )\n \
- \ )\n )\n) == 1\n"
+ (openshift-.*|kube-.*|default)\"}) > 1\n )\n )\n )\n )\n\
+ ) == 1\n"
for: 1h
labels:
severity: warning
@@ -2174,21 +2242,24 @@
rules:
- alert: SYN_AlertmanagerReceiversNotConfigured
annotations:
- message: Alerts are not configured to be sent to a notification system,
+ description: Alerts are not configured to be sent to a notification system,
meaning that you may not be notified in a timely fashion when important
failures occur. Check the OpenShift documentation to learn how to configure
notifications with Alertmanager.
+ summary: Receivers (notification integrations) are not configured on Alertmanager
syn_component: openshift4-monitoring
- expr: cluster:alertmanager_routing_enabled:max == 0
+ expr: cluster:alertmanager_integrations:max == 0
for: 10m
labels:
+ namespace: openshift-monitoring
severity: warning
syn: 'true'
- alert: SYN_ClusterMonitoringOperatorReconciliationErrors
annotations:
- message: Cluster Monitoring Operator is experiencing unexpected reconciliation
- errors. Inspect the cluster-monitoring-operator log for potential root
- causes.
+ description: Errors are occurring during reconciliation cycles. Inspect
+ the cluster-monitoring-operator log for potential root causes.
+ summary: Cluster Monitoring Operator is experiencing unexpected reconciliation
+ errors.
syn_component: openshift4-monitoring
expr: max_over_time(cluster_monitoring_operator_last_reconciliation_successful[5m])
== 0
@@ -2207,28 +2278,32 @@
this may indicate a new version of a cluster component cannot start
due to a bug or configuration error. Assess the pods for this deployment
to verify they are running on healthy nodes and then contact support.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeDeploymentReplicasMismatch.md
summary: Deployment has not matched the expected number of replicas
syn_component: openshift4-monitoring
- expr: "(\n kube_deployment_spec_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
- ,job=\"kube-state-metrics\"}\n !=\n kube_deployment_status_replicas_available{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
- ) and (\n changes(kube_deployment_status_replicas_updated{namespace=~\"\
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\
- \ ==\n 0\n) and cluster:control_plane:all_nodes_ready\n"
+ expr: "(((\n kube_deployment_spec_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{namespace=~\"\
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n) and (\n\
+ \ changes(kube_deployment_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\
+ ,job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)) * on() group_left cluster:control_plane:all_nodes_ready)\
+ \ > 0\n"
for: 15m
labels:
severity: warning
syn: 'true'
- alert: SYN_MultipleContainersOOMKilled
annotations:
- message: Multiple containers were out of memory killed within the past
- 15 minutes.
+ description: Multiple containers were out of memory killed within the
+ past 15 minutes. There are many potential causes of OOM errors, however
+ issues on a specific node or containers breaching their limits is common.
+ summary: Containers are being killed due to OOM
syn_component: openshift4-monitoring
expr: sum(max by(namespace, container, pod) (increase(kube_pod_container_status_restarts_total[12m]))
and max by(namespace, container, pod) (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"})
== 1) > 5
for: 15m
labels:
+ namespace: kube-system
severity: info
syn: 'true'
- name: syn-openshift-monitoring.rules
@@ -2253,6 +2328,7 @@
'
for: 1h
labels:
+ namespace: openshift-kube-apiserver
severity: info
syn: 'true'
- alert: SYN_APIRemovedInNextReleaseInUse
@@ -2264,13 +2340,14 @@
{{ $labels.resource }}.{{ $labels.version }}.{{ $labels.group }} -o
yaml` to identify the workload.
syn_component: openshift4-monitoring
- expr: 'group(apiserver_requested_deprecated_apis{removed_release="1.22"})
+ expr: 'group(apiserver_requested_deprecated_apis{removed_release="1.23"})
by (group,version,resource) and (sum by(group,version,resource) (rate(apiserver_request_total{system_client!="kube-controller-manager",system_client!="cluster-policy-controller"}[4h])))
> 0
'
for: 1h
labels:
+ namespace: openshift-kube-apiserver
severity: info
syn: 'true'
- name: syn-prometheus
@@ -2325,6 +2402,22 @@
labels:
severity: warning
syn: 'true'
+ - alert: SYN_PrometheusLabelLimitHit
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
+ {{ printf "%.0f" $value }} targets because some samples exceeded the
+ configured label_limit, label_name_length_limit or label_value_length_limit.
+ summary: Prometheus has dropped targets because some scrape configs have
+ exceeded the labels limit.
+ syn_component: openshift4-monitoring
+ expr: 'increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
+ > 0
+
+ '
+ for: 15m
+ labels:
+ severity: warning
+ syn: 'true'
- alert: SYN_PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed
@@ -2518,6 +2611,21 @@
labels:
severity: warning
syn: 'true'
+ - alert: SYN_PrometheusTargetSyncFailure
+ annotations:
+ description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}}
+ have failed to sync because invalid configuration was supplied.'
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusTargetSyncFailure.md
+ summary: Prometheus has failed to sync targets.
+ syn_component: openshift4-monitoring
+ expr: 'increase(prometheus_target_sync_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[30m])
+ > 0
+
+ '
+ for: 5m
+ labels:
+ severity: critical
+ syn: 'true'
- name: syn-prometheus-operator
rules:
- alert: SYN_PrometheusOperatorListErrors
@@ -2623,6 +2731,20 @@
labels:
severity: warning
syn: 'true'
+ - name: syn-scheduler-legacy-policy-deprecated
+ rules:
+ - alert: SYN_SchedulerLegacyPolicySet
+ annotations:
+ message: The scheduler is currently configured to use a legacy scheduler
+ policy API. Use of the policy API is deprecated and removed in 4.10.
+ syn_component: openshift4-monitoring
+ expr: 'cluster_legacy_scheduler_policy > 0
+
+ '
+ for: 60m
+ labels:
+ severity: warning
+ syn: 'true'
- name: syn-system-memory-exceeds-reservation
rules:
- alert: SYN_SystemMemoryExceedsReservation
@@ -2637,11 +2759,7 @@
change or at steady state).
syn_component: openshift4-monitoring
expr: 'sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum
- by (node) (kube_node_status_capacity{resource="memory"}) - sum by (node)
- (kube_node_status_capacity{resource="hugepages_1Gi"}) - sum by (node)
- (kube_node_status_capacity{resource="hugepages_2Mi"}) - sum by (node)
- (kube_node_status_allocatable{resource="memory"}) - sum by (node) (kube_node_status_allocatable{resource="hugepages_1Gi"})
- - sum by (node) (kube_node_status_allocatable{resource="hugepages_2Mi"}))
+ by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"}))
* 0.95)
'
@@ -2653,12 +2771,12 @@
rules:
- alert: SYN_ThanosQueryGrpcClientErrorRate
annotations:
- description: Thanos Query {{$labels.job}} is failing to send {{ $value
- | humanize }}% of requests.
+ description: Thanos Query {{$labels.job}} is failing to send {{$value
+ | humanize}}% of requests.
summary: Thanos Query is failing to send requests.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(grpc_client_handled_total{grpc_code!=\"OK\"\
- , job=\"thanos-querier\"}[5m]))\n/\n sum by (job) (rate(grpc_client_started_total{job=\"\
+ expr: "(\n sum by (job, namespace) (rate(grpc_client_handled_total{grpc_code!=\"\
+ OK\", job=\"thanos-querier\"}[5m]))\n/\n sum by (job, namespace) (rate(grpc_client_started_total{job=\"\
thanos-querier\"}[5m]))\n) * 100 > 5\n"
for: 1h
labels:
@@ -2666,12 +2784,13 @@
syn: 'true'
- alert: SYN_ThanosQueryGrpcServerErrorRate
annotations:
- description: Thanos Query {{$labels.job}} is failing to handle {{ $value
- | humanize }}% of requests.
+ description: Thanos Query {{$labels.job}} is failing to handle {{$value
+ | humanize}}% of requests.
summary: Thanos Query is failing to handle requests.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
- , job=\"thanos-querier\"}[5m]))\n/\n sum by (job) (rate(grpc_server_started_total{job=\"\
+ expr: "(\n sum by (job, namespace) (rate(grpc_server_handled_total{grpc_code=~\"\
+ Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
+ , job=\"thanos-querier\"}[5m]))\n/\n sum by (job, namespace) (rate(grpc_server_started_total{job=\"\
thanos-querier\"}[5m]))\n* 100 > 5\n)\n"
for: 1h
labels:
@@ -2679,12 +2798,12 @@
syn: 'true'
- alert: SYN_ThanosQueryHighDNSFailures
annotations:
- description: Thanos Query {{$labels.job}} have {{ $value | humanize }}%
+ description: Thanos Query {{$labels.job}} have {{$value | humanize}}%
of failing DNS queries for store endpoints.
summary: Thanos Query is having high number of DNS failures.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=\"\
- thanos-querier\"}[5m]))\n/\n sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=\"\
+ expr: "(\n sum by (job, namespace) (rate(thanos_query_store_apis_dns_failures_total{job=\"\
+ thanos-querier\"}[5m]))\n/\n sum by (job, namespace) (rate(thanos_query_store_apis_dns_lookups_total{job=\"\
thanos-querier\"}[5m]))\n) * 100 > 1\n"
for: 1h
labels:
@@ -2692,26 +2811,28 @@
syn: 'true'
- alert: SYN_ThanosQueryHttpRequestQueryErrorRateHigh
annotations:
- description: Thanos Query {{$labels.job}} is failing to handle {{ $value
- | humanize }}% of "query" requests.
+ description: Thanos Query {{$labels.job}} is failing to handle {{$value
+ | humanize}}% of "query" requests.
summary: Thanos Query is failing to handle requests.
syn_component: openshift4-monitoring
- expr: "(\n sum(rate(http_requests_total{code=~\"5..\", job=\"thanos-querier\"\
- , handler=\"query\"}[5m]))\n/\n sum(rate(http_requests_total{job=\"thanos-querier\"\
- , handler=\"query\"}[5m]))\n) * 100 > 5\n"
+ expr: "(\n sum by (job, namespace) (rate(http_requests_total{code=~\"5..\"\
+ , job=\"thanos-querier\", handler=\"query\"}[5m]))\n/\n sum by (job,\
+ \ namespace) (rate(http_requests_total{job=\"thanos-querier\", handler=\"\
+ query\"}[5m]))\n) * 100 > 5\n"
for: 1h
labels:
severity: warning
syn: 'true'
- alert: SYN_ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations:
- description: Thanos Query {{$labels.job}} is failing to handle {{ $value
- | humanize }}% of "query_range" requests.
+ description: Thanos Query {{$labels.job}} is failing to handle {{$value
+ | humanize}}% of "query_range" requests.
summary: Thanos Query is failing to handle requests.
syn_component: openshift4-monitoring
- expr: "(\n sum(rate(http_requests_total{code=~\"5..\", job=\"thanos-querier\"\
- , handler=\"query_range\"}[5m]))\n/\n sum(rate(http_requests_total{job=\"\
- thanos-querier\", handler=\"query_range\"}[5m]))\n) * 100 > 5\n"
+ expr: "(\n sum by (job, namespace) (rate(http_requests_total{code=~\"5..\"\
+ , job=\"thanos-querier\", handler=\"query_range\"}[5m]))\n/\n sum by\
+ \ (job, namespace) (rate(http_requests_total{job=\"thanos-querier\", handler=\"\
+ query_range\"}[5m]))\n) * 100 > 5\n"
for: 1h
labels:
severity: warning
@@ -2720,24 +2841,25 @@
rules:
- alert: SYN_ThanosNoRuleEvaluations
annotations:
- description: Thanos Rule {{$labels.job}} did not perform any rule evaluations
- in the past 2 minutes.
+ description: Thanos Rule {{$labels.instance}} did not perform any rule
+ evaluations in the past 10 minutes.
summary: Thanos Rule did not perform any rule evaluations.
syn_component: openshift4-monitoring
- expr: "sum(rate(prometheus_rule_evaluations_total{job=\"thanos-ruler\"}[2m]))\
- \ <= 0\n and\nsum(thanos_rule_loaded_rules{job=\"thanos-ruler\"}) > 0\n"
- for: 3m
+ expr: "sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=\"\
+ thanos-ruler\"}[5m])) <= 0\n and\nsum by (job, instance) (thanos_rule_loaded_rules{job=\"\
+ thanos-ruler\"}) > 0\n"
+ for: 5m
labels:
severity: warning
syn: 'true'
- alert: SYN_ThanosRuleAlertmanagerHighDNSFailures
annotations:
- description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}%
+ description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}%
of failing DNS queries for Alertmanager endpoints.
summary: Thanos Rule is having high number of DNS failures.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=\"\
- thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=\"\
+ expr: "(\n sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=\"\
+ thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=\"\
thanos-ruler\"}[5m]))\n* 100 > 1\n)\n"
for: 15m
labels:
@@ -2749,20 +2871,21 @@
configuration.
summary: Thanos Rule has not been able to reload configuration.
syn_component: openshift4-monitoring
- expr: avg(thanos_rule_config_last_reload_successful{job="thanos-ruler"})
- by (job) != 1
+ expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job="thanos-ruler"})
+ != 1
for: 5m
labels:
severity: info
syn: 'true'
- alert: SYN_ThanosRuleGrpcErrorRate
annotations:
- description: Thanos Rule {{$labels.job}} is failing to handle {{ $value
- | humanize }}% of requests.
+ description: Thanos Rule {{$labels.job}} is failing to handle {{$value
+ | humanize}}% of requests.
summary: Thanos Rule is failing to handle grpc requests.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
- , job=\"thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(grpc_server_started_total{job=\"\
+ expr: "(\n sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~\"\
+ Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
+ , job=\"thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(grpc_server_started_total{job=\"\
thanos-ruler\"}[5m]))\n* 100 > 5\n)\n"
for: 5m
labels:
@@ -2770,12 +2893,11 @@
syn: 'true'
- alert: SYN_ThanosRuleHighRuleEvaluationFailures
annotations:
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to
- evaluate rules.
+ description: Thanos Rule {{$labels.instance}} is failing to evaluate rules.
summary: Thanos Rule is failing to evaluate rules.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=\"\
- thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(prometheus_rule_evaluations_total{job=\"\
+ expr: "(\n sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=\"\
+ thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=\"\
thanos-ruler\"}[5m]))\n* 100 > 5\n)\n"
for: 5m
labels:
@@ -2783,11 +2905,11 @@
syn: 'true'
- alert: SYN_ThanosRuleHighRuleEvaluationWarnings
annotations:
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number
- of evaluation warnings.
+ description: Thanos Rule {{$labels.instance}} has high number of evaluation
+ warnings.
summary: Thanos Rule has high number of evaluation warnings.
syn_component: openshift4-monitoring
- expr: 'sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m]))
+ expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m]))
> 0
'
@@ -2797,16 +2919,15 @@
syn: 'true'
- alert: SYN_ThanosRuleNoEvaluationFor10Intervals
annotations:
- description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}%
- rule groups that did not evaluate for at least 10x of their expected
- interval.
+ description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% rule
+ groups that did not evaluate for at least 10x of their expected interval.
summary: Thanos Rule has rule groups that did not evaluate for 10 intervals.
syn_component: openshift4-monitoring
- expr: 'time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"})
+ expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"})
>
- 10 * max by (job, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"})
+ 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"})
'
for: 5m
@@ -2815,12 +2936,12 @@
syn: 'true'
- alert: SYN_ThanosRuleQueryHighDNSFailures
annotations:
- description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}%
- of failing DNS queries for query endpoints.
+ description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of
+ failing DNS queries for query endpoints.
summary: Thanos Rule is having high number of DNS failures.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=\"\
- thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=\"\
+ expr: "(\n sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=\"\
+ thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=\"\
thanos-ruler\"}[5m]))\n* 100 > 1\n)\n"
for: 15m
labels:
@@ -2828,11 +2949,11 @@
syn: 'true'
- alert: SYN_ThanosRuleQueueIsDroppingAlerts
annotations:
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to
- queue alerts.
+ description: Thanos Rule {{$labels.instance}} is failing to queue alerts.
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ThanosRuleQueueIsDroppingAlerts.md
summary: Thanos Rule is failing to queue alerts.
syn_component: openshift4-monitoring
- expr: 'sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m]))
+ expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m]))
> 0
'
@@ -2842,12 +2963,12 @@
syn: 'true'
- alert: SYN_ThanosRuleRuleEvaluationLatencyHigh
annotations:
- description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation
- latency than interval for {{$labels.rule_group}}.
+ description: Thanos Rule {{$labels.instance}} has higher evaluation latency
+ than interval for {{$labels.rule_group}}.
summary: Thanos Rule has high rule evaluation latency.
syn_component: openshift4-monitoring
- expr: "(\n sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"\
- thanos-ruler\"})\n>\n sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=\"\
+ expr: "(\n sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"\
+ thanos-ruler\"})\n>\n sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=\"\
thanos-ruler\"})\n)\n"
for: 5m
labels:
@@ -2855,11 +2976,11 @@
syn: 'true'
- alert: SYN_ThanosRuleSenderIsFailingAlerts
annotations:
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to
- send alerts to alertmanager.
+ description: Thanos Rule {{$labels.instance}} is failing to send alerts
+ to alertmanager.
summary: Thanos Rule is failing to send alerts to alertmanager.
syn_component: openshift4-monitoring
- expr: 'sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m]))
+ expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m]))
> 0
'
@@ -2867,47 +2988,3 @@
labels:
severity: warning
syn: 'true'
- - name: syn-thanos-sidecar
- rules:
- - alert: SYN_ThanosSidecarBucketOperationsFailed
- annotations:
- description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} bucket operations
- are failing
- summary: Thanos Sidecar bucket operations are failing
- syn_component: openshift4-monitoring
- expr: 'rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m])
- > 0
-
- '
- for: 1h
- labels:
- severity: warning
- syn: 'true'
- - alert: SYN_ThanosSidecarPrometheusDown
- annotations:
- description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect
- to Prometheus.
- summary: Thanos Sidecar cannot connect to Prometheus
- syn_component: openshift4-monitoring
- expr: 'sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}
- == 0)
-
- '
- for: 1h
- labels:
- severity: warning
- syn: 'true'
- - alert: SYN_ThanosSidecarUnhealthy
- annotations:
- description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy
- for {{ $value }} seconds.
- summary: Thanos Sidecar is unhealthy.
- syn_component: openshift4-monitoring
- expr: 'time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"})
- by (job,pod) >= 240
-
- '
- for: 1h
- labels:
- severity: warning
- syn: 'true'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment