sum(ALERTS_FOR_STATE) by (alertname)
{namespace="kube-system"} |~ "took too long .* to execute"
counted:
count_over_time({namespace="kube-system"} |~ "took too long .* to execute" [60m])
count_over_time({namespace="kube-system"} |~ "ClientConn switching balancer to \"pick_first" [60m])
{component=~"kube-apiserver|etcd",namespace="kube-system"}
{namespace="kube-system", container_name="k8s-event-logger"} |~ "nodeport-proxy-envoy.*failed"
counter
count_over_time({namespace="kube-system", container_name="k8s-event-logger"} |~ "nodeport-proxy-envoy.*failed"[1h]) > 0
{namespace="kube-system", container_name="k8s-event-logger"} |~ "SystemOOM"
count
count_over_time({namespace="kube-system", container_name="k8s-event-logger"} |~ "SystemOOM"[4h]) > 0
{namespace="kube-system", container_name="k8s-event-logger"}
|regexp `(?P<pre>(?s)(.+?).*stdout F )(?P<json>(?s)(.+?)$)`
| line_format "{{.json}}" |json | reason="SystemOOM"
|regexp `(?P<pre2>(?s)(.+?).*victim process: )(?P<victim>(?s)(.+?))\, pid`
| line_format "{{.involvedObject_name}} {{.victim}} {{.message}}"
Tools
etcdperf
:
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
run: etcdperf
name: etcdperf
namespace: monitoring
spec:
selector:
matchLabels:
run: etcdperf
template:
metadata:
labels:
run: etcdperf
spec:
nodeSelector:
node-role.kubernetes.io/master: ""
containers:
- image: quay.io/openshift-scale/etcd-perf
name: etcdperf
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
apiVersion: batch/v1
kind: CronJob
metadata:
name: etcdperf-cron
spec:
schedule: "10 * * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: etcdperf
image: quay.io/openshift-scale/etcd-perf:latest
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- date; echo FIO job
restartPolicy: OnFailure