Skip to content

Instantly share code, notes, and snippets.

@Richard-Barrett
Forked from rmeleromira/alerts.yml
Created December 7, 2018 23:15
Show Gist options
  • Save Richard-Barrett/223226581025a71903a1334dd546d20b to your computer and use it in GitHub Desktop.
Save Richard-Barrett/223226581025a71903a1334dd546d20b to your computer and use it in GitHub Desktop.
- name: alert.rules
rules:
- alert: HAproxyMysqlClusterHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="mysql_cluster"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailNamedProcessWarning
expr: >-
count(procstat_running{process_name="contrail-named"} == 0) >= count(procstat_running{process_name="contrail-named"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-named"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: DockerServiceDockerRegistryCriticalReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}[1m])) <= 3 * 0.4
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "docker_registry"
annotations:
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'docker_registry' for 2 minutes."
summary: "Docker Swarm service docker_registry invalid number of replicas for 2 minutes"
- alert: HAproxyNovaApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="nova_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: DockerServiceMonitoringServerWarningReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}[1m])) <= 2 * 0.7
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "monitoring_server"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_server' for 2 minutes."
summary: "Docker Swarm service monitoring_server invalid number of replicas for 2 minutes"
- alert: ContrailWebServerProcessCritical
expr: >-
count(procstat_running{process_name="contrail-web-server"} == 0) >= count(procstat_running{process_name="contrail-web-server"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-web-server"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyContrailApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="contrail_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailControlProcessWarning
expr: >-
count(procstat_running{process_name="contrail-control"} == 0) >= count(procstat_running{process_name="contrail-control"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyHeatApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailIrondProcessCritical
expr: >-
count(procstat_running{process_name="contrail-irond"} == 0) >= count(procstat_running{process_name="contrail-irond"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-irond"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailJobServerProcessDown
expr: >-
count(procstat_running{process_name="contrail-job-server"} == 0) == count(procstat_running{process_name="contrail-job-server"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-job-server"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: CassandraServerProcessInfo
expr: >-
procstat_running{process_name="cassandra-server"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "cassandra-server"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailNodemgrConfigProcessWarning
expr: >-
count(procstat_running{process_name="contrail-nodemgr-config"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-config"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-nodemgr-config"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordAnalyticsProcessWarning
expr: >-
count(procstat_running{process_name="contrail-supervisord-analytics"} == 0) >= count(procstat_running{process_name="contrail-supervisord-analytics"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-supervisord-analytics"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: DockerServiceElasticsearchElasticsearchclusterReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="elasticsearch_elasticsearch-cluster"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="elasticsearch_elasticsearch-cluster"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "elasticsearch_elasticsearch-cluster"
annotations:
description: "No replicas are running for the Docker Swarn service 'elasticsearch_elasticsearch-cluster'. for 2 minutes"
summary: "Docker Swarm service elasticsearch_elasticsearch-cluster down for 2 minutes"
- alert: HAproxyRabbitmqClusterBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailSupervisordConfigProcessInfo
expr: >-
procstat_running{process_name="contrail-supervisord-config"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-supervisord-config"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailVrouterAgentProcessInfo
expr: >-
procstat_running{process_name="contrail-vrouter-agent"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-vrouter-agent"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: KibanaProcessWarning
expr: >-
count(procstat_running{process_name="kibana"} == 0) >= count(procstat_running{process_name="kibana"}) * 0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "kibana"
annotations:
description: "More than 30.0% of Kibana services are down"
summary: "More than 30.0% of Kibana services are down"
- alert: ContrailDiscoveryProcessCritical
expr: >-
count(procstat_running{process_name="contrail-discovery"} == 0) >= count(procstat_running{process_name="contrail-discovery"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-discovery"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyKeystoneAdminApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="keystone_admin_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: CinderServicesInfo
expr: >-
openstack_cinder_service == 1
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "'{{ $labels.service }}' is down on {{ $labels.hostname }} for the last 2 minutes."
summary: "'{{ $labels.service }}' is down"
- alert: DockerServiceMonitoringPushgatewayReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_pushgateway"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_pushgateway'. for 2 minutes"
summary: "Docker Swarm service monitoring_pushgateway down for 2 minutes"
- alert: SystemMemoryAvailableTooLow
expr: >-
avg_over_time(mem_available_percent[5m]) < 5.0
labels:
route: "email,salesforce"
severity: "critical"
service: "system"
annotations:
description: "The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold=5.0%)."
summary: "Free memory too low on {{ $labels.host }}"
- alert: HAproxyGlanceRegistryApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: InfluxdbCritical
expr: >-
count(influxdb_up == 0) >= count(influxdb_up) * 0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "influxdb"
annotations:
description: "More than 60.0% of InfluxDB services are down"
summary: "More than 60.0% of InfluxDB services are down"
- alert: DockerServiceMonitoringAlertmanagerReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_alertmanager"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_alertmanager'. for 2 minutes"
summary: "Docker Swarm service monitoring_alertmanager down for 2 minutes"
- alert: NovaAPIDown
expr: >-
openstack_api_check_status{service=~"nova.*|placement"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes"
summary: "Endpoint check for '{{ $labels.service }}' is down"
- alert: DockerServiceMonitoringAlertmanagerCriticalReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}[1m])) <= 2 * 0.4
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "monitoring_alertmanager"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_alertmanager' for 2 minutes."
summary: "Docker Swarm service monitoring_alertmanager invalid number of replicas for 2 minutes"
- alert: RedisServerProcessDown
expr: >-
count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"})
labels:
route: "email,salesforce"
severity: "down"
service: "redis-server"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyInfluxdbRelayBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailCollectorAPIDown
expr: >-
count(http_response_status{service=~"contrail.collector"} == 0) by (service) == count(http_response_status{service=~"contrail.collector"}) by (service)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' APIs are down"
summary: "All '{{ $labels.service }}' APIs are down"
- alert: KafkaServerProcessDown
expr: >-
count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"})
labels:
route: "email,salesforce"
severity: "down"
service: "kafka-server"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailVrouterAgentProcessCritical
expr: >-
count(procstat_running{process_name="contrail-vrouter-agent"} == 0) >= count(procstat_running{process_name="contrail-vrouter-agent"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-vrouter-agent"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailNodemgrControlProcessWarning
expr: >-
count(procstat_running{process_name="contrail-nodemgr-control"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-control"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-nodemgr-control"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyInfluxdbRelayBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: HAproxyNovaApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: HeatAPIDown
expr: >-
openstack_api_check_status{service=~"heat.*"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes"
summary: "Endpoint check for '{{ $labels.service }}' is down"
- alert: HAproxyKibanaBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="kibana"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="kibana"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailXMPPSessionsTooManyVariations
expr: >-
abs(delta(contrail_xmpp_session_count[2m])) >= 100
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are too many XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=100)"
summary: "Number of XMPP sessions changed between checks is too high"
- alert: ContrailWebServerProcessWarning
expr: >-
count(procstat_running{process_name="contrail-web-server"} == 0) >= count(procstat_running{process_name="contrail-web-server"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-web-server"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: InfluxdbHTTPPointsWrittenFail
expr: >-
rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > 5
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb"
annotations:
description: "{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold=5)."
summary: "Influxdb too many failed writes"
- alert: ContrailCollectorAPICritical
expr: >-
count(http_response_status{service=~"contrail.collector"} == 0) by (service) >= count(http_response_status{service=~"contrail.collector"}) by (service) *0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: NovaServicesCritical
expr: >-
openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * 0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 60.0% of {{ $labels.service }} services are down for the last 2 minutes"
summary: "More than 60.0% of {{ $labels.service }} services are down"
- alert: DockerServiceAptlyPublicCriticalReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}[1m])) <= 3 * 0.4
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "aptly_public"
annotations:
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'aptly_public' for 2 minutes."
summary: "Docker Swarm service aptly_public invalid number of replicas for 2 minutes"
- alert: NovaTotalFreeMemoryShortage
expr: >-
(100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 2.0
for: 1m
labels:
route: "email,salesforce"
severity: "critical"
service: "nova"
annotations:
description: "Memory shortage for 1 minutes"
summary: "Memory shortage for new instances"
- alert: ContrailVrouterDNSXMPPSessionsNone
expr: >-
max(contrail_vrouter_dns_xmpp) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are no vRouter DNS-XMPP sessions on node {{ $labels.host }}"
summary: "No vRouter DNS-XMPP sessions"
- alert: ContrailNamedProcessCritical
expr: >-
count(procstat_running{process_name="contrail-named"} == 0) >= count(procstat_running{process_name="contrail-named"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-named"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: RabbitMQTooManyMessages
expr: >-
rabbitmq_overview_messages > 1048576
labels:
route: "email,salesforce"
severity: "warning"
service: "rabbitmq"
annotations:
description: "The number of outstanding messages in RabbitMQ is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=1048576)."
summary: "Too many messages in RabbitMQ"
- alert: SystemMemoryAvailableLow
expr: >-
avg_over_time(mem_available_percent[5m]) < 10.0
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold=10.0%)."
summary: "Free memory low on {{ $labels.host }}"
- alert: DockerServiceDockerRegistryWarningReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}[1m])) <= 3 * 0.7
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "docker_registry"
annotations:
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'docker_registry' for 2 minutes."
summary: "Docker Swarm service docker_registry invalid number of replicas for 2 minutes"
- alert: GlusterFSDown
expr: >-
glusterfs_up != 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "glusterfs"
annotations:
description: "GlusterFS service is down on node {{ $labels.host }}"
summary: "GlusterFS service down"
- alert: ContrailJobServerProcessCritical
expr: >-
count(procstat_running{process_name="contrail-job-server"} == 0) >= count(procstat_running{process_name="contrail-job-server"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-job-server"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailXMPPSessionsNoneUp
expr: >-
max(contrail_xmpp_session_up_count) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are no active XMPP sessions on node {{ $labels.host }}"
summary: "no active XMPP sessions"
- alert: ZookeeperCritical
expr: >-
count(zookeeper_up == 0) >= count(zookeeper_up) * 0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "zookeeper"
annotations:
description: "More than 60.0% of Zookeeper services are down"
summary: "More than 60.0% of Zookeeper services are down"
- alert: SystemRxPacketsDroppedTooHigh
expr: >-
rate(net_drop_in[1m]) > 100
labels:
route: "email,salesforce"
severity: "critical"
service: "system"
annotations:
description: "The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold=100/sec)"
summary: "Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}"
- alert: ContrailSchemaProcessDown
expr: >-
count(procstat_running{process_name="contrail-schema"} == 0) == count(procstat_running{process_name="contrail-schema"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-schema"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyElasticsearchBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: KeystoneErrorLogsTooHigh
expr: >-
sum(rate(log_messages{service="keystone",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)."
summary: "Too many errors in {{ $labels.service }} logs"
- alert: HAproxyContrailAnalyticsBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyHeatCloudwatchApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailSvcMonitorProcessInfo
expr: >-
procstat_running{process_name="contrail-svc-monitor"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-svc-monitor"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: NovaTotalFreeVCPUsShortage
expr: >-
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 2.0
for: 1m
labels:
route: "email,salesforce"
severity: "critical"
service: "nova"
annotations:
description: "VPCU shortage for 1 minutes"
summary: "VCPU shortage for new instances"
- alert: ContrailFlowsInvalidLabelTooMany
expr: >-
min(contrail_vrouter_flows_invalid_label) by (host) >= 100
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter flows with invalid label on node {{ $labels.host }} (current value={{ $value }}, threshold=100)"
summary: "Too many vRouter flows with invalid label"
- alert: SystemFreeOpenFilesTooLow
expr: >-
predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "Host {{ $labels.host }}) will run out of free open files in less than 8 hours."
summary: "Free open files for {{ $labels.path }} too low on {{ $labels.host }}"
- alert: SaltMasterProcessDown
expr: >-
procstat_running{process_name="salt-master"} == 0
labels:
route: "email,salesforce"
severity: "warning"
service: "salt-master"
annotations:
description: "Salt-master service is down on node {{ $labels.host }}"
summary: "Salt-master service is down"
- alert: DockerServiceMonitoringPushgatewayCriticalReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}[1m])) <= 2 * 0.4
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "monitoring_pushgateway"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_pushgateway' for 2 minutes."
summary: "Docker Swarm service monitoring_pushgateway invalid number of replicas for 2 minutes"
- alert: ContrailNodemgrProcessDown
expr: >-
count(procstat_running{process_name="contrail-nodemgr"} == 0) == count(procstat_running{process_name="contrail-nodemgr"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-nodemgr"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailCollectorAPIWarning
expr: >-
count(http_response_status{service=~"contrail.collector"} == 0) by (service) >= count(http_response_status{service=~"contrail.collector"}) by (service) *0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyNovaMetadataApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: SshFailedLoginsTooHigh
expr: >-
rate(failed_logins_total[5m]) > 0.2
labels:
route: "email,salesforce"
severity: "warning"
service: "ssh"
annotations:
description: "The rate of failed logins is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)."
summary: "Too many failed SSH logins"
- alert: HAproxyInfluxdbRelayHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="influxdb_relay"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: HAproxyHeatCfnApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="heat_cfn_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: DockerServiceRundeckRundeckapiReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="rundeck_rundeck-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="rundeck_rundeck-api"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "rundeck_rundeck-api"
annotations:
description: "No replicas are running for the Docker Swarn service 'rundeck_rundeck-api'. for 2 minutes"
summary: "Docker Swarm service rundeck_rundeck-api down for 2 minutes"
- alert: ContrailTopologyProcessInfo
expr: >-
procstat_running{process_name="contrail-topology"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-topology"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailBGPSessionsSomeDown
expr: >-
min(contrail_bgp_session_down_count) by (host) > 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are inactive BGP sessions on node {{ $labels.host }}"
summary: "inactive BGP sessions"
- alert: NovaServicesDown
expr: >-
openstack_nova_services{state="up",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' services are down for the last 2 minutes"
summary: "All {{ $labels.service }} services down"
- alert: ContrailSchemaProcessInfo
expr: >-
procstat_running{process_name="contrail-schema"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-schema"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: RedisServerProcessCritical
expr: >-
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "redis-server"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailNodemgrProcessInfo
expr: >-
procstat_running{process_name="contrail-nodemgr"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-nodemgr"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: HAproxyContrailAnalyticsBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailSvcMonitorProcessCritical
expr: >-
count(procstat_running{process_name="contrail-svc-monitor"} == 0) >= count(procstat_running{process_name="contrail-svc-monitor"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-svc-monitor"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: InfluxdbHTTPClientErrors
expr: >-
rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > 5
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb"
annotations:
description: "{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold=5)."
summary: "Influxdb number of client errors is high"
- alert: HAproxyNovaMetadataApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: CinderServicesWarning
expr: >-
openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * 0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 30.0%)"
summary: "More than 30.0% of {{ $labels.service }} services are down"
- alert: HAproxyGlanceApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="glance_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailVrouterLLSSessionsTooMany
expr: >-
min(contrail_vrouter_lls) by (host) >= 10
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter LLS sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=10)"
summary: "Too many vRouter LLS sessions"
- alert: ContrailQueryEngineProcessCritical
expr: >-
count(procstat_running{process_name="contrail-query-engine"} == 0) >= count(procstat_running{process_name="contrail-query-engine"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-query-engine"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyElasticsearchBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailAnalyticsApiProcessInfo
expr: >-
procstat_running{process_name="contrail-analytics-api"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-analytics-api"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailTopologyProcessDown
expr: >-
count(procstat_running{process_name="contrail-topology"} == 0) == count(procstat_running{process_name="contrail-topology"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-topology"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: CinderServicesDown
expr: >-
openstack_cinder_services{state="up",service=~"cinder-volume|cinder-scheduler"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All {{ $labels.service }} services are down for the last 2 minutes"
summary: "All {{ $labels.service }} services are down"
- alert: HAproxyElasticsearchBinaryHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="elasticsearch_binary"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: NovaComputesCritical
expr: >-
openstack_nova_services_percent{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * 0.5
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 50.0% of {{ $labels.service }} services are down for the last 2 minutes"
summary: "More than 50.0% of {{ $labels.service }} services are down"
- alert: ContrailSchemaProcessWarning
expr: >-
count(procstat_running{process_name="contrail-schema"} == 0) >= count(procstat_running{process_name="contrail-schema"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-schema"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: GlanceErrorLogsTooHigh
expr: >-
sum(rate(log_messages{service="glance",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)."
summary: "Too many errors in {{ $labels.service }} logs"
- alert: DockerServiceMonitoringServerCriticalReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}[1m])) <= 2 * 0.4
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "monitoring_server"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_server' for 2 minutes."
summary: "Docker Swarm service monitoring_server invalid number of replicas for 2 minutes"
- alert: InfluxdbRelayFailedRequests
expr: >-
rate(influxdb_relay_failed_requests_total[5m]) / rate(influxdb_relay_requests_total[5m]) * 100 > 5
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb-relay"
annotations:
description: "{{ printf `%.1f` $value }}% of requests have been dropped on {{ $labels.instance }} (threshold=5)."
summary: "InfluxDB Relay too many failed requests"
- alert: ContrailVrouterDNSXMPPSessionsTooMany
expr: >-
min(contrail_vrouter_dns_xmpp) by (host) >= 10
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter DNS-XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=10)"
summary: "Too many vRouter DNS-XMPP sessions"
- alert: ContrailAPICritical
expr: >-
count(http_response_status{service=~"contrail.api"} == 0) by (service) >= count(http_response_status{service=~"contrail.api"}) by (service) *0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordAnalyticsProcessDown
expr: >-
count(procstat_running{process_name="contrail-supervisord-analytics"} == 0) == count(procstat_running{process_name="contrail-supervisord-analytics"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-supervisord-analytics"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: NovaAPIServiceDown
expr: >-
http_response_status{service=~"nova-api"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: ContrailNodemgrDatabaseProcessCritical
expr: >-
count(procstat_running{process_name="contrail-nodemgr-database"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-database"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-nodemgr-database"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordConfigProcessDown
expr: >-
count(procstat_running{process_name="contrail-supervisord-config"} == 0) == count(procstat_running{process_name="contrail-supervisord-config"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-supervisord-config"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyGlanceRegistryApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: HAproxyKibanaBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="kibana"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="kibana"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="kibana"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailNodeManagerAPIWarning
expr: >-
count(http_response_status{service=~"contrail.node.manager"} == 0) by (service) >= count(http_response_status{service=~"contrail.node.manager"}) by (service) *0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: KibanaProcessDown
expr: >-
count(procstat_running{process_name="kibana"} == 0) == count(procstat_running{process_name="kibana"})
labels:
route: "email,salesforce"
severity: "down"
service: "kibana"
annotations:
description: "All Kibana services are down"
summary: "All Kibana services are down"
- alert: HAproxyNovaMetadataApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="nova_metadata_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: NovaTotalFreeMemoryLow
expr: >-
(100.0 * openstack_nova_total_free_ram) / (openstack_nova_total_free_ram + openstack_nova_total_used_ram) < 10.0
for: 1m
labels:
route: "email,salesforce"
severity: "warning"
service: "nova"
annotations:
description: "Memory low limit for 1 minutes"
summary: "Memory low limit for new instances"
- alert: ElasticsearchInfo
expr: >-
elasticsearch_up{host=~'.*'} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "elasticsearch"
annotations:
description: "Elasticsearch service is down on node {{ $labels.host }}"
summary: "Elasticsearch service is down"
- alert: KafkaServerProcessWarning
expr: >-
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "kafka-server"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordControlProcessWarning
expr: >-
count(procstat_running{process_name="contrail-supervisord-control"} == 0) >= count(procstat_running{process_name="contrail-supervisord-control"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-supervisord-control"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: DockerServiceDevopsportalFrontendReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="devops-portal_frontend"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="devops-portal_frontend"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "devops-portal_frontend"
annotations:
description: "No replicas are running for the Docker Swarn service 'devops-portal_frontend'. for 2 minutes"
summary: "Docker Swarm service devops-portal_frontend down for 2 minutes"
- alert: ContrailVrouterAPIDown
expr: >-
count(http_response_status{service=~"contrail.vrouter"} == 0) by (service) == count(http_response_status{service=~"contrail.vrouter"}) by (service)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' APIs are down"
summary: "All '{{ $labels.service }}' APIs are down"
- alert: NovaComputesWarning
expr: >-
openstack_nova_services{state="down",service=~"nova-compute"} >= on (service) sum(openstack_nova_services{service=~"nova-compute"}) by (service) * 0.25
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 25.0% of {{ $labels.service }} services are down for the last 2 minutes"
summary: "More than 25.0% of {{ $labels.service }} services are down"
- alert: NovaErrorLogsTooHigh
expr: >-
sum(rate(log_messages{service="nova",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)."
summary: "Too many errors in {{ $labels.service }} logs"
- alert: NtpOffset
expr: >-
ntpq_offset >= 250
labels:
route: "email,salesforce"
severity: "warning"
service: "ntp"
annotations:
description: "NTP offset is higher than 250ms on node {{ $labels.host }}"
summary: "NTP offset is too high"
- alert: KafkaServerProcessCritical
expr: >-
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "kafka-server"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordControlProcessCritical
expr: >-
count(procstat_running{process_name="contrail-supervisord-control"} == 0) >= count(procstat_running{process_name="contrail-supervisord-control"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-supervisord-control"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailNodemgrVrouterProcessCritical
expr: >-
count(procstat_running{process_name="contrail-nodemgr-vrouter"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-vrouter"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-nodemgr-vrouter"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailNamedProcessDown
expr: >-
count(procstat_running{process_name="contrail-named"} == 0) == count(procstat_running{process_name="contrail-named"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-named"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailSnmpCollectorProcessDown
expr: >-
count(procstat_running{process_name="contrail-snmp-collector"} == 0) == count(procstat_running{process_name="contrail-snmp-collector"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-snmp-collector"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailDeviceManagerProcessInfo
expr: >-
procstat_running{process_name="contrail-device-manager"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-device-manager"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: CassandraServerProcessWarning
expr: >-
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "cassandra-server"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailVrouterXMPPSessionsTooManyVariations
expr: >-
abs(delta(contrail_vrouter_xmpp[2m])) >= 5
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=5)"
summary: "Number of vRouter XMPP sessions changed between checks is too high"
- alert: KeystoneAPIDown
expr: >-
openstack_api_check_status{service=~"keystone.*"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes"
summary: "Endpoint check for '{{ $labels.service }}' is down"
- alert: HAproxyNovaNovncHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="nova_novnc"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailFlowsDiscardTooMany
expr: >-
rate(contrail_vrouter_flows_discard[5m]) >= 0.1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many discarded vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold=0.1)"
summary: "Too many vRouter discarded flows"
- alert: HAproxyElasticsearchBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyCinderApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="cinder_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: HAproxyContrailAnalyticsBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_analytics"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailFlowsQueueLimitExceededTooMany
expr: >-
rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= 0.1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter flows with queue limit exceeded on node {{ $labels.host }} (current value={{ $value }}, threshold=0.1)"
summary: "Too many vRouter flows with queue limit exceeded"
- alert: ElasticsearchDown
expr: >-
count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
labels:
route: "email,salesforce"
severity: "down"
service: "elasticsearch"
annotations:
description: "All Elasticsearch services are down"
summary: "All Elasticsearch services are down"
- alert: ContrailSnmpCollectorProcessInfo
expr: >-
procstat_running{process_name="contrail-snmp-collector"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-snmp-collector"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ElasticsearchClusterHealthStatusRed
expr: >-
elasticsearch_cluster_health_status == 3
labels:
route: "email,salesforce"
severity: "critical"
service: "elasticsearch"
annotations:
description: "The Elasticsearch cluster status is RED for the last 5 minutes."
summary: "Elasticsearch cluster status is RED"
- alert: HAproxyElasticsearchHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="elasticsearch"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailFlowsFragErrTooMany
expr: >-
min(contrail_vrouter_flows_frag_err) by (host) >= 100
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter flows with fragment errors on node {{ $labels.host }} (current value={{ $value }}, threshold=100)"
summary: "Too many vRouter flows with fragment errors"
- alert: ContrailApiProcessInfo
expr: >-
procstat_running{process_name="contrail-api"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-api"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: HAproxyMysqlClusterBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyCinderApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailSupervisordConfigProcessWarning
expr: >-
count(procstat_running{process_name="contrail-supervisord-config"} == 0) >= count(procstat_running{process_name="contrail-supervisord-config"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-supervisord-config"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyAodhApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: HAproxyHeatCfnApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: DockerServiceJanitorMonkeyCleanupservicemongodbReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-mongodb"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-mongodb"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "janitor_monkey_cleanup-service-mongodb"
annotations:
description: "No replicas are running for the Docker Swarn service 'janitor_monkey_cleanup-service-mongodb'. for 2 minutes"
summary: "Docker Swarm service janitor_monkey_cleanup-service-mongodb down for 2 minutes"
- alert: InfluxdbHTTPPointsWrittenDropped
expr: >-
rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > 5
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb"
annotations:
description: "{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold=5)."
summary: "Influxdb too many dropped writes"
- alert: MemcachedProcessDown
expr: >-
procstat_running{process_name="memcached"} == 0
labels:
route: "email,salesforce"
severity: "warning"
service: "memcached"
annotations:
description: "Memcached service is down on node {{ $labels.host }}"
summary: "Memcached service is down"
- alert: InfluxdbRelayBufferNearFull
expr: >-
influxdb_relay_backend_buffer_bytes > 536870912.0 * 70 / 100
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb-relay"
annotations:
description: "The buffer size for the {{ $labels.instance }}/{{ $labels.backend }} backend is getting full (current value={{ $value }} bytes, threshold=375809638.4)."
summary: "InfluxDB Relay buffer almost full"
- alert: HeatAPIServicesInfo
expr: >-
http_response_status{service=~"heat.*-api"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: ContrailFlowsActiveTooMany
expr: >-
deriv(contrail_vrouter_flows_active[5m]) >= 100
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many active vRouter flows on node {{ $labels.host }} (current value={{ $value }}, threshold=100)"
summary: "Too many vRouter active flows"
- alert: ContrailNodemgrControlProcessInfo
expr: >-
procstat_running{process_name="contrail-nodemgr-control"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-nodemgr-control"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: NovaTotalFreeVCPUsLow
expr: >-
(100.0 * openstack_nova_total_free_vcpus) / (openstack_nova_total_free_vcpus + openstack_nova_total_used_vcpus) < 10.0
for: 1m
labels:
route: "email,salesforce"
severity: "warning"
service: "nova"
annotations:
description: "VPCU low limit for 1 minutes"
summary: "VCPU low limit for new instances"
- alert: ContrailTopologyProcessCritical
expr: >-
count(procstat_running{process_name="contrail-topology"} == 0) >= count(procstat_running{process_name="contrail-topology"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-topology"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailNodemgrVrouterProcessDown
expr: >-
count(procstat_running{process_name="contrail-nodemgr-vrouter"} == 0) == count(procstat_running{process_name="contrail-nodemgr-vrouter"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-nodemgr-vrouter"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailVrouterXMPPSessionsTooMany
expr: >-
min(contrail_vrouter_xmpp) by (host) >= 10
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=10)"
summary: "Too many vRouter XMPP sessions"
- alert: ZookeeperWarning
expr: >-
count(zookeeper_up == 0) >= count(zookeeper_up) * 0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "zookeeper"
annotations:
description: "More than 30.0% of Zookeeper services are down"
summary: "More than 30.0% of Zookeeper services are down"
- alert: ContrailAlarmGenProcessCritical
expr: >-
count(procstat_running{process_name="contrail-alarm-gen"} == 0) >= count(procstat_running{process_name="contrail-alarm-gen"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-alarm-gen"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: SystemDiskInodesFull
expr: >-
disk_inodes_used / disk_inodes_total >= 0.99
labels:
route: "email,salesforce"
severity: "critical"
service: "system"
annotations:
description: "The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}."
summary: "Inodes for {{ $labels.path }} full on {{ $labels.host }}"
- alert: ElasticsearchClusterDiskLowWaterMark
expr: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "elasticsearch"
annotations:
description: "Elasticsearch will not allocate new shards to node {{ $labels.host }}"
summary: "Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}"
- alert: ContrailQueryEngineProcessInfo
expr: >-
procstat_running{process_name="contrail-query-engine"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-query-engine"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailDiscoveryAPIWarning
expr: >-
count(http_response_status{service=~"contrail.discovery"} == 0) by (service) >= count(http_response_status{service=~"contrail.discovery"}) by (service) *0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailVrouterLLSSessionsTooManyVariations
expr: >-
abs(delta(contrail_vrouter_lls[2m])) >= 5
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter LLS sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=5)"
summary: "Number of vRouter LLS sessions changed between checks is too high"
- alert: ContrailAnalyticsApiProcessDown
expr: >-
count(procstat_running{process_name="contrail-analytics-api"} == 0) == count(procstat_running{process_name="contrail-analytics-api"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-analytics-api"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyNovaMetadataApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="nova_metadata_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: RabbitMQDown
expr: >-
rabbitmq_up != 1
labels:
route: "email,salesforce"
severity: "warning"
service: "rabbitmq"
annotations:
description: "RabbitMQ service is down on node {{ $labels.host }}"
summary: "RabbitMQ service down"
- alert: GlanceAPIDown
expr: >-
max(openstack_api_check_status{service=~"glance.*"}) by (service) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes"
summary: "Endpoint check for '{{ $labels.service }}' is down"
- alert: ContrailVrouterAgentProcessWarning
expr: >-
count(procstat_running{process_name="contrail-vrouter-agent"} == 0) >= count(procstat_running{process_name="contrail-vrouter-agent"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-vrouter-agent"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailXMPPSessionsNone
expr: >-
max(contrail_xmpp_session_count) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are no XMPP sessions on node {{ $labels.host }}"
summary: "No XMPP sessions"
- alert: HAproxyInfluxdbRelayBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="influxdb_relay"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailIrondProcessWarning
expr: >-
count(procstat_running{process_name="contrail-irond"} == 0) >= count(procstat_running{process_name="contrail-irond"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-irond"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailDiscoveryProcessInfo
expr: >-
procstat_running{process_name="contrail-discovery"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-discovery"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailAPIWarning
expr: >-
count(http_response_status{service=~"contrail.api"} == 0) by (service) >= count(http_response_status{service=~"contrail.api"}) by (service) *0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailApiProcessCritical
expr: >-
count(procstat_running{process_name="contrail-api"} == 0) >= count(procstat_running{process_name="contrail-api"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-api"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyHeatApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="heat_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HeatAPIServicesWarning
expr: >-
count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * 0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 30.0%)"
summary: "More than 30.0% of {{ $labels.service }} services are down"
- alert: RabbitMQDiskLow
expr: >-
predict_linear(rabbitmq_node_disk_free[8h], 8*3600) <= rabbitmq_node_disk_free_limit
labels:
route: "email,salesforce"
severity: "warning"
service: "rabbitmq"
annotations:
description: "The RabbitMQ disk partition will be full in less than 8 hours on node {{ $labels.host }}."
summary: "RabbitMQ disk free space too low"
- alert: ApacheIdleWorkersShortage
expr: >-
apache_IdleWorkers == 0
labels:
route: "email,salesforce"
severity: "warning"
service: "apache"
annotations:
description: "Apache idle workers shortage on node {{ $labels.host }}"
summary: "Apache idle workers shortage"
- alert: InfluxdbInfo
expr: >-
influxdb_up == 0
labels:
route: "email,salesforce"
severity: "info"
service: "influxdb"
annotations:
description: "InfluxDB service is down on node {{ $labels.host }}"
summary: "InfluxDB service down"
- alert: ContrailAPIInfo
expr: >-
http_response_status{service=~"contrail.api"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}"
summary: "Endpoint check for '{{ $labels.service }}' is failed"
- alert: InfluxdbWarning
expr: >-
count(influxdb_up == 0) >= count(influxdb_up) * 0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb"
annotations:
description: "More than 30.0% of InfluxDB services are down"
summary: "More than 30.0% of InfluxDB services are down"
- alert: ContrailDnsProcessDown
expr: >-
count(procstat_running{process_name="contrail-dns"} == 0) == count(procstat_running{process_name="contrail-dns"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-dns"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailDnsProcessCritical
expr: >-
count(procstat_running{process_name="contrail-dns"} == 0) >= count(procstat_running{process_name="contrail-dns"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-dns"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: SystemTxPacketsDroppedTooHigh
expr: >-
rate(net_drop_out[1m]) > 100
labels:
route: "email,salesforce"
severity: "critical"
service: "system"
annotations:
description: "The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold=100/sec)"
summary: "Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}"
- alert: ContrailFlowsDropTooMany
expr: >-
rate(contrail_vrouter_flows_flow_action_drop[5m]) >= 0.2
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many dropped vRouter flows on node {{ $labels.host }} (current value={{ $value }} flows/s, threshold=0.2 flows/s)"
summary: "Too many vRouter dropped flows"
- alert: ContrailNodeManagerAPIInfo
expr: >-
http_response_status{service=~"contrail.node.manager"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}"
summary: "Endpoint check for '{{ $labels.service }}' is failed"
- alert: ContrailJobServerProcessInfo
expr: >-
procstat_running{process_name="contrail-job-server"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-job-server"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailCollectorProcessCritical
expr: >-
count(procstat_running{process_name="contrail-collector"} == 0) >= count(procstat_running{process_name="contrail-collector"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-collector"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyGlanceRegistryApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_registry_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailDeviceManagerProcessWarning
expr: >-
count(procstat_running{process_name="contrail-device-manager"} == 0) >= count(procstat_running{process_name="contrail-device-manager"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-device-manager"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordControlProcessInfo
expr: >-
procstat_running{process_name="contrail-supervisord-control"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-supervisord-control"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailNodemgrControlProcessCritical
expr: >-
count(procstat_running{process_name="contrail-nodemgr-control"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-control"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-nodemgr-control"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailAnalyticsApiProcessCritical
expr: >-
count(procstat_running{process_name="contrail-analytics-api"} == 0) >= count(procstat_running{process_name="contrail-analytics-api"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-analytics-api"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: NovaComputesDown
expr: >-
openstack_nova_services{state="up",service=~"nova-compute"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' services are down for the last 2 minutes"
summary: "All {{ $labels.service }} services are down"
- alert: ContrailSupervisordDatabaseProcessCritical
expr: >-
count(procstat_running{process_name="contrail-supervisord-database"} == 0) >= count(procstat_running{process_name="contrail-supervisord-database"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-supervisord-database"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordVrouterProcessInfo
expr: >-
procstat_running{process_name="contrail-supervisord-vrouter"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-supervisord-vrouter"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailAlarmGenProcessInfo
expr: >-
procstat_running{process_name="contrail-alarm-gen"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-alarm-gen"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailNodemgrProcessWarning
expr: >-
count(procstat_running{process_name="contrail-nodemgr"} == 0) >= count(procstat_running{process_name="contrail-nodemgr"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-nodemgr"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailDnsProcessWarning
expr: >-
count(procstat_running{process_name="contrail-dns"} == 0) >= count(procstat_running{process_name="contrail-dns"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-dns"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: RabbitMQMemoryLow
expr: >-
(rabbitmq_node_mem_limit - rabbitmq_node_mem_used) <= 104857600
labels:
route: "email,salesforce"
severity: "warning"
service: "rabbitmq"
annotations:
description: "The amount of free memory is too low on node {{ $labels.host }} (current value={{ $value }}B, threshold=104857600B)."
summary: "RabbitMQ free memory too low"
- alert: GaleraNodeNotReady
expr: >-
mysql_wsrep_ready != 1
for: 1m
labels:
route: "email,salesforce"
severity: "warning"
service: "mysql"
annotations:
description: "The Galera service on {{ $labels.host }} is not ready to serve queries."
summary: "Galera on {{ $labels.host }} not ready"
- alert: HAproxyNovaNovncBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailNamedProcessInfo
expr: >-
procstat_running{process_name="contrail-named"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-named"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: HAproxyHeatCfnApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: CassandraServerProcessDown
expr: >-
count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"})
labels:
route: "email,salesforce"
severity: "down"
service: "cassandra-server"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyKeystoneAdminApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailDiscoveryProcessDown
expr: >-
count(procstat_running{process_name="contrail-discovery"} == 0) == count(procstat_running{process_name="contrail-discovery"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-discovery"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyGlanceRegistryApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="glance_registry_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: RabbitMQMemoryFull
expr: >-
rabbitmq_node_mem_used >= rabbitmq_node_mem_limit
labels:
route: "email,salesforce"
severity: "critical"
service: "rabbitmq"
annotations:
description: "All producers are blocked because the memory is full on node {{ $labels.host }}."
summary: "RabbitMQ producers blocked due to full memory"
- alert: HAproxyContrailApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: KeystoneAPIServiceDown
expr: >-
http_response_status{service=~"keystone.*"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: ContrailIfmapServerProcessDown
expr: >-
count(procstat_running{process_name="contrail-ifmap-server"} == 0) == count(procstat_running{process_name="contrail-ifmap-server"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-ifmap-server"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: NovaAggregatesFreeMemoryShortage
expr: >-
(100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 2.0
for: 1m
labels:
aggregate: "{{ $labels.aggregate }}"
route: "email,salesforce"
severity: "critical"
service: "nova"
annotations:
description: "Memory shortage for 1 minutes on aggregate {{ $labels.aggregate }}"
summary: "Memory shortage for new instances on aggregate {{ $labels.aggregate }}"
- alert: HAproxyNovaApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="nova_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: DockerServiceMonitoringAlertmanagerWarningReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_alertmanager"}[1m])) <= 2 * 0.7
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "monitoring_alertmanager"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_alertmanager' for 2 minutes."
summary: "Docker Swarm service monitoring_alertmanager invalid number of replicas for 2 minutes"
- alert: HAproxyKeystoneAdminApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: HAproxyHeatCfnApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="heat_cfn_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ElasticsearchCritical
expr: >-
count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * 0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "elasticsearch"
annotations:
description: "More than 60.0% of Elasticsearch services are down"
summary: "More than 60.0% of Elasticsearch services are down"
- alert: SystemDiskErrors
expr: >-
increase(hdd_errors_total[5m]) > 0
labels:
route: "email,salesforce"
severity: "critical"
service: "system"
annotations:
description: "The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}."
summary: "Disk {{ $labels.device }} is failing"
- alert: ContrailApiProcessDown
expr: >-
count(procstat_running{process_name="contrail-api"} == 0) == count(procstat_running{process_name="contrail-api"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-api"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailIfmapServerProcessCritical
expr: >-
count(procstat_running{process_name="contrail-ifmap-server"} == 0) >= count(procstat_running{process_name="contrail-ifmap-server"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-ifmap-server"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyKeystonePublicApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailCollectorProcessInfo
expr: >-
procstat_running{process_name="contrail-collector"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-collector"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ElasticsearchWarning
expr: >-
count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * 0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "elasticsearch"
annotations:
description: "More than 30.0% of Elasticsearch services are down"
summary: "More than 30.0% of Elasticsearch services are down"
- alert: ContrailVrouterAPIInfo
expr: >-
http_response_status{service=~"contrail.vrouter"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}"
summary: "Endpoint check for '{{ $labels.service }}' is failed"
- alert: SystemDiskSpaceFull
expr: >-
disk_used_percent >= 99 and disk_inodes_total > 0
labels:
route: "email,salesforce"
severity: "critical"
service: "system"
annotations:
description: "The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}."
summary: "Disk partition {{ $labels.path }} full on {{ $labels.host }}"
- alert: ElasticsearchClusterHealthStatusYellow
expr: >-
elasticsearch_cluster_health_status == 2
labels:
route: "email,salesforce"
severity: "warning"
service: "elasticsearch"
annotations:
description: "The Elasticsearch cluster status is YELLOW for the last 5 minutes."
summary: "Elasticsearch cluster status is YELLOW"
- alert: HAproxyKeystonePublicApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyKeystoneAdminApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="keystone_admin_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: DockerServicePostgresqlPostgresqldbReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="postgresql_postgresql-db"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="postgresql_postgresql-db"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "postgresql_postgresql-db"
annotations:
description: "No replicas are running for the Docker Swarn service 'postgresql_postgresql-db'. for 2 minutes"
summary: "Docker Swarm service postgresql_postgresql-db down for 2 minutes"
- alert: HAproxyNovaNovncBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyMysqlClusterBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailControlProcessCritical
expr: >-
count(procstat_running{process_name="contrail-control"} == 0) >= count(procstat_running{process_name="contrail-control"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-control"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailXMPPSessionsTooMany
expr: >-
min(contrail_xmpp_session_count) by (host) >= 500
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are too many XMPP sessions on node {{ $labels.host }} (current value={{ $value }}, threshold=500)"
summary: "Too many XMPP sessions"
- alert: HAproxyContrailApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailFlowsInvalidNHTooMany
expr: >-
rate(contrail_vrouter_flows_invalid_nh[5m]) >= 0.1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter flows with invalid next hop on node {{ $labels.host }} (current value={{ $value }}, threshold=0.1)"
summary: "Too many vRouter flows with invalid next hop"
- alert: HAproxyHeatApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="heat_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailSupervisordVrouterProcessWarning
expr: >-
count(procstat_running{process_name="contrail-supervisord-vrouter"} == 0) >= count(procstat_running{process_name="contrail-supervisord-vrouter"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-supervisord-vrouter"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: SystemSwapIn
expr: >-
rate(swap_in[2m]) > 1048576
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold=1048576b/s)."
summary: "Swap input throughput too high on {{ $labels.host }}"
- alert: ContrailNodemgrDatabaseProcessDown
expr: >-
count(procstat_running{process_name="contrail-nodemgr-database"} == 0) == count(procstat_running{process_name="contrail-nodemgr-database"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-nodemgr-database"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailVrouterAPICritical
expr: >-
count(http_response_status{service=~"contrail.vrouter"} == 0) by (service) >= count(http_response_status{service=~"contrail.vrouter"}) by (service) *0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyContrailDiscoveryBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailNodemgrDatabaseProcessInfo
expr: >-
procstat_running{process_name="contrail-nodemgr-database"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-nodemgr-database"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: OutOfMemoryTooHigh
expr: >-
rate(out_of_memory_total[5m]) > 0.0011
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The rate of out-of-memory errors is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.0011)."
summary: "Too many out-of-memory errors"
- alert: ContrailSnmpCollectorProcessCritical
expr: >-
count(procstat_running{process_name="contrail-snmp-collector"} == 0) >= count(procstat_running{process_name="contrail-snmp-collector"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-snmp-collector"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailCollectorProcessDown
expr: >-
count(procstat_running{process_name="contrail-collector"} == 0) == count(procstat_running{process_name="contrail-collector"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-collector"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: InfluxdbDown
expr: >-
count(influxdb_up == 0) == count(influxdb_up)
labels:
route: "email,salesforce"
severity: "down"
service: "influxdb"
annotations:
description: "All InfluxDB services are down"
summary: "All InfluxDB services are down"
- alert: DockerServiceMonitoringPushgatewayWarningReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_pushgateway"}[1m])) <= 2 * 0.7
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "monitoring_pushgateway"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_pushgateway' for 2 minutes."
summary: "Docker Swarm service monitoring_pushgateway invalid number of replicas for 2 minutes"
- alert: HAproxyNovaNovncBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="nova_novnc"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailSupervisordConfigProcessCritical
expr: >-
count(procstat_running{process_name="contrail-supervisord-config"} == 0) >= count(procstat_running{process_name="contrail-supervisord-config"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-supervisord-config"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: DockerServiceSecurityMonkeySecurityauditschedulerReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-scheduler"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-scheduler"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "security_monkey_security-audit-scheduler"
annotations:
description: "No replicas are running for the Docker Swarn service 'security_monkey_security-audit-scheduler'. for 2 minutes"
summary: "Docker Swarm service security_monkey_security-audit-scheduler down for 2 minutes"
- alert: HAproxyAodhApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailSnmpCollectorProcessWarning
expr: >-
count(procstat_running{process_name="contrail-snmp-collector"} == 0) >= count(procstat_running{process_name="contrail-snmp-collector"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-snmp-collector"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyGlanceApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailApiProcessWarning
expr: >-
count(procstat_running{process_name="contrail-api"} == 0) >= count(procstat_running{process_name="contrail-api"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-api"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: DockerServicePushkinPushkinapiReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="pushkin_pushkin-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="pushkin_pushkin-api"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "pushkin_pushkin-api"
annotations:
description: "No replicas are running for the Docker Swarn service 'pushkin_pushkin-api'. for 2 minutes"
summary: "Docker Swarm service pushkin_pushkin-api down for 2 minutes"
- alert: HaproxyDown
expr: >-
haproxy_up != 1
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy"
annotations:
description: "Haproxy service is down on node {{ $labels.host }}"
summary: "Haproxy service down"
- alert: HAproxyKeystonePublicApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="keystone_public_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: SystemCpuIdleTooLow
expr: >-
avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < 10.0
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold=10.0%)."
summary: "Idle CPU usage too low on {{ $labels.host }}"
- alert: CassandraServerProcessCritical
expr: >-
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "cassandra-server"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailFlowsTableFullTooMany
expr: >-
min(contrail_vrouter_flows_flow_table_full) by (host) >= 100
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter flows with table full on node {{ $labels.host }} (current value={{ $value }}, threshold=100)"
summary: "Too many vRouter flows with table full"
- alert: HeatErrorLogsTooHigh
expr: >-
sum(rate(log_messages{service="heat",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)."
summary: "Too many errors in {{ $labels.service }} logs"
- alert: DockerServiceJanitorMonkeyCleanupserviceapiReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="janitor_monkey_cleanup-service-api"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "janitor_monkey_cleanup-service-api"
annotations:
description: "No replicas are running for the Docker Swarn service 'janitor_monkey_cleanup-service-api'. for 2 minutes"
summary: "Docker Swarm service janitor_monkey_cleanup-service-api down for 2 minutes"
- alert: ContrailCollectorProcessWarning
expr: >-
count(procstat_running{process_name="contrail-collector"} == 0) >= count(procstat_running{process_name="contrail-collector"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-collector"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyNeutronApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailFlowsInvalidITFTooMany
expr: >-
rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= 0.05
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter flows with composite invalid interface on node {{ $labels.host }} (current value={{ $value }}, threshold=0.05)"
summary: "Too many vRouter flows with composite invalid interface"
- alert: DockerServiceAptlyPublicReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "aptly_public"
annotations:
description: "No replicas are running for the Docker Swarn service 'aptly_public'. for 2 minutes"
summary: "Docker Swarm service aptly_public down for 2 minutes"
- alert: NovaAggregatesFreeMemoryLow
expr: >-
(100.0 * openstack_nova_aggregate_free_ram) / (openstack_nova_aggregate_free_ram + openstack_nova_aggregate_used_ram) < 10.0
for: 1m
labels:
aggregate: "{{ $labels.aggregate }}"
route: "email,salesforce"
severity: "warning"
service: "nova"
annotations:
description: "Memory low limit for 1 minutes on aggregate {{ $labels.aggregate }}"
summary: "Memory low limit for new instances on aggregate {{ $labels.aggregate }}"
- alert: KeystoneFailedAuthsTooHigh
expr: >-
rate(authentications_total_failed[5m]) > rate(authentications_total_all[5m]) * 50 / 100 and rate(authentications_total_all[5m]) > 0.1
labels:
route: "email,salesforce"
severity: "warning"
service: "keystone"
annotations:
description: "The rate of failed authentications in Keystone over the last 5 minutes is too high (current value={{ $value }}, threshold=50)."
summary: "Too many failed authentications in Keystone"
- alert: NovaAggregatesFreeVCPUsLow
expr: >-
(100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 10.0
for: 1m
labels:
aggregate: "{{ $labels.aggregate }}"
route: "email,salesforce"
severity: "warning"
service: "nova"
annotations:
description: "VPCU low limit for 1 minutes on aggregate {{ $labels.aggregate }}"
summary: "VCPU low limit for new instances on aggregate {{ $labels.aggregate }}"
- alert: CinderAPIServiceInfo
expr: >-
http_response_status{service=~"cinder-api"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for the last 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: HAproxyNeutronApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailAlarmGenProcessWarning
expr: >-
count(procstat_running{process_name="contrail-alarm-gen"} == 0) >= count(procstat_running{process_name="contrail-alarm-gen"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-alarm-gen"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: PrometheusRemoteStorageQueue
expr: >-
prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > 75.0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "prometheus"
annotations:
description: "The Prometheus {{ $labels.instance }} remote storage queue almost full (current value={{ $value }}%, threshold=75.0%)"
summary: "Prometheus {{ $labels.instance }} remote storage queue is filling"
- alert: HAproxyContrailApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyKeystonePublicApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="keystone_public_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: DockerServiceMonitoringRemoteAgentReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_agent"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_agent"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_remote_agent"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_remote_agent'. for 2 minutes"
summary: "Docker Swarm service monitoring_remote_agent down for 2 minutes"
- alert: ContrailAPIDown
expr: >-
count(http_response_status{service=~"contrail.api"} == 0) by (service) == count(http_response_status{service=~"contrail.api"}) by (service)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' APIs are down"
summary: "All '{{ $labels.service }}' APIs are down"
- alert: DockerServiceMonitoringRemoteStorageAdapterReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_storage_adapter"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_storage_adapter"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_remote_storage_adapter"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_remote_storage_adapter'. for 2 minutes"
summary: "Docker Swarm service monitoring_remote_storage_adapter down for 2 minutes"
- alert: HAproxyElasticsearchBinaryBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailJobServerProcessWarning
expr: >-
count(procstat_running{process_name="contrail-job-server"} == 0) >= count(procstat_running{process_name="contrail-job-server"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-job-server"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: GlanceAPIServiceDown
expr: >-
http_response_status{service=~"glance-api"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: ContrailQueryEngineProcessWarning
expr: >-
count(procstat_running{process_name="contrail-query-engine"} == 0) >= count(procstat_running{process_name="contrail-query-engine"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-query-engine"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: DockerdProcessDown
expr: >-
procstat_running{process_name="dockerd"} == 0
labels:
route: "email,salesforce"
severity: "warning"
service: "docker"
annotations:
description: "Dockerd service is down on node {{ $labels.host }}"
summary: "Dockerd service is down"
- alert: ContrailVrouterXMPPSessionsNone
expr: >-
max(contrail_vrouter_xmpp) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are no vRouter XMPP sessions on node {{ $labels.host }}"
summary: "No vRouter XMPP sessions"
- alert: ContrailQueryEngineProcessDown
expr: >-
count(procstat_running{process_name="contrail-query-engine"} == 0) == count(procstat_running{process_name="contrail-query-engine"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-query-engine"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: HAproxyHeatCloudwatchApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: DockerServiceMonitoringRelayCriticalReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}[1m])) <= 2 * 0.4
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "monitoring_relay"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_relay' for 2 minutes."
summary: "Docker Swarm service monitoring_relay invalid number of replicas for 2 minutes"
- alert: NovaLibvirtDown
expr: >-
max(libvirt_up) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "libvirt"
annotations:
description: "libvirt check on '{{ $labels.host }}' is down for 2 minutes"
summary: "libvirt check on '{{ $labels.host }}' is down"
- alert: HAproxyContrailAnalyticsHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="contrail_analytics"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: NovaAggregatesFreeVCPUsShortage
expr: >-
(100.0 * openstack_nova_aggregate_free_vcpus) / (openstack_nova_aggregate_free_vcpus + openstack_nova_aggregate_used_vcpus) < 2.0
for: 1m
labels:
aggregate: "{{ $labels.aggregate }}"
route: "email,salesforce"
severity: "critical"
service: "nova"
annotations:
description: "VPCU shortage for 1 minutes on aggregate {{ $labels.aggregate }}"
summary: "VCPU shortage for new instances on aggregate {{ $labels.aggregate }}"
- alert: ContrailDiscoveryProcessWarning
expr: >-
count(procstat_running{process_name="contrail-discovery"} == 0) >= count(procstat_running{process_name="contrail-discovery"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-discovery"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordControlProcessDown
expr: >-
count(procstat_running{process_name="contrail-supervisord-control"} == 0) == count(procstat_running{process_name="contrail-supervisord-control"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-supervisord-control"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailSvcMonitorProcessWarning
expr: >-
count(procstat_running{process_name="contrail-svc-monitor"} == 0) >= count(procstat_running{process_name="contrail-svc-monitor"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-svc-monitor"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ZookeeperDown
expr: >-
count(zookeeper_up == 0) == count(zookeeper_up)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "zookeeper"
annotations:
description: "All Zookeeper services are down"
summary: "All Zookeeper services are down"
- alert: ContrailAlarmGenProcessDown
expr: >-
count(procstat_running{process_name="contrail-alarm-gen"} == 0) == count(procstat_running{process_name="contrail-alarm-gen"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-alarm-gen"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: CinderAPIDown
expr: >-
max(openstack_api_check_status{service=~"cinder.*"}) by (service) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is down for the last 2 minutes"
summary: "Endpoint check for '{{ $labels.service }}' is down"
- alert: HAproxyRabbitmqClusterBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailControlProcessDown
expr: >-
count(procstat_running{process_name="contrail-control"} == 0) == count(procstat_running{process_name="contrail-control"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-control"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: CinderServicesCritical
expr: >-
openstack_cinder_services{service=~"cinder-volume|cinder-scheduler", state="down"} >= on (service) sum(openstack_cinder_services{service=~"cinder-volume|cinder-scheduler"}) by (service) * 0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 60.0%)"
summary: "More than 60.0% of {{ $labels.service }} services are down"
- alert: RedisServerProcessInfo
expr: >-
procstat_running{process_name="redis-server"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "redis-server"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: CinderErrorLogsTooHigh
expr: >-
sum(rate(log_messages{service="cinder",level=~"(?i:(error|emergency|fatal))"}[5m])) without (level) > 0.2
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "The rate of errors in {{ $labels.service }} logs over the last 5 minutes is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=0.2)."
summary: "Too many errors in {{ $labels.service }} logs"
- alert: AlertmanagerNotificationFailed
expr: >-
rate(alertmanager_notifications_failed_total[5m]) > 0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "alertmanager"
annotations:
description: "Alertmanager {{ $labels.instance }} failed notifications for {{ $labels.integration }} (current value={{ $value }}, threshold=0.3)"
summary: "Alertmanager {{ $labels.instance }} failed notifications"
- alert: RedisServerProcessWarning
expr: >-
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "redis-server"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: NeutronAPIDown
expr: >-
openstack_api_check_status{service=~"neutron.*"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is down for 2 minutes"
summary: "Endpoint check for '{{ $labels.service }}' is down"
- alert: GlanceRegistryServiceDown
expr: >-
http_response_status{service=~"glance-registry"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: ContrailIfmapServerProcessInfo
expr: >-
procstat_running{process_name="contrail-ifmap-server"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-ifmap-server"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: HAproxyElasticsearchBinaryBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: ContrailWebServerProcessInfo
expr: >-
procstat_running{process_name="contrail-web-server"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-web-server"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailWebServerProcessDown
expr: >-
count(procstat_running{process_name="contrail-web-server"} == 0) == count(procstat_running{process_name="contrail-web-server"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-web-server"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: SystemDiskSpaceTooLow
expr: >-
predict_linear(disk_free[1h], 8*3600) < 0
for: 15m
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}."
summary: "Free space for {{ $labels.path }} too low on {{ $labels.host }}"
- alert: ContrailSupervisordAnalyticsProcessCritical
expr: >-
count(procstat_running{process_name="contrail-supervisord-analytics"} == 0) >= count(procstat_running{process_name="contrail-supervisord-analytics"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-supervisord-analytics"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailIrondProcessDown
expr: >-
count(procstat_running{process_name="contrail-irond"} == 0) == count(procstat_running{process_name="contrail-irond"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-irond"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailNodemgrConfigProcessDown
expr: >-
count(procstat_running{process_name="contrail-nodemgr-config"} == 0) == count(procstat_running{process_name="contrail-nodemgr-config"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-nodemgr-config"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: PrometheusTargetDown
expr: >-
up != 1
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "prometheus"
annotations:
description: "The Prometheus target {{ $labels.instance }} is down for the job {{ $labels.job }}."
summary: "Prometheus endpoint {{ $labels.instance }} down"
- alert: ContrailIfmapServerProcessWarning
expr: >-
count(procstat_running{process_name="contrail-ifmap-server"} == 0) >= count(procstat_running{process_name="contrail-ifmap-server"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-ifmap-server"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyKibanaBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="kibana"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="kibana"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyContrailDiscoveryHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="contrail_discovery"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: KibanaProcessCritical
expr: >-
count(procstat_running{process_name="kibana"} == 0) >= count(procstat_running{process_name="kibana"}) * 0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "kibana"
annotations:
description: "More than 60.0% of Kibana services are down"
summary: "More than 60.0% of Kibana services are down"
- alert: ContrailSupervisordDatabaseProcessDown
expr: >-
count(procstat_running{process_name="contrail-supervisord-database"} == 0) == count(procstat_running{process_name="contrail-supervisord-database"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-supervisord-database"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: DockerServiceAptlyPublicWarningReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="aptly_public"}[1m])) <= 3 * 0.7
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "aptly_public"
annotations:
description: "{{ $value }}/3 replicas are running for the Docker Swarn service 'aptly_public' for 2 minutes."
summary: "Docker Swarm service aptly_public invalid number of replicas for 2 minutes"
- alert: ContrailDeviceManagerProcessCritical
expr: >-
count(procstat_running{process_name="contrail-device-manager"} == 0) >= count(procstat_running{process_name="contrail-device-manager"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-device-manager"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailNodemgrConfigProcessInfo
expr: >-
procstat_running{process_name="contrail-nodemgr-config"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-nodemgr-config"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: HAproxyRabbitmqClusterHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="rabbitmq_cluster"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailNodeManagerAPIDown
expr: >-
count(http_response_status{service=~"contrail.node.manager"} == 0) by (service) == count(http_response_status{service=~"contrail.node.manager"}) by (service)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' APIs are down"
summary: "All '{{ $labels.service }}' APIs are down"
- alert: HeatAPIServicesCritical
expr: >-
count(http_response_status{service=~"heat.*-api"} == 0) by (service) >= on (service) count(http_response_status{service=~"heat.*-api"}) by (service) * 0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "{{ $value }} {{ $labels.service }} services are down for the last 2 minutes (More than 60.0%)"
summary: "More than 60.0% of {{ $labels.service }} services are down"
- alert: SaltMinionProcessDown
expr: >-
procstat_running{process_name="salt-minion"} == 0
labels:
route: "email,salesforce"
severity: "warning"
service: "salt-minion"
annotations:
description: "Salt-minion service is down on node {{ $labels.host }}"
summary: "Salt-minion service is down"
- alert: ContrailXMPPSessionsSomeDown
expr: >-
min(contrail_xmpp_session_down_count) by (host) > 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are inactive XMPP sessions on node {{ $labels.host }}"
summary: "inactive XMPP sessions"
- alert: ContrailVrouterAPIWarning
expr: >-
count(http_response_status{service=~"contrail.vrouter"} == 0) by (service) >= count(http_response_status{service=~"contrail.vrouter"}) by (service) *0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: HAproxyContrailDiscoveryBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: DockerServiceDashboardGrafanaReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="dashboard_grafana"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="dashboard_grafana"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "dashboard_grafana"
annotations:
description: "No replicas are running for the Docker Swarn service 'dashboard_grafana'. for 2 minutes"
summary: "Docker Swarm service dashboard_grafana down for 2 minutes"
- alert: HAproxyNeutronApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="neutron_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: InfluxdbSeriesNumberTooHigh
expr: >-
influxdb_database_numSeries >= 1000000
labels:
route: "email,salesforce"
severity: "critical"
service: "influxdb"
annotations:
description: "The InfluxDB {{ $labels.database }} database has exceeded the maximum number of series (value={{ $value }},threshold=1000000)."
summary: "InfluxDB too many series for {{ $labels.database }}"
- alert: KafkaServerProcessInfo
expr: >-
procstat_running{process_name="kafka-server"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "kafka-server"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: HAproxyGlanceApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: DockerServiceMonitoringRelayReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_relay"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_relay'. for 2 minutes"
summary: "Docker Swarm service monitoring_relay down for 2 minutes"
- alert: DockerServiceMonitoringRemoteCollectorReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_collector"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_remote_collector"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_remote_collector"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_remote_collector'. for 2 minutes"
summary: "Docker Swarm service monitoring_remote_collector down for 2 minutes"
- alert: HAproxyNeutronApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="neutron_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: HAproxyCinderApiBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: HAproxyAodhApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="aodh-api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailTopologyProcessWarning
expr: >-
count(procstat_running{process_name="contrail-topology"} == 0) >= count(procstat_running{process_name="contrail-topology"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-topology"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailDiscoveryAPICritical
expr: >-
count(http_response_status{service=~"contrail.discovery"} == 0) by (service) >= count(http_response_status{service=~"contrail.discovery"}) by (service) *0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: KibanaProcessInfo
expr: >-
procstat_running{process_name="kibana"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "kibana"
annotations:
description: "Kibana service is down on node {{ $labels.host }}"
summary: "Kibana service is down"
- alert: HAproxyCinderApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="cinder_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: SystemDiskInodesTooLow
expr: >-
predict_linear(disk_inodes_free[1h], 8*3600) < 0
for: 15m
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}."
summary: "Free inodes for {{ $labels.path }} too low on {{ $labels.host }}"
- alert: KeystoneAPITooSlow
expr: >-
max by(host) (openstack_http_response_times{service='keystone',quantile="0.9",http_method=~"^(GET|POST)$",http_status=~"^2..$"}) >= 3.0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "keystone"
annotations:
description: "The 90th percentile of the Keystone API response times for GET and POST requests is too high on node {{ $labels.host }} (current value={{ $value }}s, threshold=3.0s)."
summary: "Keystone API too slow"
- alert: ContrailNodemgrControlProcessDown
expr: >-
count(procstat_running{process_name="contrail-nodemgr-control"} == 0) == count(procstat_running{process_name="contrail-nodemgr-control"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-nodemgr-control"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: RemoteStorageAdapterSendingTooSlow
expr: >-
100.0 - (100.0 * sent_samples_total{job="remote_storage_adapter"} / on (job, instance) received_samples_total) > 10.0
labels:
route: "email,salesforce"
severity: "warning"
service: "remote_storage_adapter"
annotations:
description: "Remote storage adapter can not ingest samples fast enough on {{ $labels.instance }} (current value={{ $value }}%, threshold=10.0%)."
summary: "Remote storage adapter too slow on {{ $labels.instance }}"
- alert: HeatAPIServicesDown
expr: >-
count(http_response_status{service=~"heat.*-api"} == 0) by (service) == on (service) count(http_response_status{service=~"heat.*-api"}) by (service)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All {{ $labels.service }} services are down for the last 2 minutes"
summary: "All {{ $labels.service }} services are down"
- alert: HAproxyMysqlClusterBackendDown
expr: >-
max(haproxy_active_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) + max(haproxy_backup_servers{sv="BACKEND",proxy="mysql_cluster"}) by (proxy) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "The proxy '{{ $labels.proxy }}' has no active backend"
summary: "All backends are down for the '{{ $labels.proxy }}' proxy"
- alert: NeutronAPIServiceDown
expr: >-
http_response_status{service=~"neutron-api"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "The HTTP check for '{{ $labels.service }}' is down on {{ $labels.host }} for 2 minutes."
summary: "HTTP check for '{{ $labels.service }}' down"
- alert: ContrailNodeManagerAPICritical
expr: >-
count(http_response_status{service=~"contrail.node.manager"} == 0) by (service) >= count(http_response_status{service=~"contrail.node.manager"}) by (service) *0.6
for: 2m
labels:
route: "email,salesforce"
severity: "critical"
service: "{{ $labels.service }}"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: HAproxyKibanaHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="kibana"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: ContrailBGPSessionsNoneUp
expr: >-
max(contrail_bgp_session_up_count) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are no active BGP sessions on node {{ $labels.host }}"
summary: "no active BGP sessions"
- alert: ContrailSchemaProcessCritical
expr: >-
count(procstat_running{process_name="contrail-schema"} == 0) >= count(procstat_running{process_name="contrail-schema"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-schema"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ZookeeperInfo
expr: >-
zookeeper_up != 1
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "zookeeper"
annotations:
description: "Zookeeper service is down on node {{ $labels.host }}."
summary: "Zookeeper service down"
- alert: DockerServiceMonitoringServerReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_server"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "monitoring_server"
annotations:
description: "No replicas are running for the Docker Swarn service 'monitoring_server'. for 2 minutes"
summary: "Docker Swarm service monitoring_server down for 2 minutes"
- alert: ContrailNodemgrVrouterProcessInfo
expr: >-
procstat_running{process_name="contrail-nodemgr-vrouter"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-nodemgr-vrouter"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: RemoteStorageAdapterIgnoredTooHigh
expr: >-
100.0 * prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"} / on (job, instance) sent_samples_total > 5.0
labels:
route: "email,salesforce"
severity: "warning"
service: "remote_storage_adapter"
annotations:
description: "Remote storage adapter is receiving too many invalid metrics on {{ $labels.instance }} (current value={{ $value }}%, threshold=5.0%)."
summary: "Remote storage adapter receiving too many invalid metrics on {{ $labels.instance }}"
- alert: ContrailNodemgrDatabaseProcessWarning
expr: >-
count(procstat_running{process_name="contrail-nodemgr-database"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-database"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-nodemgr-database"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailCollectorAPIInfo
expr: >-
http_response_status{service=~"contrail.collector"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}"
summary: "Endpoint check for '{{ $labels.service }}' is failed"
- alert: ContrailNodemgrConfigProcessCritical
expr: >-
count(procstat_running{process_name="contrail-nodemgr-config"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-config"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-nodemgr-config"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailAnalyticsApiProcessWarning
expr: >-
count(procstat_running{process_name="contrail-analytics-api"} == 0) >= count(procstat_running{process_name="contrail-analytics-api"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-analytics-api"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: DockerServiceDockerRegistryReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="docker_registry"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "docker_registry"
annotations:
description: "No replicas are running for the Docker Swarn service 'docker_registry'. for 2 minutes"
summary: "Docker Swarm service docker_registry down for 2 minutes"
- alert: ContrailIrondProcessInfo
expr: >-
procstat_running{process_name="contrail-irond"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-irond"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: RabbitMQDiskFull
expr: >-
rabbitmq_node_disk_free <= rabbitmq_node_disk_free_limit
labels:
route: "email,salesforce"
severity: "critical"
service: "rabbitmq"
annotations:
description: "All producers are blocked because the RabbitMQ disk partition is full on node {{ $labels.host }}."
summary: "RabbitMQ producers blocked due to full disk"
- alert: ContrailSupervisordVrouterProcessCritical
expr: >-
count(procstat_running{process_name="contrail-supervisord-vrouter"} == 0) >= count(procstat_running{process_name="contrail-supervisord-vrouter"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-supervisord-vrouter"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailDiscoveryAPIDown
expr: >-
count(http_response_status{service=~"contrail.discovery"} == 0) by (service) == count(http_response_status{service=~"contrail.discovery"}) by (service)
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "{{ $labels.service }}"
annotations:
description: "All '{{ $labels.service }}' APIs are down"
summary: "All '{{ $labels.service }}' APIs are down"
- alert: ContrailVrouterDNSXMPPSessionsTooManyVariations
expr: >-
abs(delta(contrail_vrouter_dns_xmpp[2m])) >= 5
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-compute"
annotations:
description: "There are too many vRouter DNS-XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold=5)"
summary: "Number of vRouter DNS-XMPP sessions changed between checks is too high"
- alert: DockerServiceSecurityMonkeySecurityauditapiReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="security_monkey_security-audit-api"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "security_monkey_security-audit-api"
annotations:
description: "No replicas are running for the Docker Swarn service 'security_monkey_security-audit-api'. for 2 minutes"
summary: "Docker Swarm service security_monkey_security-audit-api down for 2 minutes"
- alert: GaleraServiceDown
expr: >-
mysql_up != 1
labels:
route: "email,salesforce"
severity: "warning"
service: "mysql"
annotations:
description: "Galera service is down on node {{ $labels.host }}"
summary: "Galera service down"
- alert: ContrailNodemgrProcessCritical
expr: >-
count(procstat_running{process_name="contrail-nodemgr"} == 0) >= count(procstat_running{process_name="contrail-nodemgr"}) *0.6
labels:
route: "email,salesforce"
severity: "critical"
service: "contrail-nodemgr"
annotations:
description: "More than 60.0% of '{{ $labels.service }}' is down"
summary: "More than 60.0% of '{{ $labels.service }}' is down"
- alert: ContrailSupervisordAnalyticsProcessInfo
expr: >-
procstat_running{process_name="contrail-supervisord-analytics"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-supervisord-analytics"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: GaleraNodeNotConnected
expr: >-
mysql_wsrep_connected != 1
for: 1m
labels:
route: "email,salesforce"
severity: "warning"
service: "mysql"
annotations:
description: "The Galera service on {{ $labels.host }} is not connected to the cluster."
summary: "Galera on {{ $labels.host }} not connected"
- alert: ContrailDeviceManagerProcessDown
expr: >-
count(procstat_running{process_name="contrail-device-manager"} == 0) == count(procstat_running{process_name="contrail-device-manager"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-device-manager"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: ContrailSvcMonitorProcessDown
expr: >-
count(procstat_running{process_name="contrail-svc-monitor"} == 0) == count(procstat_running{process_name="contrail-svc-monitor"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-svc-monitor"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: DockerServiceMonitoringRelayWarningReplicasNumber
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="monitoring_relay"}[1m])) <= 2 * 0.7
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "monitoring_relay"
annotations:
description: "{{ $value }}/2 replicas are running for the Docker Swarn service 'monitoring_relay' for 2 minutes."
summary: "Docker Swarm service monitoring_relay invalid number of replicas for 2 minutes"
- alert: HAproxyContrailDiscoveryBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="contrail_discovery"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailControlProcessInfo
expr: >-
procstat_running{process_name="contrail-control"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-control"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ContrailSupervisordDatabaseProcessInfo
expr: >-
procstat_running{process_name="contrail-supervisord-database"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-supervisord-database"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: ElasticsearchClusterDiskHighWaterMark
expr: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "elasticsearch"
annotations:
description: "Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node"
summary: "Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}"
- alert: HAproxyHeatCloudwatchApiBackendWarning
expr: >-
max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}[12h])) by (proxy) - min(haproxy_active_servers{sv="BACKEND",proxy="heat_cloudwatch_api"}) by (proxy) >= 1
for: 5m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }} of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "At least one backend is down for '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: HAproxyGlanceApiBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="glance_api"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="glance_api"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailDiscoveryAPIInfo
expr: >-
http_response_status{service=~"contrail.discovery"} == 0
for: 2m
labels:
route: "email,salesforce"
severity: "info"
service: "{{ $labels.service }}"
annotations:
description: "Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}"
summary: "Endpoint check for '{{ $labels.service }}' is failed"
- alert: ContrailDnsProcessInfo
expr: >-
procstat_running{process_name="contrail-dns"} == 0
labels:
route: "email,salesforce"
severity: "info"
service: "contrail-dns"
annotations:
description: "{{ $labels.service }} service is down on node {{ $labels.host }}"
summary: "{{ $labels.service }} service is down"
- alert: DockerServiceHceHceapiReplicasDown
expr: >-
count(count_over_time(docker_container_cpu_usage_percent{com_docker_swarm_service_name="hce_hce-api"}[1m])) == 0 or absent(docker_container_cpu_usage_percent{com_docker_swarm_service_name="hce_hce-api"}) == 1
for: 2m
labels:
route: "email,salesforce"
severity: "down"
service: "hce_hce-api"
annotations:
description: "No replicas are running for the Docker Swarn service 'hce_hce-api'. for 2 minutes"
summary: "Docker Swarm service hce_hce-api down for 2 minutes"
- alert: HAproxyElasticsearchBinaryBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="elasticsearch_binary"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailVrouterAgentProcessDown
expr: >-
count(procstat_running{process_name="contrail-vrouter-agent"} == 0) == count(procstat_running{process_name="contrail-vrouter-agent"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-vrouter-agent"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
- alert: NovaServicesWarning
expr: >-
openstack_nova_services{state="down",service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"} >= on (service) sum(openstack_nova_services{service=~"nova-cert|nova-conductor|nova-consoleauth|nova-scheduler"}) by (service) * 0.3
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "{{ $labels.service }}"
annotations:
description: "More than 30.0% of {{ $labels.service }} services are down for the last 2 minutes"
summary: "More than 30.0% of {{ $labels.service }} services are down"
- alert: HAproxyAodhApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="aodh-api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: HAproxyNovaApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="nova_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: NginxDown
expr: >-
nginx_up != 1
labels:
route: "email,salesforce"
severity: "warning"
service: "nginx"
annotations:
description: "Nginx service is down on node {{ $labels.host }}"
summary: "Nginx service down"
- alert: ContrailBGPSessionsNone
expr: >-
max(contrail_bgp_session_count) by (host) == 0
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-control"
annotations:
description: "There are no BGP sessions on node {{ $labels.host }}"
summary: "No BGP sessions"
- alert: HAproxyHeatApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="heat_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: KeepalivedProcessDown
expr: >-
procstat_running{process_name="keepalived"} == 0
labels:
route: "email,salesforce"
severity: "warning"
service: "keepalived"
annotations:
description: "Keepalived service is down on node {{ $labels.host }}"
summary: "Keepalived service is down"
- alert: InfluxdbSeriesNumberHigh
expr: >-
influxdb_database_numSeries >= 950000.0
labels:
route: "email,salesforce"
severity: "warning"
service: "influxdb"
annotations:
description: "The InfluxDB {{ $labels.database }} database is getting close to the maximum number of series (value={{ $value }},threshold=950000.0)."
summary: "InfluxDB high number of series for {{ $labels.database }}"
- alert: SystemSwapOut
expr: >-
rate(swap_out[2m]) > 1048576
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold=1048576b/s)."
summary: "Swap output throughput too high on {{ $labels.host }}"
- alert: HAproxyHeatCloudwatchApiHTTPResponse5xx
expr: >-
rate(haproxy_http_response_5xx{sv="FRONTEND",proxy="heat_cloudwatch_api"}[1m]) > 1
for: 2m
labels:
route: "email,salesforce"
severity: "warning"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "Too many 5xx HTTP errors have been detected on the '{{ $labels.proxy }}' proxy for the last 2 minutes ({{ $value }} error(s) per second)"
summary: "HTTP 5xx responses on '{{ $labels.proxy }}' proxy (host {{ $labels.host }})"
- alert: SystemLoad5TooHigh
expr: >-
system_load5 / system_n_cpus > 3
labels:
route: "email,salesforce"
severity: "warning"
service: "system"
annotations:
description: "The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold=3)."
summary: "High system load (5m) on {{ $labels.host }}"
- alert: HAproxyRabbitmqClusterBackendCritical
expr: >-
(max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}[12h])) by (proxy)
- min (haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}) by (proxy)
) / max(max_over_time(haproxy_active_servers{sv="BACKEND",proxy="rabbitmq_cluster"}[12h])) by (proxy) * 100 >= 50
for: 5m
labels:
route: "email,salesforce"
severity: "critical"
service: "haproxy/{{ $labels.proxy }}"
annotations:
description: "{{ $value }}% of backends are down for the '{{ $labels.proxy }}' proxy"
summary: "Less than 50% of backends are up for the '{{ $labels.proxy }}' proxy for the last 5 minutes"
- alert: ContrailSupervisordDatabaseProcessWarning
expr: >-
count(procstat_running{process_name="contrail-supervisord-database"} == 0) >= count(procstat_running{process_name="contrail-supervisord-database"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-supervisord-database"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ContrailNodemgrVrouterProcessWarning
expr: >-
count(procstat_running{process_name="contrail-nodemgr-vrouter"} == 0) >= count(procstat_running{process_name="contrail-nodemgr-vrouter"}) *0.3
labels:
route: "email,salesforce"
severity: "warning"
service: "contrail-nodemgr-vrouter"
annotations:
description: "More than 30.0% of '{{ $labels.service }}' is down"
summary: "More than 30.0% of '{{ $labels.service }}' is down"
- alert: ApacheDown
expr: >-
apache_up != 1
labels:
route: "email,salesforce"
severity: "warning"
service: "apache"
annotations:
description: "Apache service is down on node {{ $labels.host }}"
summary: "Apache service down"
- alert: ContrailSupervisordVrouterProcessDown
expr: >-
count(procstat_running{process_name="contrail-supervisord-vrouter"} == 0) == count(procstat_running{process_name="contrail-supervisord-vrouter"})
labels:
route: "email,salesforce"
severity: "down"
service: "contrail-supervisord-vrouter"
annotations:
description: "All '{{ $labels.service }}' services are down"
summary: "All '{{ $labels.service }}' services are down"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment