/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-alertmanager.rules.yaml > alertmanager.rules
|
alert: AlertmanagerConfigInconsistent
expr: count_values
by(service) ("config_hash", alertmanager_config_hash{job="prometheus-prometheus-oper-alertmanager",namespace="laika-infrastructure"})
/ on(service) group_left() label_replace(max by(name, job, namespace, controller)
(prometheus_operator_spec_replicas{controller="alertmanager",job="prometheus-prometheus-oper-operator",namespace="laika-infrastructure"}),
"service", "$1", "name", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-general.rules.yaml > general.rules
|
Labels |
State |
Active Since |
Value |
alertname="TargetDown"
job="kube-proxy"
namespace="kube-system"
service="prometheus-prometheus-oper-kube-proxy"
severity="warning"
|
firing |
2020-05-18 21:17:44 +0000 UTC |
100 |
Annotations |
- message
- 100% of the kube-proxy/prometheus-prometheus-oper-kube-proxy targets in kube-system namespace are down.
|
alertname="TargetDown"
job="assets"
namespace="apps"
service="assets"
severity="warning"
|
firing |
2023-11-23 07:01:44.437701098 +0000 UTC |
28.846153846153843 |
Annotations |
- message
- 28.85% of the assets/assets targets in apps namespace are down.
|
alertname="TargetDown"
job="life"
namespace="apps"
service="life"
severity="warning"
|
firing |
2024-06-24 05:27:44.437701098 +0000 UTC |
50 |
Annotations |
- message
- 50% of the life/life targets in apps namespace are down.
|
alertname="TargetDown"
job="sharingui"
namespace="apps"
service="sharingui"
severity="warning"
|
firing |
2021-03-11 18:09:14 +0000 UTC |
43.24324324324324 |
Annotations |
- message
- 43.24% of the sharingui/sharingui targets in apps namespace are down.
|
alertname="TargetDown"
job="nginx-ingress-controller-metrics"
namespace="nginx-ingress"
service="nginx-ingress-controller-metrics"
severity="warning"
|
firing |
2023-09-06 10:23:44.437701098 +0000 UTC |
50 |
Annotations |
- message
- 50% of the nginx-ingress-controller-metrics/nginx-ingress-controller-metrics targets in nginx-ingress namespace are down.
|
|
alert: Watchdog
expr: vector(1)
labels:
severity: none
annotations:
message: |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
Labels |
State |
Active Since |
Value |
alertname="Watchdog"
severity="none"
|
firing |
2023-01-19 10:24:14.437701098 +0000 UTC |
1 |
Annotations |
- message
- This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kube-apiserver-slos.yaml > kube-apiserver-slos
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kube-state-metrics.yaml > kube-state-metrics
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-apps.yaml > kubernetes-apps
|
Labels |
State |
Active Since |
Value |
alertname="KubeCronJobRunning"
cronjob="similarityrecognition"
endpoint="http"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="apps"
pod="prometheus-kube-state-metrics-78b7d687c5-wms4l"
service="prometheus-kube-state-metrics"
severity="warning"
|
firing |
2022-09-13 09:17:56 +0000 UTC |
1.1549891666700006e+08 |
Annotations |
- message
- CronJob apps/similarityrecognition is taking more than 1h to complete.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
|
|
Labels |
State |
Active Since |
Value |
alertname="KubeDaemonSetRolloutStuck"
daemonset="nginx-ingress-controller"
endpoint="http"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="nginx-ingress"
pod="prometheus-kube-state-metrics-78b7d687c5-wms4l"
service="prometheus-kube-state-metrics"
severity="critical"
|
firing |
2023-09-06 10:23:56.66725083 +0000 UTC |
0.5 |
Annotations |
- message
- Only 50% of the desired Pods of DaemonSet nginx-ingress/nginx-ingress-controller are scheduled and ready.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
|
Labels |
State |
Active Since |
Value |
alertname="KubeHpaMaxedOut"
endpoint="http"
hpa="data"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="apps"
pod="prometheus-kube-state-metrics-78b7d687c5-wms4l"
service="prometheus-kube-state-metrics"
severity="warning"
|
firing |
2023-06-23 18:18:56.66725083 +0000 UTC |
4 |
Annotations |
- message
- HPA apps/data has been running at max replicas for longer than 15 minutes.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
alertname="KubeHpaMaxedOut"
endpoint="http"
hpa="media"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="apps"
pod="prometheus-kube-state-metrics-78b7d687c5-wms4l"
service="prometheus-kube-state-metrics"
severity="warning"
|
firing |
2023-06-23 18:19:56.66725083 +0000 UTC |
4 |
Annotations |
- message
- HPA apps/media has been running at max replicas for longer than 15 minutes.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
|
Labels |
State |
Active Since |
Value |
alertname="KubeJobCompletion"
endpoint="http"
instance="172.44.14.221:8080"
job="kube-state-metrics"
job_name="similarityrecognition-1604358000"
namespace="apps"
pod="prometheus-kube-state-metrics-78b7d687c5-wms4l"
service="prometheus-kube-state-metrics"
severity="warning"
|
firing |
2022-09-13 09:17:56 +0000 UTC |
1 |
Annotations |
- message
- Job apps/similarityrecognition-1604358000 is taking more than one hour to complete.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
|
Labels |
State |
Active Since |
Value |
alertname="KubePodCrashLooping"
container="nginx-ingress-controller"
endpoint="http"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="nginx-ingress"
pod="nginx-ingress-controller-kgt7z"
service="prometheus-kube-state-metrics"
severity="critical"
|
firing |
2023-09-06 10:24:26.66725083 +0000 UTC |
1.0344827586206897 |
Annotations |
- message
- Pod nginx-ingress/nginx-ingress-controller-kgt7z (nginx-ingress-controller) is restarting 1.03 times / 5 minutes.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
alertname="KubePodCrashLooping"
container="life"
endpoint="http"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="apps"
pod="life-5b7dcc96b6-xpxzn"
service="prometheus-kube-state-metrics"
severity="critical"
|
firing |
2024-06-27 05:15:26.66725083 +0000 UTC |
1.3793103448275863 |
Annotations |
- message
- Pod apps/life-5b7dcc96b6-xpxzn (life) is restarting 1.38 times / 5 minutes.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
alertname="KubePodCrashLooping"
container="nginx-ingress-controller"
endpoint="http"
instance="172.44.14.221:8080"
job="kube-state-metrics"
namespace="nginx-ingress"
pod="nginx-ingress-controller-vztpf"
service="prometheus-kube-state-metrics"
severity="critical"
|
firing |
2023-09-06 11:55:26.66725083 +0000 UTC |
1.0344827586206897 |
Annotations |
- message
- Pod nginx-ingress/nginx-ingress-controller-vztpf (nginx-ingress-controller) is restarting 1.03 times / 5 minutes.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
|
Labels |
State |
Active Since |
Value |
alertname="KubeContainerWaiting"
container="nginx-ingress-controller"
namespace="nginx-ingress"
pod="nginx-ingress-controller-kgt7z"
severity="warning"
|
pending |
2024-07-02 03:15:56.66725083 +0000 UTC |
1 |
Annotations |
- message
- Pod nginx-ingress/nginx-ingress-controller-kgt7z container nginx-ingress-controller has been in waiting state for longer than 1 hour.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
alertname="KubeContainerWaiting"
container="life"
namespace="apps"
pod="life-5b7dcc96b6-xpxzn"
severity="warning"
|
pending |
2024-07-02 03:20:26.66725083 +0000 UTC |
1 |
Annotations |
- message
- Pod apps/life-5b7dcc96b6-xpxzn container life has been in waiting state for longer than 1 hour.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
alertname="KubeContainerWaiting"
container="nginx-ingress-controller"
namespace="nginx-ingress"
pod="nginx-ingress-controller-vztpf"
severity="warning"
|
pending |
2024-07-02 03:21:56.66725083 +0000 UTC |
1 |
Annotations |
- message
- Pod nginx-ingress/nginx-ingress-controller-vztpf container nginx-ingress-controller has been in waiting state for longer than 1 hour.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-resources.yaml > kubernetes-resources
|
Labels |
State |
Active Since |
Value |
alertname="CPUThrottlingHigh"
container="life"
namespace="apps"
pod="life-5b7dcc96b6-xpxzn"
severity="warning"
|
pending |
2024-07-02 03:18:16.656134993 +0000 UTC |
0.3263456090651558 |
Annotations |
- message
- 32.63% throttling of CPU in namespace apps for container life in pod life-5b7dcc96b6-xpxzn.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-storage.yaml > kubernetes-storage
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-system-apiserver.yaml > kubernetes-system-apiserver
|
|
alert: AggregatedAPIErrors
expr: sum
by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) >
2
labels:
severity: warning
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
errors. The number of errors have increased for it in the past five minutes. High
values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-system-kubelet.yaml > kubernetes-system-kubelet
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-system-scheduler.yaml > kubernetes-system-scheduler
|
alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"}
== 1)
for: 15m
labels:
severity: critical
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
Labels |
State |
Active Since |
Value |
alertname="KubeSchedulerDown"
severity="critical"
|
firing |
2020-05-18 21:17:24 +0000 UTC |
1 |
Annotations |
- message
- KubeScheduler has disappeared from Prometheus target discovery.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-kubernetes-system.yaml > kubernetes-system
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-node-exporter.yaml > node-exporter
|
alert: NodeClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m])
== 0
for: 10m
labels:
severity: warning
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
summary: Clock not synchronising.
|
|
|
|
|
|
|
|
|
|
|
alert: NodeNetworkReceiveErrs
expr: increase(node_network_receive_errs_total[2m])
> 10
for: 1h
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
|
alert: NodeNetworkTransmitErrs
expr: increase(node_network_transmit_errs_total[2m])
> 10
for: 1h
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-node-network.yaml > node-network
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-prometheus-operator.yaml > prometheus-operator
|
|
|
/etc/prometheus/rules/prometheus-prometheus-prometheus-oper-prometheus-rulefiles-0/laika-infrastructure-prometheus-prometheus-oper-prometheus.yaml > prometheus
|
Labels |
State |
Active Since |
Value |
alertname="PrometheusNotConnectedToAlertmanagers"
endpoint="web"
instance="172.44.12.88:9090"
job="prometheus-prometheus-oper-prometheus"
namespace="laika-infrastructure"
pod="prometheus-prometheus-prometheus-oper-prometheus-0"
service="prometheus-prometheus-oper-prometheus"
severity="warning"
|
firing |
2023-01-19 10:24:04.24694101 +0000 UTC |
0 |
Annotations |
- description
- Prometheus laika-infrastructure/prometheus-prometheus-prometheus-oper-prometheus-0 is not connected to any Alertmanagers.
- summary
- Prometheus is not connected to any Alertmanagers.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|