groups: - name: "kubernetes" rules: - alert: "kube_node状态异常告警" expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 1m labels: severity: "critical" annotations: description: "node节点:{{$labels.node}}状态异常" summary: "kube_node 状态异常!"
- alert: "pod不可调度告警" expr: kube_node_spec_taint{key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1 for: 5m labels: severity: "critical" annotations: description: "node节点:{{$labels.node}}不可调度" summary: "pod不可调度"
- alert: "node可运行pod数量不足告警" expr: count by(node,project,env,dept) ((kube_pod_status_phase{name=~".+",phase="Running"} == 1) * on(instance,pod,namespace,name,dept) group_left(node) topk by(instance,pod,namespace,name) (1, kube_pod_info{name=~".+",}))/max by(node,project,env) (kube_node_status_capacity{name=~".+",resource="pods"} != 1) *100 >= 80 for: 5m labels: severity: "critical" annotations: description: "{{$labels.node}}上运行pod数量超过pod数量上限的90%,监控值为:{{ $value| printf `%.f` }}" summary: "node运行pod数量超过pod数量上限"
- alert: "node 状态抖动告警" expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (project,node,name,env) > 2 for: 5m labels: alertname: "kube_node状态抖动告警" severity: "warning" annotations: description: "{{$labels.project}}-{{$labels.node}}15分钟内存在抖动" summary: "kube_node 状态抖动"
- alert: "pod运行状态异常告警" expr: sum by (name,namespace,pod,project,env,dept) (kube_pod_status_phase{ phase=~"Failed|Unknown"}) > 0 for: 5m labels: alertname: "pod运行状态异常告警" severity: "critical" annotations: description: "pod:{{$labels.name}}-{{$labels.namespace}}-{{$labels.pod}}运行状态为Unknown|Failed" summary: "pod运行状态为Unknown|Failed"
- alert: "pod等待状态告警" expr: kube_pod_container_status_waiting == 1 for: 5m labels: alertname: "pod等待状态告警" severity: "critical" annotations: description: "pod:{{$labels.name}}-{{$labels.namespace}}-{{$labels.pod}} 运行处于等待状态!" summary: "pod运行处于等待状态!"
- alert: "pod终止状态告警" expr: kube_pod_container_status_terminated{namespace!~"kube-system|ingress-nginx|default|crontab"} == 1 for: 5m labels: alertname: "pod终止状态告警" severity: "critical" annotations: description: "pod:{{$labels.name}}-{{$labels.namespace}}-{{$labels.pod}}运行处于终止状态" summary: "pod运行处于终止状态"
- alert: "pod5分钟内发生过OOM告警" expr: increase(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[5m]) > 0 labels: alertname: "pod发生OOM告警" severity: "critical" annotations: description: "pod:{{$labels.name}}-{{$labels.namespace}}-{{$labels.pod}} 发生OOM!"
- alert: "pod发生重启告警" expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[5m]) > 1 for: 5m labels: alertname: "pod发生重启告警" severity: "critical" annotations: description: "{{$labels.pod}}发生{{ $value| printf `%.f` }}次重启!"
- alert: "Deployment副本数异常告警" expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available for: 5m labels: severity: "critical" annotations: description: "deployment:{{ $labels.deployment }}副本数小于预期"
- alert: "node资源异常告警" expr: kube_node_status_condition{condition=~"DiskPressure|FrequentContainerdRestart|FrequentDockerRestart|FrequentKubeletRestart|KUBELETProblem|KUBEPROXYProblem|MemoryPressure|PIDPressure|NTPProblem",status="true"} == 1 for: 5m labels: alertname: "node资源异常告警" severity: "critical" annotations: description: "Deployment:{{$labels.name}}-{{$labels.node}}发生{{$labels.condition}}"
- alert: "pod占用磁盘空间过高" expr: sum by(pod_name, namespace,pod,node_ip,project,app,env,dept) (container_fs_usage_bytes{image!=""}) / 1024 / 1024 / 1024 > 200 labels: severity: "warning" annotations: description: "{{$labels.pod}}占用磁盘空间量为:{{ $value| printf `%.f` }} G"
- alert: "pod内存使用率告警" expr: sum by(pod_name, namespace,project,pod,env,dept) (container_memory_rss{image!=""}) / sum by(pod_name, namespace,project,pod,env,dept) (container_spec_memory_limit_bytes{image!=""}) * 100 != +Inf > 96 for: 5m labels: alertname: "pod 内存使用率告警" severity: "warning" annotations: description: "{{$labels.pod}} 内存使用率为:{{ $value| printf `%.2f` }} %" summary: "内存使用率告警超过 96 %"
- alert: "pod CPU使用率告警" expr: sum by(pod_name, namespace,project,env,pod,dept) (rate(container_cpu_usage_seconds_total{image!=""}[5m])) / (sum by(pod_name, namespace,project,env,pod,dept) (container_spec_cpu_quota{image!=""} / 100000)) * 100 > 90 for: 5m labels: severity: "critical" annotations: summary: "{{$labels.pod}} CPU使用率为:{{ $value| printf `%.2f` }} %" description: "CPU使用率超过80%"
- alert: "pod 出网带宽大于100MB" expr: round(sum by (namespace,job,name,project,env,dept) (irate(container_network_transmit_bytes_total{image!=""}[5m])) / 1024 /1024*1000)/1000 > 100 for: 5m labels: severity: "warning" annotations: summary: "{{$labels.pod}}出网带宽为:{{ $value| printf `%.2f` }} MB"
- alert: "pod 入网带宽大于10MB" expr: round(sum by (namespace,job,name,project,env,dept) (irate(container_network_receive_bytes_total{image!=""}[5m])) / 1024 /1024*1000)/1000 > 100 for: 3m labels: severity: "warning" annotations: summary: "{{$labels.pod}}入网带宽为: {{ $value| printf `%.2f` }} MB"
- alert: "node节点分配的pod过多告警" expr: kubelet_running_pod_count > 30 for: 10m labels: severity: critical annotations: description: "node节点:{{$labels.instance}}运行pod数量为:{{$value}}" summary: "node节点分配的pod过多" unit: number values: '{{ $value }}'
|