告警模板参数大致解析

1
2
3
4
5
6
7
8
9
10
11
12
13
groups:
- name: example # 组名
# 触发规则列表
rules:
- alert: HighErrorRate # 警告名
expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 # 触发规则
for: 10m # 规则触发持续多长时间发送告警
# 告警附加标签
labels:
severity: page
# 告警附加注释
annotations:
summary: High request latency

node_alived.yml ( 实例存活告警规则 )

1
2
3
4
5
6
7
8
9
10
11
12
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up == 0
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "主机宕机 !!!"
description: "该实例主机已经宕机超过一分钟了。"

memory_over.yml (内存报警规则)

1
2
3
4
5
6
7
8
9
10
11
groups:
- name: 内存报警规则
rules:
- alert: 内存使用率告警
expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "服务器可用内存不足。"
description: "内存使用率已超过50%(当前值:{{ $value }}%)"

cpu_over.yml ( CPU报警规则 )

1
2
3
4
5
6
7
8
9
10
11
groups:
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "CPU使用率正在飙升。"
description: "CPU使用率超过50%(当前值:{{ $value }}%)"

disk_over.yml (磁盘使用率报警规则 )

1
2
3
4
5
6
7
8
9
10
11
groups:
- name: 磁盘使用率报警规则
rules:
- alert: 磁盘使用率告警
expr: 100 - node_filesystem_free_bytes{fstype=~"xfs|ext4"} / node_filesystem_size_bytes{fstype=~"xfs|ext4"} * 100 > 80
for: 20m
labels:
severity: warning
annotations:
summary: "硬盘分区使用率过高"
description: "分区使用大于80%(当前值:{{ $value }}%)"