添加钉钉机器人
1. 部署 alermanager
1.1 下载软件包
wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
网址 :Releases · prometheus/alertmanager (github.com)
1.2 解压软件包
mkdir -pv /app/tools/
tar xf alertmanager-0.26.0.linux-amd64.tar.gz -C /app/tools/
1.3 创建符号链接
cd /app/tools/ && ln -svf alertmanager-0.26.0.linux-amd64 alertmanager
1.4 修改 alermanager 的配置文件
vim alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 2m
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
# prometheus-webhook-dingtalk的地址
#这里只需要修改IP地址及端口号即可
#也可以第一次启动webhook-dingtalk的时候胡hi有提示这个地址(需要直接使用webhook命令执行)
- url: 'http://localhost:8060/dingtalk/webhook1/send'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
1.5 启动alermanager
vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=The prometheus webhook dingtalk
After=network.target
[Service]
WorkingDirectory=/app/tools/alertmanager-0.26.0.linux-amd64/
ExecStart=/app/tools/alertmanager-0.26.0.linux-amd64/alertmanager \
--config.file=/app/tools/alertmanager-0.26.0.linux-amd64/alertmanager.yml
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable --now alertmanager.service
1.6 登录WebUI
http://192.168.99.181:9093/
有页面则部署完成
2. 部署 prometheus-webhook-dingtalk
2.1 下载 prometheus-webhook-dingtalk 软件包
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
2.2 解压软件包
tar xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz -C /app/tools
2.3 编写配置文件
cat config.example.yml
## Targets, previously was known as "profiles"
#指定告警模板配置文件
templates:
# - /root/prometheus-webhook-dingtalk-2.1.0.linux-amd64/templates/default.tmpl
- /app/tools/prometheus-webhook-dingtalk-2.1.0.linux-amd64/templates/test.tmpl
targets:
webhook1:
#机器人的地址
url: https://oapi.dingtalk.com/robot/send?access_token=1859e97456bdcb436f87f8e27147cfe07557901bdc4691c1836e30640f33c60b
# secret for signature
#机器人的认证标签
secret: SEC13a35ec382cbce1b46f4275ce640aedce28610b8026ceecd50ef59eb74002f8d
message:
## 指定了消息的标题,使用模板 `ding.link.title` 来生成。
title: '{{ template "ding.link.title" . }}'
# 指定了消息的正文内容,使用模板 `ding.link.content` 来生成。
text: '{{ template "ding.link.content" . }}'
2.4 编写告警模板
cat templates/test.tmpl
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}
{{ define "__alert_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}
---
**告警名称**: {{ .Labels.alertname }}
**告警主机**: {{ .Labels.instance }}
**告警级别**: {{ .Labels.severity }}
#description 这里引用的是ruels文件中的字段
**告警描述**: {{ index .Annotations "description" }}
#时间不要动
**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
#可填写查看详情的网址
**查看详情**: http://192.168.99.181:9093
{{ end }}{{ end }}
{{ define "__resolved_list" }}{{ range . }}
---
**告警名称**: {{ .Labels.alertname }}
**告警主机**: {{ .Labels.instance }}
**告警级别**: {{ .Labels.severity }}
**告警描述**: {{ index .Annotations "description" }}
**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
**恢复时间**: {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}
{{ define "default.title" }}
{{ template "__subject" . }}
{{ end }}
{{ define "default.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**<font color="#FF0000">======侦测到{{ .Alerts.Firing | len }}个告警======</font>**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
**<font color="green">======恢复{{ .Alerts.Resolved | len }}个故障======</font>**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
{{ template "default.title" . }}
{{ template "default.content" . }}
2.5 启动webhook-dingtalk
cat /usr/lib/systemd/system/webhook-dingtalk.service
[Unit]
Description=The prometheus webhook dingtalk
After=network.target
[Service]
WorkingDirectory=/app/tools/prometheus-webhook-dingtalk-2.1.0.linux-amd64/
ExecStart=/app/tools/prometheus-webhook-dingtalk-2.1.0.linux-amd64/prometheus-webhook-dingtalk \
--config.file=config.example.yml \
--web.enable-lifecycle \
--web.enable-ui
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enabled --now webhook-dingtalk.service
3. Promtheus
3.1 配置rules规则文件
[root@prometheus-server31 /app/tools/prometheus]# cat rules/test.yaml
# 相关的规则设置定义在一个group下。在每一个group中我们可以定义多个告警规则(rule)
groups:
# 组名,报警规则组名称
- name: 内存告警
rules:
- alert: 内存使用率超过80%告警
# expr:基于PromQL表达式告警触发条件,用于计算是否有时间序列满足该条件。
#expr: floor(100 * (1 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes)) > 80
expr: floor(100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))) > 80
# for:评估等待时间,可选参数。用于表示只有当触发条件持续一段时间后才发送告警。在等待期间新产生告警的状态为pending。
# for语句会使 Prometheus 服务等待指定的时间, 然后执行查询表达式。(for 表示告警持续的时长,若持续时长小于该时间就不发给alertmanager了,大于该时间再发。for的值不要小于prometheus中的scrape_interval,例如scrape_interval为30s,for为15s,如果触发告警规则,则再经过for时长后也一定会告警,这是因为最新的度量指标还没有拉取,在15s时仍会用原来值进行计算。另外,要注意的是只有在第一次触发告警时才会等待(for)时长。)
for: 1m
# labels:自定义标签,允许用户指定要附加到告警上的一组附加标签。
labels:
# severity: 指定告警级别,有三种等级,分别为:警告、严重、紧急,严重等级依次递增。
severity: '<font color="#0000FF">警告</font>'
# annotations: 附加信息,比如用于描述告警详细信息的文字等,annotations的内容在告警产生时会一同作为参数发送到Alertmanager。
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**内存使用率持续1分钟超过**80%**, 请及时处理! 当前值**{{ $value }}%**。"
- alert: 内存使用率超过90%告警
expr: floor(100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))) > 90
for: 1m
labels:
severity: '<font color="#FFA500">严重</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**内存使用率持续1分钟超过**90%**, 请马上处理! 当前值**{{ $value }}%**。"
- name: CPU告警
rules:
- alert: CPU使用率超过80%告警
expr: floor(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)) > 80
for: 1m
labels:
severity: '<font color="#0000FF">警告</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**CPU使用率持续1分钟超过**80%**, 请及时处理! 当前值**{{ $value }}%**。"
- alert: CPU使用率超过90%告警
expr: floor(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)) > 90
for: 1m
labels:
severity: '<font color="#FFA500">严重</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**CPU使用率持续1分钟超过**90%**, 请马上处理! 当前值**{{ $value }}%**。"
- name: 磁盘告警
rules:
- alert: 磁盘 **/** 目录使用率超过90%告警
expr: floor(100 * ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"})) >90
for: 30m
labels:
severity: '<font color="#0000FF">警告</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**磁盘 **/** 目录使用率已经超过阈值**90%**, 请及时处理! 当前值**{{ $value }}%**。"
- alert: 磁盘 **/home** 目录使用率超过90%告警
expr: floor(100 * ((node_filesystem_size_bytes{mountpoint="/home"} - node_filesystem_avail_bytes{mountpoint="/home"}) / node_filesystem_size_bytes{mountpoint="/home"})) >90
for: 30m
labels:
severity: '<font color="#0000FF">警告</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**磁盘 **/home** 目录使用率已经超过阈值**90%**, 请及时处理! 当前值**{{ $value }}%**。"
- alert: 磁盘 **/data** 目录使用率超过90%告警
expr: floor(100 * ((node_filesystem_size_bytes{mountpoint="/data"} - node_filesystem_avail_bytes{mountpoint="/data"}) / node_filesystem_size_bytes{mountpoint="/data"})) >90
for: 30m
labels:
severity: '<font color="#0000FF">警告</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**磁盘 **/data** 目录使用率已经超过阈值**90%**, 请及时处理! 当前值**{{ $value }}%**。"
- alert: 磁盘 **/data1** 目录使用率超过90%告警
expr: floor(100 * ((node_filesystem_size_bytes{mountpoint="/data1"} - node_filesystem_avail_bytes{mountpoint="/data1"}) / node_filesystem_size_bytes{mountpoint="/data1"})) >90
for: 30m
labels:
severity: '<font color="#0000FF">警告</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**磁盘 **/data1** 目录使用率已经超过阈值**90%**, 请及时处理! 当前值**{{ $value }}%**。"
- alert: 磁盘 **/postgres** 目录使用率超过90%告警
expr: floor(100 * ((node_filesystem_size_bytes{mountpoint="/postgres"} - node_filesystem_avail_bytes{mountpoint="/postgres"}) / node_filesystem_size_bytes{mountpoint="/postgres"})) >90
for: 30m
labels:
severity: '<font color="#0000FF">警告</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**磁盘 **/postgres** 目录使用率已经超过阈值**90%**, 请及时处理! 当前值**{{ $value }}%**。"
- name: 离线告警
rules:
- alert: 服务器离线告警
expr: up{instance =~ ".*:(9100|9200)"} == 0
for: 30s
labels:
severity: '<font color="#FF0000">紧急</font>'
annotations:
servername: "{{ $labels.server_name }}"
server_ip: "{{ $labels.server_ip }}"
value: "{{ $value }}"
description: "**{{ $labels.server_name }}**离线了,可能出现宕机情况, 请立即检查处理! "
3.2 配置 prometheus.yml 文件
vim /app/tools/prometheus/prometheus.yml
....
#指定alermanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.99.181:9093
.....
rule_files:
#指定 rules 配置文件
- "/app/tools/prometheus/rules/test.yaml"
.....
3.3 重启prometheus
systemctl reload prometheus.server
4. 测试
根据自己定义的规则关闭一个监控的节点,查看钉钉是否有收到消息