版本:centos7.9 python3.9.5 alertmanager0.25.0 prometheus2.46.0
安装alertmanager prometheus 配置webhook
# 解压:
tar -xvf alertmanager-0.25.0.linux-amd64.tar.gz
tar -xvf prometheus-2.46.0.linux-amd64.tar.gz
mv alertmanager-0.25.0.linux-amd64 alertmanager
mv prometheus-2.46.0.linux-amd64 prometheus
# 安装Python
yum install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gcc make libffi-devel
cd /app
wget https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz
tar -xvf Python-3.9.0.tgz
cd Python-3.9.0
./configure prefix=/usr/local/python3
make && make install
yum install gcc libffi-devel openssl-devel -y
ln -s /usr/local/python3/bin/python3.9 /usr/bin/python3
ln -s /usr/local/python3/bin/pip3.9 /usr/bin/pip3
pip3 install -U pip
#配置webhook
pwd
/app/jiankong
cd /app
mdkir webhook
cd webhook
yum epel-release -y
yum install openssl11 openssl11-devel
pip3 install urllib3==1.26.15
pip3 install --upgrade cryptography
pip3 install --upgrade pyopenssl
pip3 install --upgrade requests
pip3 install flask
vim /app/webhook/main.py
#!/usr/local/bin/python3
# coding: utf-8
import json
from datetime import datetime
import requests
from requests.exceptions import RequestException
from flask import Flask
from flask import request
app = Flask(__name__)
@app.route('/', methods=['POST'])
def send_wechat():
if request.method == 'POST':
post_data = request.get_data()
data = json.loads(post_data.decode('utf-8'))
for alert in data.get('alerts'):
webchat(alert)
return "success\n"
@app.route('/dingtalk', methods=['POST'])
def send_dingtalk():
if request.method == 'POST':
post_data = request.get_data()
data = json.loads(post_data.decode('utf-8'))
access_token = 'dxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxb'
for alert in data.get('alerts'):
content = dingtalk_msgformat(alert)
dingding_sendmsg(access_token, content)
return "success\n"
@app.route('/prometheus_dingtalk', methods=['POST'])
def send_prodingtalk():
if request.method == 'POST':
post_data = request.get_data()
data = json.loads(post_data.decode('utf-8'))
access_token = '8xxxxxxxxxxxxxxxxxxxxxxxxxxx0' #设置钉钉机器人
for alert in data.get('alerts'):
content = dingtalk_msgformat(alert)
dingding_sendmsg(access_token, content)
return "success\n"
@app.route('/dingding_send', methods=['POST'])
def dingding_send():
if request.method == 'POST':
post_dingding_data = request.get_data()
json_dingding_data = json.loads(post_dingding_data.decode('utf-8'))
content = json_dingding_data["content"]
access_token = json_dingding_data["access_token"]
dingding_sendmsg(access_token, content)
return "ok"
def webchat(data):
url = 'http://92.168.60.xxx:4567/send'
users_list = []
usernames = {}
with open('/app/webhook/users', encoding='utf-8') as f:
usernames = dict(line.strip().split(':') for line in f if line)
users_cn = data.get('annotations').get('sendUsers')
for i in users_cn.split(','):
if usernames.get(i):
users_list.append(usernames.get(i))
users = ','.join(users_list)
message = '''status: %s
alertlevel: %s
alertname: %s
message: %s
startsAt: %s
endsAt: %s
消息发送时间: %s
消息发送给: %s''' % (data.get('status'), data.get('annotations').get('severity'), data.get('labels').get('alertname'),
data.get('annotations').get('message'), data.get('startsAt'), data.get('endsAt'),
datetime.now().isoformat(), users_cn)
params = {'tos': users, 'content': message}
requests.post(url=url, data=params)
def dingtalk_msgformat(data):
message = f'''status: {data.get('status')}
alertlevel: {data.get('annotations').get('severity')}
alertname: {data.get('labels').get('alertname')}
message: {data.get('annotations').get('message')}
startsAt: {data.get('startsAt')}
endsAt: {data.get('endsAt')}
消息发送时间:{datetime.now().isoformat()}
消息发送给:{data.get('annotations').get('sendUsers')}'''
return message
def dingding_sendmsg(access_token, content):
headers = {
'content-type': 'application/json',
'Accept': 'application/json;charset=utf-8',
}
payload = {
"text": {
"content": content
},
"at": {
"atMobiles": "",
"isAtAll": False,
},
"msgtype": "text",
}
webhook_url = 'https://oapi.dingtalk.com/robot/send?access_token=%s' %access_token
try:
response = requests.post(webhook_url, data=json.dumps(payload), headers=headers)
response.raise_for_status()
except RequestException as e:
raise e
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
#cat /etc/system/systemd/webhook.service
# 做成服务
[Unit]
Description= Webhook wechat for prometheus
After=network.target
[Service]
#Restart=always
#RestartSec=30
#Type=simple
WorkingDirectory=/app/webhook
ExecStart=//usr/local/python3/bin/python3.9 /app/webhook/main.py
[Install]
WantedBy=multi-user.target
配置alertmanager prometheus
# 配置数据存储目录
mkdir -p /data/prometheus/prometheus /data/prometheus/alertmanager
[root@rabbit4-64 prometheus]# ls
alertmanager prometheus
[root@rabbit4-64 data]# useradd prometheus
[root@rabbit4-64 data]# chown -R prometheus.prometheus /data/prometheus
[root@rabbit4-64 prometheus]# ll
总用量 0
drwxr-xr-x. 2 prometheus prometheus 6 5月 31 10:25 alertmanager
drwxr-xr-x. 2 prometheus prometheus 6 5月 31 10:25 prometheus
[root@rabbit4-64 data]#
# alertmanager配置
[root@rabbit3-63 alertmanager]# cat /app/jiankong/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'cluster', 'service','instance']
group_wait: 10s
group_interval: 5s
repeat_interval: 1h
receiver: 'wechat'
receivers:
- name: 'wechat'
webhook_configs:
- url: 'http://192.168.xxxxx:5000/prometheus_dingtalk'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
# 设置成服务
[root@rabbit3-63 alertmanager]# cat /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Alert Manager
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target
[Service]
User=prometheus
LimitNOFILE=65535
WorkingDirectory=/app/jiankong/alertmanager
ExecStart=/app/jiankong/alertmanager/alertmanager \
--config.file=/app/jiankong/alertmanager/alertmanager.yml \
--storage.path=/data/prometheus/alertmanager
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=60s
[Install]
WantedBy=multi-user.target
# prometheus配置
mkdir /app/jiankong/prometheus/rules
cd /app/jiankong/prometheus/
chown -R prometheus.prometheus rules/
[root@rabbit4-64 prometheus]# cat prometheus.yml
global:
scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_timeout: 55s
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.70.xx:9093 # 设置alertmanager的地址
rule_files:
- '/app/jiankong/prometheus/rules/*.rules'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: "node"
static_configs:
- targets:
[
"192.168.x0.xx:9100",
"192.168.x0.xx:9100",
]
# 做成服务
[root@localhost alertmanager]# cat /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target
[Service]
User=prometheus
LimitNOFILE=65535
WorkingDirectory=/app/jiankong/prometheus
ExecStart=/app/jiankong/prometheus/prometheus --log.level=info \
--config.file=/app/jiankong/prometheus/prometheus.yml \
--storage.tsdb.retention.time=10d \
--storage.tsdb.path=/data/prometheus/prometheus
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=60s
[Install]
WantedBy=multi-user.target
# 配置相关规则
[root@rabbit4-64 rules]# cat up.rules
groups:
- name: instance_status
rules:
- alert: 系统或服务异常请运维紧急查看!!!
expr: up == 0
for: 30s
labels:
severity: page
annotations:
sendUsers: "李处长"
message: "{{$labels.instance}} 来自于 job {{$labels.job}} 已经采集失败超过五分钟。"
severity: "Warning"
[root@rabbit4-64 rules]# ls
linux_hosts.rules up.rules
[root@rabbit4-64 rules]# cat up.rules
groups:
- name: instance_status
rules:
- alert: 系统或服务异常请运维紧急查看!!!
expr: up == 0
for: 30s
labels:
severity: page
annotations:
sendUsers: "李处长"
message: "{{$labels.instance}} 来自于 job {{$labels.job}} 已经采集失败超过五分钟。"
[root@rabbit4-64 rules]# cat linux_hosts.rules
groups:
- name: linux_host_status
rules:
- alert: data_node_cpu_too_load_high
expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 60
for: 5m
labels:
severity: page
annotations:
sendUsers: "李处长,陈处长"
message: "{{ $labels.instance }} 来自于job {{ $labels.job }} CPU 使用率连续五分钟超过 60%,当前值: {{ $value }} ,请检查主机应用!"
severity: "Warning"
- alert: node_filesystem_usage_hign
expr: node_filesystem_free_bytes{device !~'tmpfs', fstype!~'rootfs'} / node_filesystem_size_bytes < 0.15
for: 5m
labels:
severity: page
annotations:
sendUsers: "李处长,陈处长"
message: "{{ $labels.instance }} 来自于job {{ $labels.job }} 磁盘 {{$labels.device}} 挂载点 {{$labels.mountpoint}} 使用率超过 85%,当前值: {{ $value }} ,请检查主机磁盘!"
severity: "Warning"
- alert: data_node_memory_too_usage_high
expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100 > 95
for: 5m
labels:
severity: page
annotations:
sendUsers: "李处长,陈处长"
message: "{{ $labels.instance }} 来自于job {{ $labels.job }} 内存使用率连续五分钟超过 95%, 当前值: {{ $value }} ,请检查主机应用!"
severity: "Warning"
- alert: yunclassroom_process_memory_use_high
expr: namedprocess_namegroup_memory_bytes{job="yunbanji_web",memtype="resident"}/1024/1024 > 4000
for: 5m
labels:
severity: page
annotations:
sendUsers: "李处长,陈处长"
message: "{{ $labels.instance }} 来自于进程 {{ $labels.groupname }} 内存使用率连续五分钟超过 4000M, 当前值: {{ $value }} ,请检查主机应用!"
severity: "Warning"
启动相关服务
systemctl daemon-reload
systemctl start webhook
systemctl start alertmanager
systemctl start prometheus
systemctl status xxx #查看状态
[root@localhost jiankong]# netstat -nltp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
tcp 0 0 0.0.0.0:5000 0.0.0.0:* LISTEN 60670/python3.9
tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 1014/sshd
tcp6 0 0 :::9093 :::* LISTEN 60467/alertmanager
tcp6 0 0 :::9094 :::* LISTEN 60467/alertmanager
tcp6 0 0 :::9100 :::* LISTEN 1013/node_exporter
tcp6 0 0 :::22 :::* LISTEN 1014/sshd
tcp6 0 0 :::9090 :::* LISTEN 60500/prometheus
访问:ip:9090 ip:9094
测试:当其中一台挂掉了实现告警