一、prometheus 实现钉钉和企业微信告警

基础流程
altermanager流程

1.1 钉钉通知

altermanager基础设置可以参照: https://editor.csdn.net/md/?articleId=121845743

钉钉群设置
群设置 -> 智能群助手 -> 添加机器人 -> 自定义 -> 添加 -> 保存生成的webhook地址
在这里插入图片描述

1.1.1 测试发送信息 - 关键字认证

root@prometheus:~# mkdir data/scripts -p
root@prometheus:~# cd data/scripts/
root@prometheus:~/data/scripts# vim dinngding-keyworlds.sh
#/bin/bash
source /etc/profile
MESSAGE=$1
curl -X  "POST" '你生成的Webhook地址' \
-H 'Content-Type:application/json' \
-d '{ "msgtype" : "text",
  "text" : {
    "content":"'${MESSAGE}'"
  }
}'
root@prometheus:~/data/scripts# chmod +x dinngding-keyworlds.sh
root@prometheus:~/data/scripts# bash dinngding-keyworlds.sh "namespace=defalt\npod=pod1\ncpu=90%\n持续时间=8s\nalertname=pod"
{"errcode":0,"errmsg":"ok"}

在这里插入图片描述

1.1.1.1 部署webhook-dingtalk

github地址 : https://github.com/timonwong/prometheus-webhook-dingtalk

root@prometheus:/apps# wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
root@prometheus:/apps# tar xf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
root@prometheus:/apps/prometheus-webhook-dingtalk-1.4.0.linux-amd64# ./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=你的webhook地址"
level=info ts=2022-02-10T03:49:21.015Z caller=main.go:62 msg="Starting prometheus-webhook-dingtalk" version="(version=1.4.0, branch=HEAD, revision=02fe8265a98ab4caaa78ebbed209d3f06b87b4a6)"
level=info ts=2022-02-10T03:49:21.016Z caller=main.go:63 msg="Build context" (gogo1.13.5,userroot@eb9f8d8f0437,date20191211-03:00:38)=(MISSING)
level=warn ts=2022-02-10T03:49:21.016Z caller=main.go:105 msg="DEPRECATION: Detected one of the following flags: --ding.profile, --ding.timeout, --template.file"
level=warn ts=2022-02-10T03:49:21.016Z caller=main.go:106 msg="DEPRECATION: Now working in compatibility mode, please consider upgrading your configurations"
level=info ts=2022-02-10T03:49:21.016Z caller=main.go:117 component=configuration msg="Loading templates" templates=
ts=2022-02-10T03:49:21.016Z caller=main.go:133 component=configuration msg="Webhook urls for prometheus alertmanager" urls=http://0.0.0.0:8060/dingtalk/alertname/send
level=info ts=2022-02-10T03:49:21.016Z caller=web.go:210 component=web msg="Start listening for connections" address=0.0.0.0:8060
level=info ts=2022-02-10T03:49:21.428Z caller=entry.go:22 component=web http_scheme=http http_proto=HTTP/1.1 http_method=POST remote_addr=10.0.0.61:55076 user_agent=Alertmanager/0.23.0 uri=http://10.0.0.61:8060/dingtalk/alertname/send resp_status=200 resp_bytes_length=2 resp_elapsed_ms=184.460895 msg="request complete"

#测试一下
root@prometheus:~# telnet 10.0.0.61 8060
Trying 10.0.0.61...
Connected to 10.0.0.61.
Escape character is '^]'.

HTTP/1.1 400 Bad Request
Content-Type: text/plain; charset=utf-8
Connection: close

400 Bad RequestConnection closed by foreign host.

1.1.1.2 配置alertmanager

root@prometheus:/apps/alertmanager# vim alertmanager.yml
---
  #修改接受者
  receiver: 'dingding'
receivers:
 #添加钉钉
- name: dingding
  webhook_configs:
  - url:'http://10.0.0.61:8060/dingtalk/altername/send'
    send_resolved: true

1.1.1.3 验证

在这里插入图片描述

1.1.2 测试发送信息 - 加签认证

1.1.2.1 配置加签

在这里插入图片描述

1.1.2.2 加签认证-获取认证

root@prometheus:/apps/alertmanager# apt install python2
root@prometheus:~# vim data/scripts/dingding-label-sign.py 
#!/usr/bin/python2.7
import time
import hmac
import hashlib
import base64
import urllib
timestamp=long(round(time.time())*1000)
secret='你的加签生成的秘钥'
secret_enc=bytes(secret).encode('utf-8')
string_to_sign='{}\n{}'.format(timestamp,secret)
string_to_sign_enc=bytes(string_to_sign).encode('utf-8')
hmac_code=hmac.new(secret_enc,string_to_sign_enc,digestmod=hashlib.sha256).digest()
sign=urllib.quote_plus(base64.b64encode(hmac_code))
print(timestamp)
print(sign)
#生成时间戳和认证
root@prometheus:~# python2.7 data/scripts/dingding-label-sign.py 

1.1.2.3 消息发送脚本

#测试脚本可用
root@prometheus:~# vim /root/data/scripts/dingding-label-send.sh
#!/bin/bash
source /etc/profile
MESSAGE=$1
secret='你的加签生成的秘钥'
getkey=$(python2.7 /root/data/scripts/dingding-label-sign.py)
timestamp=${getkey:0:13}
sign=$(echo "${getkey:13:100}"|tr -d '\n')
# DateStamp=$(date -d @${getkey:0:10}"+%F%H:%m:%S")

curl -X  "POST" "你的webhook地址&timestamp=${timestamp}&sign=${sign}" \
-H 'Content-Type:application/json' \
-d '{ "msgtype" : "text",
  "text" : {
    "content":"'${MESSAGE}'"
  }
}'

root@prometheus:~# bash /root/data/scripts/dingding-label-send.sh sss
{"errcode":0,"errmsg":"ok"}

在这里插入图片描述

1.1.2.4 webhook启动

#先获取当前时间戳和认证秘钥
root@prometheus:~# python2.7 /root/data/scripts/dingding-label-sign.py 
#启动webhook的dingtalk
root@prometheus:/apps/prometheus-webhook-dingtalk-1.4.0.linux-amd64# ./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=你的webhook地址&timestamp=生成的时间戳&sign=生成的认证秘钥"
level=info ts=2022-02-10T05:22:45.778Z caller=main.go:62 msg="Starting prometheus-webhook-dingtalk" version="(version=1.4.0, branch=HEAD, revision=02fe8265a98ab4caaa78ebbed209d3f06b87b4a6)"
level=info ts=2022-02-10T05:22:45.778Z caller=main.go:63 msg="Build context" (gogo1.13.5,userroot@eb9f8d8f0437,date20191211-03:00:38)=(MISSING)
level=warn ts=2022-02-10T05:22:45.779Z caller=main.go:105 msg="DEPRECATION: Detected one of the following flags: --ding.profile, --ding.timeout, --template.file"
level=warn ts=2022-02-10T05:22:45.779Z caller=main.go:106 msg="DEPRECATION: Now working in compatibility mode, please consider upgrading your configurations"
level=info ts=2022-02-10T05:22:45.779Z caller=main.go:117 component=configuration msg="Loading templates" templates=
ts=2022-02-10T05:22:45.779Z caller=main.go:133 component=configuration msg="Webhook urls for prometheus alertmanager" urls=http://0.0.0.0:8060/dingtalk/alertname/send
level=info ts=2022-02-10T05:22:45.779Z caller=web.go:210 component=web msg="Start listening for connections" address=0.0.0.0:8060
level=info ts=2022-02-10T05:22:46.788Z caller=entry.go:22 component=web http_scheme=http http_proto=HTTP/1.1 http_method=POST remote_addr=10.0.0.61:59396 user_agent=Alertmanager/0.23.0 uri=http://10.0.0.61:8060/dingtalk/alertname/send resp_status=200 resp_bytes_length=2 resp_elapsed_ms=908.904779 msg="request complete"

1.1.2.5 进行验证

在这里插入图片描述

1.2 企业微信通知

1.2.1 创建应用

登录pc的企业微信 -> 应用管理 ->创建应用
在这里插入图片描述

1.2.2 测试发送信息

在这里插入图片描述

1.2.3 验证测试信息

在这里插入图片描述

1.2.4 alertmanager配置

#修改配置文件
root@prometheus:/apps/alertmanager# vim alertmanager.yml 
---
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 2s
  repeat_interval: 2m
  #receiver: 'web.hook'
  #receiver: dingding
  receiver: wechat
----
- name: wechat
  wechat_configs:
  - corp_id: 你的企业ID
    to_user: '@all' #发送给所有人
    agent_id: 你的应用id
    api_secret: 你的应用秘钥
    send_resolved: true

1.2.5 验证信息

在这里插入图片描述

1.2.6 消息发送给指定组

1.2.6.1 获取部门ID

在这里插入图片描述

1.2.6.2 alertmanager配置

root@prometheus:/apps/alertmanager# vim alertmanager.yml 
- name: wechat
  wechat_configs:
  - corp_id: 你的企业ID
    #to_user: '@all'
    to_party: 1 #指定部门ID
    agent_id: 你的应用ID
    api_secret: 你的应用secret
    send_resolved: true
root@prometheus:/apps/alertmanager# systemctl restart alertmanager

1.2.6.3 验证信息

在这里插入图片描述

1.3 消息分类发送

根据消息中的属性信息设置规则,将消息分类发送,如将severity级别为critical的通知信息发送到邮箱,其他发送到微信

1.3.1 alertmanager设置

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 2s
  repeat_interval: 2m
  #receiver: 'web.hook'
  #receiver: dingding
  receiver: wechat
  routes: #添加信息路由
  - receiver: web.hook  #critical级别的信息发送到邮箱
    group_wait: 10s
    match_re:
      severity: critical

1.3.2 验证信息

在这里插入图片描述

1.4 自定义消息模板

默认的消息内容需要调整,而且信息是连接在一起的

1.4.1 定义模板

root@prometheus:/apps/alertmanager# vim alertmanager-wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
 
=========  监控告警 =========
告警程序:     Alertmanager
告警类型:    {{ $alert.Labels.alertname }}
告警级别:    {{ $alert.Labels.severity }} 级
告警状态:    {{ .Status }}
故障主机:    {{ $alert.Labels.instance }} {{ $alert.Labels.device }}
告警主题:    {{ .Annotations.summary }}
告警详情:    {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
主机标签:    {{ range .Labels.SortedPairs  }}  [{{ .Name }}: {{ .Value  | html }} ] {{- end }}
故障时间:    {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}
 
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
 
========= 告警恢复 =========
告警程序:     Alertmanager
告警主题:    {{ $alert.Annotations.summary }}
告警主机:    {{ .Labels.instance }}
告警类型:    {{ .Labels.alertname }}
告警级别:    {{ $alert.Labels.severity }} 级
告警状态:    {{   .Status }}
告警详情:    {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
故障时间:    {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间:    {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}

1.4.2 alertmanager引用模板

root@prometheus:/apps/alertmanager# vim alertmanager.yml
---
#添加模板
templates:
  - /apps/alertmanager/alertmanager-wechat.tmpl

root@prometheus:/apps/alertmanager# systemctl restart alertmanager

在这里插入图片描述

1.5 告警抑制和静默

1.5.1 告警抑制

基于告警规则,超过80%就不在发60%的告警,即由60%的表达式触发的告警被抑制了

root@prometheus:/apps/prometheus# vim rules.yml
groups:
- name: altermanager_pod.rules
  rules:
  - alert: 磁盘容量
    expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes{fstype=~"ext4|xfs"}*100)>30 #故意写小
    for: 2s
    labels:
      severity: critical
    annotations:
      description: "{{$labels.mountpoint}} 磁盘分区使用大于30%(目前使用:{{$value}}%)"
      summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
  - alert: 磁盘容量
    expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes{fstype=~"ext4|xfs"}*100)>20 #故意写小
    for: 2s
    labels:
      severity: warning
    annotations:
      description: "{{$labels.mountpoint}} 磁盘分区使用大于20%(目前使用:{{$value}}%)"
      summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
root@prometheus:/apps/prometheus# systemctl restart prometheus.service 
root@prometheus:/apps/prometheus# systemctl restart alertmanager.service 

进行验证
在这里插入图片描述

1.5.2 手动静默

先找到要静默的告警事件,然后手动静默指定的事件

1.5.2.1 点击静默

在这里插入图片描述

1.5.2.2 填写信息并创建

在这里插入图片描述

1.5.2.3 查看并验证

在这里插入图片描述
进行验证
在这里插入图片描述

Logo

华为云1024程序员节送福利,参与活动赢单人4000元礼包,更有热门技术干货免费学习

更多推荐