需求
因为目前服务器规模较小,使用zabbix,nagios 等开源的监控系统的必要性并不高,加上配置维护花费的时间成本,所以决定通过自己的脚本,配合saltstack来处理。
监控原理很简单,server端负责处理监控信息,agent 端负责收集信息,并统一发送到服务器端。
服务器端
脚本目录
├── weixin.py
├── __init__.py
└── main.py
main.py
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
import time, socket, threading,json
from weixin import senddata,gettoken
def tcplink(sock, addr):
print 'New Connection from %s:%s...' % addr
res={}
while True:
data = sock.recv(1024)
time.sleep(1)
if data == 'exit' or not data:
break
res = data
handler(res)
sock.close()
print 'Connection from %s:%s closed.' % addr
return res
# 错误报告
def report(data):
content = ''
for d in data:
content = content + d + "\n"
print content
corpid = 'xxxxxxxxxxxx'
corpsecret = 'xxxxxxxxxxxxxxxxx'
accesstoken = gettoken(corpid, corpsecret)
msg = senddata(accesstoken, content)
print msg
print data
# 处理客户端消息,根据阈值判断
def handler(res):
try:
data = json.loads(res)
except Exception,e:
print e
print "Data type wrong."
return False
m_type = data['type']
# 服务器资源监控
if m_type == 1:
# ip
ip = data['ip']
# ip
name = data['name']
# cpu 利用率
cpu_use = data['cpu_use']
# cpu load (可以改进通过获取cpu核数来动态判断)
cpu_load = data['cpu_load']
# 内存 利用率
mem_use = data['mem_use']
# 磁盘利用率
disk_use = data['disk_use']
message = ["ip: %s" % ip, "name: %s" % name]
print ip,cpu_use,cpu_load,mem_use,disk_use
if cpu_use > 95:
message.append("cpu_use: %s" % cpu_use)
if cpu_load > 3:
message.append("cpu_load: %s" % cpu_load)
if mem_use > 85:
message.append("mem_use: %s" % mem_use)
if disk_use > 75:
message.append("disk_use: %s" % disk_use)
if message.__len__() > 2:
report(message)
return True
# 服务监控
elif m_type == 2:
print "service eyes..."
print data
message = ["oops some service down!"]
if data["status"] == 1:
message.append("message: %s" % data)
report(message)
return True
if __name__=="__main__":
print "Minitor Service Listening on 9999 port."
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(('0.0.0.0', 9999))
s.listen(5)
while True:
sock, addr = s.accept()
t = threading.Thread(target=tcplink, args=(sock, addr))
t.start()
weixin.py
import requests
import json
import sys
def gettoken(corp_id, corp_secret):
gettoken_url = 'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid=' + corp_id + '&corpsecret=' + corp_secret
try:
token_file = requests.get(gettoken_url)
except requests.HTTPError as e:
print(e.code)
print(e.read().decode("utf8"))
token_data = token_file.text.decode('utf-8')
token_json = json.loads(token_data)
token_json.keys()
token = token_json['access_token']
return token
def senddata(access_token,content):
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=' + access_token
send_values = {
"touser":"187xxxxxxxx|185xxxxxxxx",
"msgtype":"text",
"agentid":"17",
"text":{
"content":content
},
"safe":"0"
}
send_data = json.dumps(send_values, ensure_ascii=False).encode(encoding='UTF8')
response = requests.post(send_url, send_data)
msg = response.text
return msg
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
客户端 1
# monitor.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import socket
import psutil
import os
# 内存
def getMonitor():
# 主机信息
name = socket.getfqdn(socket.gethostname())
ip = socket.gethostbyname(name)
# n内存
mem=psutil.virtual_memory()
mem_use = int((mem.available/mem.total)*100)
# cpu
cpuload_1, cpuload_5, cpuload_15 = os.getloadavg()
cpu_load = cpuload_5
# cpu_use = psutil.cpu_percent(1)
cpu = psutil.cpu_percent(interval=5, percpu=True)
cpu_count = psutil.cpu_count()
cpu_use_total = 0
for c in cpu:
cpu_use_total=cpu_use_total + c
cpu_use = cpu_use_total/cpu_count
# 磁盘
disk_use = psutil.disk_usage('/').percent
data = {
"type": 1,
"ip": ip,
"name": name,
"cpu_load": cpu_load,
"cpu_use": cpu_use,
"mem_use": mem_use,
"disk_use": disk_use,
}
print str(data)
return str(data).replace("'", '"')
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# 建立连接:
s.connect(('server_ip', 9999))
data = getMonitor()
try:
s.send(data)
s.close()
except Exception,e:
print e
s.close()
客户端 2
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import socket
import os,commands,json
# 获取通过systemclt 工具管理的系统服务状态
def check_status(service_name):
status = os.system('sudo systemctl status ' + service_name+ ' > /dev/null')
return status
# 要监控的服务列表
service_lists = ['config.service','xxx.service','xxx.service']
def get_status(service_lists):
"""
type == 1 硬件监控
type == 2 服务监控
type == x xxxxxx
"""
data = {"type": 2, "status": 0}
for service in service_lists:
re = check_status(service)
if re != 0:
data[service] = "down"
data["status"] = 1
print str(data)
return str(data).replace("'", '"')
data = get_status(service_lists)
if json.loads(data)["status"] == 1:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# 建立连接:
s.connect(('server_ip', 9999))
try:
s.send(data)
s.close()
except Exception,e:
print e
s.close()
运行方式
客户端
在saltstack 服务器上定时执行 监控脚本
*/5 * * * * salt '*' cmd.script salt://scripts/monitor.py python_shell=true
*/5 * * * * salt '*' cmd.script salt://scripts/monitor_service_status.py python_shell=true
服务器
加入系统进程,侦听tcp端口