#coding=utf-8 #diagnose公共模块, 供cli命令和backgroud后台自动运行工具调用 #前台执行的diagnose cli命令, 不写syslog日志 #后台自动监测脚本, 写syslog, 也就是APP FM的功能 #写日志仅限于app相关, cpu,mem是snmp模块负责的 import sys import time import json import logging import syslog import prettytable from sys import path path.append(r'../py_common') #将存放module的路径添加进来 path.append(r'../py_cmd') #将存放module的路径添加进来 from common_telegraf import * from common_args import * from common_influxdb import * from common_whoami import * from common_system_cmd import * from tsg_software_reboot import * #syslog 级别: #define KERN_EMERG "<0>" /* system is unusable */ #define KERN_ALERT "<1>" /* action must be taken immediately */ #define KERN_CRIT "<2>" /* critical conditions */ #define KERN_ERR "<3>" /* error conditions */ #define KERN_WARNING "<4>" /* warning conditions */ #define KERN_NOTICE "<5>" /* normal but significant condition */ #define KERN_INFO "<6>" /* informational */ #define KERN_DEBUG "<7>" /* debug-level messages */ #参数log_level表示高于此级别的才输出, syslog值越小优先级越高, #即log_level <= diagnose_level时输出结果, 如果级别高于ERR, 同时写入syslog #如果没有任何警告和错误, 显示normal #所有检测项不能中途退出, 即使有错误, 显示当前检测项的错误后, 继续执行, 保证全检测一遍 #测试用, 所有比例增加50%, 用于产生期望的日志 debug_crit_rate_plus = 0 #根据优先级, 设置新的日志等级 def tsg_set_log_level(old_level, new_level): if new_level < old_level: return new_level else: return old_level def tsg_diagnose_write_log(log_level, event_source, user_msg): if g_is_cli_cmd != 0: g_ptable.add_row([g_local_sled_name, event_source , G_SYS_LOG_STRING[log_level], user_msg]) else: if log_level <= g_log_level: syslog_msg = "[%s] %s" %(event_source, user_msg) syslog.syslog(log_level, syslog_msg) def tsg_diagnose_for_app(): cur_level = syslog.LOG_INFO err_code = 0 module_array = tsg_get_local_sled_modules(g_local_sled_name) if len(module_array) <= 0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) tsg_diagnose_write_log(cur_level, "app", "can't get local sled modules") #获取不到模块列表, app检测终止! return 1 for module_name in module_array: module_operator = tsg_get_module_opertor(module_name) if len(module_operator) <= 0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) err_msg = "can't get local sled module operator for %s" %(module_name) tsg_diagnose_write_log(cur_level, "app", err_msg) err_code += 1 ret, start_func, stop_func, check_func = tsg_get_operator_by_config(module_operator) if ret != 0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) err_msg = "can't get operator for %s" %(module_operator[TSG_OP_MODULE_NAME_INDEX]) tsg_diagnose_write_log(cur_level, "app", err_msg) err_code += 1 running_flag = check_func(module_operator[TSG_OP_MODULE_NAME_INDEX]) if running_flag == 0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ALERT) err_msg = "module %s is not running" %(module_name) tsg_diagnose_write_log(cur_level, "app", err_msg) err_code += 1 #todo, check for restart time if cur_level >= syslog.LOG_INFO: tsg_diagnose_write_log(cur_level, "app", "normal") return err_code def tsg_diagnose_for_cpu(): cur_level = syslog.LOG_INFO cpu_sql_str = "select usage_user+usage_system as cpu_usage from cpu order by time desc limit 1" ret, points, msg = tsg_influxb_query(g_influxdb_client, cpu_sql_str) if ret == 0: for point in points: cpu_usage = point['cpu_usage'] + debug_crit_rate_plus if cpu_usage >= 90: cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT) msg = "cpu usage more than %.2f%%" %(cpu_usage) if cpu_usage >= 0.02: cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) msg = "cpu usage more than %.2f%%" %(cpu_usage) elif cpu_usage >= 0.01: cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE) msg = "cpu usage more than %.2f%%" %(cpu_usage) else: msg = "normal" tsg_diagnose_write_log(cur_level, "cpu", msg) else: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) tsg_diagnose_write_log(cur_level, "cpu", "sql query error") return 1 def tsg_diagnose_for_mem(): cur_level = syslog.LOG_INFO mem_sql_str = "select 100*used/total as mem_usage from mem order by time desc limit 1" ret, points, msg = tsg_influxb_query(g_influxdb_client, mem_sql_str) if ret == 0: for point in points: mem_usage = point['mem_usage'] + debug_crit_rate_plus if mem_usage >= 90.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT) msg = "memory usage more than %.2f%%" %(mem_usage) if mem_usage >= 20.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) msg = "memory usage more than %.2f%%" %(mem_usage) elif mem_usage >= 1.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE) msg = "memory usage more than %.2f%%" %(mem_usage) else: msg = "normal" tsg_diagnose_write_log(cur_level, "memory", msg) else: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) tsg_diagnose_write_log(cur_level, "memory", "sql query error") return 1 def tsg_diagnose_for_disk_by_influxdb(): cur_level = syslog.LOG_INFO disk_sql_str = "select 100*used/total as disk_usage from disk order by time desc limit 1" ret, points, msg = tsg_influxb_query(g_influxdb_client, disk_sql_str) if ret == 0: for point in points: disk_usage = point['disk_usage'] if disk_usage >= 10.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT) msg = "disk usage more than %.2f%%" %(disk_usage) if disk_usage >= 5.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) msg = "disk usage more than %.2f%%" %(disk_usage) elif disk_usage >= 3.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE) msg = "disk usage more than %.2f%%" %(disk_usage) else: msg = "normal" tsg_diagnose_write_log(cur_level, "disk", msg) else: cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) tsg_diagnose_write_log(cur_level, "disk", "sql query error") if cur_level >= syslog.LOG_INFO: tsg_diagnose_write_log(cur_level, "disk", "normal") return 0 return 1 def tsg_diagnose_for_disk_by_df_cmd_ranking(filesystem_name, disk_usage): cur_level = syslog.LOG_INFO disk_usage += debug_crit_rate_plus if disk_usage >= 10.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT) msg = "filesystem '%s' usage more than %.2f%%" %(filesystem_name, disk_usage) if disk_usage >= 5.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) msg = "filesystem '%s' usage more than %.2f%%" %(filesystem_name, disk_usage) elif disk_usage >= 3.0: cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE) msg = "filesystem '%s' usage more than %.2f%%" %(filesystem_name, disk_usage) tsg_diagnose_write_log(cur_level, "disk", msg) return cur_level def tsg_diagnose_for_disk_by_df_cmd(): disk_level = syslog.LOG_INFO #忽略tmpfs, nfs等文件系统, 只统计ext3,ext4,xfs类型 ret, cmd_res = system_cmd_run("df -t ext3 -t ext4 -t xfs | awk {'print $6,$5'}") df_info = cmd_res.split() max_item_num = len(df_info) #从2开始, 跳过第一行Use% Mounted for i in range(2, max_item_num-1, 2): fs_level = tsg_diagnose_for_disk_by_df_cmd_ranking(df_info[i], float(df_info[i+1].split("%")[0])) disk_level = tsg_set_log_level(disk_level, fs_level) if disk_level >= syslog.LOG_INFO: tsg_diagnose_write_log(cur_level, "disk", "normal") return 0 def tsg_diagnose_for_disk(): #使用df获取本地磁盘使用率 tsg_diagnose_for_disk_by_df_cmd() #从influxDB读取本地磁盘使用率 #tsg_diagnose_for_disk_by_influxdb() return 0 def tsg_diagnose_interface_drop_rate_ranking(device, dir, drop_pkt_num, tot_pkt_num): drop_rate = 100.0 * float(drop_pkt_num) / float(tot_pkt_num) + debug_crit_rate_plus cur_level = syslog.LOG_INFO if drop_rate >= 0.01: cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT) msg = "device '%s' %s drop rate more than %.2f%%" %(device, dir, drop_rate) if drop_rate >= 0.005: cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) msg = "device '%s' %s drop rate more than %.2f%%" %(device, dir, drop_rate) elif drop_rate >= 0.001: cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE) msg = "device '%s' %s drop rate more than %.2f%%" %(device, dir, drop_rate) else: msg = "normal" return syslog.LOG_INFO tsg_diagnose_write_log(cur_level, "interface", msg) return cur_level def tsg_diagnose_for_interface(): interface_level = syslog.LOG_INFO sql_str = "select device, PhyRXError,PhyRXMissed,PhyRXNoBUF,PhyRXFrame,PhyTXError, PhyTXFrame from interface where sled = '%s' and PhyRXBits+PhyRXError+PhyRXMissed+PhyRXNoBUF > 0 and time > now() -5d group by device limit 1" %(g_local_sled_name) ret, points, msg = tsg_influxb_query(g_influxdb_client, sql_str) if ret == 0: for point in points: device_level = tsg_diagnose_interface_drop_rate_ranking(point['device'], "RX", int(point['PhyRXError']) + int(point['PhyRXMissed']) + int(point['PhyRXNoBUF']), int(point['PhyRXFrame'])) interface_level = tsg_set_log_level(interface_level, device_level) device_level = tsg_diagnose_interface_drop_rate_ranking(point['device'], "TX", int(point['PhyTXError']) , int(point['PhyTXFrame'])) interface_level = tsg_set_log_level(interface_level, device_level) else: interface_level = tsg_set_log_level(interface_level, syslog.LOG_ERR) tsg_diagnose_write_log(interface_level, "interface", "sql query error") return 1 if interface_level >= syslog.LOG_INFO: tsg_diagnose_write_log(interface_level, "interface", "normal") return 0 def tsg_diagnose_app_drop_rate_ranking(dir, drop_pkt_num, tot_pkt_num): drop_rate = 100.0 * float(drop_pkt_num) / float(tot_pkt_num) + debug_crit_rate_plus cur_level = syslog.LOG_INFO if drop_rate >= 0.01: cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT) msg = "module kni %s drop rate more than %.2f%%" %(dir, drop_rate) if drop_rate >= 0.005: cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) msg = "module kni %s drop rate more than %.2f%%" %(dir, drop_rate) elif drop_rate >= 0.001: cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE) msg = "module kni %s drop rate more than %.2f%%" %(dir, drop_rate) else: msg = "normal" return syslog.LOG_INFO tsg_diagnose_write_log(cur_level, "app network", msg) return cur_level def tsg_diagnose_for_app_stream(): app_net_level = syslog.LOG_INFO app_sql_str = "select * from app where RxDrops + TxDrops > 0 and time > now() -5d limit 1" ret, points, msg = tsg_influxb_query(g_influxdb_client, app_sql_str) if ret == 0: for point in points: cur_level = tsg_diagnose_app_drop_rate_ranking("RX", int(point['RxDrops']), int(point['RxPkts'])) app_net_level = tsg_set_log_level(app_net_level, cur_level) cur_level = tsg_diagnose_app_drop_rate_ranking("TX", int(point['TxDrops']), int(point['TxPkts'])) app_net_level = tsg_set_log_level(app_net_level, cur_level) else: app_net_level = tsg_set_log_level(app_net_level, syslog.LOG_ERR) tsg_diagnose_write_log(app_net_level, "app network", "sql query error") return 1 if app_net_level >= syslog.LOG_INFO: tsg_diagnose_write_log(app_net_level, "app network", "normal") return 0 return 1 def tsg_common_diagnose_init(is_cli_cmd): global g_ptable global g_influxdb_client global g_local_sled_name global g_log_level global g_is_cli_cmd global syslog_handle g_is_cli_cmd = is_cli_cmd if is_cli_cmd != 0: g_log_level = syslog.LOG_INFO else: g_log_level = syslog.LOG_NOTICE type, id, g_local_sled_name = tsg_whoami() if g_local_sled_name == "": if g_is_cli_cmd != 0: print("can't get local sled name") else: syslog_handle = syslog.openlog() syslog_msg = "[%s] %s" %("diagnose", "can't get local sled name") syslog_handle.syslog(syslog.LOG_ERR, syslog_msg) sys.exit(1) #pretty table init g_ptable = prettytable.PrettyTable() g_ptable.field_names = ["Sled", "Type", "Level", "Status"] #incluxDB init influxdb_server_ip = "127.0.0.1" influxdb_server_port = 8086 ret, g_influxdb_client, msg = tsg_influxdb_init(influxdb_server_ip, influxdb_server_port, 'admin', 'tsg2019', 'tsg_stat') if ret != 0: err_msg = "can't connect influxDB server %s:%d" %(influxdb_server_ip, influxdb_server_port) tsg_diagnose_write_log(syslog.LOG_ERR, "diagnose", err_msg) #后台调用的轮询脚本, 写syslog if g_is_cli_cmd == 0: syslog_handle = syslog.openlog(g_local_sled_name) return 0 #cli 命令不影响syslog, 只是输出各种消息, 即便有warning, error, 自动轮询脚本也会执行检测到 #后台自动轮询命令增加 syslog-effect, 表示影响syslog, #参数is_cli_cmd表示是否是用户前台调用的cli命令 def tsg_common_diagnose(is_cli_cmd): if debug_crit_rate_plus > 0: print("######### this is a debug fake crit alarm version!") tsg_common_diagnose_init(is_cli_cmd) #检查app进程是否存在, 是否最近n分钟内重启过 tsg_diagnose_for_app() #检查CPU占用率 tsg_diagnose_for_cpu() #检查mem占用率 tsg_diagnose_for_mem() #检查磁盘占用率 tsg_diagnose_for_disk() #检查物理网络情况 tsg_diagnose_for_interface() #检查应用流量 tsg_diagnose_for_app_stream() if is_cli_cmd != 0: print(g_ptable) if __name__ == '__main__': tsg_common_diagnose(1)