summaryrefslogtreecommitdiff
path: root/py_common/common_diagnose.py
diff options
context:
space:
mode:
Diffstat (limited to 'py_common/common_diagnose.py')
-rw-r--r--py_common/common_diagnose.py323
1 files changed, 239 insertions, 84 deletions
diff --git a/py_common/common_diagnose.py b/py_common/common_diagnose.py
index 1dc1758..ca3e9f8 100644
--- a/py_common/common_diagnose.py
+++ b/py_common/common_diagnose.py
@@ -13,9 +13,7 @@ from sys import path
path.append(r'../py_common') #将存放module的路径添加进来
path.append(r'../py_cmd') #将存放module的路径添加进来
from common_telegraf import *
-from common_logger import *
from common_args import *
-from common_logger import *
from common_influxdb import *
from common_whoami import *
from common_system_cmd import *
@@ -36,11 +34,8 @@ from tsg_software_reboot import *
#如果没有任何警告和错误, 显示normal
#所有检测项不能中途退出, 即使有错误, 显示当前检测项的错误后, 继续执行, 保证全检测一遍
-
-def tsg_diagnose_syslog(event_source, user_msg):
- if syslog_enable != 0 and user_level <= g_log_level:
- sys_msg = "[%s] %s" %(event_source, user_msg)
- syslog.syslog(user_level, sys_msg)
+#测试用, 所有比例增加50%, 用于产生期望的日志
+debug_crit_rate_plus = 0
#根据优先级, 设置新的日志等级
def tsg_set_log_level(old_level, new_level):
@@ -49,22 +44,23 @@ def tsg_set_log_level(old_level, new_level):
else:
return old_level
+def tsg_diagnose_write_log(log_level, event_source, user_msg):
+ if g_is_cli_cmd != 0:
+ g_ptable.add_row([g_local_sled_name, event_source , G_SYS_LOG_STRING[log_level], user_msg])
+ else:
+ if log_level <= g_log_level:
+ syslog_msg = "[%s] %s" %(event_source, user_msg)
+ syslog.syslog(log_level, syslog_msg)
def tsg_diagnose_for_app():
cur_level = syslog.LOG_INFO
err_code = 0
- sled_type,sled_id,sled_name = tsg_whoami()
- if sled_name == "":
- cur_level = syslog.LOG_ERR
- ptable.add_row(["", "application", G_SYS_LOG_STRING[cur_level], "can't get local sled name"])
- return 1
-
- module_array = tsg_get_local_sled_modules(sled_name)
+ module_array = tsg_get_local_sled_modules(g_local_sled_name)
if len(module_array) <= 0:
cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR)
- ptable.add_row([g_local_sled_name, "application", G_SYS_LOG_STRING[cur_level],"can't get local sled modules"])
- #print("can't get local sled modules")
+ tsg_diagnose_write_log(cur_level, "app", "can't get local sled modules")
+ #获取不到模块列表, app检测终止!
return 1
for module_name in module_array:
@@ -72,144 +68,301 @@ def tsg_diagnose_for_app():
if len(module_operator) <= 0:
cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR)
err_msg = "can't get local sled module operator for %s" %(module_name)
- ptable.add_row([g_local_sled_name,"application", G_SYS_LOG_STRING[cur_level], err_msg])
- #print(err_msg)
+ tsg_diagnose_write_log(cur_level, "app", err_msg)
err_code += 1
ret, start_func, stop_func, check_func = tsg_get_operator_by_config(module_operator)
if ret != 0:
cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR)
err_msg = "can't get operator for %s" %(module_operator[TSG_OP_MODULE_NAME_INDEX])
- ptable.add_row([g_local_sled_name,"application", G_SYS_LOG_STRING[cur_level],err_msg])
- #print(err_msg)
+ tsg_diagnose_write_log(cur_level, "app", err_msg)
err_code += 1
running_flag = check_func(module_operator[TSG_OP_MODULE_NAME_INDEX])
if running_flag == 0:
- cur_level = tsg_set_log_level(cur_level, syslog.LOG_EMERG)
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_ALERT)
err_msg = "module %s is not running" %(module_name)
- ptable.add_row([g_local_sled_name,"application", G_SYS_LOG_STRING[cur_level],err_msg])
+ tsg_diagnose_write_log(cur_level, "app", err_msg)
err_code += 1
#todo, check for restart time
if cur_level >= syslog.LOG_INFO:
- ptable.add_row([g_local_sled_name, "application", G_SYS_LOG_STRING[cur_level], "normal"])
+ tsg_diagnose_write_log(cur_level, "app", "normal")
return err_code
def tsg_diagnose_for_cpu():
cur_level = syslog.LOG_INFO
- ptable.add_row([g_local_sled_name,"cpu", G_SYS_LOG_STRING[cur_level],"normal"])
- return 0
+ cpu_sql_str = "select usage_user+usage_system as cpu_usage from cpu order by time desc limit 1"
+
+ ret, points, msg = tsg_influxb_query(g_influxdb_client, cpu_sql_str)
+ if ret == 0:
+ for point in points:
+ cpu_usage = point['cpu_usage'] + debug_crit_rate_plus
+ if cpu_usage >= 90:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT)
+ msg = "cpu usage more than %.2f%%" %(cpu_usage)
+ if cpu_usage >= 0.02:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
+ msg = "cpu usage more than %.2f%%" %(cpu_usage)
+ elif cpu_usage >= 0.01:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE)
+ msg = "cpu usage more than %.2f%%" %(cpu_usage)
+ else:
+ msg = "normal"
+
+ tsg_diagnose_write_log(cur_level, "cpu", msg)
+ else:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR)
+ tsg_diagnose_write_log(cur_level, "cpu", "sql query error")
+
+ return 1
def tsg_diagnose_for_mem():
cur_level = syslog.LOG_INFO
- ptable.add_row([g_local_sled_name,"memory", G_SYS_LOG_STRING[cur_level],"normal"])
- return 0
+ mem_sql_str = "select 100*used/total as mem_usage from mem order by time desc limit 1"
+
+ ret, points, msg = tsg_influxb_query(g_influxdb_client, mem_sql_str)
+ if ret == 0:
+ for point in points:
+ mem_usage = point['mem_usage'] + debug_crit_rate_plus
+ if mem_usage >= 90.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT)
+ msg = "memory usage more than %.2f%%" %(mem_usage)
+ if mem_usage >= 20.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
+ msg = "memory usage more than %.2f%%" %(mem_usage)
+ elif mem_usage >= 1.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE)
+ msg = "memory usage more than %.2f%%" %(mem_usage)
+ else:
+ msg = "normal"
+
+ tsg_diagnose_write_log(cur_level, "memory", msg)
+ else:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR)
+ tsg_diagnose_write_log(cur_level, "memory", "sql query error")
+
+ return 1
-def tsg_diagnose_disk_usage_check(file_system, usage):
- cur_level = syslog.LOG_DEBUG
- msg = "Filesystem %s useage more than %d%%" %(file_system, usage)
+def tsg_diagnose_for_disk_by_influxdb():
+ cur_level = syslog.LOG_INFO
+ disk_sql_str = "select 100*used/total as disk_usage from disk order by time desc limit 1"
+
+ ret, points, msg = tsg_influxb_query(g_influxdb_client, disk_sql_str)
+ if ret == 0:
+ for point in points:
+ disk_usage = point['disk_usage']
+ if disk_usage >= 10.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT)
+ msg = "disk usage more than %.2f%%" %(disk_usage)
+ if disk_usage >= 5.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
+ msg = "disk usage more than %.2f%%" %(disk_usage)
+ elif disk_usage >= 3.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE)
+ msg = "disk usage more than %.2f%%" %(disk_usage)
+ else:
+ msg = "normal"
+
+ tsg_diagnose_write_log(cur_level, "disk", msg)
+ else:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR)
+ tsg_diagnose_write_log(cur_level, "disk", "sql query error")
+
+ if cur_level >= syslog.LOG_INFO:
+ tsg_diagnose_write_log(cur_level, "disk", "normal")
+ return 0
+
+ return 1
+
+def tsg_diagnose_for_disk_by_df_cmd_ranking(filesystem_name, disk_usage):
+ cur_level = syslog.LOG_INFO
- if usage >= 95:
- cur_level = syslog.LOG_CRIT
- elif usage >= 90:
- cur_level = syslog.LOG_WARNING
- elif usage >= 80:
- cur_level = syslog.LOG_INFO
+ disk_usage += debug_crit_rate_plus
- if cur_level <= syslog.LOG_INFO:
- ptable.add_row([g_local_sled_name,"disk", G_SYS_LOG_STRING[syslog.LOG_CRIT], msg])
+ if disk_usage >= 10.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT)
+ msg = "filesystem '%s' usage more than %.2f%%" %(filesystem_name, disk_usage)
+ if disk_usage >= 5.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
+ msg = "filesystem '%s' usage more than %.2f%%" %(filesystem_name, disk_usage)
+ elif disk_usage >= 3.0:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE)
+ msg = "filesystem '%s' usage more than %.2f%%" %(filesystem_name, disk_usage)
+ tsg_diagnose_write_log(cur_level, "disk", msg)
+
return cur_level
-def tsg_diagnose_for_disk():
- cur_level = syslog.LOG_INFO
+def tsg_diagnose_for_disk_by_df_cmd():
+ disk_level = syslog.LOG_INFO
- ret, cmd_res = system_cmd_run("df | awk {'print $1,$5'}")
+ #忽略tmpfs, nfs等文件系统, 只统计ext3,ext4,xfs类型
+ ret, cmd_res = system_cmd_run("df -t ext3 -t ext4 -t xfs | awk {'print $6,$5'}")
df_info = cmd_res.split()
max_item_num = len(df_info)
- #从2开始, 跳过第一行Filesystem Use%
+ #从2开始, 跳过第一行Use% Mounted
for i in range(2, max_item_num-1, 2):
- new_level = tsg_diagnose_disk_usage_check(df_info[i], int(df_info[i+1].split("%")[0]))
- tsg_set_log_level(cur_level, new_level)
+ fs_level = tsg_diagnose_for_disk_by_df_cmd_ranking(df_info[i], float(df_info[i+1].split("%")[0]))
+ disk_level = tsg_set_log_level(disk_level, fs_level)
- if cur_level >= syslog.LOG_INFO:
- ptable.add_row([g_local_sled_name,"disk", G_SYS_LOG_STRING[cur_level],"normal"])
+ if disk_level >= syslog.LOG_INFO:
+ tsg_diagnose_write_log(cur_level, "disk", "normal")
+
+ return 0
+
+
+def tsg_diagnose_for_disk():
+ #使用df获取本地磁盘使用率
+ tsg_diagnose_for_disk_by_df_cmd()
+
+ #从influxDB读取本地磁盘使用率
+ #tsg_diagnose_for_disk_by_influxdb()
return 0
+def tsg_diagnose_interface_drop_rate_ranking(device, dir, drop_pkt_num, tot_pkt_num):
+ drop_rate = 100.0 * float(drop_pkt_num) / float(tot_pkt_num) + debug_crit_rate_plus
+
+ cur_level = syslog.LOG_INFO
+
+ if drop_rate >= 0.01:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT)
+ msg = "device '%s' %s drop rate more than %.2f%%" %(device, dir, drop_rate)
+ if drop_rate >= 0.005:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
+ msg = "device '%s' %s drop rate more than %.2f%%" %(device, dir, drop_rate)
+ elif drop_rate >= 0.001:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE)
+ msg = "device '%s' %s drop rate more than %.2f%%" %(device, dir, drop_rate)
+ else:
+ msg = "normal"
+ return syslog.LOG_INFO
+
+ tsg_diagnose_write_log(cur_level, "interface", msg)
+
+ return cur_level
+
def tsg_diagnose_for_interface():
+ interface_level = syslog.LOG_INFO
+ sql_str = "select device, PhyRXError,PhyRXMissed,PhyRXNoBUF,PhyRXFrame,PhyTXError, PhyTXFrame from interface where sled = '%s' and PhyRXBits+PhyRXError+PhyRXMissed+PhyRXNoBUF > 0 and time > now() -5d group by device limit 1" %(g_local_sled_name)
+
+ ret, points, msg = tsg_influxb_query(g_influxdb_client, sql_str)
+ if ret == 0:
+ for point in points:
+ device_level = tsg_diagnose_interface_drop_rate_ranking(point['device'], "RX", int(point['PhyRXError']) + int(point['PhyRXMissed']) + int(point['PhyRXNoBUF']), int(point['PhyRXFrame']))
+ interface_level = tsg_set_log_level(interface_level, device_level)
+ device_level = tsg_diagnose_interface_drop_rate_ranking(point['device'], "TX", int(point['PhyTXError']) , int(point['PhyTXFrame']))
+ interface_level = tsg_set_log_level(interface_level, device_level)
+ else:
+ interface_level = tsg_set_log_level(interface_level, syslog.LOG_ERR)
+ tsg_diagnose_write_log(interface_level, "interface", "sql query error")
+ return 1
+
+ if interface_level >= syslog.LOG_INFO:
+ tsg_diagnose_write_log(interface_level, "interface", "normal")
+
+ return 0
+
+def tsg_diagnose_app_drop_rate_ranking(dir, drop_pkt_num, tot_pkt_num):
+ drop_rate = 100.0 * float(drop_pkt_num) / float(tot_pkt_num) + debug_crit_rate_plus
+
cur_level = syslog.LOG_INFO
- sql_str = "select * from interface where PhyRXBits+PhyRXError+PhyRXMissed+PhyRXNoBUF > 0 and time > now() -5m limit 1"
+
+ if drop_rate >= 0.01:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_CRIT)
+ msg = "module kni %s drop rate more than %.2f%%" %(dir, drop_rate)
+ if drop_rate >= 0.005:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
+ msg = "module kni %s drop rate more than %.2f%%" %(dir, drop_rate)
+ elif drop_rate >= 0.001:
+ cur_level = tsg_set_log_level(cur_level, syslog.LOG_NOTICE)
+ msg = "module kni %s drop rate more than %.2f%%" %(dir, drop_rate)
+ else:
+ msg = "normal"
+ return syslog.LOG_INFO
+
+ tsg_diagnose_write_log(cur_level, "app network", msg)
+
+ return cur_level
+
+def tsg_diagnose_for_app_stream():
+ app_net_level = syslog.LOG_INFO
+ app_sql_str = "select * from app where RxDrops + TxDrops > 0 and time > now() -5d limit 1"
- ret, points, msg = tsg_influxb_query(influxdb_client, sql_str)
+ ret, points, msg = tsg_influxb_query(g_influxdb_client, app_sql_str)
if ret == 0:
for point in points:
- cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING)
- msg = "%s has droped %d packets" %(point['device'], int(point['PhyRXBits']) + int(point['PhyRXMissed']) + int(point['PhyRXNoBUF']) )
- ptable.add_row([g_local_sled_name,"physical network", G_SYS_LOG_STRING[cur_level], msg])
-
- if cur_level >= syslog.LOG_INFO:
- ptable.add_row([g_local_sled_name,"physical network", G_SYS_LOG_STRING[cur_level], "normal"])
+ cur_level = tsg_diagnose_app_drop_rate_ranking("RX", int(point['RxDrops']), int(point['RxPkts']))
+ app_net_level = tsg_set_log_level(app_net_level, cur_level)
+ cur_level = tsg_diagnose_app_drop_rate_ranking("TX", int(point['TxDrops']), int(point['TxPkts']))
+ app_net_level = tsg_set_log_level(app_net_level, cur_level)
+ else:
+ app_net_level = tsg_set_log_level(app_net_level, syslog.LOG_ERR)
+ tsg_diagnose_write_log(app_net_level, "app network", "sql query error")
+ return 1
+
+ if app_net_level >= syslog.LOG_INFO:
+ tsg_diagnose_write_log(app_net_level, "app network", "normal")
return 0
return 1
-def tsg_diagnose_for_app_stream():
- cur_level = syslog.LOG_INFO
- ptable.add_row([g_local_sled_name,"app network", G_SYS_LOG_STRING[cur_level],"normal"])
- return 0
-
-def tsg_common_diagnose_init(manual_cmd):
- global ptable
- global influxdb_client
+def tsg_common_diagnose_init(is_cli_cmd):
+ global g_ptable
+ global g_influxdb_client
global g_local_sled_name
global g_log_level
- global syslog_enable
+ global g_is_cli_cmd
global syslog_handle
- #前台cli命令
- if 0 == manual_cmd:
+ g_is_cli_cmd = is_cli_cmd
+
+ if is_cli_cmd != 0:
g_log_level = syslog.LOG_INFO
- syslog_enable = 0
- #后台自动轮询命令
else:
- g_log_level = syslog.LOG_WARNING
- syslog_enable = 1
+ g_log_level = syslog.LOG_NOTICE
type, id, g_local_sled_name = tsg_whoami()
if g_local_sled_name == "":
- if syslog_enable != 0:
- else:
+ if g_is_cli_cmd != 0:
print("can't get local sled name")
- sys.exit(1)
+ else:
+ syslog_handle = syslog.openlog()
+ syslog_msg = "[%s] %s" %("diagnose", "can't get local sled name")
+ syslog_handle.syslog(syslog.LOG_ERR, syslog_msg)
+ sys.exit(1)
#pretty table init
- ptable = prettytable.PrettyTable()
- ptable.field_names = ["Sled", "Type", "Level", "Status"]
+ g_ptable = prettytable.PrettyTable()
+ g_ptable.field_names = ["Sled", "Type", "Level", "Status"]
#incluxDB init
- ret, influxdb_client, msg = tsg_influxdb_init('127.0.0.1', 8086, 'admin', 'tsg2019', 'tsg_stat')
+ influxdb_server_ip = "127.0.0.1"
+ influxdb_server_port = 8086
+ ret, g_influxdb_client, msg = tsg_influxdb_init(influxdb_server_ip, influxdb_server_port, 'admin', 'tsg2019', 'tsg_stat')
if ret != 0:
- ptable.add_row([g_local_sled_name,"common", G_SYS_LOG_STRING[syslog.LOG_ERR],"can't connect influxDB server"])
-
-
+ err_msg = "can't connect influxDB server %s:%d" %(influxdb_server_ip, influxdb_server_port)
+ tsg_diagnose_write_log(syslog.LOG_ERR, "diagnose", err_msg)
- #syslog init
- if syslog_enable != 0:
+ #后台调用的轮询脚本, 写syslog
+ if g_is_cli_cmd == 0:
syslog_handle = syslog.openlog(g_local_sled_name)
return 0
#cli 命令不影响syslog, 只是输出各种消息, 即便有warning, error, 自动轮询脚本也会执行检测到
#后台自动轮询命令增加 syslog-effect, 表示影响syslog,
-#参数manual_cmd表示是否是用户前台调用的cli命令
-def tsg_common_diagnose(manual_cmd):
- tsg_common_diagnose_init(manual_cmd)
+#参数is_cli_cmd表示是否是用户前台调用的cli命令
+def tsg_common_diagnose(is_cli_cmd):
+ if debug_crit_rate_plus > 0:
+ print("######### this is a debug fake crit alarm version!")
+
+ tsg_common_diagnose_init(is_cli_cmd)
#检查app进程是否存在, 是否最近n分钟内重启过
tsg_diagnose_for_app()
@@ -229,7 +382,9 @@ def tsg_common_diagnose(manual_cmd):
#检查应用流量
tsg_diagnose_for_app_stream()
+ if is_cli_cmd != 0:
+ print(g_ptable)
if __name__ == '__main__':
tsg_common_diagnose(1)
- print(ptable) \ No newline at end of file
+ \ No newline at end of file