diff options
| author | lijia <[email protected]> | 2019-08-11 23:20:35 +0800 |
|---|---|---|
| committer | lijia <[email protected]> | 2019-08-11 23:20:35 +0800 |
| commit | 78c989e3a3b7c475ae790436776c62e6fc7bbf7a (patch) | |
| tree | 027944580877e70275c5fd5fa15c0b5a3e30bb87 | |
| parent | c5820eb4b36985c9f0f5ea44eb57c13568e7c791 (diff) | |
完善相关命令, 增加diagnose命令.
29 files changed, 869 insertions, 162 deletions
@@ -1 +1,2 @@ py_bin/*
+py_temp/*
diff --git a/deploy_doc/deploy.md b/deploy_doc/deploy.md deleted file mode 100644 index aa7c573..0000000 --- a/deploy_doc/deploy.md +++ /dev/null @@ -1,22 +0,0 @@ -1.交换板文件
- /opt/tsg/bin
- /opt/tsg/etc
-
- 更新telegraf.conf的global tags, 本机sn.
-
-2.计算板文件(4台)
-
-
-3.服务
-启动交换板crontab服务:
-service crond start
-chkconfig crond on
-crontab -e, 输入以下内容:
-* * * * * /opt/tsg/bin/tsg_update_tags
-* * * * * sleep 10; /opt/tsg/bin/tsg_update_tags
-* * * * * sleep 20; /opt/tsg/bin/tsg_update_tags
-* * * * * sleep 30; /opt/tsg/bin/tsg_update_tags
-* * * * * sleep 40; /opt/tsg/bin/tsg_update_tags
-* * * * * sleep 50; /opt/tsg/bin/tsg_update_tags
-
-修改后要执行: service crond restart
\ No newline at end of file diff --git a/deploy_doc/cli部署.txt b/deploy_doc/oam_cli_deply.md index 5e4a4d6..5e4a4d6 100644 --- a/deploy_doc/cli部署.txt +++ b/deploy_doc/oam_cli_deply.md diff --git a/deploy_doc/tsg_cli_deploy.md b/deploy_doc/tsg_cli_deploy.md new file mode 100644 index 0000000..cacaf5e --- /dev/null +++ b/deploy_doc/tsg_cli_deploy.md @@ -0,0 +1,25 @@ +1.交换板文件
+ /opt/tsg/bin
+ /opt/tsg/tools
+ /opt/tsg/etc
+
+ 更新telegraf.conf的global tags, 本机sn.
+
+2.计算板文件(4台)
+
+
+3.服务
+启动交换板crontab服务:
+service crond start
+chkconfig crond on
+crontab -e, 输入以下内容:
+* * * * * /opt/tsg/tools/tsg_update_tags
+* * * * * /opt/tsg/tools/tsg_monit_interface
+* * * * * /opt/tsg/tools/tsg_monit_stream
+* * * * * sleep 10; /opt/tsg/tools/tsg_update_tags
+* * * * * sleep 20; /opt/tsg/tools/tsg_update_tags
+* * * * * sleep 30; /opt/tsg/tools/tsg_update_tags
+* * * * * sleep 40; /opt/tsg/tools/tsg_update_tags
+* * * * * sleep 50; /opt/tsg/tools/tsg_update_tags
+
+修改后要执行: service crond restart
\ No newline at end of file diff --git a/deploy_etc/tsg_module_deploy.json b/deploy_etc/tsg_module_deploy.json new file mode 100644 index 0000000..0afcdb3 --- /dev/null +++ b/deploy_etc/tsg_module_deploy.json @@ -0,0 +1,12 @@ +{ + "modules_deploy": { + "mcn0": ["kni", "a.out"], + "mcn1": ["tfe", "a1.out"], + "mcn2": ["tfe", "a2.out"], + "mcn3": ["tfe", "a3.out"] + }, + "modules_operator": { + "kni": ["sapp", "r3 sapp", "/home/tsg/kni", "r2", "killall", "exec", "ps"], + "telegraf": ["teleraf", "#", "#", "#", "systemctl_stop", "systemctl_start", "systemctl_status"] + } +}
\ No newline at end of file diff --git a/deploy_etc/tsg_modules_operator.json.bak b/deploy_etc/tsg_modules_operator.json.bak new file mode 100644 index 0000000..273e9fa --- /dev/null +++ b/deploy_etc/tsg_modules_operator.json.bak @@ -0,0 +1,6 @@ +{ + "modules_operator": { + "sapp": ["sapp", "r3", "/home/tsg/kni", "r2", "killall", "exec", "ps"], + "telegraf": ["teleraf", "#", "#", "#", "systemctl_stop", "systemctl_start", "systemctl_status"] + } +}
\ No newline at end of file diff --git a/py_cmd/.gitignore b/py_cmd/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/py_cmd/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/py_cmd/common_modules_deploy.json b/py_cmd/common_modules_deploy.json deleted file mode 100644 index 786644f..0000000 --- a/py_cmd/common_modules_deploy.json +++ /dev/null @@ -1,12 +0,0 @@ -#此文件定义四块计算板分别运行什么模块
-#先根据tsg_chassis_ip.json, 根据当前设备的ip地址, 知道当前设备的sled名称
-#然后根据设备类型名称, 找到当前运行的所有模块(应用)名称
-#然后根据模块名称, 到common_modules_operator.json查找每个模块的操作方法
-{
- "modules_dispatch":
-
- "mcn0": ["kni"]
- "mcn1": ["tfe"]
- "mcn2": ["tfe"]
- "mcn3": ["tfe"]
-}
\ No newline at end of file diff --git a/py_cmd/common_modules_operator.json b/py_cmd/common_modules_operator.json deleted file mode 100644 index 1bd6a16..0000000 --- a/py_cmd/common_modules_operator.json +++ /dev/null @@ -1,26 +0,0 @@ -#此文件定义, 每个模块的启动、停止、检查运行状态的方法
-{
- "modules":[
- {sapp:
- start:
- "cwd"
- "exe"
- stop:
- "killall -9 r3 sapp"
- check healthy:
- "ps -ef | grep sapp "
- }
-
- {telegraf:
- start:
- "cwd" : ""
- "exe" : "service influxdb restart"
- stop:
- "service influxdb stop"
- check healthy:
- "service influxdb status | grep Active "
- }
-
- ]
-
-}
\ No newline at end of file diff --git a/py_cmd/tsg_monit_device.py b/py_cmd/tsg_monit_device.py.bak index e8d0e63..e8d0e63 100644 --- a/py_cmd/tsg_monit_device.py +++ b/py_cmd/tsg_monit_device.py.bak diff --git a/py_cmd/tsg_monit_device_by_telegraf.py b/py_cmd/tsg_monit_device_by_telegraf.py.bak index f2366a2..f2366a2 100644 --- a/py_cmd/tsg_monit_device_by_telegraf.py +++ b/py_cmd/tsg_monit_device_by_telegraf.py.bak diff --git a/py_cmd/tsg_monit_stream.py b/py_cmd/tsg_monit_stream.py.bak index c34f217..c34f217 100644 --- a/py_cmd/tsg_monit_stream.py +++ b/py_cmd/tsg_monit_stream.py.bak diff --git a/py_cmd/tsg_software_reboot.py b/py_cmd/tsg_software_reboot.py index efb592c..6320e83 100644 --- a/py_cmd/tsg_software_reboot.py +++ b/py_cmd/tsg_software_reboot.py @@ -5,6 +5,11 @@ import syslog import subprocess import time import re +from sys import path +path.append(r'../py_common') #将存放module的路径添加进来 +from common_modules_operator import * +from common_whoami import * +from common_modules_deploy import * ##define KERN_EMERG "<0>" /* system is unusable */ ##define KERN_ALERT "<1>" /* action must be taken immediately */ @@ -15,7 +20,7 @@ import re ##define KERN_INFO "<6>" /* informational */ ##define KERN_DEBUG "<7>" /* debug-level messages */ -MSG_PREFIX = ['EMERG', 'ALERT', 'CRIT', 'ERR', 'WARNING', 'NOTICE', 'INFO', 'DEBUG'] +G_SYS_LOG_STRING = ['EMERG', 'ALERT', 'CRIT', 'ERR', 'WARNING', 'NOTICE', 'INFO', 'DEBUG'] G_LOCAL_NODE_NAME = "TSG_MXN" @@ -23,18 +28,18 @@ class CommandException(Exception): pass def tsg_restart_err_log(error_num, user_msg): - msg = "[%s] %s" %(MSG_PREFIX[syslog.LOG_ERR], user_msg) + msg = "[%s] %s" %(G_SYS_LOG_STRING[syslog.LOG_ERR], user_msg) syslog.syslog(syslog.LOG_ERR, msg) print (msg) - msg = "[%s] %s" %(MSG_PREFIX[syslog.LOG_ERR], "tsg software reboot error") + msg = "[%s] %s" %(G_SYS_LOG_STRING[syslog.LOG_ERR], "tsg software reboot error") syslog.syslog(syslog.LOG_ERR, msg) print (msg) sys.exit(error_num) def tsg_restart_succ_log(): - msg = "[%s] %s" %(MSG_PREFIX[syslog.LOG_NOTICE], "tsg software reboot success") + msg = "[%s] %s" %(G_SYS_LOG_STRING[syslog.LOG_NOTICE], "tsg software reboot success") syslog.syslog(syslog.LOG_NOTICE, msg) print (msg) sys.exit(0) @@ -97,36 +102,20 @@ def tsg_kill_app_process_by_killall(module_name, extra_progs): return 0 def tsg_kill_app_process_by_systemctl_stop(module_name): - print("TODO") + print("tsg_kill_app_process_by_systemctl_stop TODO!!!") sys.exit(1) return 0 -def tsg_stop_app_process(module_name, extra_progs, stop_method, check_method): +def tsg_stop_app_process(module_name, extra_progs, stop_func, check_func): res_code = 0 running_flag = 0 - if check_method == 'ps': - check_func = tsg_check_process_health_by_ps - elif check_method == 'systemctl_status': - check_func = tsg_check_process_health_by_systemctl_status - else: - print("not support check method:%s" %(check_method)) - sys.exit(1) - running_flag = check_func(module_name) if running_flag == 0: - #print("%s is not running, start it..." %(module_name)) + logger.debug("%s is not running, start it..." %(module_name)) return 0 #ready to stop progcess, retry for 3 times - - if stop_method == 'killall': - stop_func = tsg_kill_app_process_by_killall - elif stop_method == 'systemctl_stop': - stop_func = tsg_kill_app_process_by_systemctl_stop - else: - print("not support stop method:%s" %(check_method)) - sys.exit(1) for times in range(3): stop_func(module_name, extra_progs) @@ -140,55 +129,42 @@ def tsg_stop_app_process(module_name, extra_progs, stop_method, check_method): if res_code != 0: errmsg = "can't stop process %s" %(module_name) + logger.error(errmsg) tsg_restart_err_log(res_code, errmsg) return res_code -def tsg_start_app_process_by_exec_call(module_name, module_cwd, module_exe, check_method): +def tsg_start_app_process_by_exec_call(module_name, module_cwd, module_exe): + logger.debug("try cd to dir:%s" %(module_cwd)) try: os.chdir(module_cwd) except Exception as e: print("%s" %(e)) return 1 + logger.debug("try call exec :%s" %(module_exe)) cmd_str = "./%s" %(module_exe) ret_code, output = system_cmd_run(cmd_str) if ret_code != 0: errmsg = "start program %s error, call %s/%s failed" %(module_name, module_cwd, module_exe) tsg_restart_err_log(ret_code, errmsg) - - if check_method == 'ps': - check_func = tsg_check_process_health_by_ps - elif check_method == 'systemctl_status': - check_func = tsg_check_process_health_by_systemctl_status - else: - print("not support check method:%s" %(check_method)) - return 1 - - running_flag = check_func(module_name) - if running_flag == 0: - errmsg = "start process %s error" %(module_name) - tsg_restart_err_log(ret_code, errmsg) - + return 0 def tsg_start_app_process_by_systemctl_start(module_name, module_cwd, module_exe, check_method): - print("TODO") + print("tsg_start_app_process_by_systemctl_start TODO!!!!!") sys.exit(1) -def tsg_start_app_process(module_name, module_cwd, module_exe, start_method, check_method): - if start_method == 'exec_call': - start_func = tsg_start_app_process_by_exec_call - elif start_method == 'systemctl_start': - start_func = tsg_start_app_process_by_systemctl_start - else: - print("not support start method:%s" %(start_method)) - sys.exit(1) - - ret = start_func(module_name, module_cwd, module_exe, check_method) +def tsg_start_app_process(module_name, module_cwd, module_exe, start_func, check_func): + ret = start_func(module_name, module_cwd, module_exe) if ret != 0: - sys.exit(1) + return 1 + running_flag = check_func(module_name) + if running_flag == 0: + errmsg = "start process %s error" %(module_name) + return 1 + return 0 #参数说明: @@ -201,25 +177,82 @@ def tsg_start_app_process(module_name, module_cwd, module_exe, start_method, che # start_method: 启动应用方法 # check_method: 检测应用是否运行方法 # -def tsg_restart_app_process(module_name, extra_progs, module_cwd, module_exe, stop_method, start_method, check_method): +def tsg_restart_app_process(module_name, extra_progs, module_cwd, module_exe, start_func, stop_func, check_func): res_code = 0 - res_code = tsg_stop_app_process(module_name, extra_progs, stop_method, check_method) + res_code = tsg_stop_app_process(module_name, extra_progs, stop_func, check_func) if res_code != 0: return res_code - res_code = tsg_start_app_process(module_name, module_cwd, module_exe, start_method, check_method) + res_code = tsg_start_app_process(module_name, module_cwd, module_exe, start_func, check_func) if res_code != 0: return res_code return 0 + +#根据配置文件的参数, 选择用那种操作函数继续下一步 +#返回值, 函数指针: +#ret_code, start_fun, stop_fun, check_health_fun +def tsg_get_operator_by_config(module_operator): + if module_operator[TSG_OP_MODULE_START_INDEX] == 'exec': + start_func = tsg_start_app_process_by_exec_call + elif module_operator[TSG_OP_MODULE_START_INDEX] == 'systemctl_start': + start_func = tsg_start_app_process_by_systemctl_start + else: + errmsg = "not support start method:%s, only be [exec, systemctl_start]" %(module_operator[TSG_OP_MODULE_START_INDEX]) + tsg_restart_err_log(1, errmsg) + + if module_operator[TSG_OP_MODULE_STOP_INDEX] == 'killall': + stop_func = tsg_kill_app_process_by_killall + elif module_operator[TSG_OP_MODULE_STOP_INDEX] == 'systemctl_stop': + stop_func = tsg_kill_app_process_by_systemctl_stop + else: + errmsg = "not support stop method:%s, only be [killall, systemctl_stop]" %(module_operator[TSG_OP_MODULE_STOP_INDEX]) + tsg_restart_err_log(1, errmsg) + + if module_operator[TSG_OP_MODULE_STATUS_INDEX] == 'ps': + check_func = tsg_check_process_health_by_ps + elif module_operator[TSG_OP_MODULE_STATUS_INDEX] == 'systemctl_status': + check_func = tsg_check_process_health_by_systemctl_status + else: + errmsg = "not support check method:%s, only be [ps, systemctl_status]" %(module_operator[TSG_OP_MODULE_STATUS_INDEX]) + tsg_restart_err_log(1, errmsg) + + return 0, start_func, stop_func, check_func def tsg_software_reboot(): #G_LOCAL_NODE_NAME = get_local_node_name() log_handle = syslog.openlog(G_LOCAL_NODE_NAME) - - tsg_restart_app_process("sapp", "r3", "/home/tsg/kni", "r2", "killall", "exec_call", "ps") + + sled_type,sled_id,sled_name = tsg_whoami() + if sled_name == "": + tsg_restart_err_log(1, "can't get local sled name") + sys.exit(1) + + module_array = tsg_get_local_sled_modules(sled_name) + if len(module_array) <= 0: + tsg_restart_err_log(1, "can't get local sled modules") + sys.exit(1) + + logger.debug("len(module_array) = %d" %(len(module_array))) + for module_name in module_array: + module_operator = tsg_get_module_opertor(module_name) + if len(module_operator) <= 0: + tsg_restart_err_log(1, "can't get local sled module operator for %s" %(module_name)) + sys.exit(1) + + ret, start_func, stop_func, check_func = tsg_get_operator_by_config(module_operator) + if ret != 0: + tsg_restart_err_log(1, "can't get operator for %s" %(module_operator[TSG_OP_MODULE_NAME_INDEX])) + sys.exit(1) + + tsg_restart_app_process(module_operator[TSG_OP_MODULE_NAME_INDEX], module_operator[TSG_OP_MODULE_EXTRA_INDEX], + module_operator[TSG_OP_MODULE_CWD_INDEX], + module_operator[TSG_OP_MODULE_EXE_INDEX], + start_func, stop_func, check_func) + + #tsg_restart_app_process("sapp", "r3", "/home/tsg/kni", "r2", "killall", "exec_call", "ps") #tsg_restart_app_process("telegraf", "systemctl_stop", "systemctl_start", "systemctl_status") #tsg_restart_app_process("marsio", "systemctl_stop", "systemctl_start", "systemctl_status") #tsg_restart_app_process("influxd", "systemctl_stop", "systemctl_start", "systemctl_status") @@ -227,4 +260,11 @@ def tsg_software_reboot(): tsg_restart_succ_log() if __name__ == '__main__': + global logger + + if len(sys.argv) >= 2 and sys.argv[1] == "debug": + logger = logger_init(logging.DEBUG) + else: + logger = logger_init(logging.CRITICAL) + tsg_software_reboot() diff --git a/py_cmd/tsg_software_reboot.py.bak b/py_cmd/tsg_software_reboot.py.bak new file mode 100644 index 0000000..efb592c --- /dev/null +++ b/py_cmd/tsg_software_reboot.py.bak @@ -0,0 +1,230 @@ +#coding=utf-8 +import os +import sys +import syslog +import subprocess +import time +import re + +##define KERN_EMERG "<0>" /* system is unusable */ +##define KERN_ALERT "<1>" /* action must be taken immediately */ +##define KERN_CRIT "<2>" /* critical conditions */ +##define KERN_ERR "<3>" /* error conditions */ +##define KERN_WARNING "<4>" /* warning conditions */ +##define KERN_NOTICE "<5>" /* normal but significant condition */ +##define KERN_INFO "<6>" /* informational */ +##define KERN_DEBUG "<7>" /* debug-level messages */ + +MSG_PREFIX = ['EMERG', 'ALERT', 'CRIT', 'ERR', 'WARNING', 'NOTICE', 'INFO', 'DEBUG'] + +G_LOCAL_NODE_NAME = "TSG_MXN" + +class CommandException(Exception): + pass + +def tsg_restart_err_log(error_num, user_msg): + msg = "[%s] %s" %(MSG_PREFIX[syslog.LOG_ERR], user_msg) + syslog.syslog(syslog.LOG_ERR, msg) + print (msg) + + msg = "[%s] %s" %(MSG_PREFIX[syslog.LOG_ERR], "tsg software reboot error") + syslog.syslog(syslog.LOG_ERR, msg) + print (msg) + + sys.exit(error_num) + +def tsg_restart_succ_log(): + msg = "[%s] %s" %(MSG_PREFIX[syslog.LOG_NOTICE], "tsg software reboot success") + syslog.syslog(syslog.LOG_NOTICE, msg) + print (msg) + sys.exit(0) + +#return exitcode value + output message: +# 0: succ +# 1: error +def system_cmd_run(cmd_str): + dangerous_cmd = {"rm", "mv", "poweroff", "shutdown"} + + for cmd in dangerous_cmd: + pattern = "\s*%s" %(cmd) + match_str = re.match(pattern, cmd_str) + if not match_str is None: + print("can't run this cmd:%s" %(cmd_str)) + sys.exit(1) + + try: + exitcode, output = subprocess.getstatusoutput(cmd_str) + except Exception as e: + print(e) + print("###### %s" %(e.message)) + #if exitcode != 0: + # output = "" + return 1, e.message + + return exitcode, output + +#return value: +# 1: progcess of prog_name is exist +# 0: progcess of prog_name is not exist +def tsg_check_process_health_by_ps(module_name): + cmd_str = "ps -afx | grep %s | grep -v grep" %(module_name) + exitcode, output = system_cmd_run(cmd_str) + if exitcode == 0: + return 1 + + return 0 + +#return value: +# 1: progcess of prog_name is exist +# 0: progcess of prog_name is not exist +def tsg_check_process_health_by_systemctl_status(module_name): + print("systemctl_status check method TODO!") + sys.exit(1) + return 0 + +def tsg_kill_app_process_by_killall(module_name, extra_progs): + #todo , stop sapp, xxx, check process exist or not, maybe zombie, maybe very slow + command = "killall -9 %s %s" %(module_name, extra_progs) + try: + exitcode, output = subprocess.getstatusoutput(command) + #print("%d" %(exitcode)) + except Exception as e: + pass + + if exitcode != 0: + return 1 + + return 0 + +def tsg_kill_app_process_by_systemctl_stop(module_name): + print("TODO") + sys.exit(1) + return 0 + +def tsg_stop_app_process(module_name, extra_progs, stop_method, check_method): + res_code = 0 + running_flag = 0 + + if check_method == 'ps': + check_func = tsg_check_process_health_by_ps + elif check_method == 'systemctl_status': + check_func = tsg_check_process_health_by_systemctl_status + else: + print("not support check method:%s" %(check_method)) + sys.exit(1) + + running_flag = check_func(module_name) + + if running_flag == 0: + #print("%s is not running, start it..." %(module_name)) + return 0 + #ready to stop progcess, retry for 3 times + + if stop_method == 'killall': + stop_func = tsg_kill_app_process_by_killall + elif stop_method == 'systemctl_stop': + stop_func = tsg_kill_app_process_by_systemctl_stop + else: + print("not support stop method:%s" %(check_method)) + sys.exit(1) + + for times in range(3): + stop_func(module_name, extra_progs) + #此处不判断stop_func的返回值, 可能程序不存在, 可能守护不存在,等原因 + #直接用check()方法检测stop()的成功 + res_code = tsg_check_process_health_by_ps(module_name) + if res_code != 0: + continue + else: + break + + if res_code != 0: + errmsg = "can't stop process %s" %(module_name) + tsg_restart_err_log(res_code, errmsg) + + return res_code + +def tsg_start_app_process_by_exec_call(module_name, module_cwd, module_exe, check_method): + try: + os.chdir(module_cwd) + except Exception as e: + print("%s" %(e)) + return 1 + + cmd_str = "./%s" %(module_exe) + ret_code, output = system_cmd_run(cmd_str) + if ret_code != 0: + errmsg = "start program %s error, call %s/%s failed" %(module_name, module_cwd, module_exe) + tsg_restart_err_log(ret_code, errmsg) + + if check_method == 'ps': + check_func = tsg_check_process_health_by_ps + elif check_method == 'systemctl_status': + check_func = tsg_check_process_health_by_systemctl_status + else: + print("not support check method:%s" %(check_method)) + return 1 + + running_flag = check_func(module_name) + if running_flag == 0: + errmsg = "start process %s error" %(module_name) + tsg_restart_err_log(ret_code, errmsg) + + return 0 + +def tsg_start_app_process_by_systemctl_start(module_name, module_cwd, module_exe, check_method): + print("TODO") + sys.exit(1) + +def tsg_start_app_process(module_name, module_cwd, module_exe, start_method, check_method): + if start_method == 'exec_call': + start_func = tsg_start_app_process_by_exec_call + elif start_method == 'systemctl_start': + start_func = tsg_start_app_process_by_systemctl_start + else: + print("not support start method:%s" %(start_method)) + sys.exit(1) + + ret = start_func(module_name, module_cwd, module_exe, check_method) + if ret != 0: + sys.exit(1) + + return 0 + +#参数说明: +# +# module_name: 模块名称 +# extra_progs: 其他需要kill的附加程序, 如sapp的r3守护, 需要杀掉, 否则后台可能会重复启动sapp +# module_cwd: 应用的绝对路径 +# module_exe: 启动应用的名称, 可能跟module_name不一样, 比如用r2启动sapp +# stop_method: 停止应用方法 +# start_method: 启动应用方法 +# check_method: 检测应用是否运行方法 +# +def tsg_restart_app_process(module_name, extra_progs, module_cwd, module_exe, stop_method, start_method, check_method): + res_code = 0 + + res_code = tsg_stop_app_process(module_name, extra_progs, stop_method, check_method) + if res_code != 0: + return res_code + + res_code = tsg_start_app_process(module_name, module_cwd, module_exe, start_method, check_method) + if res_code != 0: + return res_code + + return 0 + +def tsg_software_reboot(): + #G_LOCAL_NODE_NAME = get_local_node_name() + + log_handle = syslog.openlog(G_LOCAL_NODE_NAME) + + tsg_restart_app_process("sapp", "r3", "/home/tsg/kni", "r2", "killall", "exec_call", "ps") + #tsg_restart_app_process("telegraf", "systemctl_stop", "systemctl_start", "systemctl_status") + #tsg_restart_app_process("marsio", "systemctl_stop", "systemctl_start", "systemctl_status") + #tsg_restart_app_process("influxd", "systemctl_stop", "systemctl_start", "systemctl_status") + + tsg_restart_succ_log() + +if __name__ == '__main__': + tsg_software_reboot() diff --git a/py_tools/common_args.py b/py_common/common_args.py index 52b93de..406a5b4 100644 --- a/py_tools/common_args.py +++ b/py_common/common_args.py @@ -1,16 +1,16 @@ -#coding=utf-8
-import argparse
-
-def setup_common_args():
- parser = argparse.ArgumentParser(description='TSG OAM Argument Parser')
-
- parser.add_argument('-g', '--debug', help = 'debug mode, default is disable',
- action='store_true', default = 0)
- parser.add_argument('-l', '--log-level', help = 'debug log level, support:10,20,30,40,50, default is:30',
- type=int, default = 30)
- parser.add_argument('--telegraf-ip', help = 'send log to telegraf ip address, default is:192.168.200.5',
- type=str, default = '192.168.200.5')
- parser.add_argument('--telegraf-port', help = 'send log to telegraf port, default is:8126',
- type=int, default = 8126)
-
+#coding=utf-8 +import argparse + +def setup_common_args(): + parser = argparse.ArgumentParser(description='TSG OAM Argument Parser') + + parser.add_argument('-g', '--debug', help = 'debug mode, default is disable', + action='store_true', default = 0) + parser.add_argument('-l', '--log-level', help = 'debug log level, support:10,20,30,40,50, default is:30', + type=int, default = 30) + parser.add_argument('--telegraf-ip', help = 'send log to telegraf ip address, default is:192.168.200.5', + type=str, default = '192.168.200.5') + parser.add_argument('--telegraf-port', help = 'send log to telegraf port, default is:8126', + type=int, default = 8126) + return parser
\ No newline at end of file diff --git a/py_common/common_diagnose.py b/py_common/common_diagnose.py new file mode 100644 index 0000000..25f8234 --- /dev/null +++ b/py_common/common_diagnose.py @@ -0,0 +1,171 @@ +# coding: utf-8 +#diagnose公共模块, 供cli命令和backgroud后台自动运行工具调用 +import sys +import time +import json +import logging +import syslog +import prettytable +from sys import path +path.append(r'../py_common') #将存放module的路径添加进来 +path.append(r'../py_cmd') #将存放module的路径添加进来 +from common_telegraf import * +from common_logger import * +from common_args import * +from common_logger import * +from common_influxdb import * +from common_whoami import * +from common_system_cmd import * +from tsg_software_reboot import * + +#syslog 级别: +#define KERN_EMERG "<0>" /* system is unusable */ +#define KERN_ALERT "<1>" /* action must be taken immediately */ +#define KERN_CRIT "<2>" /* critical conditions */ +#define KERN_ERR "<3>" /* error conditions */ +#define KERN_WARNING "<4>" /* warning conditions */ +#define KERN_NOTICE "<5>" /* normal but significant condition */ +#define KERN_INFO "<6>" /* informational */ +#define KERN_DEBUG "<7>" /* debug-level messages */ +#参数log_level表示高于此级别的才输出, syslog值越小优先级越高, +#即log_level <= diagnose_level时输出结果, 如果级别高于ERR, 同时写入syslog + +#如果没有任何警告和错误, 显示normal +#所有检测项不能中途退出, 即使有错误, 显示当前检测项的错误后, 继续, 保证全检测一遍 + +#根据优先级, 设置新的日志等级 +def tsg_set_log_level(old_level, new_level): + if new_level < old_level: + return new_level + else: + return old_level + + +def tsg_diagnose_for_app(log_level): + cur_level = syslog.LOG_INFO + err_code = 0 + + sled_type,sled_id,sled_name = tsg_whoami() + if sled_name == "": + print("can't get local sled name") + return 1 + + module_array = tsg_get_local_sled_modules(sled_name) + if len(module_array) <= 0: + cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) + ptable.add_row([g_local_sled_name,"common", G_SYS_LOG_STRING[cur_level],"can't get local sled modules"]) + #print("can't get local sled modules") + return 1 + + for module_name in module_array: + module_operator = tsg_get_module_opertor(module_name) + if len(module_operator) <= 0: + cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) + err_msg = "can't get local sled module operator for %s" %(module_name) + ptable.add_row([g_local_sled_name,"common", G_SYS_LOG_STRING[cur_level], err_msg]) + #print(err_msg) + err_code += 1 + + ret, start_func, stop_func, check_func = tsg_get_operator_by_config(module_operator) + if ret != 0: + cur_level = tsg_set_log_level(cur_level, syslog.LOG_ERR) + err_msg = "can't get operator for %s" %(module_operator[TSG_OP_MODULE_NAME_INDEX]) + ptable.add_row([g_local_sled_name,"common", G_SYS_LOG_STRING[cur_level],err_msg]) + #print(err_msg) + err_code += 1 + + running_flag = check_func(module_operator[TSG_OP_MODULE_NAME_INDEX]) + if running_flag == 0: + cur_level = tsg_set_log_level(cur_level, syslog.LOG_EMERG) + err_msg = "app module %s is not running" %(module_name) + ptable.add_row([g_local_sled_name,"app", G_SYS_LOG_STRING[cur_level],err_msg]) + err_code += 1 + + #todo, check for restart time + if cur_level >= syslog.LOG_INFO: + ptable.add_row([g_local_sled_name,"app", G_SYS_LOG_STRING[cur_level], "normal"]) + + return err_code + +def tsg_diagnose_for_cpu(log_level): + cur_level = syslog.LOG_INFO + ptable.add_row([g_local_sled_name,"cpu", G_SYS_LOG_STRING[cur_level],"normal"]) + return 0 + +def tsg_diagnose_for_mem(log_level): + cur_level = syslog.LOG_INFO + ptable.add_row([g_local_sled_name,"memory", G_SYS_LOG_STRING[cur_level],"normal"]) + return 0 + +def tsg_diagnose_for_disk(log_level): + cur_level = syslog.LOG_INFO + + ret, cmd_res = system_cmd_run("df | awk {'print $1,$5'}") + + print(cmd_res.split()) + + print(len(cmd_res.split())) + print(cmd_res.split()[0]) + print(cmd_res.split()[1]) + print(cmd_res.split()[2]) + + ptable.add_row([g_local_sled_name,"disk", G_SYS_LOG_STRING[cur_level],"normal"]) + return 0 + +def tsg_diagnose_for_interface(log_level): + cur_level = syslog.LOG_INFO + sql_str = "select * from interface where PhyRXBits+PhyRXError+PhyRXMissed+PhyRXNoBUF > 0 and time > now() -5m limit 1" + + ret, points, msg = tsg_influxb_query(influxdb_client, sql_str) + if ret == 0: + for point in points: + cur_level = tsg_set_log_level(cur_level, syslog.LOG_WARNING) + msg = "%s has droped %d packets" %(point['device'], int(point['PhyRXBits']) + int(point['PhyRXMissed']) + int(point['PhyRXNoBUF']) ) + ptable.add_row([g_local_sled_name,"physical network", G_SYS_LOG_STRING[cur_level], msg]) + + if cur_level >= syslog.LOG_INFO: + ptable.add_row([g_local_sled_name,"physical network", G_SYS_LOG_STRING[cur_level], "normal"]) + return 0 + + return 1 + +def tsg_diagnose_for_app_stream(log_level): + cur_level = syslog.LOG_INFO + ptable.add_row([g_local_sled_name,"app network", G_SYS_LOG_STRING[cur_level],"normal"]) + return 0 + + +def tsg_common_diagnose(log_level): + #检查app进程是否存在, 是否最近n分钟内重启过 + tsg_diagnose_for_app(log_level) + + #检查CPU占用率 + tsg_diagnose_for_cpu(log_level) + + #检查mem占用率 + tsg_diagnose_for_mem(log_level) + + #检查磁盘占用率 + tsg_diagnose_for_disk(log_level) + + #检查物理网络情况 + tsg_diagnose_for_interface(log_level) + + #检查应用流量 + tsg_diagnose_for_app_stream(log_level) + +if __name__ == '__main__': + global ptable + global influxdb_client + global g_local_sled_name + type, id, g_local_sled_name = tsg_whoami() + + ptable = prettytable.PrettyTable() + ptable.field_names = ["Sled", "Type", "Level", "Status"] + + ret, influxdb_client, msg = tsg_influxdb_init('127.0.0.1', 8086, 'admin', 'tsg2019', 'tsg_stat') + if ret != 0: + ptable.add_row([g_local_sled_name,"common", G_SYS_LOG_STRING[syslog.LOG_ERR],"can't connect influxDB server"]) + + tsg_common_diagnose(syslog.LOG_DEBUG) + print(ptable)
\ No newline at end of file diff --git a/py_tools/common_get_tags.py b/py_common/common_get_tags.py index f347171..7e3d148 100644 --- a/py_tools/common_get_tags.py +++ b/py_common/common_get_tags.py @@ -1,30 +1,30 @@ -import json
-
-SN_JSON_PATH = '/opt/tsg/etc/tsg_tags.json'
-
-def tsg_get_tags_json():
- try:
- with open(SN_JSON_PATH) as json_fp:
- json_dict = json.load(json_fp)
- return json_dict['tags']
- except IOError:
- return ""
-
-def tsg_get_tags():
- tags = {}
- json_fp = tsg_get_tags_json()
- if len(json_fp) > 0:
- for tag_item in json_fp:
- #print(tag_item)
- tags[tag_item['tag']] = tag_item['value']
-
- return tags
-
-def main():
- tags = {}
-
- tags = tsg_get_tags()
- print(tags)
-
-if __name__ == '__main__':
+import json + +SN_JSON_PATH = '/opt/tsg/etc/tsg_tags.json' + +def tsg_get_tags_json(): + try: + with open(SN_JSON_PATH) as json_fp: + json_dict = json.load(json_fp) + return json_dict['tags'] + except IOError: + return "" + +def tsg_get_tags(): + tags = {} + json_fp = tsg_get_tags_json() + if len(json_fp) > 0: + for tag_item in json_fp: + #print(tag_item) + tags[tag_item['tag']] = tag_item['value'] + + return tags + +def main(): + tags = {} + + tags = tsg_get_tags() + print(tags) + +if __name__ == '__main__': main()
\ No newline at end of file diff --git a/py_common/common_influxdb.py b/py_common/common_influxdb.py new file mode 100644 index 0000000..c8ca799 --- /dev/null +++ b/py_common/common_influxdb.py @@ -0,0 +1,41 @@ +# coding: utf-8 + +import sys +from influxdb import InfluxDBClient + + +#return value +#ret, result_array, message +def tsg_influxb_query(influxdb_client, sql_str): + try: + result = influxdb_client.query(sql_str) + except Exception as e: + return 1, {}, e + + return 0, result.get_points(), "succ" + +#return value: +#ret, client, error_msg +def tsg_influxdb_init(arg_host,arg_port,arg_username,arg_password,arg_db): + try: + influxdb_client = InfluxDBClient(host=arg_host,port=arg_port,username=arg_username,password=arg_password,database=arg_db, timeout=3) + except Exception as e: + print("InfluxDBClient error: %s" %(e)) + return 1, None, e + + return 0, influxdb_client, "succ" + +if __name__ == '__main__': + ret, influxdb_client, msg = tsg_influxdb_init('127.0.0.1', 8086, 'admin', 'tsg2019', 'tsg_stat') + if ret != 0: + sys.exit(1) + + ret, points,msg = tsg_influxb_query(influxdb_client, "select * from interface order by time desc limit 1") + if ret == 0: + print("--for xxx in points:-----------------") + for point in points: + print("--------%s, %s" %(point['PhyRXBits'], point['PhyRXMissed'])) + + else: + print("query error, %s" %(res)) + diff --git a/py_common/common_json.py b/py_common/common_json.py new file mode 100644 index 0000000..61aec60 --- /dev/null +++ b/py_common/common_json.py @@ -0,0 +1,20 @@ +# coding: utf-8 + +import sys +import psutil +import time +import json + +#return value: +# 0, error msg, json_dict : succ +# 1, error msg, "" : error +def tsg_json_parse(file_name): + try: + with open(file_name) as json_fp: + try: + json_dict = json.load(json_fp) + return 0, "", json_dict + except Exception as e: + return 1, e, "" + except Exception as e: + return 1, e , ""
\ No newline at end of file diff --git a/py_tools/common_logger.py b/py_common/common_logger.py index e6e8f25..e6e8f25 100644 --- a/py_tools/common_logger.py +++ b/py_common/common_logger.py diff --git a/py_common/common_modules_deploy.py b/py_common/common_modules_deploy.py new file mode 100644 index 0000000..301fae5 --- /dev/null +++ b/py_common/common_modules_deploy.py @@ -0,0 +1,48 @@ +#coding=utf-8 +import os +import sys +import syslog +import subprocess +import time +import re +import logging +import logging.handlers +from common_system_cmd import * +from common_logger import * +from common_json import * +from common_modules_deploy import * + +# coding: utf-8 +#此文件定义四块计算板分别运行什么模块 +#先根据tsg_chassis_ip.json, 根据当前设备的ip地址, 知道当前设备的sled名称 +#然后根据设备类型名称, 找到当前运行的所有模块(应用)名称 +#然后根据模块名称, 到common_modules_operator.json查找每个模块的操作方法 +#{ +# "modules_deploy": { +# "mcn0": ["kni", "a.out"], +# "mcn1": ["tfe", "a1.out"], +# "mcn2": ["tfe", "a2.out"], +# "mcn3": ["tfe", "a3.out"] +# } +#} + +G_MODULE_DEPLOY_JSON = "/opt/tsg/etc/tsg_module_deploy.json" + +#返回当前板卡运行的所有模块, 数组形式 +def tsg_get_local_sled_modules(sled_name): + ret, err_msg, json_dict = tsg_json_parse(G_MODULE_DEPLOY_JSON) + if ret != 0: + return {} + + module_list = json_dict['modules_deploy'] + if len(module_list) <= 0: + return {} + + return module_list[sled_name] + + +if __name__ == '__main__': + global logger + logger = logger_init(10) + module_list = tsg_get_local_sled_modules("mcn0") + print(module_list)
\ No newline at end of file diff --git a/py_common/common_modules_operator.py b/py_common/common_modules_operator.py new file mode 100644 index 0000000..5d6d1b1 --- /dev/null +++ b/py_common/common_modules_operator.py @@ -0,0 +1,66 @@ +#coding=utf-8 +import os +import sys +import syslog +import subprocess +import time +import re +import logging +import logging.handlers +from common_system_cmd import * +from common_logger import * +from common_json import * + +# coding: utf-8 +#先根据tsg_chassis_ip.json, 根据当前设备的ip地址, 知道当前设备的sled名称 +#然后根据设备类型名称, 找到当前运行的所有模块(应用)名称 +#然后根据模块名称, 到common_modules_operator.json查找每个模块的操作方法 +#此文件定义, 每个模块的启动、停止、检查运行状态的方法 +#操作方法参数说明: +# +# prog_name: 实际运行进程名称, 如kni模块实际运行的是sapp +# extra_progs: 其他需要kill的附加程序, 如sapp的r3守护, 需要杀掉, 否则后台可能会重复启动sapp +# module_cwd: 应用的绝对路径 +# module_exe: 启动应用的名称, 可能跟module_name不一样, 比如用r2启动sapp +# stop_method: 停止应用方法 +# start_method: 启动应用方法 +# check_method: 检测应用是否运行方法 +# +#例如: "kni": ["sapp", "r3", "/home/tsg/kni", "r2", "killall", "exec", "ps"], + +G_MODULE_DEPLOY_JSON = "/opt/tsg/etc/tsg_module_deploy.json" + +TSG_OP_MODULE_NAME_INDEX = 0 +TSG_OP_MODULE_EXTRA_INDEX = 1 +TSG_OP_MODULE_CWD_INDEX = 2 +TSG_OP_MODULE_EXE_INDEX = 3 +TSG_OP_MODULE_STOP_INDEX = 4 +TSG_OP_MODULE_START_INDEX = 5 +TSG_OP_MODULE_STATUS_INDEX = 6 + +def tsg_get_module_opertor(module_name): + empty = [] + + ret, err_msg, json_dict = tsg_json_parse(G_MODULE_DEPLOY_JSON) + if ret != 0: + print("open or parse json file %s error, %s" %(G_MODULE_DEPLOY_JSON, err_msg)) + return empty + + module_oplist = json_dict['modules_operator'] + if len(module_oplist) <= 0: + print("can't get modules_operator from %s" %(G_MODULE_DEPLOY_JSON)) + return empty + + for module_operator in module_oplist: + if module_operator == module_name: + #print(module_oplist[module_operator]) + return module_oplist[module_operator] + + return empty + + +if __name__ == '__main__': + global logger + logger = logger_init(10) + module_operator = tsg_get_module_opertor("kni") + print(module_operator)
\ No newline at end of file diff --git a/py_common/common_system_cmd.py b/py_common/common_system_cmd.py new file mode 100644 index 0000000..047fa40 --- /dev/null +++ b/py_common/common_system_cmd.py @@ -0,0 +1,28 @@ +#coding=utf-8 +import os +import sys + +#return exitcode value + output message: +# 0: succ +# 1: error +def system_cmd_run(cmd_str): + dangerous_cmd = {"rm", "mv", "poweroff", "shutdown"} + + for cmd in dangerous_cmd: + pattern = "\s*%s" %(cmd) + match_str = re.match(pattern, cmd_str) + if not match_str is None: + print("can't run this cmd:%s" %(cmd_str)) + sys.exit(1) + + try: + exitcode, output = subprocess.getstatusoutput(cmd_str) + except Exception as e: + print(e) + print("###### %s" %(e.message)) + #if exitcode != 0: + # output = "" + return 1, e.message + + return exitcode, output +
\ No newline at end of file diff --git a/py_tools/common_telegraf.py b/py_common/common_telegraf.py index e1cdc43..e3e5744 100644 --- a/py_tools/common_telegraf.py +++ b/py_common/common_telegraf.py @@ -6,4 +6,5 @@ def telegraf_init(arg_host, arg_port, arg_tags): #global telegraf_client #global_tags = {'host' : "lijia", 'local_ip_addr': "127.0.0.1"} telegraf_client = telegraf.TelegrafClient(host = arg_host, port = arg_port, tags = arg_tags) - return telegraf_client
\ No newline at end of file + return telegraf_client + diff --git a/py_common/common_whoami.py b/py_common/common_whoami.py new file mode 100644 index 0000000..786ef2a --- /dev/null +++ b/py_common/common_whoami.py @@ -0,0 +1,54 @@ +#coding=utf-8 +import os +import sys +import syslog +import subprocess +import time +import re +import logging +import logging.handlers +from common_system_cmd import * +from common_logger import * +from common_json import * +from common_whoami import * + +G_CHASSIS_IP_JSON = "/opt/tsg/etc/tsg_chassis_ip.json" + + + +#遍历所有ip地址, 判断当前的板卡类型, +#返回值: +# type, id, name +#例如"mcn", 1, "mcn1" +def tsg_whoami(): + ret, iplist = tsg_sys_cmd_run("hostname -I") + if ret != 0: + return "", -1, "" + + iparray = re.split(r'[:\t\s\r\n]\s*', iplist.strip()) + + ret, err_msg, json_dict = tsg_json_parse(G_CHASSIS_IP_JSON) + if ret != 0: + return "", -1, "" + + sleds = json_dict['tsg_chassis_ip'] + if len(sleds) <= 0: + logger.critical("can't get tsg_chassis_ip from json file %s!" %(G_CHASSIS_IP_JSON)) + return "", -1, "" + + for ipaddr in iparray: + #print("find ip %s in which sled..." %(ipaddr)) + for sled in sleds: + #print(sled, sled['ip']) + #print("diff host ip:%s with json ip:%s..." %(ipaddr), sled['ip']) + if ipaddr == sled['ip']: + return sled['type'], sled['id'], sled['name'] + + logger.critical("local ip address can't match any item in json file %s!" %(G_CHASSIS_IP_JSON)) + return "", -1, "" + +if __name__ == '__main__': + global logger + logger = logger_init(10) + type, id, name = tsg_whoami() + print(type, id, name)
\ No newline at end of file diff --git a/py_tools/get_traffic_by_proc.py b/py_tools/get_traffic_by_proc.py index fec12bb..ddaffd8 100644 --- a/py_tools/get_traffic_by_proc.py +++ b/py_tools/get_traffic_by_proc.py @@ -20,7 +20,7 @@ Transmit_packets_index = 10 Transmit_err_index = 11
Transmit_drop_index = 12
-#return exitcode value + output message:
+#return exitcode value, output message:
# 0: succ
# 1: error
def system_cmd_run(cmd_str):
diff --git a/py_tools/tsg_diagnose_background.py b/py_tools/tsg_diagnose_background.py new file mode 100644 index 0000000..bcdd037 --- /dev/null +++ b/py_tools/tsg_diagnose_background.py @@ -0,0 +1,23 @@ +# coding: utf-8 +#与提供给用户执行的tsg_diagnose命令区别是: +#此文件后台周期性自动运行 +#tsg_diagnose 最低级别是warning, +import sys +import time +import json +import logging +from common_telegraf import * +from common_logger import * +from common_args import * +from common_logger import * +from common_influxdb import * + + + +tsg_ + + + +if __name__ == '__main__': + + diff --git a/py_cmd/tsg_get_sn.py b/py_tools/tsg_get_sn.py index e0bf4f3..e0bf4f3 100644 --- a/py_cmd/tsg_get_sn.py +++ b/py_tools/tsg_get_sn.py diff --git a/py_cmd/tsg_update_tags.py b/py_tools/tsg_update_tags.py index 9650429..c56ed12 100644 --- a/py_cmd/tsg_update_tags.py +++ b/py_tools/tsg_update_tags.py @@ -20,7 +20,7 @@ def tsg_get_device_list_info_by_sn(sn): def tsg_get_tags_by_sn(sn): dev_list_info = tsg_get_device_list_info_by_sn(sn) if len(dev_list_info) <= 0: - #写个空json + #返回空json return "{}" json_dict = json.loads(dev_list_info) |
