diff options
| author | lijia <[email protected]> | 2019-08-09 18:11:11 +0800 |
|---|---|---|
| committer | lijia <[email protected]> | 2019-08-09 18:11:11 +0800 |
| commit | c5820eb4b36985c9f0f5ea44eb57c13568e7c791 (patch) | |
| tree | e87a08dc70f64b82256737dc8577978379b764f2 | |
| parent | d5f4683fcc139c4e2b25c2f62548cd4215f3c51b (diff) | |
增加根据每块计算板的不同, 灵活判断应该重启哪些应用
| -rw-r--r-- | py_cmd/common_modules_deploy.json | 12 | ||||
| -rw-r--r-- | py_cmd/common_modules_operator.json | 26 | ||||
| -rw-r--r-- | py_tools/get_traffic_by_marsio.py | 2 | ||||
| -rw-r--r-- | py_tools/get_traffic_by_proc.py | 115 | ||||
| -rw-r--r-- | py_tools/get_traffic_by_psutil.py | 24 | ||||
| -rw-r--r-- | py_tools/tsg_monit_interface.py | 77 |
6 files changed, 219 insertions, 37 deletions
diff --git a/py_cmd/common_modules_deploy.json b/py_cmd/common_modules_deploy.json new file mode 100644 index 0000000..786644f --- /dev/null +++ b/py_cmd/common_modules_deploy.json @@ -0,0 +1,12 @@ +#此文件定义四块计算板分别运行什么模块
+#先根据tsg_chassis_ip.json, 根据当前设备的ip地址, 知道当前设备的sled名称
+#然后根据设备类型名称, 找到当前运行的所有模块(应用)名称
+#然后根据模块名称, 到common_modules_operator.json查找每个模块的操作方法
+{
+ "modules_dispatch":
+
+ "mcn0": ["kni"]
+ "mcn1": ["tfe"]
+ "mcn2": ["tfe"]
+ "mcn3": ["tfe"]
+}
\ No newline at end of file diff --git a/py_cmd/common_modules_operator.json b/py_cmd/common_modules_operator.json new file mode 100644 index 0000000..1bd6a16 --- /dev/null +++ b/py_cmd/common_modules_operator.json @@ -0,0 +1,26 @@ +#此文件定义, 每个模块的启动、停止、检查运行状态的方法
+{
+ "modules":[
+ {sapp:
+ start:
+ "cwd"
+ "exe"
+ stop:
+ "killall -9 r3 sapp"
+ check healthy:
+ "ps -ef | grep sapp "
+ }
+
+ {telegraf:
+ start:
+ "cwd" : ""
+ "exe" : "service influxdb restart"
+ stop:
+ "service influxdb stop"
+ check healthy:
+ "service influxdb status | grep Active "
+ }
+
+ ]
+
+}
\ No newline at end of file diff --git a/py_tools/get_traffic_by_marsio.py b/py_tools/get_traffic_by_marsio.py index 8bf1282..7dbc53f 100644 --- a/py_tools/get_traffic_by_marsio.py +++ b/py_tools/get_traffic_by_marsio.py @@ -117,7 +117,7 @@ def get_and_send_marsio_traffic(logger, json_fp, telegraf_client, devsym, arg_fl if value == "": logger.critical("can't get device:%s traffic!" %(devsym)) sys.exit(1) - __metric_dict_speed[item] = value + __metric_dict_speed[item] = int(value) telegraf_client.metric('interface', __metric_dict_speed, tags = metric_tag) #logger.info("telegraf_client send metric") diff --git a/py_tools/get_traffic_by_proc.py b/py_tools/get_traffic_by_proc.py new file mode 100644 index 0000000..fec12bb --- /dev/null +++ b/py_tools/get_traffic_by_proc.py @@ -0,0 +1,115 @@ +#coding=utf-8
+import sys
+import re
+import syslog
+import subprocess
+import time
+import telegraf
+import logging
+import logging.handlers
+from common_telegraf import *
+from common_logger import *
+
+Receive_bytes_index = 1
+Receive_packets_index = 2
+Receive_err_index = 3
+Receive_drop_index = 4
+
+Transmit_bytes_index = 9
+Transmit_packets_index = 10
+Transmit_err_index = 11
+Transmit_drop_index = 12
+
+#return exitcode value + output message:
+# 0: succ
+# 1: error
+def system_cmd_run(cmd_str):
+ dangerous_cmd = {"rm", "mv", "poweroff", "shutdown"}
+
+ for cmd in dangerous_cmd:
+ pattern = "\s*%s" %(cmd)
+ match_str = re.match(pattern, cmd_str)
+ if not match_str is None:
+ print("can't run this cmd:%s" %(cmd_str))
+ sys.exit(1)
+
+ try:
+ exitcode, output = subprocess.getstatusoutput(cmd_str)
+ except Exception as e:
+ #if exitcode != 0:
+ # output = ""
+ print(e)
+ return 1, e
+
+ return exitcode, output
+
+def get_traffic_from_proc(logger, device_name):
+ cmd_str = "cat /proc/net/dev | grep %s:" %(device_name)
+ res_code, res_buf = system_cmd_run(cmd_str)
+ if res_code != 0:
+ return ""
+ if len(res_buf) <= 0:
+ return ""
+
+ logger.debug(res_buf)
+
+ sections_array = re.split(r'[:\t\s\r\n]\s*', res_buf.strip())
+
+ packets_recv = sections_array[Receive_packets_index]
+ bytes_recv = sections_array[Receive_bytes_index]
+ errin = sections_array[Receive_err_index]
+ dropin = sections_array[Receive_drop_index]
+
+ packets_sent = sections_array[Transmit_packets_index]
+ bytes_sent = sections_array[Transmit_bytes_index]
+ errout = sections_array[Transmit_err_index]
+ dropout = sections_array[Transmit_drop_index]
+
+ return packets_recv,bytes_recv,errin,dropin,packets_sent,bytes_sent,errout,dropout
+
+def send_metrics_by_proc(logger, telegraf_client, dev_name, arg_flow_type, arg_node_name):
+ metric_tag = {'device': dev_name, 'flow_type':arg_flow_type, 'node':arg_node_name}
+
+ old_packets_recv,old_bytes_recv,old_errin,old_dropin,old_packets_sent,old_bytes_sent,old_errout,old_dropout = get_traffic_from_proc(logger,dev_name)
+ if len(old_packets_recv) <= 0:
+ logger.error("can't get %s status" %(dev_name))
+ return 1
+ time.sleep(1)
+ new_packets_recv,new_bytes_recv,new_errin,new_dropin,new_packets_sent,new_bytes_sent,new_errout,new_dropout = get_traffic_from_proc(logger,dev_name)
+ if len(new_packets_recv) <= 0:
+ logger.error("can't get %s status" %(dev_name))
+ return 1
+
+ metrict_val = {}
+ metrict_val['PhyRXBits'] = int(8 *(int(new_bytes_recv) - int(old_bytes_recv)))
+ metrict_val['PhyRXError'] = int(new_errin) - int(old_errin)
+ metrict_val['PhyRXFrame'] = int(new_packets_recv) - int(old_packets_recv)
+ metrict_val['PhyRXMissed'] = 0
+ metrict_val['PhyRXNoBUF'] = 0
+
+ metrict_val['PhyTXBits'] = int(8 *(int(new_bytes_recv) - int(old_bytes_recv)))
+ metrict_val['PhyTXError'] = int(new_errin) - int(old_errin)
+ metrict_val['PhyTXFrame'] = int(new_packets_recv) - int(old_packets_recv)
+ metrict_val['PhyRXMissed'] = 0
+ metrict_val['PhyRXNoBUF'] = 0
+
+ metrict_val['UsrRXDrops'] = int(new_dropin) - int(old_dropin)
+ metrict_val['UsrTXDrops'] = int(new_dropout) - int(old_dropout)
+
+ telegraf_client.metric('interface', metrict_val, tags = metric_tag)
+
+ logger.debug(metrict_val)
+
+if __name__ == '__main__':
+ global logger
+ global time_interval
+ if len(sys.argv) <= 1:
+ print("Usage: %s timeinterval" %(sys.argv[0]))
+ time_interval = 5
+ else:
+ time_interval = int(sys.argv[1])
+ tele_tags = {}
+ telegraf_client = telegraf_init('127.0.0.1', 8126, tele_tags)
+ logger = logger_init(10)
+
+ send_metrics_by_proc(logger, telegraf_client, 'ens33', 'inline', 'mcn1')
\ No newline at end of file diff --git a/py_tools/get_traffic_by_psutil.py b/py_tools/get_traffic_by_psutil.py index 9fb77c2..263414e 100644 --- a/py_tools/get_traffic_by_psutil.py +++ b/py_tools/get_traffic_by_psutil.py @@ -4,7 +4,11 @@ import sys import psutil import time import telegraf - +import logging +import logging.handlers +from common_telegraf import * +from common_logger import * + def get_stats_for_device(dev_name): devlist = psutil.net_io_counters(pernic=True).keys() @@ -38,23 +42,23 @@ def send_metrics_by_psutil(logger, telegraf_client, dev_name, arg_flow_type, arg metric_tag = {'device': dev_name, 'flow_type':arg_flow_type, 'node':arg_node_name} old_packets_recv,old_bytes_recv,old_errin,old_dropin,old_packets_sent,old_bytes_sent,old_errout,old_dropout = get_stats_for_device(dev_name) - time.sleep(3) + time.sleep(1) new_packets_recv,new_bytes_recv,new_errin,new_dropin,new_packets_sent,new_bytes_sent,new_errout,new_dropout = get_stats_for_device(dev_name) metrict_val = {} metrict_val['PhyRXBits'] = int(8 *(new_bytes_recv - old_bytes_recv)) metrict_val['PhyRXError'] = int(new_errin-old_errin) metrict_val['PhyRXFrame'] = int(new_packets_recv - old_packets_recv) - metrict_val['PhyRXMissed'] = 0 - metrict_val['PhyRXNoBUF'] = 0 + metrict_val['PhyRXMissed'] = int('0') + metrict_val['PhyRXNoBUF'] = int('0') metrict_val['PhyTXBits'] = int(8 *(new_bytes_recv - old_bytes_recv)) metrict_val['PhyTXError'] = int(new_errin-old_errin) metrict_val['PhyTXFrame'] = int(new_packets_recv - old_packets_recv) - metrict_val['PhyRXMissed'] = 0 - metrict_val['PhyRXNoBUF'] = 0 + metrict_val['PhyRXMissed'] = int('0') + metrict_val['PhyRXNoBUF'] = int('0') - metrict_val['UsrRXDrops'] = int(new_dropin - new_dropin) + metrict_val['UsrRXDrops'] = int(new_dropin - old_dropin) metrict_val['UsrTXDrops'] = int(new_dropout - old_dropout) telegraf_client.metric('interface', metrict_val, tags = metric_tag) @@ -71,8 +75,8 @@ if __name__ == '__main__': else: time_interval = int(sys.argv[1]) - telegraf_client = telegraf_init('127.0.0.1', 8126) + tele_tags = {} + telegraf_client = telegraf_init('127.0.0.1', 8126, tele_tags) logger = logger_init(10) - telegraf_client = telegraf_init() - send_metrics_by_psutil(logger, telegraf_client, 'ens33', 'inline', 'mcn_1') + send_metrics_by_psutil(logger, telegraf_client, 'ens33', 'inline', 'mcn1') diff --git a/py_tools/tsg_monit_interface.py b/py_tools/tsg_monit_interface.py index 64a3729..62ccfbe 100644 --- a/py_tools/tsg_monit_interface.py +++ b/py_tools/tsg_monit_interface.py @@ -5,8 +5,10 @@ import psutil import time import json import telegraf +import logging from get_traffic_by_psutil import * from get_traffic_by_marsio import * +from get_traffic_by_proc import * from common_telegraf import * from common_logger import * from common_args import * @@ -16,37 +18,60 @@ from common_get_tags import * INTERFACE_JSON_PATH = '/opt/tsg/etc/tsg_chassis_interface.json' -def get_local_node(): +#return value: +# 0, error msg, json_dict : succ +# 1, error msg, "" : error +def tsg_json_parse(file_name): try: - with open(INTERFACE_JSON_PATH) as json_fp: - json_dict = json.load(json_fp) - return json_dict['local_chassis_node'] - except IOError: - return "" + with open(file_name) as json_fp: + try: + json_dict = json.load(json_fp) + return 0, "", json_dict + except Exception as e: + return 1, e, "" + except Exception as e: + return 1, e , "" + +def get_local_node(): + ret, msg, json_dict = tsg_json_parse(INTERFACE_JSON_PATH) + + if ret == 0: + return json_dict['local_chassis_node'] + else: + logger.critical("%s!" %(msg)) + sys.exit(1) def get_interface_list(): - try: - with open(INTERFACE_JSON_PATH) as json_fp: - json_dict = json.load(json_fp) - interface_list = json_dict['interface_list'] - #print(interface_list) - for dev in interface_list: - if 'marsio' == dev['dev_type']: - logger.debug("get traffic stat by %s for: %s" %(dev['dev_type'], dev['dev_name'])) - # call function in get_traffic_by_marsio.py - send_metrics_by_marsio(logger, telegraf_client, dev['dev_name'], dev['flow_type'], node_name) - elif 'pcap' == dev['dev_type']: - logger.debug("get traffic stat by %s for: %s, please wait for 1s..." %(dev['dev_type'], dev['dev_name'])) - # call function in get_traffic_by_psutil.py - send_metrics_by_psutil(logger, telegraf_client, dev['dev_name'], dev['flow_type'], node_name) - else: - logger.critical("not support driver type %s for: %s" %(dev['dev_type'], dev['dev_name'])) - sys.exit(1) - except IOError: - logger.critical("can't open file %s!" %(INTERFACE_JSON_PATH)) + ret, msg, json_dict = tsg_json_parse(INTERFACE_JSON_PATH) + + if ret == 0: + interface_list = json_dict['interface_list'] + else: + logger.critical("%s!" %(msg)) sys.exit(1) - return "" + + if len(interface_list) <= 0: + logger.critical("can't get interface_list from json file %s!" %(INTERFACE_JSON_PATH)) + sys.exit(1) + + #print(interface_list) + for dev in interface_list: + if 'marsio' == dev['dev_type']: + logger.debug("get traffic stat by %s for: %s" %(dev['dev_type'], dev['dev_name'])) + # call function in get_traffic_by_marsio.py + send_metrics_by_marsio(logger, telegraf_client, dev['dev_name'], dev['flow_type'], node_name) + elif 'pcap' == dev['dev_type']: + logger.debug("get traffic stat by %s for: %s, please wait for 1s..." %(dev['dev_type'], dev['dev_name'])) + # call function in get_traffic_by_psutil.py + #send_metrics_by_psutil(logger, telegraf_client, dev['dev_name'], dev['flow_type'], node_name) + + #Ҳ��������ԭʼ�İ취, ����cat ����/proc/net/dev + send_metrics_by_proc(logger, telegraf_client, dev['dev_name'], dev['flow_type'], node_name) + else: + logger.critical("not support driver type %s for: %s" %(dev['dev_type'], dev['dev_name'])) + sys.exit(1) + if __name__ == '__main__': |
