diff options
| author | zhangshuai <[email protected]> | 2023-05-15 14:48:58 +0800 |
|---|---|---|
| committer | zhangshuai <[email protected]> | 2023-05-15 14:48:58 +0800 |
| commit | 050f0dd3eba5d8a95c227f9b5140a415ed0733e7 (patch) | |
| tree | ed885d1660a4268751e55360cdad6a450600e74f | |
| parent | 9671dddbd4395ea157f0390cfb3b997e42fb5e2d (diff) | |
fix: NEZ-2822 修复 asset ping,endpoint state 正常,故障诊断显示 agent 状态异常问题
3 files changed, 165 insertions, 117 deletions
diff --git a/nz-admin/src/main/java/com/nis/modules/diagnose/chain/asset/AgentStateCheckOfPingInterceptor.java b/nz-admin/src/main/java/com/nis/modules/diagnose/chain/asset/AgentStateCheckOfPingInterceptor.java index a00c9c4b..2fbd457b 100644 --- a/nz-admin/src/main/java/com/nis/modules/diagnose/chain/asset/AgentStateCheckOfPingInterceptor.java +++ b/nz-admin/src/main/java/com/nis/modules/diagnose/chain/asset/AgentStateCheckOfPingInterceptor.java @@ -1,6 +1,7 @@ package com.nis.modules.diagnose.chain.asset; +import cn.hutool.core.util.StrUtil; import cn.hutool.log.Log; import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; import com.nis.common.utils.Constant; @@ -86,55 +87,86 @@ public class AgentStateCheckOfPingInterceptor implements DiagnoseInterceptor { String assetPingFromConf = sysConfService.getValueOrDefault(Constant.SYSCONFIG_KEY_ASSET_PING_FROM, Constant.AssetPingFrom.GLOBAL.getValue()); log.info("[checkAgentStateOfPing] [asset_ping_from: {}]", assetPingFromConf); - List<Agent> totalAgentList = Tool.ListUtil.list(true); - List<Agent> downAgentList = Tool.ListUtil.list(true); + // federation enable + boolean federationEnabled = StrUtil.equals("1", sysConfService.getValueOrDefault(Constant.SYSCONFIG_KEY_PROM_FEDER_ENABLED, "1")); - // ping source is global - if (Tool.StrUtil.equals(Constant.AssetPingFrom.GLOBAL.getValue(), assetPingFromConf)) { - totalAgentList = agentService.list(new LambdaUpdateWrapper<Agent>() - .eq(Agent::getType, Constant.AgentType.GLOBAL.getValue())); + List<Agent> healthyDatacenterList = Tool.ListUtil.list(true); - downAgentList = totalAgentList.stream().filter(agent -> Tool.ObjectUtil.notEqual(1, agent.getStatus())).collect(Collectors.toList()); - log.info("[checkAgentStateOfPing] [asset ping from global] [total agent size: {}] [down agent size: {}]", totalAgentList.size(), downAgentList.size()); - } - - // ping source is per-datacenter - if (Tool.StrUtil.equals(Constant.AssetPingFrom.PER_DATACENTER.getValue(), assetPingFromConf)) { - Integer dcId = asset.getDcId(); - // per-datacenter agent list - totalAgentList = agentService.list(new LambdaUpdateWrapper<Agent>() - .eq(Agent::getType, Constant.AgentType.PER_DATACENTER.getValue()) - .eq(Agent::getDcId, dcId)); + // global agent list + // ping_from is global || federation is false global scrape endpoint job + List<Agent> healthyGlobalList = agentService.list(new LambdaUpdateWrapper<Agent>() + .eq(Agent::getType, Constant.AgentType.GLOBAL.getValue()) + .eq(Agent::getStatus, "1")); - List<Agent> downPerDcAgents = totalAgentList.stream().filter(agent -> Tool.ObjectUtil.notEqual(1, agent.getStatus())).collect(Collectors.toList()); + log.info("[checkAgentStateOfPing] [total healthy global agent size: {}]", healthyGlobalList.size()); - // per-datacenter scrape asset_ping job, query prom_api by global agent, so down agent include global agent - List<Agent> downGlobalAgents = agentService.list(new LambdaUpdateWrapper<Agent>().eq(Agent::getType, Constant.AgentType.GLOBAL.getValue()).ne(Agent::getStatus, 1)); - log.info("[checkAgentStateOfPing] [asset ping from per-datacenter] [total per-datacenter agent size: {}] [down per-datacenter agent size: {}] [down global agent size: {}]", totalAgentList.size(), downPerDcAgents.size(), downGlobalAgents.size()); - - // down agent list include per-datacenter and global agent - downAgentList.addAll(downPerDcAgents); - downAgentList.addAll(downGlobalAgents); - } + List<Agent> totalAgentLists = agentService.list(); + //result Map<Object, Object> resultMap = Tool.MapUtil.builder() .put("item", "diagnose.asset.item.agent.status") .put("resolution", Tool.StrUtil.EMPTY) .map(); - if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), - Tool.CollUtil.isNotEmpty(downAgentList))) { - // has down agent - resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); - resultMap.put("resolution", "diagnose.asset.resolution.agent.status"); - } else if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), - Tool.CollUtil.isEmpty(downAgentList))) { - // PASS - resultMap.put("state", DiagnoseState.PASS.getValue()); - } else if (Tool.CollUtil.isEmpty(totalAgentList)) { - // 没有配置 Agent 信息 - resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); - resultMap.put("resolution", "diagnose.asset.resolution.agent.config"); + + if (federationEnabled) { + // federation 模式关闭 ,Global scrape endpoint jobs + log.info("[agentStateCheckOfMetrics] [federation is not enabled. get global agents]"); + + // ping source is per-datacenter + if (Tool.StrUtil.equals(Constant.AssetPingFrom.PER_DATACENTER.getValue(), assetPingFromConf)) { + Integer dcId = asset.getDcId(); + + // per-datacenter agent list + healthyDatacenterList = agentService.list(new LambdaUpdateWrapper<Agent>() + .eq(Agent::getType, Constant.AgentType.PER_DATACENTER.getValue()) + .eq(Agent::getDcId, dcId) + .eq(Agent::getStatus, "1") + .last("limit 1")); + + log.info("[checkAgentStateOfPing] [asset ping from per-datacenter] [healthy global agent size: {}] [healthy per-datacenter agent size: {}]", healthyGlobalList.size(), healthyDatacenterList.size()); + + if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(healthyGlobalList), Tool.CollUtil.isNotEmpty(healthyDatacenterList))) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (Tool.CollUtil.isNotEmpty(totalAgentLists) && (Tool.CollUtil.isEmpty(healthyGlobalList) || Tool.CollUtil.isEmpty(healthyDatacenterList))) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.status"); + } else if (Tool.CollUtil.isEmpty(totalAgentLists)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.config"); + } + } else { + if (Tool.CollUtil.isNotEmpty(healthyGlobalList)) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (Tool.CollUtil.isNotEmpty(totalAgentLists) && Tool.CollUtil.isEmpty(healthyGlobalList)) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.status"); + } + if (Tool.CollUtil.isEmpty(totalAgentLists)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.config"); + } + } + } else { + if (Tool.CollUtil.isNotEmpty(healthyGlobalList)) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (Tool.CollUtil.isNotEmpty(totalAgentLists) && Tool.CollUtil.isEmpty(healthyGlobalList)) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.status"); + } + if (Tool.CollUtil.isEmpty(totalAgentLists)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.config"); + } } return resultMap; } diff --git a/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/logs/AgentStateCheckOfLogsEndpointInterceptor.java b/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/logs/AgentStateCheckOfLogsEndpointInterceptor.java index 0cbf3b3a..cd458037 100644 --- a/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/logs/AgentStateCheckOfLogsEndpointInterceptor.java +++ b/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/logs/AgentStateCheckOfLogsEndpointInterceptor.java @@ -1,6 +1,7 @@ package com.nis.modules.diagnose.chain.endpoint.logs; +import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.StrUtil; import cn.hutool.log.Log; import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; @@ -78,15 +79,37 @@ public class AgentStateCheckOfLogsEndpointInterceptor implements DiagnoseInterce String federationEnabled = sysConfService.getValueOrDefault(Constant.SYSCONFIG_KEY_PROM_FEDER_ENABLED, "1"); log.info("[agentStateCheckOfLogs] [prometheus_federation_enabled] [value: {}]", federationEnabled); - List<Agent> totalAgentList = Tool.ListUtil.list(true); - List<Agent> downAgentList = Tool.ListUtil.list(true); + List<Agent> healthyDatacenterList = Tool.ListUtil.list(true); + + List<Agent> healthyClobalList = agentService.list(new LambdaUpdateWrapper<Agent>() + .eq(Agent::getType, Constant.AgentType.GLOBAL.getValue()) + .eq(Agent::getStatus, "1")); + log.info("[agentStateCheckOfLogs] [global agent receive log data] [healthy global agent size: {}]", healthyClobalList.size()); + + List<Agent> totalAgentLists = agentService.list(); + + //result + Map<Object, Object> resultMap = Tool.MapUtil.builder() + .put("item", "logs.diagnose.endpoint.item.agent.status") + .put("resolution", Tool.StrUtil.EMPTY) + .map(); if (StrUtil.equals("0", federationEnabled)) { // federation 模式关闭 ,Global scrape endpoint jobs - log.info("[agentStateCheckOfLogs] [federation is not enabled. get global agents]"); - totalAgentList = agentService.list(new LambdaUpdateWrapper<Agent>().eq(Agent::getType, Constant.AgentType.GLOBAL.getValue())); - downAgentList = totalAgentList.stream().filter(agent -> Tool.ObjectUtil.notEqual(1, agent.getStatus())).collect(Collectors.toList()); - log.info("[agentStateCheckOfLogs] [global agent receive log data] [total agent size: {}] [down agent size: {}]", totalAgentList.size(), downAgentList.size()); + log.info("[agentStateCheckOfMetrics] [federation is not enabled. get global agents]"); + + if (Tool.CollUtil.isNotEmpty(totalAgentLists) && Tool.CollUtil.isEmpty(healthyClobalList)) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.endpoint.resolution.agent.status"); + } else if (Tool.CollUtil.isNotEmpty(healthyClobalList)) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (CollUtil.isEmpty(totalAgentLists)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.endpoint.resolution.agent.config"); + } } if (StrUtil.equals("1", federationEnabled)) { @@ -95,40 +118,26 @@ public class AgentStateCheckOfLogsEndpointInterceptor implements DiagnoseInterce AssetAsset asset = assetService.getById(assetId); // per-datacenter agent list - totalAgentList = agentService.list(new LambdaUpdateWrapper<Agent>() + healthyDatacenterList = agentService.list(new LambdaUpdateWrapper<Agent>() .eq(Agent::getType, Constant.AgentType.PER_DATACENTER.getValue()) - .eq(Agent::getDcId, asset.getDcId())); - - List<Agent> downPerDcAgents = totalAgentList.stream().filter(agent -> Tool.ObjectUtil.notEqual(1, agent.getStatus())).collect(Collectors.toList()); - - // per-datacenter receive log data, query prom_api by global agent, so down agent include global agent - List<Agent> downGlobalAgents = agentService.list(new LambdaUpdateWrapper<Agent>().eq(Agent::getType, Constant.AgentType.GLOBAL.getValue()).ne(Agent::getStatus, 1)); - - // down agent list include per-datacenter and global agent - downAgentList.addAll(downPerDcAgents); - downAgentList.addAll(downGlobalAgents); - - log.info("[agentStateCheckOfLogs] [per-datacenter agent receive log data] [total per-datacenter agent size: {}] [down per-datacenter agent size: {}] [down global agent size: {}]", totalAgentList.size(), downPerDcAgents.size(), downGlobalAgents.size()); - } - - Map<Object, Object> resultMap = Tool.MapUtil.builder() - .put("item", "logs.diagnose.endpoint.item.agent.status") - .put("resolution", Tool.StrUtil.EMPTY) - .map(); - - if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), - Tool.CollUtil.isNotEmpty(downAgentList))) { - // has down agent - resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); - resultMap.put("resolution", "diagnose.endpoint.resolution.agent.status"); - } else if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), - Tool.CollUtil.isEmpty(downAgentList))) { - // PASS - resultMap.put("state", DiagnoseState.PASS.getValue()); - } else if (Tool.CollUtil.isEmpty(totalAgentList)) { - // 没有配置 Agent 信息 - resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); - resultMap.put("resolution", "diagnose.endpoint.resolution.agent.config"); + .eq(Agent::getDcId, asset.getDcId()) + .eq(Agent::getStatus, "1")); + + log.info("[agentStateCheckOfLogs] [per-datacenter agent receive log data] [total healthy per-datacenter agent size: {}] [total healthy global agent size: {}]", healthyDatacenterList.size(), healthyClobalList.size()); + + if (Tool.CollUtil.isNotEmpty(totalAgentLists) && (Tool.CollUtil.isEmpty(healthyClobalList) || Tool.CollUtil.isEmpty(healthyDatacenterList))) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.endpoint.resolution.agent.status"); + } else if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(healthyClobalList), + Tool.CollUtil.isNotEmpty(healthyDatacenterList))) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (Tool.CollUtil.isEmpty(totalAgentLists)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.endpoint.resolution.agent.config"); + } } return resultMap; } diff --git a/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/metrics/AgentStateCheckOfMetricsEndpointInterceptor.java b/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/metrics/AgentStateCheckOfMetricsEndpointInterceptor.java index 5f2a40d7..a679cdab 100644 --- a/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/metrics/AgentStateCheckOfMetricsEndpointInterceptor.java +++ b/nz-admin/src/main/java/com/nis/modules/diagnose/chain/endpoint/metrics/AgentStateCheckOfMetricsEndpointInterceptor.java @@ -78,57 +78,64 @@ public class AgentStateCheckOfMetricsEndpointInterceptor implements DiagnoseInte String federationEnabled = sysConfService.getValueOrDefault(Constant.SYSCONFIG_KEY_PROM_FEDER_ENABLED, "1"); log.info("[agentStateCheckOfMetrics] [prometheus_federation_enabled] [value: {}]", federationEnabled); - List<Agent> totalAgentList = Tool.ListUtil.list(true); - List<Agent> downAgentList = Tool.ListUtil.list(true); + List<Agent> healthyGlobalList = agentService.list(new LambdaUpdateWrapper<Agent>() + .eq(Agent::getType, Constant.AgentType.GLOBAL.getValue()) + .eq(Agent::getStatus, "1")); + + log.info("[agentStateCheckOfMetrics] [query prom_api by global agent] [healthy global agent size: {}]", healthyGlobalList.size()); + + List<Agent> healthyDatacenterList = Tool.ListUtil.list(true); + List<Agent> totalAgentList = agentService.list(); + + //result + Map<Object, Object> resultMap = Tool.MapUtil.builder() + .put("item", "metric.diagnose.endpoint.item.agent.status") + .put("resolution", Tool.StrUtil.EMPTY) + .map(); if (StrUtil.equals("0", federationEnabled)) { // federation 模式关闭 ,Global scrape endpoint jobs log.info("[agentStateCheckOfMetrics] [federation is not enabled. get global agents]"); - totalAgentList = agentService.list(new LambdaUpdateWrapper<Agent>().eq(Agent::getType, Constant.AgentType.GLOBAL.getValue())); - downAgentList = totalAgentList.stream().filter(agent -> Tool.ObjectUtil.notEqual(1, agent.getStatus())).collect(Collectors.toList()); - log.info("[agentStateCheckOfMetrics] [global agent scrapes endpoint metrics] [total agent size: {}] [down agent size: {}]", totalAgentList.size(), downAgentList.size()); - } + if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), Tool.CollUtil.isEmpty(healthyGlobalList))) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.endpoint.resolution.agent.status"); + } else if (Tool.CollUtil.isNotEmpty(healthyGlobalList)) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (Tool.CollUtil.isEmpty(totalAgentList)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.config"); + } + + } if (StrUtil.equals("1", federationEnabled)) { // federation 模式开启 ,Per-datacenter scrape endpoint jobs Integer assetId = endpoint.getAssetId(); AssetAsset asset = assetService.getById(assetId); // per-datacenter agent list - totalAgentList = agentService.list(new LambdaUpdateWrapper<Agent>() + healthyDatacenterList = agentService.list(new LambdaUpdateWrapper<Agent>() .eq(Agent::getType, Constant.AgentType.PER_DATACENTER.getValue()) - .eq(Agent::getDcId, asset.getDcId())); - - List<Agent> downPerDcAgents = totalAgentList.stream().filter(agent -> Tool.ObjectUtil.notEqual(1, agent.getStatus())).collect(Collectors.toList()); - - // per-datacenter scrape metrics endpoint job, query prom_api by global agent, so down agent include global agent - List<Agent> downGlobalAgents = agentService.list(new LambdaUpdateWrapper<Agent>().eq(Agent::getType, Constant.AgentType.GLOBAL.getValue()).ne(Agent::getStatus, 1)); - - // down agent list include per-datacenter and global agent - downAgentList.addAll(downPerDcAgents); - downAgentList.addAll(downGlobalAgents); - - log.info("[agentStateCheckOfMetrics] [per-datacenter agent scrapes endpoint metrics] [total per-datacenter agent size: {}] [down per-datacenter agent size: {}] [down global agent size: {}]", totalAgentList.size(), downPerDcAgents.size(), downGlobalAgents.size()); - } - - Map<Object, Object> resultMap = Tool.MapUtil.builder() - .put("item", "metric.diagnose.endpoint.item.agent.status") - .put("resolution", Tool.StrUtil.EMPTY) - .map(); - - if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), - Tool.CollUtil.isNotEmpty(downAgentList))) { - // has down agent - resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); - resultMap.put("resolution", "diagnose.endpoint.resolution.agent.status"); - } else if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(totalAgentList), - Tool.CollUtil.isEmpty(downAgentList))) { - // PASS - resultMap.put("state", DiagnoseState.PASS.getValue()); - } else if (Tool.CollUtil.isEmpty(totalAgentList)) { - // 没有配置 Agent 信息 - resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); - resultMap.put("resolution", "diagnose.endpoint.resolution.agent.config"); + .eq(Agent::getDcId, asset.getDcId()) + .eq(Agent::getStatus, "1")); + + log.info("[agentStateCheckOfMetrics] [per-datacenter agent scrapes endpoint metrics] [healthy per-datacenter agent size: {}] [healthy global agent size: {}]", healthyDatacenterList.size(), healthyGlobalList.size()); + + if (Tool.BooleanUtil.and(Tool.CollUtil.isNotEmpty(healthyGlobalList), Tool.CollUtil.isNotEmpty(healthyDatacenterList))) { + // PASS + resultMap.put("state", DiagnoseState.PASS.getValue()); + } else if (Tool.CollUtil.isNotEmpty(totalAgentList) && (Tool.CollUtil.isEmpty(healthyGlobalList) || Tool.CollUtil.isEmpty(healthyDatacenterList))) { + // has down agent + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.status"); + } else if (Tool.CollUtil.isEmpty(totalAgentList)) { + // 没有配置 Agent 信息 + resultMap.put("state", DiagnoseState.DID_NOT_PASS.getValue()); + resultMap.put("resolution", "diagnose.asset.resolution.agent.config"); + } } return resultMap; } |
