summaryrefslogtreecommitdiff
path: root/node
diff options
context:
space:
mode:
authorGrant Limberg <[email protected]>2023-05-04 07:58:02 -0700
committerGitHub <[email protected]>2023-05-04 07:58:02 -0700
commit74dc41c7c73669f5575851c830050747e332e38d (patch)
tree60eafe0cd08a09b836b4aea08113b89da731feea /node
parent925599cab0600ec99dd56dabb08c4dd889f2cfd5 (diff)
Peer metrics (#1995)
* Adding peer metrics still need to be wired up for use * per peer packet metrics * Fix crash from bad instantiation of histogram * separate alive & dead path counts * Add peer metric update block * add peer latency values in doPingAndKeepalive * prevent deadlock * peer latency histogram actually works now * cleanup * capture counts of packets to specific peers --------- Co-authored-by: Joseph Henry <[email protected]>
Diffstat (limited to 'node')
-rw-r--r--node/Metrics.cpp16
-rw-r--r--node/Metrics.hpp7
-rw-r--r--node/Peer.cpp105
-rw-r--r--node/Peer.hpp7
4 files changed, 92 insertions, 43 deletions
diff --git a/node/Metrics.cpp b/node/Metrics.cpp
index 7c10540e..e20f06c3 100644
--- a/node/Metrics.cpp
+++ b/node/Metrics.cpp
@@ -176,6 +176,22 @@ namespace ZeroTier {
prometheus::simpleapi::counter_family_t network_outgoing_packets
{ "zt_network_outgoing_packets", "number of outgoing packets per network" };
+ // PeerMetrics
+ prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency =
+ prometheus::Builder<prometheus::Histogram<uint64_t>>()
+ .Name("zt_peer_latency")
+ .Help("peer latency (ms)")
+ .Register(prometheus::simpleapi::registry);
+
+ prometheus::simpleapi::gauge_family_t peer_path_count
+ { "zt_peer_path_count", "number of paths to peer" };
+ prometheus::simpleapi::counter_family_t peer_incoming_packets
+ { "zt_peer_incoming_packets", "number of incoming packets from a peer" };
+ prometheus::simpleapi::counter_family_t peer_outgoing_packets
+ { "zt_peer_outgoing_packets", "number of outgoing packets to a peer" };
+ prometheus::simpleapi::counter_family_t peer_packet_errors
+ { "zt_peer_packet_errors" , "number of incoming packet errors from a peer" };
+
// General Controller Metrics
prometheus::simpleapi::gauge_metric_t network_count
{"controller_network_count", "number of networks the controller is serving"};
diff --git a/node/Metrics.hpp b/node/Metrics.hpp
index a3efcc28..f78a0f15 100644
--- a/node/Metrics.hpp
+++ b/node/Metrics.hpp
@@ -107,6 +107,13 @@ namespace ZeroTier {
extern prometheus::simpleapi::counter_family_t network_incoming_packets;
extern prometheus::simpleapi::counter_family_t network_outgoing_packets;
+ // Peer Metrics
+ extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency;
+ extern prometheus::simpleapi::gauge_family_t peer_path_count;
+ extern prometheus::simpleapi::counter_family_t peer_incoming_packets;
+ extern prometheus::simpleapi::counter_family_t peer_outgoing_packets;
+ extern prometheus::simpleapi::counter_family_t peer_packet_errors;
+
// General Controller Metrics
extern prometheus::simpleapi::gauge_metric_t network_count;
extern prometheus::simpleapi::gauge_metric_t member_count;
diff --git a/node/Peer.cpp b/node/Peer.cpp
index c46bdf9d..a08bebbf 100644
--- a/node/Peer.cpp
+++ b/node/Peer.cpp
@@ -28,11 +28,6 @@ namespace ZeroTier {
static unsigned char s_freeRandomByteCounter = 0;
-char * peerIDString(const Identity &id) {
- char out[16];
- return id.address().toString(out);
-}
-
Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) :
RR(renv),
_lastReceive(0),
@@ -55,7 +50,13 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
_directPathPushCutoffCount(0),
_echoRequestCutoffCount(0),
_localMultipathSupported(false),
- _lastComputedAggregateMeanLatency(0)
+ _lastComputedAggregateMeanLatency(0),
+ _peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})},
+ _alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})},
+ _dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})},
+ _incoming_packet{Metrics::peer_incoming_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
+ _outgoing_packet{Metrics::peer_outgoing_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
+ _packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
{
if (!myIdentity.agree(peerIdentity,_key)) {
throw ZT_EXCEPTION_INVALID_ARGUMENT;
@@ -96,7 +97,7 @@ void Peer::received(
default:
break;
}
-
+ _incoming_packet++;
recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
if (trustEstablished) {
@@ -519,54 +520,70 @@ void Peer::performMultipathStateCheck(void *tPtr, int64_t now)
unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
{
unsigned int sent = 0;
- Mutex::Lock _l(_paths_m);
+ {
+ Mutex::Lock _l(_paths_m);
- performMultipathStateCheck(tPtr, now);
+ performMultipathStateCheck(tPtr, now);
- const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
- if (sendFullHello) {
- _lastSentFullHello = now;
- }
+ const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
+ if (sendFullHello) {
+ _lastSentFullHello = now;
+ }
- // Right now we only keep pinging links that have the maximum priority. The
- // priority is used to track cluster redirections, meaning that when a cluster
- // redirects us its redirect target links override all other links and we
- // let those old links expire.
- long maxPriority = 0;
- for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
- if (_paths[i].p) {
- maxPriority = std::max(_paths[i].priority,maxPriority);
- } else {
- break;
+ // Right now we only keep pinging links that have the maximum priority. The
+ // priority is used to track cluster redirections, meaning that when a cluster
+ // redirects us its redirect target links override all other links and we
+ // let those old links expire.
+ long maxPriority = 0;
+ for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+ if (_paths[i].p) {
+ maxPriority = std::max(_paths[i].priority,maxPriority);
+ } else {
+ break;
+ }
}
- }
- bool deletionOccurred = false;
- for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
- if (_paths[i].p) {
- // Clean expired and reduced priority paths
- if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) {
- if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) {
- attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello);
- _paths[i].p->sent(now);
- sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
+ bool deletionOccurred = false;
+ for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+ if (_paths[i].p) {
+ // Clean expired and reduced priority paths
+ if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) {
+ if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) {
+ attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello);
+ _paths[i].p->sent(now);
+ sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
+ }
+ } else {
+ _paths[i] = _PeerPath();
+ deletionOccurred = true;
}
- } else {
- _paths[i] = _PeerPath();
- deletionOccurred = true;
+ }
+ if (!_paths[i].p || deletionOccurred) {
+ for(unsigned int j=i;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
+ if (_paths[j].p && i != j) {
+ _paths[i] = _paths[j];
+ _paths[j] = _PeerPath();
+ break;
+ }
+ }
+ deletionOccurred = false;
}
}
- if (!_paths[i].p || deletionOccurred) {
- for(unsigned int j=i;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
- if (_paths[j].p && i != j) {
- _paths[i] = _paths[j];
- _paths[j] = _PeerPath();
- break;
+ uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0;
+ for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+ if (_paths[i].p) {
+ if (_paths[i].p->alive(now)) {
+ alive_path_count_tmp++;
+ }
+ else {
+ dead_path_count_tmp++;
}
}
- deletionOccurred = false;
}
+ _alive_path_count = alive_path_count_tmp;
+ _dead_path_count = dead_path_count_tmp;
}
+ _peer_latency.Observe(latency(now));
return sent;
}
@@ -641,6 +658,7 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
{
+ _outgoing_packet++;
if (_localMultipathSupported && _bond) {
_bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
}
@@ -648,6 +666,7 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
{
+ _packet_errors++;
if (_localMultipathSupported && _bond) {
_bond->recordIncomingInvalidPacket(path);
}
diff --git a/node/Peer.hpp b/node/Peer.hpp
index 427e78a5..cd6b871f 100644
--- a/node/Peer.hpp
+++ b/node/Peer.hpp
@@ -598,6 +598,13 @@ private:
int32_t _lastComputedAggregateMeanLatency;
SharedPtr<Bond> _bond;
+
+ prometheus::Histogram<uint64_t> &_peer_latency;
+ prometheus::simpleapi::gauge_metric_t _alive_path_count;
+ prometheus::simpleapi::gauge_metric_t _dead_path_count;
+ prometheus::simpleapi::counter_metric_t _incoming_packet;
+ prometheus::simpleapi::counter_metric_t _outgoing_packet;
+ prometheus::simpleapi::counter_metric_t _packet_errors;
};
} // namespace ZeroTier