1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <event2/bufferevent.h>
#include <event2/event.h>
#include <event2/buffer.h>
#include <unistd.h>
#include <assert.h>
#include <sys/prctl.h>
#include <stdlib.h>
#include <systemd/sd-daemon.h>
#include <proxy.h>
#include <platform.h>
#include <tfe_utils.h>
#include <watchdog_tfe.h>
#include <MESA/MESA_prof_load.h>
struct watchdog_tfe
{
struct tfe_proxy *proxy;
struct event_base *ev_base;
pthread_t pthread;
const char *profile;
void *logger;
unsigned int enable;
unsigned int timeout_seconds;
unsigned int statistics_window;
unsigned int timeout_cnt_as_fail;
unsigned int timeout_debug;
unsigned int cur_time_window_fail_cnt;
time_t cur_time_window_begin;
time_t cur_time_window_end;
};
void *watchdog_tfe_thread(void *arg)
{
char thread_name[16];
snprintf(thread_name, sizeof(thread_name), "watchdog:tfe");
prctl(PR_SET_NAME, (unsigned long long)thread_name, NULL, NULL, NULL);
struct watchdog_tfe *__ctx = (struct watchdog_tfe *)arg;
while (event_base_dispatch(__ctx->ev_base) >= 0)
{
}
DIE("Watchdog TFE thread is terminated.");
}
static void watchdog_tfe_thread_handle(evutil_socket_t fd, short what, void *arg)
{
struct tfe_proxy *proxy = (struct tfe_proxy *)arg;
struct watchdog_tfe *__ctx = proxy->watchdog_tfe;
struct timespec now;
time_t temp;
const char *check_result = "WATCHDOG=1";
clock_gettime(CLOCK_MONOTONIC, &now);
if (now.tv_sec > __ctx->cur_time_window_end)
{
__ctx->cur_time_window_begin = now.tv_sec;
__ctx->cur_time_window_end = now.tv_sec + __ctx->statistics_window;
__ctx->cur_time_window_fail_cnt = 0;
}
for (unsigned int i = 0; i < proxy->nr_work_threads; i++)
{
temp = ATOMIC_READ(&(proxy->work_threads[i]->lastime));
if (temp + __ctx->timeout_seconds < now.tv_sec)
{
if (__ctx->timeout_debug)
{
TFE_LOG_ERROR(__ctx->logger, "Current timestamp is %ld, Worker thread[%d] tid %d timestamp is %ld, Worker thread timeout, Exit !!!",
now.tv_sec, proxy->work_threads[i]->thread_id, proxy->work_threads[i]->readable_tid, temp);
abort();
}
else
{
__ctx->cur_time_window_fail_cnt++;
TFE_LOG_ERROR(__ctx->logger, "Current timestamp is %ld, Worker thread[%d] tid %d timestamp is %ld, Worker thread timeout, fail count %d !!!",
now.tv_sec, proxy->work_threads[i]->thread_id, proxy->work_threads[i]->readable_tid, temp, __ctx->cur_time_window_fail_cnt);
if (__ctx->cur_time_window_fail_cnt >= __ctx->timeout_cnt_as_fail)
{
TFE_LOG_ERROR(__ctx->logger, "Frome %ld to %ld, there are %d timeouts of the worker threads, Ready to Exit !!!",
__ctx->cur_time_window_begin, __ctx->cur_time_window_end, __ctx->cur_time_window_fail_cnt);
check_result = "WATCHDOG=trigger";
}
}
}
}
if (sd_watchdog_enabled(0, NULL))
{
sd_notify(0, check_result);
}
}
struct watchdog_tfe *watchdog_tfe_create(struct tfe_proxy *proxy, const char *profile, void *logger)
{
struct watchdog_tfe *__ctx = ALLOC(struct watchdog_tfe, 1);
int ret = 0;
struct event *ev = NULL;
// The worker thread updates the timestamp every two seconds
// The watchdog thread checks the timestamp every second
struct timeval timer_delay = {1, 0};
__ctx->proxy = proxy;
__ctx->profile = profile;
__ctx->logger = logger;
MESA_load_profile_uint_def(profile, "watchdog_tfe", "enable", &(__ctx->enable), 1);
MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_seconds", &(__ctx->timeout_seconds), 5);
MESA_load_profile_uint_def(profile, "watchdog_tfe", "statistics_window", &(__ctx->statistics_window), 20);
MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_cnt_as_fail", &(__ctx->timeout_cnt_as_fail), 3);
MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_debug", &(__ctx->timeout_debug), 0);
if (!__ctx->enable)
{
return __ctx;
}
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
__ctx->cur_time_window_begin = now.tv_sec;
__ctx->cur_time_window_end = now.tv_sec + __ctx->statistics_window;
__ctx->cur_time_window_fail_cnt = 0;
__ctx->ev_base = event_base_new();
if (!__ctx->ev_base)
{
TFE_LOG_ERROR(__ctx->logger, "Fail to create event base: %s", strerror(errno));
errno = 0;
goto errout;
}
ev = event_new(__ctx->ev_base, -1, EV_PERSIST, watchdog_tfe_thread_handle, proxy);
if (unlikely(ev == NULL))
{
TFE_LOG_ERROR(__ctx->logger, "Fail to create tfe watchdog event");
errno = 0;
goto errout;
}
evtimer_add(ev, &timer_delay);
ret = pthread_create(&__ctx->pthread, NULL, watchdog_tfe_thread, (void *)__ctx);
if (unlikely(ret < 0))
{
TFE_LOG_ERROR(__ctx->logger, "Fail to create tfe watchdog thread: %s", strerror(errno));
errno = 0;
goto errout;
}
TFE_LOG_INFO(__ctx->logger, "Watchdog TFE module init successfully.");
return __ctx;
errout:
return NULL;
};
|