summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README_zh.md109
-rw-r--r--helloworld.c2
-rw-r--r--monitor_kernel.c3
-rw-r--r--monitor_kernel.h2
-rw-r--r--monitor_kernel_lib.c25
-rw-r--r--monitor_user.h2
6 files changed, 89 insertions, 54 deletions
diff --git a/README_zh.md b/README_zh.md
index db691ae..f2c6cfa 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -1,5 +1,15 @@
## Variable Monitor
+changelog
+
+```log
+11.9 多个变量监控支持
+11.10 按照 pid 区分不同内核结构, 支持每个进程单独申请取消自己的监控.
+11.13 用户接口 cancel_all_watch -> cancel_watch, 每个进程互不干扰.
+```
+
+## 说明
+
监控 数值变量(给定 地址,长度), 超过设定条件打印系统堆栈信息.
同时监控数量
@@ -8,61 +18,67 @@
- 定时器数量全局最多 128 个.
- 以上数量限制定义在 `watch_module.h` 头部宏.
-目前限制监控 在同一个应用程序下,暂不支持多个应用程序同时调用.
-- 多个应用程序如果只有一个程序调用 `cancel_all_watch();`, 那么也可以正常工作.
-
## 使用
示例如 helloworld.c
- 添加 `#include "watch.h"`
- 对每个需要监控的变量 设置: 名称 && 地址 && 长度, 设置阈值, 比较方式, 定时器间隔(ns) 等.
- `start_watch(watch_arg);` 启动监控
-- 需要取消监控时调用 `cancel_all_watch();`
+- 需要取消监控时调用 `cancel_watch();`
超出设定条件时,打印系统堆栈信息, `dmesg` 查看,如下示例:
- 一个定时器内,多个变量超过阈值,堆栈信息不会重复输出;
- 打印堆栈后定时器再启动时间为 1s, 1s 后开始下一个轮次监控.
```log
-[86245.364861] -------------------------------------
-[86245.364864] -------------watch monitor-----------
-[86245.364865] Threshold reached:
- name: temp0, threshold: 150
-[86245.364866] Timestamp (ns): 1699589000606300743
-[86245.364867] Recent Load: 116.65, 126.83, 151.17
-[86245.365669] task: name lcore-worker-4, pid 803327
-[86245.365672] task: name lcore-worker-5, pid 803328
-[86245.365673] task: name lcore-worker-6, pid 803329
-[86245.365674] task: name lcore-worker-7, pid 803330
-[86245.365676] task: name lcore-worker-8, pid 803331
-[86245.365677] task: name lcore-worker-9, pid 803332
-[86245.365679] task: name lcore-worker-10, pid 803333
-[86245.365681] task: name lcore-worker-11, pid 803334
-[86245.365682] task: name lcore-worker-68, pid 803335
-[86245.365683] task: name lcore-worker-69, pid 803336
-[86245.365684] task: name lcore-worker-70, pid 803337
-[86245.365685] task: name lcore-worker-71, pid 803338
-[86245.365686] task: name lcore-worker-72, pid 803339
-[86245.365687] task: name lcore-worker-73, pid 803340
-[86245.365688] task: name lcore-worker-74, pid 803341
-[86245.365689] task: name lcore-worker-75, pid 803342
-[86245.365694] task: name pkt:worker-0, pid 803638
-[86245.365702] hrtimer_nanosleep+0x8d/0x120
-[86245.365709] __x64_sys_nanosleep+0x96/0xd0
-[86245.365711] do_syscall_64+0x37/0x80
-[86245.365716] entry_SYSCALL_64_after_hwframe+0x44/0xae
-[86245.365718] task: name pkt:worker-1, pid 803639
-[86245.365721] hrtimer_nanosleep+0x8d/0x120
-[86245.365724] __x64_sys_nanosleep+0x96/0xd0
-[86245.365726] do_syscall_64+0x37/0x80
-[86245.365728] entry_SYSCALL_64_after_hwframe+0x44/0xae
-[86245.365730] task: name pkt:worker-2, pid 803640
-[86245.365732] hrtimer_nanosleep+0x8d/0x120
-[86245.365734] __x64_sys_nanosleep+0x96/0xd0
-[86245.365737] do_syscall_64+0x37/0x80
-[86245.365739] entry_SYSCALL_64_after_hwframe+0x44/0xae
-[86245.365740] task: name pkt:worker-3, pid 803641
-[86245.365743] hrtimer_nanosleep+0x8d/0x120
+[ 713.225894] -------------------------------------
+[ 713.225900] -------------watch monitor-----------
+[ 713.225900] Threshold reached:
+[ 713.225901] name: temp0, threshold: 150, pid: 4261
+[ 713.225902] name: temp1, threshold: 151, pid: 4261
+[ 713.225903] name: temp2, threshold: 152, pid: 4261
+[ 713.225904] name: temp3, threshold: 153, pid: 4261
+[ 713.225904] name: temp4, threshold: 154, pid: 4261
+[ 713.225905] name: temp5, threshold: 155, pid: 4261
+[ 713.225905] name: temp6, threshold: 156, pid: 4261
+[ 713.225906] name: temp7, threshold: 157, pid: 4261
+[ 713.225906] name: temp8, threshold: 158, pid: 4261
+[ 713.225907] name: temp9, threshold: 159, pid: 4261
+[ 713.225907] name: temp10, threshold: 160, pid: 4261
+[ 713.225908] name: temp11, threshold: 161, pid: 4261
+[ 713.225908] name: temp12, threshold: 162, pid: 4261
+[ 713.225909] name: temp13, threshold: 163, pid: 4261
+[ 713.225909] name: temp14, threshold: 164, pid: 4261
+[ 713.225910] name: temp15, threshold: 165, pid: 4261
+[ 713.225910] name: temp16, threshold: 166, pid: 4261
+[ 713.225911] name: temp17, threshold: 167, pid: 4261
+[ 713.225911] name: temp18, threshold: 168, pid: 4261
+[ 713.225912] name: temp19, threshold: 169, pid: 4261
+[ 713.225912] name: temp20, threshold: 170, pid: 4261
+[ 713.225913] name: temp21, threshold: 171, pid: 4261
+[ 713.225913] name: temp22, threshold: 172, pid: 4261
+[ 713.225914] name: temp23, threshold: 173, pid: 4261
+[ 713.225914] name: temp24, threshold: 174, pid: 4261
+[ 713.225915] name: temp25, threshold: 175, pid: 4261
+[ 713.225915] name: temp26, threshold: 176, pid: 4261
+[ 713.225916] name: temp27, threshold: 177, pid: 4261
+[ 713.225916] name: temp28, threshold: 178, pid: 4261
+[ 713.225916] name: temp29, threshold: 179, pid: 4261
+[ 713.225917] name: temp30, threshold: 180, pid: 4261
+[ 713.225917] name: temp31, threshold: 181, pid: 4261
+[ 713.225918] Timestamp (ns): 1699846710299420862
+[ 713.225919] Recent Load: 0.05, 0.12, 0.08
+[ 713.225921] task: name rcu_gp, pid 3, state 1026
+[ 713.225926] rescuer_thread+0x290/0x390
+[ 713.225931] kthread+0xd7/0x100
+[ 713.225932] ret_from_fork+0x1f/0x30
+[ 713.225935] task: name rcu_par_gp, pid 4, state 1026
+[ 713.225936] rescuer_thread+0x290/0x390
+[ 713.225937] kthread+0xd7/0x100
+[ 713.225938] ret_from_fork+0x1f/0x30
+[ 713.225940] task: name netns, pid 5, state 1026
+[ 713.225941] rescuer_thread+0x290/0x390
+[ 713.225942] kthread+0xd7/0x100
```
### 参数说明
@@ -137,8 +153,13 @@ rmmod watch_module.ko && make clean
- `get_user_pages_remote`/ `kmap` 会增加对应的计数,需要对等的 `put_page`/`kunmap`.
- 一个模块内全局链表 `watch_local_memory_list` 存储每一个成功挂载的变量对应的 page 和 kt,执行字符设备的 close 操作时,遍历并卸载.
+variable monitor 添加/删除
+- kernel_watch_arg 数据结构中有 pid 的成员变量,但添加变量监控时,不按照进程区分.
+- 删除时遍历全部监控变量,比较 pid.
+- 删除造成的缺位,将最后的变量移动到空位, sentinel--; hrTimer 同理.
+
堆栈输出条件: 条件参考自 [diagnose-tools::load.c](https://github.com/alibaba/diagnose-tools/blob/e285bc4626a7d207eabd4a69cb276e1a3b1b7c76/SOURCE/module/kernel/load.c#L209)
-- `TASK` 要满足 TASK_RUNNING 和 `__task_contributes_to_load`.
+- `TASK` 要满足 TASK_RUNNING 和 `__task_contributes_to_load` 和 `TASK_IDLE`(可能有阻塞进程).
- `__task_contributes_to_load` 对应内核宏 `task_contributes_to_loa`.
```c
diff --git a/helloworld.c b/helloworld.c
index 1416d5a..7386f84 100644
--- a/helloworld.c
+++ b/helloworld.c
@@ -3,7 +3,7 @@
#include <unistd.h>
#include <string.h>
-#define NUM_VARS 2049
+#define NUM_VARS 256
int main()
{
diff --git a/monitor_kernel.c b/monitor_kernel.c
index 098bb80..7737150 100644
--- a/monitor_kernel.c
+++ b/monitor_kernel.c
@@ -20,8 +20,8 @@ struct my_device_data
static int device_open(struct inode *inode, struct file *file)
{
- // printk(KERN_INFO "%s\n", __FUNCTION__);
struct my_device_data *data;
+ printk(KERN_INFO "%s: with pid %d\n", __FUNCTION__, current->pid);
// save pid
data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
@@ -38,6 +38,7 @@ static int device_release(struct inode *inode, struct file *file)
struct my_device_data *data = file->private_data;
// clear watch with pid
clear_watch(data->pid);
+ kfree(data); // free data memory
return 0;
}
diff --git a/monitor_kernel.h b/monitor_kernel.h
index d476e48..1c80be0 100644
--- a/monitor_kernel.h
+++ b/monitor_kernel.h
@@ -155,7 +155,7 @@ static void print_task_stack(void)
{
if (p->__state == TASK_RUNNING || __task_contributes_to_load(p) || p->__state == TASK_IDLE)
{
- printk("task: name %s, pid %d\n", p->comm, p->pid);
+ printk("task: name %s, pid %d, state %d\n", p->comm, p->pid, p->__state);
nr_bt = orig_stack_trace_save_tsk(p, backtrace, BACKTRACE_DEPTH, 0);
stack_trace_print(backtrace, nr_bt, 0); // print
}
diff --git a/monitor_kernel_lib.c b/monitor_kernel_lib.c
index c486a14..c7e76cb 100644
--- a/monitor_kernel_lib.c
+++ b/monitor_kernel_lib.c
@@ -209,15 +209,17 @@ enum hrtimer_restart check_variable_cb(struct hrtimer *timer)
{
kernel_watch_timer *k_watch_timer = container_of(timer, kernel_watch_timer, hr_timer);
int i = 0, j = 0;
- char buffer[1024]; // Buffer to store the messages
+ int buffer[TIMER_MAX_WATCH_NUM]; // Buffer to store the messages
// check all watched kernel_watch_arg
for (i = 0; i < k_watch_timer->sentinel; i++)
{
if (read_and_compare(&k_watch_timer->k_watch_args[i]))
{
- snprintf(buffer + strlen(buffer), sizeof(buffer) - strlen(buffer), " name: %s, threshold: %lld \n",
- k_watch_timer->k_watch_args[i].name, k_watch_timer->k_watch_args[i].threshold);
+ // snprintf(buffer + strlen(buffer), sizeof(buffer) - strlen(buffer), " name: %s, threshold: %lld, pid: %d\n",
+ // k_watch_timer->k_watch_args[i].name, k_watch_timer->k_watch_args[i].threshold,
+ // k_watch_timer->k_watch_args[i].task_id);
+ buffer[j] = i;
j++;
// printk(KERN_INFO "j: name %s, threshold: %lld\n", k_watch_timer->k_watch_args[i].name,
@@ -229,7 +231,13 @@ enum hrtimer_restart check_variable_cb(struct hrtimer *timer)
{
printk("-------------------------------------\n");
printk("-------------watch monitor-----------\n");
- printk("Threshold reached:\n %s", buffer);
+ printk("Threshold reached:\n");
+
+ for (i = 0; i < j; i++)
+ {
+ printk(" name: %s, threshold: %lld, pid: %d\n", k_watch_timer->k_watch_args[buffer[i]].name,
+ k_watch_timer->k_watch_args[buffer[i]].threshold, k_watch_timer->k_watch_args[buffer[i]].task_id);
+ }
print_task_stack();
// restart timer after 1s
hrtimer_forward(timer, timer->base->get_time(), ktime_set(1, 0));
@@ -270,6 +278,7 @@ void start_all_hrTimer(void)
timer = &(kernel_wtimer_list[i]);
TIMER_START(timer);
}
+ printk("HrTimer start,module keep %d hrtimer for now\n", kernel_wtimer_num);
}
/// @brief cancel hrTimer
@@ -284,8 +293,7 @@ void cancel_all_hrTimer(void)
TIMER_CANCEL(timer);
}
- // hrtimer_cancel(&hr_timer);
- printk("HrTimer End\n");
+ printk("HrTimer cancel,module keep %d hrtimer for now\n", kernel_wtimer_num);
}
// for read_and_compare
@@ -389,6 +397,9 @@ unsigned char del_all_kwarg_by_pid(pid_t pid)
{
int i = 0;
kernel_watch_timer *timer = NULL;
+
+ printk(KERN_INFO "del kwarg...");
+
for (i = 0; i < kernel_wtimer_num; i++)
{
timer = &(kernel_wtimer_list[i]);
@@ -414,6 +425,7 @@ unsigned char del_all_kwarg_by_pid(pid_t pid)
/// @param pid
void clear_watch(pid_t pid)
{
+ printk(KERN_INFO "clear pid %d 's watch variable\n", pid);
cancel_all_hrTimer(); // just in case
del_all_kwarg_by_pid(pid); // delete all kwarg with pid
free_page_list(pid); // free page with pid
@@ -424,6 +436,7 @@ void clear_watch(pid_t pid)
/// @param
void clear_all_watch(void)
{
+ printk(KERN_INFO "clear all watch variable\n");
// unmap and release the page
free_all_page_list();
// cancel timer
diff --git a/monitor_user.h b/monitor_user.h
index ebf2c0a..a40080d 100644
--- a/monitor_user.h
+++ b/monitor_user.h
@@ -17,5 +17,5 @@ typedef struct
// start watch
int start_watch(watch_arg w_arg);
-// cancel all watch
+// cancel watch
int cancel_watch(); \ No newline at end of file