summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile35
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/monitor_kernel.c (renamed from monitor_kernel.c)6
-rw-r--r--kernel/monitor_kernel.h (renamed from monitor_kernel.h)7
-rw-r--r--kernel/monitor_kernel_lib.c (renamed from monitor_kernel_lib.c)0
-rw-r--r--kernel/monitor_kernel_task.c377
-rw-r--r--kernel/monitor_kernel_task.h98
-rw-r--r--testcase/helloworld.c (renamed from helloworld.c)2
-rw-r--r--testcase/hptest.c (renamed from hptest.c)2
-rw-r--r--user/monitor_user.c (renamed from monitor_user.c)0
-rw-r--r--user/monitor_user.h (renamed from monitor_user.h)0
11 files changed, 517 insertions, 13 deletions
diff --git a/Makefile b/Makefile
index 2815b94..7870185 100644
--- a/Makefile
+++ b/Makefile
@@ -1,23 +1,36 @@
CC = gcc
CFLAGS = -Wall
+
PROG = helloworld
HPTEST = hptest
-KMOD = variable_monitor
-obj-m := $(KMOD).o
-$(KMOD)-objs := monitor_kernel.o
+
+UDIR = $(PWD)/user
+MDIR := $(PWD)/kernel
+KDIR := $(PWD)/linux-5.17.15 # 内核源码目录
+TDIR := $(PWD)/testcase
+
+BUILD_DIR := $(PWD)/build
+OUTPUT_DIR = $(PWD)/build
+
+
+# KMOD = variable_monitor
+# obj-m := kernel/$(KMOD).o
+# $(KMOD)-objs := kernel/monitor_kernel.o
all: $(PROG) $(HPTEST) module
-$(PROG): helloworld.c
- $(CC) $(CFLAGS) -o $(PROG) helloworld.c monitor_user.c
+$(PROG): $(TDIR)/helloworld.c
+ $(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(PROG) $(TDIR)/helloworld.c $(UDIR)/monitor_user.c
-$(HPTEST): hptest.c
- $(CC) $(CFLAGS) -o $(HPTEST) hptest.c monitor_user.c
+$(HPTEST): $(TDIR)/helloworld.c
+ $(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(HPTEST) $(TDIR)/hptest.c $(UDIR)/monitor_user.c
module:
- make -C linux-5.17.15 M=$(PWD) modules
+ make -C $(KDIR) M=$(MDIR) modules
+
+# module:
+# make -C linux-5.17.15 M=$(PWD)/kernel modules
clean:
- rm -f $(PROG)
- rm -f $(HPTEST)
- make -C linux-5.17.15 M=$(PWD) clean \ No newline at end of file
+ rm -f $(OUTPUT_DIR)/*
+ make -C $(KDIR) M=$(MDIR) clean \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..71abe65
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,3 @@
+# KMOD = variable_monitor
+# obj-m := kernel/$(KMOD).o
+# $(KMOD)-objs := kernel/monitor_kernel.o
diff --git a/monitor_kernel.c b/kernel/monitor_kernel.c
index 38360d2..1d17d64 100644
--- a/monitor_kernel.c
+++ b/kernel/monitor_kernel.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include "monitor_kernel_lib.c"
+#include "monitor_kernel_task.c"
#define DEVICE_NAME "variable_monitor"
@@ -133,6 +134,11 @@ int init_module(void) {
fn_kallsyms_lookup_name_init(); // init kallsyms_lookup_name
LOOKUP_SYMS(stack_trace_save_tsk); // stack_trace_save_tsk
LOOKUP_SYMS(show_stack); // show_stack
+ LOOKUP_SYMS(idle_sched_class); // idle_sched_class
+ LOOKUP_SYMS(access_remote_vm); // access_remote_vm
+
+ LOOKUP_SYMS_NORET(get_task_type); // get_task_type
+ LOOKUP_SYMS_NORET(kernfs_name); // kernfs_name
return 0;
}
diff --git a/monitor_kernel.h b/kernel/monitor_kernel.h
index 852d003..47e36d0 100644
--- a/monitor_kernel.h
+++ b/kernel/monitor_kernel.h
@@ -121,6 +121,13 @@ int fn_kallsyms_lookup_name_init(void); // init kallsyms_lookup_name
} \
} while (0)
+#define LOOKUP_SYMS_NORET(name) \
+ do { \
+ orig_##name = (void *)diag_kallsyms_lookup_name(#name); \
+ if (!orig_##name) \
+ pr_err("kallsyms_lookup_name: %s\n", #name); \
+ } while (0)
+
#define BACKTRACE_DEPTH 20 // max stack depth
// LOOKUP_SYMS(stack_trace_save_tsk);
diff --git a/monitor_kernel_lib.c b/kernel/monitor_kernel_lib.c
index 3dea0cd..3dea0cd 100644
--- a/monitor_kernel_lib.c
+++ b/kernel/monitor_kernel_lib.c
diff --git a/kernel/monitor_kernel_task.c b/kernel/monitor_kernel_task.c
new file mode 100644
index 0000000..3b57152
--- /dev/null
+++ b/kernel/monitor_kernel_task.c
@@ -0,0 +1,377 @@
+#include "monitor_kernel_task.h"
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/syscall.h> // for syscall_get_nr
+#include <linux/irq.h>
+#include <linux/sched/mm.h> // for get_task_mm
+#include <linux/syscalls.h>
+#include <linux/tracehook.h>
+
+struct stack_trace {
+ unsigned int nr_entries, max_entries;
+ unsigned long *entries;
+ int skip; /* input argument: How many entries to skip */
+};
+
+struct stack_frame_user {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static inline int diag_get_task_type(struct task_struct *tsk) {
+ if (orig_get_task_type)
+ return orig_get_task_type(&tsk->se);
+ return 0;
+}
+
+static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf,
+ size_t buflen) {
+ if (orig_kernfs_name && cgrp && cgrp->kn) {
+ return orig_kernfs_name(cgrp->kn, buf, buflen);
+ } else {
+ return 0;
+ }
+}
+
+static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) {
+ mm_info *info;
+ if (mm == NULL)
+ return NULL;
+ info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm);
+ return info;
+}
+
+static void __diag_cgroup_name(struct task_struct *tsk, char *buf,
+ unsigned int count, int cgroup) {
+ int cgroup_id = cpuacct_cgrp_id;
+
+ memset(buf, 0, count);
+
+ if (cgroup == 1) {
+ cgroup_id = cpuset_cgrp_id;
+ }
+
+ if (tsk && tsk->cgroups && tsk->cgroups->subsys &&
+ tsk->cgroups->subsys[cgroup_id] &&
+ tsk->cgroups->subsys[cgroup_id]->cgroup) {
+ orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count);
+ }
+}
+
+static void diag_cgroup_name(struct task_struct *tsk, char *buf,
+ unsigned int count, int cgroup) {
+ __diag_cgroup_name(tsk, buf, count, cgroup);
+}
+
+static int copy_stack_frame(const void __user *fp,
+ struct stack_frame_user *frame) {
+ int ret;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static int copy_stack_frame_remote(struct task_struct *tsk,
+ const void __user *fp,
+ struct stack_frame_user *frame) {
+ int ret;
+ struct mm_struct *mm;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0);
+ mmput(mm);
+
+ return ret;
+}
+
+static inline void save_stack_trace_user_remote(struct task_struct *tsk,
+ struct stack_trace *trace) {
+ const struct pt_regs *regs = task_pt_regs(tsk);
+ const void __user *fp = (const void __user *)regs->bp;
+ int count = 0;
+
+ if (in_atomic() || irqs_disabled()) {
+ return;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame_user frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+
+ if (!copy_stack_frame_remote(tsk, fp, &frame)) {
+ break;
+ }
+
+ if ((unsigned long)fp < regs->sp)
+ break;
+
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] = frame.ret_addr;
+ } else
+ break;
+
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+
+ count++;
+ /**
+ * 线上环境发现这里有hardlockup,这里强制退出
+ */
+ if (count >= trace->max_entries || count >= 100)
+ break;
+ }
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace) {
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+ int count = 0;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame_user frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] = frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ count++;
+ /**
+ * 线上环境发现这里有hardlockup,这里强制退出
+ */
+ if (count >= trace->max_entries || count >= 100)
+ break;
+ }
+}
+
+void perfect_save_stack_trace_user(struct stack_trace *trace) {
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+void diagnose_save_stack_trace_user(unsigned long *backtrace) {
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long));
+ trace.max_entries = BACKTRACE_DEPTH2;
+ trace.entries = backtrace;
+ perfect_save_stack_trace_user(&trace);
+}
+
+void diagnose_save_stack_trace_user_remote(struct task_struct *tsk,
+ unsigned long *backtrace) {
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long));
+ trace.max_entries = BACKTRACE_DEPTH2;
+ trace.entries = backtrace;
+
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (tsk->mm) {
+ save_stack_trace_user_remote(tsk, &trace);
+ }
+ if (trace.nr_entries < trace.max_entries)
+ trace.entries[trace.nr_entries++] = ULONG_MAX;
+}
+
+void diag_task_brief(struct task_struct *tsk, task_detail *detail) {
+ struct pid_namespace *ns;
+ struct pt_regs *task_regs;
+ struct task_struct *leader;
+ struct pt_regs *irq_regs;
+
+ if (!detail)
+ return;
+
+ memset(detail, 0, sizeof(task_detail));
+
+ if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie
+ return;
+ leader = tsk->group_leader;
+ if (!leader || leader->exit_state == EXIT_ZOMBIE) {
+ return;
+ }
+
+ if (tsk != current) { // not current task
+ detail->user_mode = -1;
+ detail->syscallno = -1;
+ } else if (!tsk->mm) { // current task but kernel thread
+ detail->user_mode = 0;
+ detail->syscallno = -1;
+ } else { // current task and user thread
+ irq_regs = get_irq_regs(); // get current irq regs
+ task_regs = task_pt_regs(tsk);
+
+ if ((irq_regs && user_mode(irq_regs)) ||
+ (task_regs && user_mode(task_regs))) {
+ detail->user_mode = 1; // user mode
+ } else {
+ detail->user_mode = 0; // kernel mode
+ }
+
+ if (task_regs) {
+ detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no
+ }
+ }
+
+ if (tsk->sched_class == orig_idle_sched_class) // idle task
+ detail->sys_task = 2;
+ else if (!tsk->mm) // kernel thread
+ detail->sys_task = 1;
+ else
+ detail->sys_task = 0;
+
+ detail->pid = tsk->pid; // pid
+ detail->tgid = tsk->tgid; // tgid
+ detail->state = tsk->__state; // state
+ detail->task_type = diag_get_task_type(tsk); // task type
+ ns = task_active_pid_ns(tsk); // container pid
+ if (ns && ns != &init_pid_ns) {
+ detail->container_pid = task_pid_nr_ns(tsk, ns);
+ detail->container_tgid = task_tgid_nr_ns(tsk, ns);
+ } else {
+ detail->container_pid = tsk->pid;
+ detail->container_tgid = tsk->tgid;
+ }
+ strncpy(detail->comm, tsk->comm, TASK_COMM_LEN);
+ detail->comm[TASK_COMM_LEN - 1] = 0; // comm name
+ diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0);
+ diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1);
+
+ detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name
+ detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name
+}
+
+void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) {
+ struct pt_regs *regs;
+ unsigned long sp, ip, bp;
+ struct task_struct *leader;
+
+ if (!detail)
+ return;
+
+ detail->stack[0] = 0;
+ if (!tsk || !tsk->mm)
+ return;
+
+ leader = tsk->group_leader;
+ if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) {
+ return;
+ }
+
+ sp = 0;
+ ip = 0;
+ bp = 0;
+ regs = task_pt_regs(tsk);
+ if (regs) {
+ sp = regs->sp;
+#if defined(DIAG_ARM64)
+ ip = regs->pc;
+ bp = regs->sp;
+#else
+ ip = regs->ip;
+ bp = regs->bp;
+#endif
+ }
+#if defined(DIAG_ARM64)
+ detail->regs = regs->user_regs;
+#else
+ detail->regs = *regs;
+#endif
+ detail->sp = sp;
+ detail->ip = ip;
+ detail->bp = bp;
+
+ if (tsk == current) {
+ diagnose_save_stack_trace_user(detail->stack);
+ } else {
+ diagnose_save_stack_trace_user_remote(tsk, detail->stack);
+ }
+}
+
+void diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) {
+ orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH2, 0);
+}
+
+void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree,
+ proc_chains_detail *detail) {
+ struct task_struct *walker;
+ mm_info *mm_info;
+ int cnt = 0;
+ int i = 0;
+ struct task_struct *leader;
+
+ for (i = 0; i < PROCESS_CHAINS_COUNT; i++) {
+ detail->chains[i][0] = 0;
+ detail->tgid[i] = 0;
+ }
+ if (style == 0)
+ return;
+
+ if (!tsk || !tsk->mm)
+ return;
+
+ leader = tsk->group_leader;
+ if (!leader || !leader->mm ||
+ leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm
+ return;
+ }
+
+ rcu_read_lock();
+ walker = tsk;
+
+ while (walker->pid > 0) {
+ if (!thread_group_leader(walker))
+ walker = rcu_dereference(walker->group_leader);
+ mm_info = find_mm_info(mm_tree, walker->mm);
+ if (mm_info) {
+ if (mm_info->cgroup_buf[0] == 0)
+ diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0);
+ strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN);
+ detail->full_argv[cnt] = 1;
+ } else {
+ strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN);
+ detail->full_argv[cnt] = 0;
+ }
+ detail->tgid[cnt] = walker->pid;
+ walker = rcu_dereference(walker->real_parent);
+ cnt++;
+ if (cnt >= PROCESS_CHAINS_COUNT)
+ break;
+ }
+ rcu_read_unlock();
+} \ No newline at end of file
diff --git a/kernel/monitor_kernel_task.h b/kernel/monitor_kernel_task.h
new file mode 100644
index 0000000..62e501c
--- /dev/null
+++ b/kernel/monitor_kernel_task.h
@@ -0,0 +1,98 @@
+#include <linux/kernfs.h>
+#include <linux/sched.h>
+
+#define CGROUP_NAME_LEN 32 // max length of cgroup name
+#define TASK_COMM_LEN 16 // max length of task name
+
+#define BACKTRACE_DEPTH2 30 // max depth of backtrace
+
+#define PROCESS_CHAINS_COUNT 10 // max count of process chains
+#define PROCESS_ARGV_LEN 128 // max length of process argv
+
+// from
+// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/uapi/ali_diagnose.h
+
+typedef struct {
+ char cgroup_buf[CGROUP_NAME_LEN];
+ char cgroup_cpuset[CGROUP_NAME_LEN];
+ int pid;
+ int tgid;
+ int container_pid;
+ int container_tgid;
+ long state;
+ int task_type;
+ unsigned long syscallno;
+ /**
+ * 0->user 1->sys 2->idle
+ */
+ unsigned long sys_task;
+ /**
+ * 1->user mode 0->sys mode -1->unknown
+ */
+ unsigned long user_mode;
+ char comm[TASK_COMM_LEN];
+} task_detail;
+
+typedef struct {
+ unsigned long stack[BACKTRACE_DEPTH2];
+} kern_stack_detail;
+
+typedef struct {
+ struct pt_regs regs;
+ unsigned long ip;
+ unsigned long bp;
+ unsigned long sp;
+ unsigned long stack[BACKTRACE_DEPTH2];
+} user_stack_detail;
+
+typedef struct {
+ unsigned int full_argv[PROCESS_CHAINS_COUNT]; //
+ char chains[PROCESS_CHAINS_COUNT][PROCESS_ARGV_LEN]; // process chains argv
+ unsigned int tgid[PROCESS_CHAINS_COUNT]; // process chains tgid
+} proc_chains_detail;
+
+// most important struct
+typedef struct {
+ int et_type;
+ unsigned long id;
+ unsigned long long tv;
+ task_detail task; // brief
+ user_stack_detail user_stack; // user stack
+ kern_stack_detail kern_stack; // kernel stack
+ proc_chains_detail proc_chains; // process chains argv
+} variable_monitor_task;
+
+typedef struct {
+ struct rcu_head rcu_head;
+ pid_t pid;
+ struct mm_struct *mm;
+ char cgroup_buf[256];
+ char argv[256];
+} mm_info;
+
+typedef struct {
+ struct radix_tree_root mm_tree;
+ spinlock_t mm_tree_lock;
+} mm_tree;
+
+void diag_task_brief(struct task_struct *tsk,
+ task_detail *detail); // get task brief
+void diag_task_user_stack(struct task_struct *tsk,
+ user_stack_detail *detail); // get task user stack
+void diag_task_kern_stack(struct task_struct *tsk,
+ kern_stack_detail *detail); // get task kernel stack
+void dump_proc_chains_argv(
+ int style, struct task_struct *tsk, mm_tree *mm_tree,
+ proc_chains_detail *detail); // get process chains argv
+
+// orig_X
+struct sched_class *orig_idle_sched_class;
+int (*orig_get_task_type)(struct sched_entity *se);
+int (*orig_kernfs_name)(struct kernfs_node *kn, char *buf, size_t buflen);
+int (*orig_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags);
+extern unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task,
+ unsigned long *store,
+ unsigned int size,
+ unsigned int skipnr);
+
diff --git a/helloworld.c b/testcase/helloworld.c
index 1416d5a..554b258 100644
--- a/helloworld.c
+++ b/testcase/helloworld.c
@@ -1,4 +1,4 @@
-#include "monitor_user.h"
+#include "../user/monitor_user.h"
#include <stdio.h>
#include <unistd.h>
#include <string.h>
diff --git a/hptest.c b/testcase/hptest.c
index 4a7e494..ab2f0a8 100644
--- a/hptest.c
+++ b/testcase/hptest.c
@@ -1,4 +1,4 @@
-#include "monitor_user.h"
+#include "../user/monitor_user.h"
#include <fcntl.h>
#include <stdio.h>
#include <sys/mman.h>
diff --git a/monitor_user.c b/user/monitor_user.c
index 91da839..91da839 100644
--- a/monitor_user.c
+++ b/user/monitor_user.c
diff --git a/monitor_user.h b/user/monitor_user.h
index f4d9df1..f4d9df1 100644
--- a/monitor_user.h
+++ b/user/monitor_user.h