diff options
| -rw-r--r-- | Makefile | 35 | ||||
| -rw-r--r-- | kernel/Makefile | 3 | ||||
| -rw-r--r-- | kernel/monitor_kernel.c (renamed from monitor_kernel.c) | 6 | ||||
| -rw-r--r-- | kernel/monitor_kernel.h (renamed from monitor_kernel.h) | 7 | ||||
| -rw-r--r-- | kernel/monitor_kernel_lib.c (renamed from monitor_kernel_lib.c) | 0 | ||||
| -rw-r--r-- | kernel/monitor_kernel_task.c | 377 | ||||
| -rw-r--r-- | kernel/monitor_kernel_task.h | 98 | ||||
| -rw-r--r-- | testcase/helloworld.c (renamed from helloworld.c) | 2 | ||||
| -rw-r--r-- | testcase/hptest.c (renamed from hptest.c) | 2 | ||||
| -rw-r--r-- | user/monitor_user.c (renamed from monitor_user.c) | 0 | ||||
| -rw-r--r-- | user/monitor_user.h (renamed from monitor_user.h) | 0 |
11 files changed, 517 insertions, 13 deletions
@@ -1,23 +1,36 @@ CC = gcc CFLAGS = -Wall + PROG = helloworld HPTEST = hptest -KMOD = variable_monitor -obj-m := $(KMOD).o -$(KMOD)-objs := monitor_kernel.o + +UDIR = $(PWD)/user +MDIR := $(PWD)/kernel +KDIR := $(PWD)/linux-5.17.15 # 内核源码目录 +TDIR := $(PWD)/testcase + +BUILD_DIR := $(PWD)/build +OUTPUT_DIR = $(PWD)/build + + +# KMOD = variable_monitor +# obj-m := kernel/$(KMOD).o +# $(KMOD)-objs := kernel/monitor_kernel.o all: $(PROG) $(HPTEST) module -$(PROG): helloworld.c - $(CC) $(CFLAGS) -o $(PROG) helloworld.c monitor_user.c +$(PROG): $(TDIR)/helloworld.c + $(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(PROG) $(TDIR)/helloworld.c $(UDIR)/monitor_user.c -$(HPTEST): hptest.c - $(CC) $(CFLAGS) -o $(HPTEST) hptest.c monitor_user.c +$(HPTEST): $(TDIR)/helloworld.c + $(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(HPTEST) $(TDIR)/hptest.c $(UDIR)/monitor_user.c module: - make -C linux-5.17.15 M=$(PWD) modules + make -C $(KDIR) M=$(MDIR) modules + +# module: +# make -C linux-5.17.15 M=$(PWD)/kernel modules clean: - rm -f $(PROG) - rm -f $(HPTEST) - make -C linux-5.17.15 M=$(PWD) clean
\ No newline at end of file + rm -f $(OUTPUT_DIR)/* + make -C $(KDIR) M=$(MDIR) clean
\ No newline at end of file diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 0000000..71abe65 --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,3 @@ +# KMOD = variable_monitor +# obj-m := kernel/$(KMOD).o +# $(KMOD)-objs := kernel/monitor_kernel.o diff --git a/monitor_kernel.c b/kernel/monitor_kernel.c index 38360d2..1d17d64 100644 --- a/monitor_kernel.c +++ b/kernel/monitor_kernel.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include "monitor_kernel_lib.c" +#include "monitor_kernel_task.c" #define DEVICE_NAME "variable_monitor" @@ -133,6 +134,11 @@ int init_module(void) { fn_kallsyms_lookup_name_init(); // init kallsyms_lookup_name LOOKUP_SYMS(stack_trace_save_tsk); // stack_trace_save_tsk LOOKUP_SYMS(show_stack); // show_stack + LOOKUP_SYMS(idle_sched_class); // idle_sched_class + LOOKUP_SYMS(access_remote_vm); // access_remote_vm + + LOOKUP_SYMS_NORET(get_task_type); // get_task_type + LOOKUP_SYMS_NORET(kernfs_name); // kernfs_name return 0; } diff --git a/monitor_kernel.h b/kernel/monitor_kernel.h index 852d003..47e36d0 100644 --- a/monitor_kernel.h +++ b/kernel/monitor_kernel.h @@ -121,6 +121,13 @@ int fn_kallsyms_lookup_name_init(void); // init kallsyms_lookup_name } \ } while (0) +#define LOOKUP_SYMS_NORET(name) \ + do { \ + orig_##name = (void *)diag_kallsyms_lookup_name(#name); \ + if (!orig_##name) \ + pr_err("kallsyms_lookup_name: %s\n", #name); \ + } while (0) + #define BACKTRACE_DEPTH 20 // max stack depth // LOOKUP_SYMS(stack_trace_save_tsk); diff --git a/monitor_kernel_lib.c b/kernel/monitor_kernel_lib.c index 3dea0cd..3dea0cd 100644 --- a/monitor_kernel_lib.c +++ b/kernel/monitor_kernel_lib.c diff --git a/kernel/monitor_kernel_task.c b/kernel/monitor_kernel_task.c new file mode 100644 index 0000000..3b57152 --- /dev/null +++ b/kernel/monitor_kernel_task.c @@ -0,0 +1,377 @@ +#include "monitor_kernel_task.h" +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/syscall.h> // for syscall_get_nr +#include <linux/irq.h> +#include <linux/sched/mm.h> // for get_task_mm +#include <linux/syscalls.h> +#include <linux/tracehook.h> + +struct stack_trace { + unsigned int nr_entries, max_entries; + unsigned long *entries; + int skip; /* input argument: How many entries to skip */ +}; + +struct stack_frame_user { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static inline int diag_get_task_type(struct task_struct *tsk) { + if (orig_get_task_type) + return orig_get_task_type(&tsk->se); + return 0; +} + +static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf, + size_t buflen) { + if (orig_kernfs_name && cgrp && cgrp->kn) { + return orig_kernfs_name(cgrp->kn, buf, buflen); + } else { + return 0; + } +} + +static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) { + mm_info *info; + if (mm == NULL) + return NULL; + info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm); + return info; +} + +static void __diag_cgroup_name(struct task_struct *tsk, char *buf, + unsigned int count, int cgroup) { + int cgroup_id = cpuacct_cgrp_id; + + memset(buf, 0, count); + + if (cgroup == 1) { + cgroup_id = cpuset_cgrp_id; + } + + if (tsk && tsk->cgroups && tsk->cgroups->subsys && + tsk->cgroups->subsys[cgroup_id] && + tsk->cgroups->subsys[cgroup_id]->cgroup) { + orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count); + } +} + +static void diag_cgroup_name(struct task_struct *tsk, char *buf, + unsigned int count, int cgroup) { + __diag_cgroup_name(tsk, buf, count, cgroup); +} + +static int copy_stack_frame(const void __user *fp, + struct stack_frame_user *frame) { + int ret; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static int copy_stack_frame_remote(struct task_struct *tsk, + const void __user *fp, + struct stack_frame_user *frame) { + int ret; + struct mm_struct *mm; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0); + mmput(mm); + + return ret; +} + +static inline void save_stack_trace_user_remote(struct task_struct *tsk, + struct stack_trace *trace) { + const struct pt_regs *regs = task_pt_regs(tsk); + const void __user *fp = (const void __user *)regs->bp; + int count = 0; + + if (in_atomic() || irqs_disabled()) { + return; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + + if (!copy_stack_frame_remote(tsk, fp, &frame)) { + break; + } + + if ((unsigned long)fp < regs->sp) + break; + + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = frame.ret_addr; + } else + break; + + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + + count++; + /** + * 线上环境发现这里有hardlockup,这里强制退出 + */ + if (count >= trace->max_entries || count >= 100) + break; + } +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) { + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + int count = 0; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + count++; + /** + * 线上环境发现这里有hardlockup,这里强制退出 + */ + if (count >= trace->max_entries || count >= 100) + break; + } +} + +void perfect_save_stack_trace_user(struct stack_trace *trace) { + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + __save_stack_trace_user(trace); + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + +void diagnose_save_stack_trace_user(unsigned long *backtrace) { + struct stack_trace trace; + + memset(&trace, 0, sizeof(trace)); + memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long)); + trace.max_entries = BACKTRACE_DEPTH2; + trace.entries = backtrace; + perfect_save_stack_trace_user(&trace); +} + +void diagnose_save_stack_trace_user_remote(struct task_struct *tsk, + unsigned long *backtrace) { + struct stack_trace trace; + + memset(&trace, 0, sizeof(trace)); + memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long)); + trace.max_entries = BACKTRACE_DEPTH2; + trace.entries = backtrace; + + /* + * Trace user stack if we are not a kernel thread + */ + if (tsk->mm) { + save_stack_trace_user_remote(tsk, &trace); + } + if (trace.nr_entries < trace.max_entries) + trace.entries[trace.nr_entries++] = ULONG_MAX; +} + +void diag_task_brief(struct task_struct *tsk, task_detail *detail) { + struct pid_namespace *ns; + struct pt_regs *task_regs; + struct task_struct *leader; + struct pt_regs *irq_regs; + + if (!detail) + return; + + memset(detail, 0, sizeof(task_detail)); + + if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie + return; + leader = tsk->group_leader; + if (!leader || leader->exit_state == EXIT_ZOMBIE) { + return; + } + + if (tsk != current) { // not current task + detail->user_mode = -1; + detail->syscallno = -1; + } else if (!tsk->mm) { // current task but kernel thread + detail->user_mode = 0; + detail->syscallno = -1; + } else { // current task and user thread + irq_regs = get_irq_regs(); // get current irq regs + task_regs = task_pt_regs(tsk); + + if ((irq_regs && user_mode(irq_regs)) || + (task_regs && user_mode(task_regs))) { + detail->user_mode = 1; // user mode + } else { + detail->user_mode = 0; // kernel mode + } + + if (task_regs) { + detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no + } + } + + if (tsk->sched_class == orig_idle_sched_class) // idle task + detail->sys_task = 2; + else if (!tsk->mm) // kernel thread + detail->sys_task = 1; + else + detail->sys_task = 0; + + detail->pid = tsk->pid; // pid + detail->tgid = tsk->tgid; // tgid + detail->state = tsk->__state; // state + detail->task_type = diag_get_task_type(tsk); // task type + ns = task_active_pid_ns(tsk); // container pid + if (ns && ns != &init_pid_ns) { + detail->container_pid = task_pid_nr_ns(tsk, ns); + detail->container_tgid = task_tgid_nr_ns(tsk, ns); + } else { + detail->container_pid = tsk->pid; + detail->container_tgid = tsk->tgid; + } + strncpy(detail->comm, tsk->comm, TASK_COMM_LEN); + detail->comm[TASK_COMM_LEN - 1] = 0; // comm name + diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0); + diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1); + + detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name + detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name +} + +void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) { + struct pt_regs *regs; + unsigned long sp, ip, bp; + struct task_struct *leader; + + if (!detail) + return; + + detail->stack[0] = 0; + if (!tsk || !tsk->mm) + return; + + leader = tsk->group_leader; + if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { + return; + } + + sp = 0; + ip = 0; + bp = 0; + regs = task_pt_regs(tsk); + if (regs) { + sp = regs->sp; +#if defined(DIAG_ARM64) + ip = regs->pc; + bp = regs->sp; +#else + ip = regs->ip; + bp = regs->bp; +#endif + } +#if defined(DIAG_ARM64) + detail->regs = regs->user_regs; +#else + detail->regs = *regs; +#endif + detail->sp = sp; + detail->ip = ip; + detail->bp = bp; + + if (tsk == current) { + diagnose_save_stack_trace_user(detail->stack); + } else { + diagnose_save_stack_trace_user_remote(tsk, detail->stack); + } +} + +void diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) { + orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH2, 0); +} + +void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree, + proc_chains_detail *detail) { + struct task_struct *walker; + mm_info *mm_info; + int cnt = 0; + int i = 0; + struct task_struct *leader; + + for (i = 0; i < PROCESS_CHAINS_COUNT; i++) { + detail->chains[i][0] = 0; + detail->tgid[i] = 0; + } + if (style == 0) + return; + + if (!tsk || !tsk->mm) + return; + + leader = tsk->group_leader; + if (!leader || !leader->mm || + leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm + return; + } + + rcu_read_lock(); + walker = tsk; + + while (walker->pid > 0) { + if (!thread_group_leader(walker)) + walker = rcu_dereference(walker->group_leader); + mm_info = find_mm_info(mm_tree, walker->mm); + if (mm_info) { + if (mm_info->cgroup_buf[0] == 0) + diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0); + strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN); + detail->full_argv[cnt] = 1; + } else { + strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN); + detail->full_argv[cnt] = 0; + } + detail->tgid[cnt] = walker->pid; + walker = rcu_dereference(walker->real_parent); + cnt++; + if (cnt >= PROCESS_CHAINS_COUNT) + break; + } + rcu_read_unlock(); +}
\ No newline at end of file diff --git a/kernel/monitor_kernel_task.h b/kernel/monitor_kernel_task.h new file mode 100644 index 0000000..62e501c --- /dev/null +++ b/kernel/monitor_kernel_task.h @@ -0,0 +1,98 @@ +#include <linux/kernfs.h> +#include <linux/sched.h> + +#define CGROUP_NAME_LEN 32 // max length of cgroup name +#define TASK_COMM_LEN 16 // max length of task name + +#define BACKTRACE_DEPTH2 30 // max depth of backtrace + +#define PROCESS_CHAINS_COUNT 10 // max count of process chains +#define PROCESS_ARGV_LEN 128 // max length of process argv + +// from +// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/uapi/ali_diagnose.h + +typedef struct { + char cgroup_buf[CGROUP_NAME_LEN]; + char cgroup_cpuset[CGROUP_NAME_LEN]; + int pid; + int tgid; + int container_pid; + int container_tgid; + long state; + int task_type; + unsigned long syscallno; + /** + * 0->user 1->sys 2->idle + */ + unsigned long sys_task; + /** + * 1->user mode 0->sys mode -1->unknown + */ + unsigned long user_mode; + char comm[TASK_COMM_LEN]; +} task_detail; + +typedef struct { + unsigned long stack[BACKTRACE_DEPTH2]; +} kern_stack_detail; + +typedef struct { + struct pt_regs regs; + unsigned long ip; + unsigned long bp; + unsigned long sp; + unsigned long stack[BACKTRACE_DEPTH2]; +} user_stack_detail; + +typedef struct { + unsigned int full_argv[PROCESS_CHAINS_COUNT]; // + char chains[PROCESS_CHAINS_COUNT][PROCESS_ARGV_LEN]; // process chains argv + unsigned int tgid[PROCESS_CHAINS_COUNT]; // process chains tgid +} proc_chains_detail; + +// most important struct +typedef struct { + int et_type; + unsigned long id; + unsigned long long tv; + task_detail task; // brief + user_stack_detail user_stack; // user stack + kern_stack_detail kern_stack; // kernel stack + proc_chains_detail proc_chains; // process chains argv +} variable_monitor_task; + +typedef struct { + struct rcu_head rcu_head; + pid_t pid; + struct mm_struct *mm; + char cgroup_buf[256]; + char argv[256]; +} mm_info; + +typedef struct { + struct radix_tree_root mm_tree; + spinlock_t mm_tree_lock; +} mm_tree; + +void diag_task_brief(struct task_struct *tsk, + task_detail *detail); // get task brief +void diag_task_user_stack(struct task_struct *tsk, + user_stack_detail *detail); // get task user stack +void diag_task_kern_stack(struct task_struct *tsk, + kern_stack_detail *detail); // get task kernel stack +void dump_proc_chains_argv( + int style, struct task_struct *tsk, mm_tree *mm_tree, + proc_chains_detail *detail); // get process chains argv + +// orig_X +struct sched_class *orig_idle_sched_class; +int (*orig_get_task_type)(struct sched_entity *se); +int (*orig_kernfs_name)(struct kernfs_node *kn, char *buf, size_t buflen); +int (*orig_access_remote_vm)(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +extern unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task, + unsigned long *store, + unsigned int size, + unsigned int skipnr); + diff --git a/helloworld.c b/testcase/helloworld.c index 1416d5a..554b258 100644 --- a/helloworld.c +++ b/testcase/helloworld.c @@ -1,4 +1,4 @@ -#include "monitor_user.h" +#include "../user/monitor_user.h" #include <stdio.h> #include <unistd.h> #include <string.h> diff --git a/hptest.c b/testcase/hptest.c index 4a7e494..ab2f0a8 100644 --- a/hptest.c +++ b/testcase/hptest.c @@ -1,4 +1,4 @@ -#include "monitor_user.h" +#include "../user/monitor_user.h" #include <fcntl.h> #include <stdio.h> #include <sys/mman.h> diff --git a/monitor_user.c b/user/monitor_user.c index 91da839..91da839 100644 --- a/monitor_user.c +++ b/user/monitor_user.c diff --git a/monitor_user.h b/user/monitor_user.h index f4d9df1..f4d9df1 100644 --- a/monitor_user.h +++ b/user/monitor_user.h |
