summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorzy <[email protected]>2023-11-16 13:11:45 +0800
committerzy <[email protected]>2023-11-16 13:11:45 +0800
commit4c63686513c0cce1b64f7aca2d6a9f2a2a379e98 (patch)
treed6e6d5c09c01e0d1f539cd1941f1e65bbe36c7e0 /kernel
parentb0365d12e761d268e47881c4a218681e78da3221 (diff)
Reorganize the source codeHEADmaster
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/monitor_kernel.c157
-rw-r--r--kernel/monitor_kernel.h179
-rw-r--r--kernel/monitor_kernel_lib.c427
-rw-r--r--kernel/monitor_kernel_task.c377
-rw-r--r--kernel/monitor_kernel_task.h98
6 files changed, 1241 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..71abe65
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,3 @@
+# KMOD = variable_monitor
+# obj-m := kernel/$(KMOD).o
+# $(KMOD)-objs := kernel/monitor_kernel.o
diff --git a/kernel/monitor_kernel.c b/kernel/monitor_kernel.c
new file mode 100644
index 0000000..1d17d64
--- /dev/null
+++ b/kernel/monitor_kernel.c
@@ -0,0 +1,157 @@
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include "monitor_kernel_lib.c"
+#include "monitor_kernel_task.c"
+
+#define DEVICE_NAME "variable_monitor"
+
+// for character device
+static dev_t dev_num;
+static struct cdev *watch_cdev;
+static struct class *watch_class;
+
+struct my_device_data {
+ pid_t pid;
+};
+
+static int device_open(struct inode *inode, struct file *file) {
+ struct my_device_data *data;
+ printk(KERN_INFO "%s: with pid %d\n", __FUNCTION__, current->pid);
+ // save pid
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->pid = current->pid;
+ file->private_data = data;
+ return 0;
+}
+
+static int device_release(struct inode *inode, struct file *file) {
+ // printk(KERN_INFO "%s\n", __FUNCTION__);
+ // load pid
+ struct my_device_data *data = file->private_data;
+ // clear watch with pid
+ clear_watch(data->pid);
+ kfree(data); // free data memory
+ return 0;
+}
+
+static long device_ioctl(struct file *file, unsigned int ioctl_num,
+ unsigned long ioctl_param) {
+ watch_arg warg;
+ void *kptr;
+ kernel_watch_timer *timer = NULL;
+ kernel_watch_arg k_watch_arg;
+ // copy watch_arg
+ if (copy_from_user(&warg, (watch_arg *)ioctl_param, sizeof(warg))) {
+ return -EACCES;
+ }
+
+ printk(KERN_INFO "Watch_arg: task_id=%d, name=%s, ptr=%p, length_byte=%d, "
+ "time_ns=%ld, threshold=%lld\n",
+ warg.task_id, warg.name, warg.ptr, warg.length_byte, warg.time_ns,
+ warg.threshold);
+ // user space address to kernel space address
+ kptr = convert_user_space_ptr(warg.task_id, (unsigned long)warg.ptr);
+ if (kptr == NULL) {
+ printk(KERN_ERR "Cannot access user space\n");
+ return -EACCES;
+ }
+ // check length
+ if (warg.length_byte != 1 && warg.length_byte != 2 && warg.length_byte != 4 &&
+ warg.length_byte != 8) {
+ printk(KERN_ERR "Invalid length %d\n", warg.length_byte);
+ return -EINVAL;
+ }
+ // k_watch_arg init
+ w_arg2k_w_arg(kptr, warg, &k_watch_arg);
+ timer = get_timer(warg.time_ns); // get a valuable timer
+
+ printk(KERN_INFO "ptr transform kptr: %p\n", kptr);
+ printk(KERN_INFO "timer: %p\n", timer);
+ printk(KERN_INFO "timer->sentinel: %d, timer->time_ns: %lld\n",
+ timer->sentinel, timer->time_ns);
+ printk(KERN_INFO "timer->hr_timer: %p\n", &timer->hr_timer);
+
+ TIMER_CANCEL(timer); // just in case
+ timer_add_watch(timer, k_watch_arg);
+ TIMER_START(timer);
+
+ printk(KERN_INFO "Start watching var: %s\n", warg.name);
+ return 0;
+}
+
+static struct file_operations fops = {
+ .open = device_open,
+ .release = device_release,
+ .unlocked_ioctl = device_ioctl,
+};
+
+int init_module(void) {
+ printk(KERN_INFO "%s\n", __FUNCTION__);
+ if (alloc_chrdev_region(&dev_num, 0, 1, DEVICE_NAME) < 0) {
+ printk(KERN_ALERT "Failed to register device number\n");
+ return -1;
+ }
+
+ if ((watch_cdev = cdev_alloc()) == NULL) {
+ printk(KERN_ALERT "Failed to allocate cdev structure\n");
+ unregister_chrdev_region(dev_num, 1);
+ return -1;
+ }
+
+ cdev_init(watch_cdev, &fops);
+ if (cdev_add(watch_cdev, dev_num, 1) == -1) {
+ printk(KERN_ALERT "Failed to add cdev structure\n");
+ device_destroy(watch_class, dev_num);
+ class_destroy(watch_class);
+ unregister_chrdev_region(dev_num, 1);
+ return -1;
+ }
+
+ if ((watch_class = class_create(THIS_MODULE, DEVICE_NAME)) == NULL) {
+ printk(KERN_ALERT "Failed to create class\n");
+ cdev_del(watch_cdev);
+ unregister_chrdev_region(dev_num, 1);
+ return -1;
+ }
+
+ if (device_create(watch_class, NULL, dev_num, NULL, DEVICE_NAME) == NULL) {
+ printk(KERN_ALERT "Failed to create device\n");
+ class_destroy(watch_class);
+ cdev_del(watch_cdev);
+ unregister_chrdev_region(dev_num, 1);
+ return -1;
+ }
+
+ printk(KERN_INFO "dev number: %d\n", dev_num);
+ printk(KERN_INFO "path: /dev/%s %d\n", DEVICE_NAME, dev_num);
+
+ fn_kallsyms_lookup_name_init(); // init kallsyms_lookup_name
+ LOOKUP_SYMS(stack_trace_save_tsk); // stack_trace_save_tsk
+ LOOKUP_SYMS(show_stack); // show_stack
+ LOOKUP_SYMS(idle_sched_class); // idle_sched_class
+ LOOKUP_SYMS(access_remote_vm); // access_remote_vm
+
+ LOOKUP_SYMS_NORET(get_task_type); // get_task_type
+ LOOKUP_SYMS_NORET(kernfs_name); // kernfs_name
+
+ return 0;
+}
+
+void cleanup_module(void) {
+ printk(KERN_INFO "%s\n", __FUNCTION__);
+ // clear all timer and page list
+ clear_all_watch();
+ // unmount
+ device_destroy(watch_class, dev_num);
+ class_destroy(watch_class);
+ cdev_del(watch_cdev);
+ unregister_chrdev_region(dev_num, 1);
+}
+
+MODULE_LICENSE("GPL"); \ No newline at end of file
diff --git a/kernel/monitor_kernel.h b/kernel/monitor_kernel.h
new file mode 100644
index 0000000..47e36d0
--- /dev/null
+++ b/kernel/monitor_kernel.h
@@ -0,0 +1,179 @@
+#include <linux/hrtimer.h>
+#include <linux/kprobes.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/slab.h> /* for kmalloc */
+#include <linux/string.h>
+
+#include <asm/uaccess.h>
+#include <linux/cdev.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/sched/loadavg.h> /* for avenrun, LOAD_* */
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/stacktrace.h> /* for stack_trace_print */
+
+#define MAX_TIMER_NUM (128) // max timer number
+#define TIMER_MAX_WATCH_NUM (32) // A timer max watch number at once time
+#define MAX_NAME_LEN (15) // max name length
+typedef struct {
+ pid_t task_id; // current process id
+ char name[MAX_NAME_LEN + 1]; // name
+ void *ptr; // virtual address
+ int length_byte; // byte
+ long long threshold; // threshold value
+ unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
+ unsigned char greater_flag; // reverse flag (true: >, false: <)
+ unsigned long time_ns; // timer interval (ns)
+} watch_arg;
+
+typedef struct {
+ pid_t task_id; // current process id
+ char name[MAX_NAME_LEN + 2]; // name, last char automatically add '\0'
+ void *kptr; // kernel address + offset
+ int length_byte; // byte
+ long long threshold; // threshold value
+ unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
+ unsigned char greater_flag; // reverse flag (true: >, false: <)
+} kernel_watch_arg;
+
+typedef struct {
+ unsigned long long time_ns; // hrTimer time interval (ns)
+ struct hrtimer hr_timer; // hrTimer
+ ktime_t kt; // hrTimer time
+ unsigned sentinel; // sentinel
+ kernel_watch_arg
+ k_watch_args[TIMER_MAX_WATCH_NUM]; // all watched kernel_watch_arg
+} kernel_watch_timer;
+
+#define TIMER_FILLED(timer) ((timer)->sentinel >= TIMER_MAX_WATCH_NUM)
+#define TIMER_EMPTY(timer) (!((timer)->time_ns | (timer)->sentinel))
+#define TIMER_NO_KWARG(timer) ((timer)->sentinel == 0)
+
+#define TIMER_START(timer) \
+ (hrtimer_start(&timer->hr_timer, timer->kt, HRTIMER_MODE_REL))
+#define TIMER_CANCEL(timer) (hrtimer_cancel(&timer->hr_timer))
+
+kernel_watch_timer kernel_wtimer_list[MAX_TIMER_NUM] = {
+ 0}; // all kernel_watch_timer
+int kernel_wtimer_num = 0; // current kernel_watch_timer number
+
+EXPORT_SYMBOL(kernel_wtimer_list); // export kernel_watch_timer_list
+EXPORT_SYMBOL(kernel_wtimer_num); // export kernel_watch_timer_num
+
+// Helper function
+unsigned char w_arg2k_w_arg(void *ptr, watch_arg warg,
+ kernel_watch_arg *k_watch_arg);
+
+// for timer
+kernel_watch_timer *get_timer(unsigned long long time_ns);
+unsigned char timer_add_watch(kernel_watch_timer *timer,
+ kernel_watch_arg k_watch_arg);
+unsigned char timer_del_watch_by_pid(kernel_watch_timer *timer, pid_t pid);
+
+// for memory access
+typedef struct {
+ pid_t task_id; // current process id
+ struct page *page;
+ void *kaddr;
+ struct list_head entry;
+} watch_local_memory;
+
+static LIST_HEAD(watch_local_memory_list);
+
+void free_page_list(pid_t task_id);
+void free_all_page_list(void);
+
+// static struct page *page = NULL;
+// static void *kaddr = NULL;
+
+void *convert_user_space_ptr(pid_t pid, unsigned long kaddr);
+
+// for timer
+// #define US2NS (1000) // Interval in microseconds
+// static struct hrtimer hr_timer;
+// static ktime_t kt;
+
+// hrTimer
+enum hrtimer_restart check_variable_cb(struct hrtimer *timer);
+void start_all_hrTimer(void);
+void cancel_all_hrTimer(void);
+
+unsigned char read_and_compare(kernel_watch_arg *k_arg);
+
+// for diag_kallsyms_lookup_name
+unsigned long (*diag_kallsyms_lookup_name)(const char *name);
+static struct kprobe kprobe_kallsyms_lookup_name = {.symbol_name =
+ "kallsyms_lookup_name"};
+
+int fn_kallsyms_lookup_name_init(void); // init kallsyms_lookup_name
+
+// form
+// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/module/internal.h#L65
+// look for current function address, all the function with prefix "orig_" are
+#define LOOKUP_SYMS(name) \
+ do { \
+ orig_##name = (void *)diag_kallsyms_lookup_name(#name); \
+ if (!orig_##name) { \
+ printk(KERN_ERR "kallsyms_lookup_name: %s\n", #name); \
+ return -EINVAL; \
+ } \
+ } while (0)
+
+#define LOOKUP_SYMS_NORET(name) \
+ do { \
+ orig_##name = (void *)diag_kallsyms_lookup_name(#name); \
+ if (!orig_##name) \
+ pr_err("kallsyms_lookup_name: %s\n", #name); \
+ } while (0)
+
+#define BACKTRACE_DEPTH 20 // max stack depth
+
+// LOOKUP_SYMS(stack_trace_save_tsk);
+unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task,
+ unsigned long *store,
+ unsigned int size,
+ unsigned int skipnr);
+// LOOKUP_SYMS(show_stack);
+void (*orig_show_stack)(struct task_struct *task, unsigned long *sp,
+ const char *loglvl);
+
+// https://www.spinics.net/lists/kernel/msg3582022.html
+// remove from 5.8.rc3,but it still work
+// whether the task contributes to the load
+#define __task_contributes_to_load(task) \
+ ((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && \
+ (task->flags & PF_FROZEN) == 0 && \
+ (READ_ONCE(task->__state) & TASK_NOLOAD) == 0)
+
+/// @brief print all task stack
+/// @param
+static void print_task_stack(void) {
+ struct task_struct *g, *p; // g: task group; p: task
+ unsigned long backtrace[BACKTRACE_DEPTH]; // save stack
+ unsigned int nr_bt; // stack depth
+ unsigned long long current_time; // last time
+ current_time = ktime_get_real();
+ printk("Timestamp (ns): %lld\n", current_time);
+ printk("Recent Load: %lu.%02lu, %lu.%02lu, %lu.%02lu\n", // recent load
+ LOAD_INT(avenrun[0]), LOAD_FRAC(avenrun[0]), LOAD_INT(avenrun[1]),
+ LOAD_FRAC(avenrun[1]), LOAD_INT(avenrun[2]), LOAD_FRAC(avenrun[2]));
+ rcu_read_lock(); // lock run queue
+ // printk("Running task\n");
+ do_each_thread(g, p) {
+ if (p->__state == TASK_RUNNING || __task_contributes_to_load(p) ||
+ p->__state == TASK_IDLE) {
+ printk("task: %s, pid %d, state %d\n", p->comm, p->pid,
+ p->__state); //! todo
+ nr_bt = orig_stack_trace_save_tsk(p, backtrace, BACKTRACE_DEPTH, 0);
+ stack_trace_print(backtrace, nr_bt, 0); // print
+ }
+ }
+ while_each_thread(g, p);
+ rcu_read_unlock(); // unlock run queue
+}
+
+unsigned char del_all_kwarg_by_pid(pid_t pid);
+void clear_watch(pid_t pid);
+void clear_all_watch(void); \ No newline at end of file
diff --git a/kernel/monitor_kernel_lib.c b/kernel/monitor_kernel_lib.c
new file mode 100644
index 0000000..3dea0cd
--- /dev/null
+++ b/kernel/monitor_kernel_lib.c
@@ -0,0 +1,427 @@
+#include "monitor_kernel.h"
+
+unsigned char w_arg2k_w_arg(void *ptr, watch_arg warg,
+ kernel_watch_arg *k_watch_arg) {
+ // k_watch_arg init
+ k_watch_arg->task_id = warg.task_id;
+ strncpy(k_watch_arg->name, warg.name, MAX_NAME_LEN + 1); // name
+ k_watch_arg->name[MAX_NAME_LEN + 1] = '\0'; // just in case
+ k_watch_arg->kptr = ptr;
+ k_watch_arg->length_byte = warg.length_byte;
+ k_watch_arg->threshold = warg.threshold;
+ k_watch_arg->unsigned_flag = warg.unsigned_flag;
+ k_watch_arg->greater_flag = warg.greater_flag;
+ return 0;
+}
+
+/// @brief get a valuable timer
+/// @param time_ns
+/// @return kernel_watch_timer *, NULL means fail
+kernel_watch_timer *get_timer(unsigned long long time_ns) {
+ int i = 0;
+ kernel_watch_timer *timer = NULL;
+ // chose a timer
+ for (i = 0; i < kernel_wtimer_num; i++) {
+ timer = &kernel_wtimer_list[i];
+
+ if (TIMER_EMPTY(timer)) {
+ break;
+ }
+ if ((timer->time_ns == time_ns) && (!TIMER_FILLED(timer))) {
+ break;
+ }
+ }
+ // if all timer is full
+ if (i >= MAX_TIMER_NUM) {
+ printk(KERN_ERR "No timer available\n");
+ return NULL;
+ }
+ // if a new timer, init it
+ if (i > kernel_wtimer_num - 1) {
+ printk(KERN_INFO "New timer\n");
+
+ kernel_wtimer_list[i].time_ns = time_ns;
+ kernel_wtimer_list[i].sentinel = 0;
+
+ kernel_wtimer_list[i].kt = ktime_set(0, (unsigned long)time_ns); // ns
+ // CLOCK_MONOTONIC: time since boot | HRTIMER_MODE_REL : relative time
+ hrtimer_init(&(kernel_wtimer_list[i].hr_timer), CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ kernel_wtimer_list[i].hr_timer.function =
+ check_variable_cb; // callback function
+
+ kernel_wtimer_num = i + 1;
+ }
+ printk(KERN_INFO "now, we have %d timers\n", kernel_wtimer_num);
+ return &kernel_wtimer_list[i];
+}
+
+/// @brief hrTimer add watch
+/// @param timer
+/// @param k_watch_arg
+/// @return 0 is success
+unsigned char timer_add_watch(kernel_watch_timer *timer,
+ kernel_watch_arg k_watch_arg) {
+ if (TIMER_FILLED(timer)) {
+ printk(KERN_ERR "Timer is full\n");
+ return -1;
+ }
+ memcpy(&timer->k_watch_args[timer->sentinel], &k_watch_arg,
+ sizeof(k_watch_arg));
+ // timer->k_watch_args[timer->sentinel] = k_watch_arg;
+ timer->sentinel++;
+ return 0;
+}
+
+unsigned char timer_del_watch_by_pid(kernel_watch_timer *timer, pid_t pid) {
+ int i = 0;
+ for (i = 0; i < timer->sentinel; i++) {
+ // if pid match, delete it and move the last one to this position, check
+ // again
+ if (timer->k_watch_args[i].task_id == pid) {
+ if (i != timer->sentinel - 1) {
+ memcpy(&timer->k_watch_args[i],
+ &timer->k_watch_args[timer->sentinel - 1],
+ sizeof(kernel_watch_arg));
+ }
+ timer->sentinel--;
+ i--;
+ }
+ }
+ return 0;
+}
+
+/// @brief transfer user space address to kernel space address
+/// change static global "kaddr" and "page" value
+/// @param pid: process id
+/// @param kaddr: user space address
+/// @return kernel space address + offset
+void *convert_user_space_ptr(pid_t pid, unsigned long addr) {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int ret;
+
+ // unsigned long aligned_addr = 0;
+ // unsigned long offset = 0;
+
+ watch_local_memory *node;
+
+ // if (addr < TASK_SIZE || addr > -PAGE_SIZE)
+ // {
+ // printk(KERN_ERR "Invalid address\n");
+ // return NULL;
+ // }
+
+ // for get_user_pages_remote
+ unsigned long aligned_addr = addr & PAGE_MASK;
+ unsigned long offset = addr & ~PAGE_MASK;
+
+ printk(KERN_INFO "%s\n", __FUNCTION__);
+
+ node = kmalloc(sizeof(watch_local_memory), GFP_KERNEL);
+ node->task_id = pid;
+
+ // Find the task with pid
+ rcu_read_lock();
+ task = pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
+
+ if (!task) {
+ printk(KERN_ERR "Cannot find task for PID %d\n", pid);
+ kfree(node); // careful there is kfree
+ return NULL;
+ }
+ // Get memory descriptor
+ mm = get_task_mm(task);
+ if (!mm) {
+ printk(KERN_ERR "Cannot get memory descriptor\n");
+ kfree(node); // careful there is kfree
+ return NULL;
+ }
+ down_read(&task->mm->mmap_lock);
+ ret = get_user_pages_remote(task->mm, aligned_addr, 1, FOLL_FORCE,
+ &(node->page), NULL, NULL);
+ up_read(&task->mm->mmap_lock);
+
+ if (ret != 1) {
+ printk(KERN_ERR "Cannot get user page\n");
+ kfree(node); // careful there is kfree
+ return NULL;
+ }
+ // Map the page to kernel space
+ node->kaddr = kmap(node->page);
+ list_add_tail(&node->entry, &watch_local_memory_list); // add to list
+ // printk(KERN_INFO "node->kaddr: %p, aligned_addr: %ld, offset: %ld\n",
+ // node->kaddr, aligned_addr, offset);
+ return (void *)((unsigned long)(node->kaddr) + offset);
+}
+
+/// @brief free page in watch_local_memory_list with task_id
+/// @param task_id
+void free_page_list(pid_t task_id) {
+ watch_local_memory *node, *next;
+ list_for_each_entry_safe(node, next, &watch_local_memory_list, entry) {
+ if (node == NULL)
+ break;
+ if (node->task_id == task_id) {
+ // unmap and release the page
+ if (node->kaddr)
+ kunmap(node->kaddr);
+ if (node->page)
+ put_page(node->page);
+ list_del(&node->entry);
+ kfree(node); // careful there is kfree
+ }
+ }
+}
+
+/// @brief free all page in watch_local_memory_list
+/// @param
+void free_all_page_list(void) {
+ watch_local_memory *node, *next;
+ list_for_each_entry_safe(node, next, &watch_local_memory_list, entry) {
+ if (node == NULL)
+ break;
+ // unmap and release the page
+ if (node->kaddr)
+ kunmap(node->kaddr);
+ if (node->page)
+ put_page(node->page);
+ list_del(&node->entry);
+ kfree(node); // careful there is kfree
+ }
+}
+
+/// @brief hrTimer handler
+enum hrtimer_restart check_variable_cb(struct hrtimer *timer) {
+ kernel_watch_timer *k_watch_timer =
+ container_of(timer, kernel_watch_timer, hr_timer);
+ int i = 0, j = 0;
+ int buffer[TIMER_MAX_WATCH_NUM]; // Buffer to store the messages
+
+ // check all watched kernel_watch_arg
+ for (i = 0; i < k_watch_timer->sentinel; i++) {
+ if (read_and_compare(&k_watch_timer->k_watch_args[i])) {
+ // snprintf(buffer + strlen(buffer), sizeof(buffer) - strlen(buffer), "
+ // name: %s, threshold: %lld, pid: %d\n",
+ // k_watch_timer->k_watch_args[i].name,
+ // k_watch_timer->k_watch_args[i].threshold,
+ // k_watch_timer->k_watch_args[i].task_id);
+ buffer[j] = i;
+ j++;
+
+ // printk(KERN_INFO "j: name %s, threshold: %lld\n",
+ // k_watch_timer->k_watch_args[i].name,
+ // k_watch_timer->k_watch_args[i].threshold);
+ // printk(KERN_INFO "j: %d\n", j);
+ }
+ }
+ if (j > 0) // if any threshold reached
+ {
+ printk("-------------------------------------\n");
+ printk("-------------watch monitor-----------\n");
+ printk("Threshold reached:\n");
+
+ for (i = 0; i < j; i++) {
+ printk(" name: %s, threshold: %lld, pid: %d\n",
+ k_watch_timer->k_watch_args[buffer[i]].name, //! todo
+ k_watch_timer->k_watch_args[buffer[i]].threshold,
+ k_watch_timer->k_watch_args[buffer[i]].task_id);
+ }
+ print_task_stack();
+ // restart timer after 1s
+ hrtimer_forward(timer, timer->base->get_time(), ktime_set(1, 0)); //! todo
+ printk("-------------------------------------\n");
+ } else {
+ // keep frequency
+ hrtimer_forward(timer, timer->base->get_time(), k_watch_timer->kt);
+ }
+ return HRTIMER_RESTART; // restart timer
+}
+
+/// @brief start hrTimer
+/// @param timeout: timeout in us
+/// @return 0 is success
+// int start_hrTimer(unsigned long timeout)
+// {
+// printk("HrTimer Start\n");
+
+// kt = ktime_set(0, (unsigned long)timeout); // us -> ns
+// // CLOCK_MONOTONIC: time since boot | HRTIMER_MODE_REL : relative time
+// hrtimer_init(&hr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+// hr_timer.function = check_variable_cb;
+// // mode the same as hrtimer_init
+// hrtimer_start(&hr_timer, kt, HRTIMER_MODE_REL);
+// return 0;
+// }
+
+/// @brief start all hrTimer
+/// @param
+void start_all_hrTimer(void) {
+ int i = 0;
+ kernel_watch_timer *timer = NULL;
+ for (i = 0; i < kernel_wtimer_num; i++) {
+ timer = &(kernel_wtimer_list[i]);
+ TIMER_START(timer);
+ }
+ printk("HrTimer start,module keep %d hrtimer for now\n", kernel_wtimer_num);
+}
+
+/// @brief cancel hrTimer
+/// @param
+void cancel_all_hrTimer(void) {
+ int i = 0;
+ kernel_watch_timer *timer = NULL;
+ for (i = 0; i < kernel_wtimer_num; i++) {
+ timer = &(kernel_wtimer_list[i]);
+ TIMER_CANCEL(timer);
+ }
+
+ printk("HrTimer cancel,module keep %d hrtimer for now\n", kernel_wtimer_num);
+}
+
+// for read_and_compare
+typedef unsigned char (*compare_func)(void *, long long);
+
+unsigned char compare_1_byte_signed(void *ptr, long long threshold) {
+ // printk("compare_1_byte_signed: value %d, biss: %lld\n", *(char *)ptr,
+ // threshold);
+ return *(char *)ptr > threshold;
+}
+unsigned char compare_1_byte_unsigned(void *ptr, long long threshold) {
+ // printk("compare_1_byte_unsigned: value %d, biss: %lld\n", *(unsigned char
+ // *)ptr, threshold);
+ return *(unsigned char *)ptr > threshold;
+}
+unsigned char compare_2_byte_signed(void *ptr, long long threshold) {
+ // printk("compare_2_byte_signed: value %d, biss: %lld\n", *(short int *)ptr,
+ // threshold);
+ return *(short int *)ptr > threshold;
+}
+unsigned char compare_2_byte_unsigned(void *ptr, long long threshold) {
+ // printk("compare_2_byte_unsigned: value %d, biss: %lld\n", *(unsigned short
+ // int *)ptr, threshold);
+ return *(unsigned short int *)ptr > threshold;
+}
+unsigned char compare_4_byte_signed(void *ptr, long long threshold) {
+ // printk("compare_4_byte_signed: value %d, biss: %lld\n", *(int *)ptr,
+ // threshold);
+ return *(int *)ptr > threshold;
+}
+unsigned char compare_4_byte_unsigned(void *ptr, long long threshold) {
+ // printk("compare_4_byte_unsigned: value %d, biss: %lld\n", *(unsigned int
+ // *)ptr, threshold);
+ return *(unsigned int *)ptr > threshold;
+}
+unsigned char compare_8_byte_signed(void *ptr, long long threshold) {
+ // printk("compare_8_byte_signed: value %lld, biss: %lld\n", *(long long
+ // *)ptr, threshold);
+ return *(long long *)ptr > threshold;
+}
+unsigned char compare_8_byte_unsigned(void *ptr, long long threshold) {
+ // printk("compare_8_byte_unsigned: value %lld, biss: %lld\n", *(unsigned long
+ // long *)ptr, threshold);
+ return *(unsigned long long *)ptr > threshold;
+}
+// list of compare functions
+static compare_func compare_funcs[8] = {
+ compare_1_byte_signed, compare_2_byte_signed, compare_4_byte_signed,
+ compare_8_byte_signed, compare_1_byte_unsigned, compare_2_byte_unsigned,
+ compare_4_byte_unsigned, compare_8_byte_unsigned};
+
+static int func_indices[2][9] = {{0, 0, 1, 0, 2, 0, 0, 0, 3},
+ {0, 4, 5, 0, 6, 0, 0, 0, 7}};
+
+/// @brief read k_arg->kptr and compare with threshold
+/// @param k_arg
+/// @return result of compare
+unsigned char read_and_compare(kernel_watch_arg *k_arg) {
+ void *ptr = k_arg->kptr;
+ int len = k_arg->length_byte;
+ unsigned char is_unsigned = k_arg->unsigned_flag;
+ long long threshold = k_arg->threshold;
+
+ unsigned char result = 0;
+
+ // if (len != 1 && len != 2 && len != 4 && len != 8)
+ // {
+ // printk(KERN_ERR "Invalid length\n");
+ // return 0;
+ // }
+
+ result = compare_funcs[func_indices[is_unsigned][len]](ptr, threshold);
+
+ // printk(KERN_INFO "read_and_compare: name %s, value %d, biss: %lld, result:
+ // %d \n", k_arg->name, *(int *)ptr,
+ // threshold, result);
+
+ if (k_arg->greater_flag)
+ return result;
+ else
+ return !result;
+}
+
+/// @brief init kallsyms_lookup_name
+/// @param
+/// @return 0 is success
+int fn_kallsyms_lookup_name_init(void) {
+ register_kprobe(&kprobe_kallsyms_lookup_name);
+ diag_kallsyms_lookup_name = (void *)kprobe_kallsyms_lookup_name.addr;
+ unregister_kprobe(&kprobe_kallsyms_lookup_name);
+
+ printk("xby-debug, diag_kallsyms_lookup_name is %p\n",
+ diag_kallsyms_lookup_name);
+
+ if (!diag_kallsyms_lookup_name) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+unsigned char del_all_kwarg_by_pid(pid_t pid) {
+ int i = 0;
+ kernel_watch_timer *timer = NULL;
+
+ printk(KERN_INFO "del kwarg...");
+
+ for (i = 0; i < kernel_wtimer_num; i++) {
+ timer = &(kernel_wtimer_list[i]);
+ timer_del_watch_by_pid(timer, pid);
+ }
+ for (i = 0; i < kernel_wtimer_num; i++) {
+ timer = &(kernel_wtimer_list[i]);
+ if (TIMER_NO_KWARG(timer)) // no available kwarg
+ {
+ if (i != kernel_wtimer_num - 1) {
+ memcpy(timer, &kernel_wtimer_list[kernel_wtimer_num - 1],
+ sizeof(kernel_watch_timer));
+ }
+ kernel_wtimer_num--;
+ i--;
+ }
+ }
+ return 0;
+}
+
+/// @brief clear watch with pid
+/// @param pid
+void clear_watch(pid_t pid) {
+ printk(KERN_INFO "clear pid %d 's watch variable\n", pid);
+ cancel_all_hrTimer(); // just in case
+ del_all_kwarg_by_pid(pid); // delete all kwarg with pid
+ free_page_list(pid); // free page with pid
+ start_all_hrTimer(); // restart timer
+}
+
+/// @brief clear all watch and reset kernel_wtimer_list/kernel_wtimer_num
+/// @param
+void clear_all_watch(void) {
+ printk(KERN_INFO "clear all watch variable\n");
+ // unmap and release the page
+ free_all_page_list();
+ // cancel timer
+ cancel_all_hrTimer();
+ // clear timer
+ kernel_wtimer_num = 0;
+ memset(kernel_wtimer_list, 0, sizeof(kernel_wtimer_list));
+} \ No newline at end of file
diff --git a/kernel/monitor_kernel_task.c b/kernel/monitor_kernel_task.c
new file mode 100644
index 0000000..3b57152
--- /dev/null
+++ b/kernel/monitor_kernel_task.c
@@ -0,0 +1,377 @@
+#include "monitor_kernel_task.h"
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/syscall.h> // for syscall_get_nr
+#include <linux/irq.h>
+#include <linux/sched/mm.h> // for get_task_mm
+#include <linux/syscalls.h>
+#include <linux/tracehook.h>
+
+struct stack_trace {
+ unsigned int nr_entries, max_entries;
+ unsigned long *entries;
+ int skip; /* input argument: How many entries to skip */
+};
+
+struct stack_frame_user {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static inline int diag_get_task_type(struct task_struct *tsk) {
+ if (orig_get_task_type)
+ return orig_get_task_type(&tsk->se);
+ return 0;
+}
+
+static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf,
+ size_t buflen) {
+ if (orig_kernfs_name && cgrp && cgrp->kn) {
+ return orig_kernfs_name(cgrp->kn, buf, buflen);
+ } else {
+ return 0;
+ }
+}
+
+static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) {
+ mm_info *info;
+ if (mm == NULL)
+ return NULL;
+ info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm);
+ return info;
+}
+
+static void __diag_cgroup_name(struct task_struct *tsk, char *buf,
+ unsigned int count, int cgroup) {
+ int cgroup_id = cpuacct_cgrp_id;
+
+ memset(buf, 0, count);
+
+ if (cgroup == 1) {
+ cgroup_id = cpuset_cgrp_id;
+ }
+
+ if (tsk && tsk->cgroups && tsk->cgroups->subsys &&
+ tsk->cgroups->subsys[cgroup_id] &&
+ tsk->cgroups->subsys[cgroup_id]->cgroup) {
+ orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count);
+ }
+}
+
+static void diag_cgroup_name(struct task_struct *tsk, char *buf,
+ unsigned int count, int cgroup) {
+ __diag_cgroup_name(tsk, buf, count, cgroup);
+}
+
+static int copy_stack_frame(const void __user *fp,
+ struct stack_frame_user *frame) {
+ int ret;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static int copy_stack_frame_remote(struct task_struct *tsk,
+ const void __user *fp,
+ struct stack_frame_user *frame) {
+ int ret;
+ struct mm_struct *mm;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0);
+ mmput(mm);
+
+ return ret;
+}
+
+static inline void save_stack_trace_user_remote(struct task_struct *tsk,
+ struct stack_trace *trace) {
+ const struct pt_regs *regs = task_pt_regs(tsk);
+ const void __user *fp = (const void __user *)regs->bp;
+ int count = 0;
+
+ if (in_atomic() || irqs_disabled()) {
+ return;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame_user frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+
+ if (!copy_stack_frame_remote(tsk, fp, &frame)) {
+ break;
+ }
+
+ if ((unsigned long)fp < regs->sp)
+ break;
+
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] = frame.ret_addr;
+ } else
+ break;
+
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+
+ count++;
+ /**
+ * 线上环境发现这里有hardlockup,这里强制退出
+ */
+ if (count >= trace->max_entries || count >= 100)
+ break;
+ }
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace) {
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+ int count = 0;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame_user frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] = frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ count++;
+ /**
+ * 线上环境发现这里有hardlockup,这里强制退出
+ */
+ if (count >= trace->max_entries || count >= 100)
+ break;
+ }
+}
+
+void perfect_save_stack_trace_user(struct stack_trace *trace) {
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+void diagnose_save_stack_trace_user(unsigned long *backtrace) {
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long));
+ trace.max_entries = BACKTRACE_DEPTH2;
+ trace.entries = backtrace;
+ perfect_save_stack_trace_user(&trace);
+}
+
+void diagnose_save_stack_trace_user_remote(struct task_struct *tsk,
+ unsigned long *backtrace) {
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long));
+ trace.max_entries = BACKTRACE_DEPTH2;
+ trace.entries = backtrace;
+
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (tsk->mm) {
+ save_stack_trace_user_remote(tsk, &trace);
+ }
+ if (trace.nr_entries < trace.max_entries)
+ trace.entries[trace.nr_entries++] = ULONG_MAX;
+}
+
+void diag_task_brief(struct task_struct *tsk, task_detail *detail) {
+ struct pid_namespace *ns;
+ struct pt_regs *task_regs;
+ struct task_struct *leader;
+ struct pt_regs *irq_regs;
+
+ if (!detail)
+ return;
+
+ memset(detail, 0, sizeof(task_detail));
+
+ if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie
+ return;
+ leader = tsk->group_leader;
+ if (!leader || leader->exit_state == EXIT_ZOMBIE) {
+ return;
+ }
+
+ if (tsk != current) { // not current task
+ detail->user_mode = -1;
+ detail->syscallno = -1;
+ } else if (!tsk->mm) { // current task but kernel thread
+ detail->user_mode = 0;
+ detail->syscallno = -1;
+ } else { // current task and user thread
+ irq_regs = get_irq_regs(); // get current irq regs
+ task_regs = task_pt_regs(tsk);
+
+ if ((irq_regs && user_mode(irq_regs)) ||
+ (task_regs && user_mode(task_regs))) {
+ detail->user_mode = 1; // user mode
+ } else {
+ detail->user_mode = 0; // kernel mode
+ }
+
+ if (task_regs) {
+ detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no
+ }
+ }
+
+ if (tsk->sched_class == orig_idle_sched_class) // idle task
+ detail->sys_task = 2;
+ else if (!tsk->mm) // kernel thread
+ detail->sys_task = 1;
+ else
+ detail->sys_task = 0;
+
+ detail->pid = tsk->pid; // pid
+ detail->tgid = tsk->tgid; // tgid
+ detail->state = tsk->__state; // state
+ detail->task_type = diag_get_task_type(tsk); // task type
+ ns = task_active_pid_ns(tsk); // container pid
+ if (ns && ns != &init_pid_ns) {
+ detail->container_pid = task_pid_nr_ns(tsk, ns);
+ detail->container_tgid = task_tgid_nr_ns(tsk, ns);
+ } else {
+ detail->container_pid = tsk->pid;
+ detail->container_tgid = tsk->tgid;
+ }
+ strncpy(detail->comm, tsk->comm, TASK_COMM_LEN);
+ detail->comm[TASK_COMM_LEN - 1] = 0; // comm name
+ diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0);
+ diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1);
+
+ detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name
+ detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name
+}
+
+void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) {
+ struct pt_regs *regs;
+ unsigned long sp, ip, bp;
+ struct task_struct *leader;
+
+ if (!detail)
+ return;
+
+ detail->stack[0] = 0;
+ if (!tsk || !tsk->mm)
+ return;
+
+ leader = tsk->group_leader;
+ if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) {
+ return;
+ }
+
+ sp = 0;
+ ip = 0;
+ bp = 0;
+ regs = task_pt_regs(tsk);
+ if (regs) {
+ sp = regs->sp;
+#if defined(DIAG_ARM64)
+ ip = regs->pc;
+ bp = regs->sp;
+#else
+ ip = regs->ip;
+ bp = regs->bp;
+#endif
+ }
+#if defined(DIAG_ARM64)
+ detail->regs = regs->user_regs;
+#else
+ detail->regs = *regs;
+#endif
+ detail->sp = sp;
+ detail->ip = ip;
+ detail->bp = bp;
+
+ if (tsk == current) {
+ diagnose_save_stack_trace_user(detail->stack);
+ } else {
+ diagnose_save_stack_trace_user_remote(tsk, detail->stack);
+ }
+}
+
+void diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) {
+ orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH2, 0);
+}
+
+void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree,
+ proc_chains_detail *detail) {
+ struct task_struct *walker;
+ mm_info *mm_info;
+ int cnt = 0;
+ int i = 0;
+ struct task_struct *leader;
+
+ for (i = 0; i < PROCESS_CHAINS_COUNT; i++) {
+ detail->chains[i][0] = 0;
+ detail->tgid[i] = 0;
+ }
+ if (style == 0)
+ return;
+
+ if (!tsk || !tsk->mm)
+ return;
+
+ leader = tsk->group_leader;
+ if (!leader || !leader->mm ||
+ leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm
+ return;
+ }
+
+ rcu_read_lock();
+ walker = tsk;
+
+ while (walker->pid > 0) {
+ if (!thread_group_leader(walker))
+ walker = rcu_dereference(walker->group_leader);
+ mm_info = find_mm_info(mm_tree, walker->mm);
+ if (mm_info) {
+ if (mm_info->cgroup_buf[0] == 0)
+ diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0);
+ strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN);
+ detail->full_argv[cnt] = 1;
+ } else {
+ strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN);
+ detail->full_argv[cnt] = 0;
+ }
+ detail->tgid[cnt] = walker->pid;
+ walker = rcu_dereference(walker->real_parent);
+ cnt++;
+ if (cnt >= PROCESS_CHAINS_COUNT)
+ break;
+ }
+ rcu_read_unlock();
+} \ No newline at end of file
diff --git a/kernel/monitor_kernel_task.h b/kernel/monitor_kernel_task.h
new file mode 100644
index 0000000..62e501c
--- /dev/null
+++ b/kernel/monitor_kernel_task.h
@@ -0,0 +1,98 @@
+#include <linux/kernfs.h>
+#include <linux/sched.h>
+
+#define CGROUP_NAME_LEN 32 // max length of cgroup name
+#define TASK_COMM_LEN 16 // max length of task name
+
+#define BACKTRACE_DEPTH2 30 // max depth of backtrace
+
+#define PROCESS_CHAINS_COUNT 10 // max count of process chains
+#define PROCESS_ARGV_LEN 128 // max length of process argv
+
+// from
+// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/uapi/ali_diagnose.h
+
+typedef struct {
+ char cgroup_buf[CGROUP_NAME_LEN];
+ char cgroup_cpuset[CGROUP_NAME_LEN];
+ int pid;
+ int tgid;
+ int container_pid;
+ int container_tgid;
+ long state;
+ int task_type;
+ unsigned long syscallno;
+ /**
+ * 0->user 1->sys 2->idle
+ */
+ unsigned long sys_task;
+ /**
+ * 1->user mode 0->sys mode -1->unknown
+ */
+ unsigned long user_mode;
+ char comm[TASK_COMM_LEN];
+} task_detail;
+
+typedef struct {
+ unsigned long stack[BACKTRACE_DEPTH2];
+} kern_stack_detail;
+
+typedef struct {
+ struct pt_regs regs;
+ unsigned long ip;
+ unsigned long bp;
+ unsigned long sp;
+ unsigned long stack[BACKTRACE_DEPTH2];
+} user_stack_detail;
+
+typedef struct {
+ unsigned int full_argv[PROCESS_CHAINS_COUNT]; //
+ char chains[PROCESS_CHAINS_COUNT][PROCESS_ARGV_LEN]; // process chains argv
+ unsigned int tgid[PROCESS_CHAINS_COUNT]; // process chains tgid
+} proc_chains_detail;
+
+// most important struct
+typedef struct {
+ int et_type;
+ unsigned long id;
+ unsigned long long tv;
+ task_detail task; // brief
+ user_stack_detail user_stack; // user stack
+ kern_stack_detail kern_stack; // kernel stack
+ proc_chains_detail proc_chains; // process chains argv
+} variable_monitor_task;
+
+typedef struct {
+ struct rcu_head rcu_head;
+ pid_t pid;
+ struct mm_struct *mm;
+ char cgroup_buf[256];
+ char argv[256];
+} mm_info;
+
+typedef struct {
+ struct radix_tree_root mm_tree;
+ spinlock_t mm_tree_lock;
+} mm_tree;
+
+void diag_task_brief(struct task_struct *tsk,
+ task_detail *detail); // get task brief
+void diag_task_user_stack(struct task_struct *tsk,
+ user_stack_detail *detail); // get task user stack
+void diag_task_kern_stack(struct task_struct *tsk,
+ kern_stack_detail *detail); // get task kernel stack
+void dump_proc_chains_argv(
+ int style, struct task_struct *tsk, mm_tree *mm_tree,
+ proc_chains_detail *detail); // get process chains argv
+
+// orig_X
+struct sched_class *orig_idle_sched_class;
+int (*orig_get_task_type)(struct sched_entity *se);
+int (*orig_kernfs_name)(struct kernfs_node *kn, char *buf, size_t buflen);
+int (*orig_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags);
+extern unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task,
+ unsigned long *store,
+ unsigned int size,
+ unsigned int skipnr);
+