diff options
| author | Xiongwei Jiang <[email protected]> | 2021-11-10 14:26:01 +0800 |
|---|---|---|
| committer | Xiongwei Jiang <[email protected]> | 2021-11-10 14:26:01 +0800 |
| commit | 4a950c984e2730fc35aaca319bcb28b51ce260b6 (patch) | |
| tree | 9c41045eeae71454d97bb093df22b602d8594860 /SOURCE | |
| parent | 19c8cbdf483bb14b4dae02cee41bc657bf98909c (diff) | |
sched: throttle-delay kernel space
Diffstat (limited to 'SOURCE')
| -rw-r--r-- | SOURCE/diagnose-tools/throttle_delay.cc | 411 | ||||
| -rwxr-xr-x | SOURCE/module/Makefile | 2 | ||||
| -rwxr-xr-x | SOURCE/module/internal.h | 3 | ||||
| -rw-r--r-- | SOURCE/module/kernel/throttle_delay.c | 958 | ||||
| -rw-r--r-- | SOURCE/uapi/ali_diagnose.h | 12 | ||||
| -rw-r--r-- | SOURCE/uapi/throttle_delay.h | 62 |
6 files changed, 1445 insertions, 3 deletions
diff --git a/SOURCE/diagnose-tools/throttle_delay.cc b/SOURCE/diagnose-tools/throttle_delay.cc new file mode 100644 index 0000000..5968565 --- /dev/null +++ b/SOURCE/diagnose-tools/throttle_delay.cc @@ -0,0 +1,411 @@ +/* + * Linux内核诊断工具--用户态throttle-delay功能实现 + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Baoyou Xie <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ + +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <getopt.h> + +#include <sys/time.h> +#include <string.h> +#include <stdio.h> /* for printf */ +#include <stdlib.h> /* for exit */ + +#include <set> + +#include "internal.h" +#include "symbol.h" +#include "json/json.h" +#include <iostream> +#include <fstream> + +#include "uapi/throttle_delay.h" +#include "params_parse.h" +#include <syslog.h> + +using namespace std; + +static char sls_file[256]; +static int syslog_enabled; + +void usage_throttle_delay(void) +{ + printf(" throttle-delay usage:\n"); + printf(" --help throttle-delay help info\n"); + printf(" --activate\n"); + printf(" verbose VERBOSE\n"); + printf(" threshold THRESHOLD(MS)\n"); + printf(" tgid process group monitored\n"); + printf(" pid thread id that monitored\n"); + printf(" comm comm that monitored\n"); + printf(" --deactivate\n"); + printf(" --report dump log with text.\n"); +} + +static void do_activate(const char *arg) +{ + int ret = 0; + struct params_parser parse(arg); + struct diag_throttle_delay_settings settings; + string str; + + memset(&settings, 0, sizeof(struct diag_throttle_delay_settings)); + + settings.verbose = parse.int_value("verbose"); + settings.tgid = parse.int_value("tgid"); + settings.pid = parse.int_value("pid"); + settings.bvt = parse.int_value("bvt"); + settings.threshold_ms = parse.int_value("threshold"); + + if (0 == settings.threshold_ms) + { + settings.threshold_ms = 50; + } + + str = parse.string_value("comm"); + if (str.length() > 0) { + strncpy(settings.comm, str.c_str(), TASK_COMM_LEN); + settings.comm[TASK_COMM_LEN - 1] = 0; + } + + if (run_in_host) { + ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_SET, (long)&settings); + } else { + ret = -ENOSYS; + syscall(DIAG_THROTTLE_DELAY_SET, &ret, &settings, sizeof(struct diag_throttle_delay_settings)); + } + + printf("功能设置%s,返回值:%d\n", ret ? "失败" : "成功", ret); + printf(" 进程ID:\t%d\n", settings.tgid); + printf(" 线程ID:\t%d\n", settings.pid); + printf(" 进程名称:\t%s\n", settings.comm); + printf(" 监控阈值(ms):\t%d\n", settings.threshold_ms); + printf(" 输出级别:\t%d\n", settings.verbose); + if (ret) + return; + + ret = diag_activate("throttle-delay"); + if (ret == 1) { + printf("throttle-delay activated\n"); + } else { + printf("throttle-delay is not activated, ret %d\n", ret); + } +} + +static void do_deactivate(void) +{ + int ret = 0; + + ret = diag_deactivate("throttle-delay"); + if (ret == 0) { + printf("throttle-delay is not activated\n"); + } else { + printf("deactivate throttle-delay fail, ret is %d\n", ret); + } +} + +static void do_settings(const char *arg) +{ + struct diag_throttle_delay_settings settings; + int ret; + int enable_json = 0; + Json::Value root; + struct params_parser parse(arg); + enable_json = parse.int_value("json"); + + if (run_in_host) { + ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_SETTINGS, (long)&settings); + } else { + ret = -ENOSYS; + syscall(DIAG_THROTTLE_DELAY_SETTINGS, &ret, &settings, + sizeof(struct diag_throttle_delay_settings)); + } + + if (ret == 0) { + if (1 != enable_json) + { + printf("功能设置:\n"); + printf(" 是否激活:\t%s\n", settings.activated ? "√" : "×"); + printf(" 进程ID:\t%d\n", settings.tgid); + printf(" 线程ID:\t%d\n", settings.pid); + printf(" 进程名称:\t%s\n", settings.comm); + printf(" 监控阈值(ms):\t%d\n", settings.threshold_ms); + printf(" 输出级别:\t%d\n", settings.verbose); + } + else + { + root["activated"] = Json::Value(settings.activated); + root["tgid"] = Json::Value(settings.tgid); + root["pid"] = Json::Value(settings.pid); + root["comm"] = Json::Value(settings.comm); + root["threshold"] = Json::Value(settings.threshold_ms); + root["verbose"] = Json::Value(settings.verbose); + } + } else { + if (1 != enable_json) + { + printf("获取throttle-delay设置失败,请确保正确安装了diagnose-tools工具\n"); + } + else + { + root["err"]=Json::Value("found throttle-delay settings failed, please check diagnose-tools installed or not\n"); + } + } + + if (1 == enable_json) + { + std::string str_log; + str_log.append(root.toStyledString()); + printf("%s", str_log.c_str()); + } +} + +static int throttle_delay_extract(void *buf, unsigned int len, void *) +{ + int *et_type; + struct throttle_delay_dither *dither; + struct throttle_delay_rq *rq; + static int seq = 0; + + if (len == 0) + return 0; + + et_type = (int *)buf; + switch (*et_type) { + case et_throttle_delay_dither: + if (len < sizeof(struct throttle_delay_dither)) + break; + dither = (struct throttle_delay_dither *)buf; + + printf("警告:调度被延迟 %lu ms,NOW: %lu, QUEUED: %lu, 当前时间:[%lu:%lu]\n", + dither->delay_ms, + dither->now, + dither->dequeued, + dither->tv.tv_sec, + dither->tv.tv_usec); + + printf("##CGROUP:[%s] %d [%03d] 采样命中\n", + dither->task.cgroup_buf, + dither->task.pid, + seq); + seq++; + + diag_printf_kern_stack(&dither->kern_stack); + diag_printf_user_stack(dither->task.tgid, + dither->task.container_tgid, + dither->task.comm, + &dither->user_stack); + printf("#* 0xffffffffffffff %s (UNKNOWN)\n", + dither->task.comm); + diag_printf_proc_chains(&dither->proc_chains); + printf("##\n"); + + break; + case et_throttle_delay_rq: + if (len < sizeof(struct throttle_delay_rq)) + break; + rq = (struct throttle_delay_rq *)buf; + + printf("\tCPU %d,nr_running:%d\n", + rq->cpu, rq->nr_running); + + break; + default: + break; + } + + return 0; +} + +static void do_extract(char *buf, int len) +{ + extract_variant_buffer(buf, len, throttle_delay_extract, NULL); +} + +static void do_dump(const char *arg) +{ + static char variant_buf[140 * 1024 * 1024]; + int len; + int ret = 0; + struct diag_ioctl_dump_param dump_param = { + .user_ptr_len = &len, + .user_buf_len = 4 * 1024 * 1024, + .user_buf = variant_buf, + }; + + memset(variant_buf, 0, 4 * 1024 * 1024); + if (run_in_host) { + ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_DUMP, (long)&dump_param); + } else { + ret = -ENOSYS; + syscall(DIAG_THROTTLE_DELAY_DUMP, &ret, &len, variant_buf, 4 * 1024 * 1024); + } + + if (ret == 0 && len > 0) { + do_extract(variant_buf, len); + } +} + +static int sls_extract(void *buf, unsigned int len, void *) +{ + int *et_type; + struct throttle_delay_dither *dither; + struct throttle_delay_rq *rq; + symbol sym; + + Json::Value root; + Json::Value task; + Json::Value kern_stack; + Json::Value user_stack; + Json::Value proc_chains; + + if (len == 0) + return 0; + + et_type = (int *)buf; + switch (*et_type) { + case et_throttle_delay_dither: + if (len < sizeof(struct throttle_delay_dither)) + break; + dither = (struct throttle_delay_dither *)buf; + root["id"] = dither->id; + root["seq"] = dither->seq; + root["delay_ms"] = Json::Value(dither->delay_ms); + root["now"] = Json::Value(dither->now); + root["queued"] = Json::Value(dither->dequeued); + diag_sls_time(&dither->tv, root); + diag_sls_task(&dither->task, task); + diag_sls_kern_stack(&dither->kern_stack, task); + diag_sls_user_stack(dither->task.tgid, + dither->task.container_tgid, + dither->task.comm, + &dither->user_stack, task, 0); + diag_sls_proc_chains(&dither->proc_chains, task); + root["task"] = task; + + write_file(sls_file, "throttle-delay-dither", &dither->tv, dither->id, dither->seq, root); + write_syslog(syslog_enabled, "throttle-delay-dither", &dither->tv, dither->id, dither->seq, root); + break; + case et_throttle_delay_rq: + if (len < sizeof(struct throttle_delay_rq)) + break; + rq = (struct throttle_delay_rq *)buf; + root["id"] = rq->id; + root["seq"] = rq->seq; + diag_sls_time(&rq->tv, root); + root["cpu"] = rq->cpu; + root["nr_running"] = rq->nr_running; + write_file(sls_file, "throttle-delay-rq", &rq->tv, rq->id, rq->seq, root); + write_syslog(syslog_enabled, "throttle-delay-rq", &rq->tv, rq->id, rq->seq, root); + break; + default: + break; + } + + return 0; +} + +static void do_sls(char *arg) +{ + int ret; + static char variant_buf[4 * 1024 * 1024]; + int len; + int jiffies_sls = 0; + struct diag_ioctl_dump_param dump_param = { + .user_ptr_len = &len, + .user_buf_len = 4 * 1024 * 1024, + .user_buf = variant_buf, + }; + + ret = log_config(arg, sls_file, &syslog_enabled); + if (ret != 1) + return; + + java_attach_once(); + while (1) { + if (run_in_host) { + ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_DUMP, (long)&dump_param); + } else { + ret = -ENOSYS; + syscall(DIAG_THROTTLE_DELAY_DUMP, &ret, &len, variant_buf, 4 * 1024 * 1024); + } + + if (ret == 0 && len > 0) { + /** + * 10 min + */ + if (jiffies_sls >= 60) { + jiffies_sls = 0; + clear_symbol_info(pid_cmdline, g_symbol_parser.get_java_procs(), 1); + java_attach_once(); + } + + extract_variant_buffer(variant_buf, len, sls_extract, NULL); + } + + sleep(10); + jiffies_sls++; + } +} + +int throttle_delay_main(int argc, char **argv) +{ + static struct option long_options[] = { + {"help", no_argument, 0, 0 }, + {"activate", optional_argument, 0, 0 }, + {"deactivate", no_argument, 0, 0 }, + {"settings", optional_argument, 0, 0 }, + {"report", optional_argument, 0, 0 }, + {"log", required_argument, 0, 0 }, + {0, 0, 0, 0 } + }; + int c; + + if (argc <= 1) { + usage_throttle_delay(); + return 0; + } + while (1) { + int option_index = -1; + + c = getopt_long_only(argc, argv, "", long_options, &option_index); + if (c == -1) + break; + switch (option_index) { + case 0: + usage_throttle_delay(); + break; + case 1: + do_activate(optarg ? optarg : ""); + break; + case 2: + do_deactivate(); + break; + case 3: + do_settings(optarg ? optarg : ""); + break; + case 4: + do_dump(optarg ? optarg : ""); + break; + case 5: + do_sls(optarg); + break; + default: + usage_throttle_delay(); + break; + } + } + + return 0; +} diff --git a/SOURCE/module/Makefile b/SOURCE/module/Makefile index 298c20f..0838616 100755 --- a/SOURCE/module/Makefile +++ b/SOURCE/module/Makefile @@ -214,7 +214,7 @@ ifneq ($(KERNELRELEASE),) kernel/exec.o kernel/perf.o kernel/run_trace.o kernel/irq_trace.o \ kernel/kprobe.o kernel/utilization.o kernel/sched_delay.o kernel/reboot.o \ kernel/uprobe.o kernel/sys_cost.o kernel/sig_info.o kernel/task_monitor.o \ - kernel/rw_sem.o + kernel/rw_sem.o kernel/throttle_delay.o $(TARGET)-objs += mm/mm_entry.o mm/alloc_page.o mm/alloc_top.o mm/high_order.o mm/rss_monitor.o mm/memcg_stats.o $(TARGET)-objs += io/io_entry.o diff --git a/SOURCE/module/internal.h b/SOURCE/module/internal.h index adb0372..1de3842 100755 --- a/SOURCE/module/internal.h +++ b/SOURCE/module/internal.h @@ -57,7 +57,7 @@ static inline void __percpu_counter_add(struct percpu_counter *fbc, #include "uapi/rss_monitor.h" #include "pub/variant_buffer.h" #include "pub/stack.h" - +#include "uapi/throttle_delay.h" /** * 手工替换函数相关的宏 */ @@ -429,6 +429,7 @@ struct diag_percpu_context { struct event_run_trace_raw event_run_trace_raw; struct sys_delay_detail sys_delay_detail; struct sched_delay_dither sched_delay_dither; + struct throttle_delay_dither throttle_delay_dither; struct { struct uprobe_detail uprobe_detail; diff --git a/SOURCE/module/kernel/throttle_delay.c b/SOURCE/module/kernel/throttle_delay.c new file mode 100644 index 0000000..578710a --- /dev/null +++ b/SOURCE/module/kernel/throttle_delay.c @@ -0,0 +1,958 @@ +/* + * Linux内核诊断工具--内核态throttle-delay功能 + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Xiongwei Jiang <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ + +#include <linux/module.h> +#include <linux/stacktrace.h> +#include <linux/hrtimer.h> +#include <linux/kernel.h> +#include <linux/kallsyms.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/timex.h> +#include <linux/tracepoint.h> +#include <trace/events/irq.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <trace/events/napi.h> +#include <linux/rtc.h> +#include <linux/time.h> +#include <linux/version.h> +#include <linux/net.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/icmp.h> +#include <linux/netfilter.h> +#include <net/tcp.h> +#include <linux/stop_machine.h> +#include <linux/smp.h> +#include <asm/thread_info.h> + +#include "internal.h" +#include "mm_tree.h" +#include "kern_internal.h" +#include "pub/trace_file.h" +#include "pub/trace_point.h" + +#include "uapi/throttle_delay.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32) && \ + LINUX_VERSION_CODE <= KERNEL_VERSION(4, 20, 0) \ + && !defined(UBUNTU_1604) + +#if defined(ALIOS_4000_009) +static unsigned long *get_last_dequeued_addr(struct task_struct *p) +{ + /** + * task_stack_page, but not end_of_stack !! + */ + return task_stack_page(p) + sizeof(struct thread_info) + 32; +} +#else +#if defined(CENTOS_8U) +#define diag_last_dequeued rh_reserved2 +#elif KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE +#define diag_last_dequeued ali_reserved3 +#elif KERNEL_VERSION(3, 10, 0) <= LINUX_VERSION_CODE +#define diag_last_dequeued rh_reserved3 +#else +#define diag_last_dequeued rh_reserved[0] +#endif + +static unsigned long *get_last_dequeued_addr(struct task_struct *p) +{ + return &p->diag_last_dequeued; +} + +#endif + +#define entity_is_task(se) (!se->my_q) + +//static struct kprobe kprobe_dequeue_entity; +//static int (*orig_throttle_cfs_rq)(struct cfs_rq *cfs_rq); + + +/* task group related information */ +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; + unsigned int rt_period_active; +}; +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchical_quota; + u64 runtime_expires; + int expires_seq; + + u8 idle; + u8 period_active; + u8 slack_started; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + + + +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + int bvt; +#ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; +#endif +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; + + ALI_HOTFIX_RESERVE(1) + ALI_HOTFIX_RESERVE(2) + ALI_HOTFIX_RESERVE(3) + ALI_HOTFIX_RESERVE(4) +}; + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + + /* Effective bvt type */ + int ebvt; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS load tracking + */ + struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + struct list_head batch_node; + unsigned int nr_batch_running; /* only tasks, no group se */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + int expires_seq; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; + int throttled, throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ + +#ifdef CONFIG_CFS_BVT + u64 kick_delay_nc; + u64 throttled_clock_nc; + u64 throttled_time_nc; /* total time */ + u64 throttled_time_nc_max; /* single max time */ + int throttled_nc; + struct list_head throttled_node_nc; +#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + unsigned long nr_uninterruptible; + + ALI_HOTFIX_RESERVE(1) + ALI_HOTFIX_RESERVE(2) + ALI_HOTFIX_RESERVE(3) + ALI_HOTFIX_RESERVE(4) +}; + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif /* CONFIG_SMP */ + int rt_queued; + + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct task_group *tg; +#endif + + unsigned long nr_uninterruptible; +}; + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root pushable_dl_tasks_root; + struct rb_node *pushable_dl_tasks_leftmost; +#else + struct dl_bw dl_bw; +#endif +}; + +#if 0 +typedef void (*smp_call_func_t)(void *info); +struct call_single_data { + struct llist_node llist; + smp_call_func_t func; + void *info; + unsigned int flags; +}; +#endif + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; +#ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#endif /* CONFIG_SMP */ + unsigned long nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; + + u64 kick_start_nc; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#ifdef CONFIG_CFS_BVT + struct list_head throttled_list_nc; +#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + unsigned int clock_skip_update; + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; + + struct callback_head *balance_callback; + + unsigned char idle_balance; + /* For active balancing */ + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + /* cpu of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; +#ifdef CONFIG_CFS_BVT + unsigned int nr_active_batch; + unsigned int nr_ls_tasks; + atomic_t curr_task_type; + int cpu_sibling; + unsigned int nr_deactive_batchq; + struct list_head batchqs; + u64 throttled_clock_nc; + s64 exempt_quota_nc; +#endif + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + long calc_load_active_r; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif + + ALI_HOTFIX_RESERVE(1) + ALI_HOTFIX_RESERVE(2) + ALI_HOTFIX_RESERVE(3) + ALI_HOTFIX_RESERVE(4) + ALI_HOTFIX_RESERVE(5) + ALI_HOTFIX_RESERVE(6) + ALI_HOTFIX_RESERVE(7) + ALI_HOTFIX_RESERVE(8) +}; + +typedef int (*tg_visitor)(struct task_group *, void *); + +__maybe_unused static atomic64_t diag_nr_running = ATOMIC64_INIT(0); +struct diag_throttle_delay_settings throttle_delay_settings = { + .threshold_ms = 50, +}; + +static int throttle_delay_alloced; +static int diag_throttle_delay_id; +static int throttle_delay_seq; +static struct diag_variant_buffer throttle_delay_variant_buffer; + +DEFINE_ORIG_FUNC(void, throttle_cfs_rq, 1, + struct cfs_rq *, cfs_rq); + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) +{ + struct task_group *parent, *child; + int ret; + + parent = from; + +down: + ret = (*down)(parent, data); + if (ret) + goto out; + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + ret = (*up)(parent, data); + if (ret || parent == from) + goto out; + + child = parent; + parent = parent->parent; + if (parent) + goto up; +out: + return ret; +} + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} + +static unsigned long read_last_dequeued(struct task_struct *p) +{ + unsigned long *ptr = get_last_dequeued_addr(p); + + if (ptr) { + return *ptr; + } else { + return 0; + } +} + + +static void update_last_dequeued(struct task_struct *p, unsigned long stamp) +{ + unsigned long *ptr = get_last_dequeued_addr(p); + + if (ptr) { + *ptr = stamp; + } +} + + +static int tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct rb_node *node; + struct sched_entity *se; + + if (!throttle_delay_settings.activated) + return 0; + + for (node = rb_first(&cfs_rq->tasks_timeline); node; node = rb_next(node)) { + se = rb_entry(node, struct sched_entity, run_node); + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + update_last_dequeued(p, ktime_to_ms(ktime_get())); + } + + } + return 0; +} + +static void diag_throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + orig_throttle_cfs_rq(cfs_rq); + +} + + +static void new_throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ + atomic64_inc_return(&diag_nr_running); + diag_throttle_cfs_rq(cfs_rq); + atomic64_dec_return(&diag_nr_running); +} + +static int lookup_syms(void) +{ + LOOKUP_SYMS(throttle_cfs_rq); + return 0; +} + +static void jump_init(void) +{ + JUMP_INIT(throttle_cfs_rq); + +} + +static int kprobe_dequeue_entity_pre(struct kprobe *p, struct pt_regs *regs) +{ + struct sched_entity *se = (void *)ORIG_PARAM2(regs); + int *flags = (void *)ORIG_PARAM3(regs); + struct task_struct *task; + + if (!throttle_delay_settings.activated) + return 0; + + + return 0; +} + +#if KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE +static void trace_sched_switch_hit(void *__data, bool preempt, + struct task_struct *prev, struct task_struct *next) +#elif KERNEL_VERSION(3, 10, 0) <= LINUX_VERSION_CODE +static void trace_sched_switch_hit(void *__data, + struct task_struct *prev, struct task_struct *next) +#else +static void trace_sched_switch_hit(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +#endif +{ + unsigned long long t_dequeued; + unsigned long long delta = 0; + unsigned long long delta_ms; + unsigned long long now = ktime_to_ms(ktime_get()); + + struct task_struct *leader = next->group_leader ? next->group_leader : next; + + if (throttle_delay_settings.bvt == 0 && diag_get_task_type(next) < 0) + return; + + if (throttle_delay_settings.comm[0] && (strcmp("none", throttle_delay_settings.comm) != 0)) { + if (strcmp(leader->comm, throttle_delay_settings.comm) != 0) + return; + } + + if (throttle_delay_settings.tgid && leader->pid != throttle_delay_settings.tgid) { + return; + } + + if (throttle_delay_settings.pid && next->pid != throttle_delay_settings.pid) { + return; + } + + t_dequeued = read_last_dequeued(next); + update_last_dequeued(next, 0); + if (t_dequeued <= 0) + return; + + delta = now - t_dequeued; + delta_ms = delta; + + if (delta_ms >= throttle_delay_settings.threshold_ms) { + struct throttle_delay_dither *dither; + unsigned long flags; + + if (strcmp(leader->comm, "qemu-kvm") == 0) + return; + + dither = &diag_percpu_context[smp_processor_id()]->throttle_delay_dither; + dither->et_type = et_throttle_delay_dither; + dither->id = diag_throttle_delay_id; + do_diag_gettimeofday(&dither->tv); + dither->seq = throttle_delay_seq; + throttle_delay_seq++; + dither->now = now; + dither->dequeued = t_dequeued; + dither->delay_ms = delta_ms; + diag_task_brief(next, &dither->task); + diag_task_kern_stack(next, &dither->kern_stack); + diag_task_user_stack(next, &dither->user_stack); + dump_proc_chains_simple(next, &dither->proc_chains); + + diag_variant_buffer_spin_lock(&throttle_delay_variant_buffer, flags); + diag_variant_buffer_reserve(&throttle_delay_variant_buffer, sizeof(struct throttle_delay_dither)); + diag_variant_buffer_write_nolock(&throttle_delay_variant_buffer, dither, sizeof(struct throttle_delay_dither)); + diag_variant_buffer_seal(&throttle_delay_variant_buffer); + diag_variant_buffer_spin_unlock(&throttle_delay_variant_buffer, flags); + } +} + +static int __activate_throttle_delay(void) +{ + int ret = 0; + + ret = alloc_diag_variant_buffer(&throttle_delay_variant_buffer); + if (ret) + goto out_variant_buffer; + throttle_delay_alloced = 1; + + JUMP_CHECK(throttle_cfs_rq); + + hook_tracepoint("sched_switch", trace_sched_switch_hit, NULL); +// hook_kprobe(&kprobe_dequeue_entity, "dequeue_entity", +// kprobe_dequeue_entity_pre, NULL); + JUMP_INSTALL(throttle_cfs_rq); + return 1; +out_variant_buffer: + return 0; +} + +int activate_throttle_delay(void) +{ + if (!throttle_delay_settings.activated) + throttle_delay_settings.activated = __activate_throttle_delay(); + + return throttle_delay_settings.activated; +} + +static void __deactivate_throttle_delay(void) +{ + unhook_tracepoint("sched_switch", trace_sched_switch_hit, NULL); + + JUMP_REMOVE(throttle_cfs_rq); + + msleep(20); + while (atomic64_read(&diag_nr_running) > 0) + { + msleep(10); + } +} + +int deactivate_throttle_delay(void) +{ + if (throttle_delay_settings.activated) + __deactivate_throttle_delay(); + throttle_delay_settings.activated = 0; + + return 0; +} + +static void dump_data(void) +{ + struct throttle_delay_rq rq; + unsigned long flags; + int cpu; + + rq.et_type = et_throttle_delay_rq; + rq.id = diag_throttle_delay_id; + do_diag_gettimeofday(&rq.tv); + + for_each_online_cpu(cpu) + { + rq.seq = throttle_delay_seq; + throttle_delay_seq++; + rq.cpu = cpu; + + diag_variant_buffer_spin_lock(&throttle_delay_variant_buffer, flags); + diag_variant_buffer_reserve(&throttle_delay_variant_buffer, sizeof(struct throttle_delay_rq)); + diag_variant_buffer_write_nolock(&throttle_delay_variant_buffer, &rq, sizeof(struct throttle_delay_rq)); + diag_variant_buffer_seal(&throttle_delay_variant_buffer); + diag_variant_buffer_spin_unlock(&throttle_delay_variant_buffer, flags); + } + + +} + +int throttle_delay_syscall(struct pt_regs *regs, long id) +{ + int __user *user_ptr_len; + size_t __user user_buf_len; + void __user *user_buf; + int ret = 0; + static struct diag_throttle_delay_settings settings; + + switch (id) { + case DIAG_THROTTLE_DELAY_SET: + user_buf = (void __user *)SYSCALL_PARAM1(regs); + user_buf_len = (size_t)SYSCALL_PARAM2(regs); + + if (user_buf_len != sizeof(struct diag_throttle_delay_settings)) { + ret = -EINVAL; + } else if (throttle_delay_settings.activated) { + ret = -EBUSY; + } else { + ret = copy_from_user(&settings, user_buf, user_buf_len); + if (!ret) { + throttle_delay_settings = settings; + } + } + break; + case DIAG_THROTTLE_DELAY_SETTINGS: + user_buf = (void __user *)SYSCALL_PARAM1(regs); + user_buf_len = (size_t)SYSCALL_PARAM2(regs); + + if (user_buf_len != sizeof(struct diag_throttle_delay_settings)) { + ret = -EINVAL; + } else { + settings = throttle_delay_settings; + ret = copy_to_user(user_buf, &settings, user_buf_len); + } + break; + case DIAG_THROTTLE_DELAY_DUMP: + user_ptr_len = (void __user *)SYSCALL_PARAM1(regs); + user_buf = (void __user *)SYSCALL_PARAM2(regs); + user_buf_len = (size_t)SYSCALL_PARAM3(regs); + + if (!throttle_delay_alloced) { + ret = -EINVAL; + } else { + dump_data(); + ret = copy_to_user_variant_buffer(&throttle_delay_variant_buffer, + user_ptr_len, user_buf, user_buf_len); + diag_throttle_delay_id++; + record_dump_cmd("throttle-delay"); + } + break; + default: + ret = -ENOSYS; + break; + } + + return ret; +} + +long diag_ioctl_throttle_delay(unsigned int cmd, unsigned long arg) +{ + struct diag_ioctl_dump_param dump_param; + int ret = 0; + static struct diag_throttle_delay_settings settings; + + switch (cmd) { + case CMD_THROTTLE_DELAY_SET: + if (throttle_delay_settings.activated) { + ret = -EBUSY; + } else { + ret = copy_from_user(&settings, (void *)arg, sizeof(struct diag_throttle_delay_settings)); + if (!ret) { + throttle_delay_settings = settings; + } + } + break; + case CMD_THROTTLE_DELAY_SETTINGS: + settings = throttle_delay_settings; + ret = copy_to_user((void *)arg, &settings, sizeof(struct diag_throttle_delay_settings)); + break; + case CMD_THROTTLE_DELAY_DUMP: + ret = copy_from_user(&dump_param, (void *)arg, sizeof(struct diag_ioctl_dump_param)); + if (!throttle_delay_alloced) { + ret = -EINVAL; + } else if (!ret) { + dump_data(); + ret = copy_to_user_variant_buffer(&throttle_delay_variant_buffer, + dump_param.user_ptr_len, dump_param.user_buf, dump_param.user_buf_len); + diag_throttle_delay_id++; + record_dump_cmd("throttle-delay"); + } + break; + default: + ret = -ENOSYS; + break; + } + + return ret; +} + +int diag_throttle_delay_init(void) +{ + if (lookup_syms()) + return -EINVAL; + + init_diag_variant_buffer(&throttle_delay_variant_buffer, 4 * 1024 * 1024); + jump_init(); + + if (throttle_delay_settings.activated) + throttle_delay_settings.activated = __activate_throttle_delay(); + + return 0; + +} + +void diag_throttle_delay_exit(void) +{ + if (throttle_delay_settings.activated) + __deactivate_throttle_delay(); + throttle_delay_settings.activated = 0; + + destroy_diag_variant_buffer(&throttle_delay_variant_buffer); +} +#else +int diag_throttle_delay_init(void) +{ + return 0; +} + +void diag_throttle_delay_exit(void) +{ + +} +#endif diff --git a/SOURCE/uapi/ali_diagnose.h b/SOURCE/uapi/ali_diagnose.h index dc7f2d7..f0f06e5 100644 --- a/SOURCE/uapi/ali_diagnose.h +++ b/SOURCE/uapi/ali_diagnose.h @@ -99,8 +99,9 @@ extern unsigned long debug_mode; #define DIAG_IOCTL_TYPE_RW_SEM (DIAG_IOCTL_TYPE_TASK_MONITOR + 1) #define DIAG_IOCTL_TYPE_RSS_MONITOR (DIAG_IOCTL_TYPE_RW_SEM + 1) #define DIAG_IOCTL_TYPE_MEMCG_STATS (DIAG_IOCTL_TYPE_RSS_MONITOR + 1) +#define DIAG_IOCTL_TYPE_THROTTLE_DELAY (DIAG_IOCTL_TYPE_MEMCG_STATS + 1) -#define DIAG_IOCTL_TYPE_END (DIAG_IOCTL_TYPE_MEMCG_STATS + 1) +#define DIAG_IOCTL_TYPE_END (DIAG_IOCTL_TYPE_THROTTLE_DELAY + 1) long diag_ioctl_sys_delay(unsigned int cmd, unsigned long arg); long diag_ioctl_sys_cost(unsigned int cmd, unsigned long arg); @@ -343,6 +344,11 @@ struct diag_ioctl_dump_param_cycle { #define DIAG_BASE_SYSCALL_MEMCG_STATS \ (DIAG_BASE_SYSCALL_PING_DELAY6 + DIAG_SYSCALL_INTERVAL) +/// 1900 +#define DIAG_BASE_SYSCALL_THROTTLE_DELAY \ + (DIAG_BASE_SYSCALL_PING_DELAY6 + DIAG_SYSCALL_INTERVAL) + + #define DIAG_SYSCALL_END (DIAG_BASE_SYSCALL + DIAG_SYSCALL_INTERVAL * 1000) enum diag_record_id { @@ -511,6 +517,10 @@ enum diag_record_id { et_memcg_stats_summary, et_memcg_stats_detail, + et_throttle_delay_base = et_rss_monitor_base + DIAG_EVENT_TYPE_INTERVAL, + et_throttle_delay_dither, + et_throttle_delay_rq, + et_count }; diff --git a/SOURCE/uapi/throttle_delay.h b/SOURCE/uapi/throttle_delay.h new file mode 100644 index 0000000..f304030 --- /dev/null +++ b/SOURCE/uapi/throttle_delay.h @@ -0,0 +1,62 @@ +/* + * Linux内核诊断工具--用户接口API + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Xiongwei Jiang <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ + +#ifndef UAPI_THROTTLE_DELAY_H +#define UAPI_THROTTLE_DELAY_H + +#include <linux/ioctl.h> + +int throttle_delay_syscall(struct pt_regs *regs, long id); + +#define DIAG_THROTTLE_DELAY_SET (DIAG_BASE_SYSCALL_THROTTLE_DELAY) +#define DIAG_THROTTLE_DELAY_SETTINGS (DIAG_THROTTLE_DELAY_SET + 1) +#define DIAG_THROTTLE_DELAY_DUMP (DIAG_THROTTLE_DELAY_SETTINGS + 1) + +struct diag_throttle_delay_settings { + unsigned int activated; + unsigned int verbose; + unsigned int tgid; + unsigned int pid; + unsigned int bvt; + char comm[TASK_COMM_LEN]; + unsigned int threshold_ms; +}; + +struct throttle_delay_rq { + int et_type; + unsigned long id; + unsigned long seq; + struct diag_timespec tv; + int cpu; + int nr_running; +}; + +struct throttle_delay_dither { + int et_type; + unsigned long id; + unsigned long seq; + struct diag_timespec tv; + unsigned long delay_ms; + unsigned long now, dequeued; + struct diag_task_detail task; + struct diag_kern_stack_detail kern_stack; + struct diag_user_stack_detail user_stack; + struct diag_proc_chains_detail proc_chains; +}; + +#define CMD_THROTTLE_DELAY_SET (0) +#define CMD_THROTTLE_DELAY_SETTINGS (CMD_THROTTLE_DELAY_SET + 1) +#define CMD_THROTTLE_DELAY_DUMP (CMD_THROTTLE_DELAY_SETTINGS + 1) +#define DIAG_IOCTL_THROTTLE_DELAY_SET _IOR(DIAG_IOCTL_TYPE_THROTTLE_DELAY, CMD_THROTTLE_DELAY_SET, struct diag_throttle_delay_settings) +#define DIAG_IOCTL_THROTTLE_DELAY_SETTINGS _IOW(DIAG_IOCTL_TYPE_THROTTLE_DELAY, CMD_THROTTLE_DELAY_SETTINGS, struct diag_throttle_delay_settings) +#define DIAG_IOCTL_THROTTLE_DELAY_DUMP _IOR(DIAG_IOCTL_TYPE_THROTTLE_DELAY, CMD_THROTTLE_DELAY_DUMP, struct diag_ioctl_dump_param) + +#endif /* UAPI_THROTTLE_DELAY_H */ |
