sched: throttle-delay kernel space

author: Xiongwei Jiang <[email protected]> 2021-11-10 14:26:01 +0800
committer: Xiongwei Jiang <[email protected]> 2021-11-10 14:26:01 +0800
commit: 4a950c984e2730fc35aaca319bcb28b51ce260b6 (patch)
tree: 9c41045eeae71454d97bb093df22b602d8594860 /SOURCE
parent: 19c8cbdf483bb14b4dae02cee41bc657bf98909c (diff)
6 files changed, 1445 insertions, 3 deletions
diff --git a/SOURCE/diagnose-tools/throttle_delay.cc b/SOURCE/diagnose-tools/throttle_delay.cc
new file mode 100644
index 0000000..5968565
--- /dev/null
+++ b/SOURCE/diagnose-tools/throttle_delay.cc
@@ -0,0 +1,411 @@
+/*
+ * Linux内核诊断工具--用户态throttle-delay功能实现
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Baoyou Xie <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <getopt.h>
+
+#include <sys/time.h>
+#include <string.h>
+#include <stdio.h>     /* for printf */
+#include <stdlib.h>    /* for exit */
+
+#include <set>
+
+#include "internal.h"
+#include "symbol.h"
+#include "json/json.h"
+#include <iostream>
+#include <fstream>
+
+#include "uapi/throttle_delay.h"
+#include "params_parse.h"
+#include <syslog.h>
+
+using namespace std;
+
+static char sls_file[256];
+static int syslog_enabled;
+
+void usage_throttle_delay(void)
+{
+	printf("    throttle-delay usage:\n");
+	printf("        --help throttle-delay help info\n");
+	printf("        --activate\n");
+	printf("          verbose VERBOSE\n");
+	printf("          threshold THRESHOLD(MS)\n");
+	printf("          tgid process group monitored\n");
+	printf("          pid thread id that monitored\n");
+	printf("          comm comm that monitored\n");
+	printf("        --deactivate\n");
+	printf("        --report dump log with text.\n");
+}
+
+static void do_activate(const char *arg)
+{
+	int ret = 0;
+	struct params_parser parse(arg);
+	struct diag_throttle_delay_settings settings;
+	string str;
+
+	memset(&settings, 0, sizeof(struct diag_throttle_delay_settings));
+
+	settings.verbose = parse.int_value("verbose");
+	settings.tgid = parse.int_value("tgid");
+	settings.pid = parse.int_value("pid");
+	settings.bvt = parse.int_value("bvt");
+	settings.threshold_ms = parse.int_value("threshold");
+
+	if (0 == settings.threshold_ms)
+	{
+		settings.threshold_ms = 50;
+	}
+
+	str = parse.string_value("comm");
+	if (str.length() > 0) {
+		strncpy(settings.comm, str.c_str(), TASK_COMM_LEN);
+		settings.comm[TASK_COMM_LEN - 1] = 0;
+	}
+
+	if (run_in_host) {
+		ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_SET, (long)&settings);
+	} else {
+		ret = -ENOSYS;
+		syscall(DIAG_THROTTLE_DELAY_SET, &ret, &settings, sizeof(struct diag_throttle_delay_settings));
+	}
+
+	printf("功能设置%s，返回值：%d\n", ret ? "失败" : "成功", ret);
+	printf("    进程ID：\t%d\n", settings.tgid);
+	printf("    线程ID：\t%d\n", settings.pid);
+	printf("    进程名称：\t%s\n", settings.comm);
+	printf("    监控阈值(ms)：\t%d\n", settings.threshold_ms);
+	printf("    输出级别：\t%d\n", settings.verbose);
+	if (ret)
+		return;
+
+	ret = diag_activate("throttle-delay");
+	if (ret == 1) {
+		printf("throttle-delay activated\n");
+	} else {
+		printf("throttle-delay is not activated, ret %d\n", ret);
+	}
+}
+
+static void do_deactivate(void)
+{
+	int ret = 0;
+
+	ret = diag_deactivate("throttle-delay");
+	if (ret == 0) {
+		printf("throttle-delay is not activated\n");
+	} else {
+		printf("deactivate throttle-delay fail, ret is %d\n", ret);
+	}
+}
+
+static void do_settings(const char *arg)
+{
+	struct diag_throttle_delay_settings settings;
+	int ret;
+	int enable_json = 0;
+	Json::Value root;
+	struct params_parser parse(arg);
+	enable_json = parse.int_value("json");
+
+	if (run_in_host) {
+		ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_SETTINGS, (long)&settings);
+	} else {
+		ret = -ENOSYS;
+		syscall(DIAG_THROTTLE_DELAY_SETTINGS, &ret, &settings,
+			sizeof(struct diag_throttle_delay_settings));
+	}
+
+	if (ret == 0) {
+		if (1 != enable_json)
+		{
+			printf("功能设置：\n");
+			printf("    是否激活：\t%s\n", settings.activated ? "√" : "×");
+			printf("    进程ID：\t%d\n", settings.tgid);
+			printf("    线程ID：\t%d\n", settings.pid);
+			printf("    进程名称：\t%s\n", settings.comm);
+			printf("    监控阈值(ms)：\t%d\n", settings.threshold_ms);
+			printf("    输出级别：\t%d\n", settings.verbose);
+		}
+		else
+		{
+			root["activated"] = Json::Value(settings.activated);
+			root["tgid"] = Json::Value(settings.tgid);
+			root["pid"] = Json::Value(settings.pid);
+			root["comm"] = Json::Value(settings.comm);
+			root["threshold"] = Json::Value(settings.threshold_ms);
+			root["verbose"] = Json::Value(settings.verbose);
+		}
+	} else {
+		if (1 != enable_json)
+		{
+			printf("获取throttle-delay设置失败，请确保正确安装了diagnose-tools工具\n");
+		}
+		else
+		{
+			root["err"]=Json::Value("found throttle-delay settings failed, please check diagnose-tools installed or not\n");
+		}
+	}
+
+	if (1 == enable_json)
+	{
+		std::string str_log;
+		str_log.append(root.toStyledString());
+		printf("%s", str_log.c_str());
+	}
+}
+
+static int throttle_delay_extract(void *buf, unsigned int len, void *)
+{
+	int *et_type;
+	struct throttle_delay_dither *dither;
+	struct throttle_delay_rq *rq;
+	static int seq = 0;
+
+	if (len == 0)
+		return 0;
+
+	et_type = (int *)buf;
+	switch (*et_type) {
+	case et_throttle_delay_dither:
+		if (len < sizeof(struct throttle_delay_dither))
+			break;
+		dither = (struct throttle_delay_dither *)buf;
+
+		printf("警告：调度被延迟 %lu ms，NOW: %lu, QUEUED: %lu, 当前时间：[%lu:%lu]\n",
+			dither->delay_ms,
+			dither->now,
+			dither->dequeued,
+			dither->tv.tv_sec,
+			dither->tv.tv_usec);
+
+		printf("##CGROUP:[%s]  %d      [%03d]  采样命中\n",
+				dither->task.cgroup_buf,
+				dither->task.pid,
+				seq);
+		seq++;
+
+		diag_printf_kern_stack(&dither->kern_stack);
+		diag_printf_user_stack(dither->task.tgid,
+				dither->task.container_tgid,
+				dither->task.comm,
+				&dither->user_stack);
+		printf("#*        0xffffffffffffff %s (UNKNOWN)\n",
+				dither->task.comm);
+		diag_printf_proc_chains(&dither->proc_chains);
+		printf("##\n");
+
+		break;
+	case et_throttle_delay_rq:
+		if (len < sizeof(struct throttle_delay_rq))
+			break;
+		rq = (struct throttle_delay_rq *)buf;
+
+		printf("\tCPU %d，nr_running:%d\n",
+			rq->cpu, rq->nr_running);
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void do_extract(char *buf, int len)
+{
+	extract_variant_buffer(buf, len, throttle_delay_extract, NULL);
+}
+
+static void do_dump(const char *arg)
+{
+	static char variant_buf[140 * 1024 * 1024];
+	int len;
+	int ret = 0;
+	struct diag_ioctl_dump_param dump_param = {
+		.user_ptr_len = &len,
+		.user_buf_len = 4 * 1024 * 1024,
+		.user_buf = variant_buf,
+	};
+
+	memset(variant_buf, 0, 4 * 1024 * 1024);
+	if (run_in_host) {
+		ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_DUMP, (long)&dump_param);
+	} else {
+		ret = -ENOSYS;
+		syscall(DIAG_THROTTLE_DELAY_DUMP, &ret, &len, variant_buf, 4 * 1024 * 1024);
+	}
+
+	if (ret == 0 && len > 0) {
+		do_extract(variant_buf, len);
+	}
+}
+
+static int sls_extract(void *buf, unsigned int len, void *)
+{
+	int *et_type;
+	struct throttle_delay_dither *dither;
+	struct throttle_delay_rq *rq;
+	symbol sym;
+
+	Json::Value root;
+	Json::Value task;
+	Json::Value kern_stack;
+	Json::Value user_stack;
+	Json::Value proc_chains;
+
+	if (len == 0)
+		return 0;
+
+	et_type = (int *)buf;
+	switch (*et_type) {
+	case et_throttle_delay_dither:
+		if (len < sizeof(struct throttle_delay_dither))
+			break;
+		dither = (struct throttle_delay_dither *)buf;
+		root["id"] = dither->id;
+		root["seq"] = dither->seq;
+		root["delay_ms"] = Json::Value(dither->delay_ms);
+		root["now"] = Json::Value(dither->now);
+		root["queued"] = Json::Value(dither->dequeued);
+		diag_sls_time(&dither->tv, root);
+		diag_sls_task(&dither->task, task);
+		diag_sls_kern_stack(&dither->kern_stack, task);
+		diag_sls_user_stack(dither->task.tgid,
+			dither->task.container_tgid,
+			dither->task.comm,
+			&dither->user_stack, task, 0);
+		diag_sls_proc_chains(&dither->proc_chains, task);
+		root["task"] = task;
+
+		write_file(sls_file, "throttle-delay-dither", &dither->tv, dither->id, dither->seq, root);
+		write_syslog(syslog_enabled, "throttle-delay-dither", &dither->tv, dither->id, dither->seq, root);
+		break;
+	case et_throttle_delay_rq:
+		if (len < sizeof(struct throttle_delay_rq))
+			break;
+		rq = (struct throttle_delay_rq *)buf;
+		root["id"] = rq->id;
+		root["seq"] = rq->seq;
+		diag_sls_time(&rq->tv, root);
+		root["cpu"] = rq->cpu;
+		root["nr_running"] = rq->nr_running;
+		write_file(sls_file, "throttle-delay-rq", &rq->tv, rq->id, rq->seq, root);
+		write_syslog(syslog_enabled, "throttle-delay-rq", &rq->tv, rq->id, rq->seq, root);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void do_sls(char *arg)
+{
+	int ret;
+	static char variant_buf[4 * 1024 * 1024];
+	int len;
+	int jiffies_sls = 0;
+	struct diag_ioctl_dump_param dump_param = {
+		.user_ptr_len = &len,
+		.user_buf_len = 4 * 1024 * 1024,
+		.user_buf = variant_buf,
+	};
+
+	ret = log_config(arg, sls_file, &syslog_enabled);
+	if (ret != 1)
+		return;
+
+	java_attach_once();
+	while (1) {
+		if (run_in_host) {
+			ret = diag_call_ioctl(DIAG_IOCTL_THROTTLE_DELAY_DUMP, (long)&dump_param);
+		} else {
+			ret = -ENOSYS;
+			syscall(DIAG_THROTTLE_DELAY_DUMP, &ret, &len, variant_buf, 4 * 1024 * 1024);
+		}
+
+		if (ret == 0 && len > 0) {
+			/**
+			 * 10 min
+			 */
+			if (jiffies_sls >= 60) {
+				jiffies_sls = 0;
+				clear_symbol_info(pid_cmdline, g_symbol_parser.get_java_procs(), 1);
+				java_attach_once();
+			}
+
+			extract_variant_buffer(variant_buf, len, sls_extract, NULL);
+		}
+
+		sleep(10);
+		jiffies_sls++;
+	}
+}
+
+int throttle_delay_main(int argc, char **argv)
+{
+	static struct option long_options[] = {
+			{"help",     no_argument, 0,  0 },
+			{"activate",     optional_argument, 0,  0 },
+			{"deactivate", no_argument,       0,  0 },
+			{"settings",     optional_argument, 0,  0 },
+			{"report",     optional_argument, 0,  0 },
+			{"log",     required_argument, 0,  0 },
+			{0,         0,                 0,  0 }
+		};
+	int c;
+
+	if (argc <= 1) {
+		usage_throttle_delay();
+		return 0;
+	}
+	while (1) {
+		int option_index = -1;
+
+		c = getopt_long_only(argc, argv, "", long_options, &option_index);
+		if (c == -1)
+			break;
+		switch (option_index) {
+		case 0:
+			usage_throttle_delay();
+			break;
+	    case 1:
+			do_activate(optarg ? optarg : "");
+			break;
+		case 2:
+			do_deactivate();
+			break;
+		case 3:
+			do_settings(optarg ? optarg : "");
+			break;
+		case 4:
+			do_dump(optarg ? optarg : "");
+			break;
+		case 5:
+			do_sls(optarg);
+			break;
+		default:
+			usage_throttle_delay();
+			break;
+		}
+	}
+
+	return 0;
+}
diff --git a/SOURCE/module/Makefile b/SOURCE/module/Makefile
index 298c20f..0838616 100755
--- a/SOURCE/module/Makefile
+++ b/SOURCE/module/Makefile
@@ -214,7 +214,7 @@ ifneq ($(KERNELRELEASE),)
 			kernel/exec.o kernel/perf.o kernel/run_trace.o kernel/irq_trace.o \
 			kernel/kprobe.o kernel/utilization.o kernel/sched_delay.o kernel/reboot.o \
 			kernel/uprobe.o kernel/sys_cost.o kernel/sig_info.o kernel/task_monitor.o \
-			kernel/rw_sem.o
+			kernel/rw_sem.o kernel/throttle_delay.o
 
 	$(TARGET)-objs += mm/mm_entry.o mm/alloc_page.o mm/alloc_top.o mm/high_order.o mm/rss_monitor.o mm/memcg_stats.o
 	$(TARGET)-objs += io/io_entry.o
diff --git a/SOURCE/module/internal.h b/SOURCE/module/internal.h
index adb0372..1de3842 100755
--- a/SOURCE/module/internal.h
+++ b/SOURCE/module/internal.h
@@ -57,7 +57,7 @@ static inline void __percpu_counter_add(struct percpu_counter *fbc,
 #include "uapi/rss_monitor.h"
 #include "pub/variant_buffer.h"
 #include "pub/stack.h"
-
+#include "uapi/throttle_delay.h"
 /**
  * 手工替换函数相关的宏
  */
@@ -429,6 +429,7 @@ struct diag_percpu_context {
 	struct event_run_trace_raw event_run_trace_raw;
 	struct sys_delay_detail sys_delay_detail;
 	struct sched_delay_dither sched_delay_dither;
+	struct throttle_delay_dither throttle_delay_dither;
 
 	struct {
 		struct uprobe_detail uprobe_detail;
diff --git a/SOURCE/module/kernel/throttle_delay.c b/SOURCE/module/kernel/throttle_delay.c
new file mode 100644
index 0000000..578710a
--- /dev/null
+++ b/SOURCE/module/kernel/throttle_delay.c
@@ -0,0 +1,958 @@
+/*
+ * Linux内核诊断工具--内核态throttle-delay功能
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Xiongwei Jiang <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/stacktrace.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/timex.h>
+#include <linux/tracepoint.h>
+#include <trace/events/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <trace/events/napi.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/version.h>
+#include <linux/net.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <net/tcp.h>
+#include <linux/stop_machine.h>
+#include <linux/smp.h>
+#include <asm/thread_info.h>
+
+#include "internal.h"
+#include "mm_tree.h"
+#include "kern_internal.h"
+#include "pub/trace_file.h"
+#include "pub/trace_point.h"
+
+#include "uapi/throttle_delay.h"
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32) && \
+	LINUX_VERSION_CODE <= KERNEL_VERSION(4, 20, 0) \
+	&& !defined(UBUNTU_1604)
+
+#if defined(ALIOS_4000_009)
+static unsigned long *get_last_dequeued_addr(struct task_struct *p)
+{
+	/**
+	 * task_stack_page, but not end_of_stack !!
+	 */
+	return task_stack_page(p) + sizeof(struct thread_info) + 32;
+}
+#else
+#if  defined(CENTOS_8U)
+#define diag_last_dequeued rh_reserved2
+#elif KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE
+#define diag_last_dequeued ali_reserved3
+#elif KERNEL_VERSION(3, 10, 0) <= LINUX_VERSION_CODE
+#define diag_last_dequeued rh_reserved3
+#else
+#define diag_last_dequeued rh_reserved[0]
+#endif
+
+static unsigned long *get_last_dequeued_addr(struct task_struct *p)
+{
+	return &p->diag_last_dequeued;
+}
+
+#endif
+
+#define entity_is_task(se)      (!se->my_q)
+
+//static struct kprobe kprobe_dequeue_entity;
+//static int (*orig_throttle_cfs_rq)(struct cfs_rq *cfs_rq);
+
+
+/* task group related information */
+struct rt_bandwidth {
+	/* nests inside the rq lock: */
+	raw_spinlock_t	rt_runtime_lock;
+	ktime_t			rt_period;
+	u64				rt_runtime;
+	struct hrtimer	rt_period_timer;
+	unsigned int	rt_period_active;
+};
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+	raw_spinlock_t lock;
+	ktime_t period;
+	u64 quota, runtime;
+	s64 hierarchical_quota;
+	u64 runtime_expires;
+	int expires_seq;
+        
+	u8 idle;
+	u8 period_active;
+	u8 slack_started;
+	struct hrtimer period_timer, slack_timer;
+	struct list_head throttled_cfs_rq;
+        
+	/* statistics */
+	int nr_periods, nr_throttled;
+	u64 throttled_time;
+#endif
+};
+
+
+
+struct task_group {
+        struct cgroup_subsys_state css;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* schedulable entities of this group on each cpu */
+        struct sched_entity **se;
+        /* runqueue "owned" by this group on each cpu */
+        struct cfs_rq **cfs_rq;
+        unsigned long shares;
+        int bvt;
+#ifdef  CONFIG_SMP
+        /*
+         * load_avg can be heavily contended at clock tick time, so put
+         * it in its own cacheline separated from the fields above which
+         * will also be accessed at each tick.
+         */
+        atomic_long_t load_avg ____cacheline_aligned;
+#endif
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct sched_rt_entity **rt_se;
+        struct rt_rq **rt_rq;
+
+        struct rt_bandwidth rt_bandwidth;
+#endif
+
+        struct rcu_head rcu;
+        struct list_head list;
+
+        struct task_group *parent;
+        struct list_head siblings;
+        struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
+
+        struct cfs_bandwidth cfs_bandwidth;
+
+        ALI_HOTFIX_RESERVE(1)
+        ALI_HOTFIX_RESERVE(2)
+        ALI_HOTFIX_RESERVE(3)
+        ALI_HOTFIX_RESERVE(4)
+};
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+        struct load_weight load;
+        unsigned int nr_running, h_nr_running;
+
+        u64 exec_clock;
+        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
+
+        struct rb_root tasks_timeline;
+        struct rb_node *rb_leftmost;
+
+        /*
+         * 'curr' points to currently running entity on this cfs_rq.
+         * It is set to NULL otherwise (i.e when none are currently running).
+         */
+        struct sched_entity *curr, *next, *last, *skip;
+
+        /* Effective bvt type */
+        int ebvt;
+
+#ifdef  CONFIG_SCHED_DEBUG
+        unsigned int nr_spread_over;
+#endif
+
+#ifdef CONFIG_SMP
+        /*
+         * CFS load tracking
+         */
+        struct sched_avg avg;
+        u64 runnable_load_sum;
+        unsigned long runnable_load_avg;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        unsigned long tg_load_avg_contrib;
+#endif
+        atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+        u64 load_last_update_time_copy;
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /*
+         *   h_load = weight * f(tg)
+         *
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
+        u64 last_h_load_update;
+        struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
+
+        /*
+         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+         * (like users, containers etc.)
+         *
+         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+         * list is used during load balance.
+         */
+        int on_list;
+        struct list_head leaf_cfs_rq_list;
+        struct task_group *tg;  /* group that "owns" this runqueue */
+        struct list_head batch_node;
+        unsigned int nr_batch_running;  /* only tasks, no group se */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+        int runtime_enabled;
+        int expires_seq;
+        u64 runtime_expires;
+        s64 runtime_remaining;
+
+        u64 throttled_clock, throttled_clock_task;
+        u64 throttled_clock_task_time;
+        int throttled, throttle_count;
+        struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#ifdef CONFIG_CFS_BVT
+        u64 kick_delay_nc;
+        u64 throttled_clock_nc;
+        u64 throttled_time_nc;          /* total time */
+        u64 throttled_time_nc_max;      /* single max time */
+        int throttled_nc;
+        struct list_head throttled_node_nc;
+#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+        unsigned long   nr_uninterruptible;
+
+        ALI_HOTFIX_RESERVE(1)
+        ALI_HOTFIX_RESERVE(2)
+        ALI_HOTFIX_RESERVE(3)
+        ALI_HOTFIX_RESERVE(4)
+};
+
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+        struct list_head queue[MAX_RT_PRIO];
+};
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq { 
+	struct rt_prio_array active;
+	unsigned int rt_nr_running;
+	unsigned int rr_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	struct {
+		int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+		int next; /* next highest */
+#endif
+	} highest_prio;
+#endif 
+#ifdef CONFIG_SMP
+        unsigned long rt_nr_migratory;
+        unsigned long rt_nr_total;
+        int overloaded;
+        struct plist_head pushable_tasks;
+#endif /* CONFIG_SMP */
+        int rt_queued;
+        
+        int rt_throttled;
+        u64 rt_time;
+        u64 rt_runtime;
+        /* Nests inside the rq lock: */
+        raw_spinlock_t rt_runtime_lock;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+        unsigned long rt_nr_boosted;
+
+        struct rq *rq;
+        struct task_group *tg;
+#endif
+
+        unsigned long   nr_uninterruptible;
+};
+
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+        /* runqueue is an rbtree, ordered by deadline */
+        struct rb_root rb_root;
+        struct rb_node *rb_leftmost;
+
+        unsigned long dl_nr_running;
+
+#ifdef CONFIG_SMP
+        /*
+         * Deadline values of the currently executing and the
+         * earliest ready task on this rq. Caching these facilitates
+         * the decision wether or not a ready but not running task
+         * should migrate somewhere else.
+         */
+        struct {
+                u64 curr;
+                u64 next;
+        } earliest_dl;
+
+        unsigned long dl_nr_migratory;
+        int overloaded;
+
+        /*
+         * Tasks on this rq that can be pushed away. They are kept in
+         * an rb-tree, ordered by tasks' deadlines, with caching
+         * of the leftmost (earliest deadline) element.
+         */
+        struct rb_root pushable_dl_tasks_root;
+        struct rb_node *pushable_dl_tasks_leftmost;
+#else
+        struct dl_bw dl_bw;
+#endif
+};
+
+#if 0
+typedef void (*smp_call_func_t)(void *info);
+struct call_single_data {
+        struct llist_node llist;
+        smp_call_func_t func;
+        void *info;
+        unsigned int flags;
+};
+#endif
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+        /* runqueue lock: */
+        raw_spinlock_t lock;
+
+        /*
+         * nr_running and cpu_load should be in the same cacheline because
+         * remote CPUs use both these fields when doing load calculation.
+         */
+        unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
+        #define CPU_LOAD_IDX_MAX 5
+        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+        unsigned long last_load_update_tick;
+#endif /* CONFIG_SMP */
+        unsigned long nohz_flags;
+#endif /* CONFIG_NO_HZ_COMMON */
+#ifdef CONFIG_NO_HZ_FULL
+        unsigned long last_sched_tick;
+#endif
+        /* capture load from *all* tasks on this cpu: */
+        struct load_weight load;
+        unsigned long nr_load_updates;
+        u64 nr_switches;
+
+        struct cfs_rq cfs;
+        struct rt_rq rt;
+        struct dl_rq dl;
+
+        u64 kick_start_nc;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* list of leaf cfs_rq on this cpu: */
+        struct list_head leaf_cfs_rq_list;
+#ifdef CONFIG_CFS_BVT
+        struct list_head throttled_list_nc;
+#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+        /*
+         * This is part of a global counter where only the total sum
+         * over all CPUs matters. A task can increase this counter on
+         * one CPU and if it got migrated afterwards it may decrease
+         * it on another CPU. Always updated under the runqueue lock:
+         */
+        unsigned long nr_uninterruptible;
+
+        struct task_struct *curr, *idle, *stop;
+        unsigned long next_balance;
+        struct mm_struct *prev_mm;
+
+        unsigned int clock_skip_update;
+        u64 clock;
+        u64 clock_task;
+
+        atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+        struct root_domain *rd;
+        struct sched_domain *sd;
+
+        unsigned long cpu_capacity;
+        unsigned long cpu_capacity_orig;
+
+        struct callback_head *balance_callback;
+
+        unsigned char idle_balance;
+        /* For active balancing */
+        int active_balance;
+        int push_cpu;
+        struct cpu_stop_work active_balance_work;
+        /* cpu of this runqueue: */
+        int cpu;
+        int online;
+
+        struct list_head cfs_tasks;
+#ifdef CONFIG_CFS_BVT
+        unsigned int nr_active_batch;
+        unsigned int nr_ls_tasks;
+        atomic_t curr_task_type;
+        int cpu_sibling;
+        unsigned int nr_deactive_batchq;
+        struct list_head batchqs;
+        u64 throttled_clock_nc;
+        s64 exempt_quota_nc;
+#endif
+
+        u64 rt_avg;
+        u64 age_stamp;
+        u64 idle_stamp;
+        u64 avg_idle;
+
+        /* This is used to determine avg_idle's max value */
+        u64 max_idle_balance_cost;
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+        u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        u64 prev_steal_time_rq;
+#endif
+
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
+        long calc_load_active_r;
+
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+        int hrtick_csd_pending;
+        struct call_single_data hrtick_csd;
+#endif
+        struct hrtimer hrtick_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+        /* latency stats */
+        struct sched_info rq_sched_info;
+        unsigned long long rq_cpu_time;
+        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+        /* sys_sched_yield() stats */
+        unsigned int yld_count;
+
+        /* schedule() stats */
+        unsigned int sched_count;
+        unsigned int sched_goidle;
+
+        /* try_to_wake_up() stats */
+        unsigned int ttwu_count;
+        unsigned int ttwu_local;
+#endif
+
+#ifdef CONFIG_SMP
+        struct llist_head wake_list;
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+        /* Must be inspected within a rcu lock section */
+        struct cpuidle_state *idle_state;
+#endif
+
+        ALI_HOTFIX_RESERVE(1)
+        ALI_HOTFIX_RESERVE(2)
+        ALI_HOTFIX_RESERVE(3)
+        ALI_HOTFIX_RESERVE(4)
+        ALI_HOTFIX_RESERVE(5)
+        ALI_HOTFIX_RESERVE(6)
+        ALI_HOTFIX_RESERVE(7)
+        ALI_HOTFIX_RESERVE(8)
+};
+
+typedef int (*tg_visitor)(struct task_group *, void *);
+
+__maybe_unused static atomic64_t diag_nr_running = ATOMIC64_INIT(0);
+struct diag_throttle_delay_settings throttle_delay_settings = {
+		.threshold_ms = 50,
+};
+
+static int throttle_delay_alloced;
+static int diag_throttle_delay_id;
+static int throttle_delay_seq;
+static struct diag_variant_buffer throttle_delay_variant_buffer;
+
+DEFINE_ORIG_FUNC(void, throttle_cfs_rq, 1,
+						 struct cfs_rq *, cfs_rq);
+
+static inline int cpu_of(struct rq *rq)
+{       
+#ifdef CONFIG_SMP
+	return rq->cpu;
+#else   
+	return 0;
+#endif          
+}
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->rq;
+}
+
+int walk_tg_tree_from(struct task_group *from,
+							tg_visitor down, tg_visitor up, void *data)
+{
+	struct task_group *parent, *child;
+	int ret;             
+
+	parent = from;
+
+down:
+	ret = (*down)(parent, data);
+	if (ret)
+		goto out;
+	list_for_each_entry_rcu(child, &parent->children, siblings) {
+		parent = child;
+		goto down;
+
+up:
+		continue;
+	}
+	ret = (*up)(parent, data);
+	if (ret || parent == from)
+		goto out;
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+out:
+	return ret;
+}
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
+int tg_nop(struct task_group *tg, void *data)
+{
+	return 0;
+}
+
+static unsigned long read_last_dequeued(struct task_struct *p) 
+{
+	unsigned long *ptr = get_last_dequeued_addr(p);
+
+	if (ptr) {
+		return *ptr;
+	} else {
+		return 0;
+	}
+}
+
+
+static void update_last_dequeued(struct task_struct *p, unsigned long stamp)
+{
+	unsigned long *ptr = get_last_dequeued_addr(p);
+
+	if (ptr) {
+		*ptr = stamp;
+	}
+}
+
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct rb_node *node;
+	struct sched_entity *se;
+
+	if (!throttle_delay_settings.activated)
+		return 0;
+
+	for (node = rb_first(&cfs_rq->tasks_timeline); node; node = rb_next(node)) {
+		se = rb_entry(node, struct sched_entity, run_node);
+		if (entity_is_task(se)) {
+			struct task_struct *p = task_of(se);
+			update_last_dequeued(p, ktime_to_ms(ktime_get()));
+		}
+
+	}
+	return 0;
+}
+
+static void diag_throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	
+	rcu_read_lock();
+	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+	rcu_read_unlock();
+
+	orig_throttle_cfs_rq(cfs_rq);
+
+}
+
+
+static void new_throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	atomic64_inc_return(&diag_nr_running);
+	diag_throttle_cfs_rq(cfs_rq);
+	atomic64_dec_return(&diag_nr_running);
+}
+
+static int lookup_syms(void)
+{
+    LOOKUP_SYMS(throttle_cfs_rq);
+    return 0;
+}
+
+static void jump_init(void)
+{
+	JUMP_INIT(throttle_cfs_rq);
+
+} 
+
+static int kprobe_dequeue_entity_pre(struct kprobe *p, struct pt_regs *regs)
+{
+	struct sched_entity *se = (void *)ORIG_PARAM2(regs);
+	int *flags = (void *)ORIG_PARAM3(regs); 
+	struct task_struct *task;
+
+	if (!throttle_delay_settings.activated)
+		return 0;
+
+
+	return 0;
+}
+
+#if KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE
+static void trace_sched_switch_hit(void *__data, bool preempt,
+		struct task_struct *prev, struct task_struct *next)
+#elif KERNEL_VERSION(3, 10, 0) <= LINUX_VERSION_CODE
+static void trace_sched_switch_hit(void *__data,
+		struct task_struct *prev, struct task_struct *next)
+#else
+static void trace_sched_switch_hit(struct rq *rq, struct task_struct *prev,
+		struct task_struct *next)
+#endif
+{
+	unsigned long long t_dequeued;
+	unsigned long long delta = 0;
+	unsigned long long delta_ms;
+	unsigned long long now = ktime_to_ms(ktime_get());
+
+	struct task_struct *leader = next->group_leader ? next->group_leader : next;
+
+	if (throttle_delay_settings.bvt == 0 && diag_get_task_type(next) < 0)
+		return;
+
+	if (throttle_delay_settings.comm[0] && (strcmp("none", throttle_delay_settings.comm) != 0)) {
+		if (strcmp(leader->comm, throttle_delay_settings.comm) != 0)
+			return;
+	}
+
+	if (throttle_delay_settings.tgid && leader->pid != throttle_delay_settings.tgid) {
+		return;
+	}
+
+	if (throttle_delay_settings.pid && next->pid != throttle_delay_settings.pid) {
+		return;
+	}
+
+	t_dequeued = read_last_dequeued(next);
+	update_last_dequeued(next, 0);
+	if (t_dequeued <= 0)
+		return;
+
+	delta = now - t_dequeued;
+	delta_ms = delta;
+
+	if (delta_ms >= throttle_delay_settings.threshold_ms) {
+		struct throttle_delay_dither *dither;
+		unsigned long flags;
+
+		if (strcmp(leader->comm, "qemu-kvm") == 0)
+			return;
+
+		dither = &diag_percpu_context[smp_processor_id()]->throttle_delay_dither;
+		dither->et_type = et_throttle_delay_dither;
+		dither->id = diag_throttle_delay_id;
+		do_diag_gettimeofday(&dither->tv);
+		dither->seq = throttle_delay_seq;
+		throttle_delay_seq++;
+		dither->now	= now;
+		dither->dequeued = t_dequeued;
+		dither->delay_ms = delta_ms;
+		diag_task_brief(next, &dither->task);
+		diag_task_kern_stack(next, &dither->kern_stack);
+		diag_task_user_stack(next, &dither->user_stack);
+		dump_proc_chains_simple(next, &dither->proc_chains);
+
+		diag_variant_buffer_spin_lock(&throttle_delay_variant_buffer, flags);
+		diag_variant_buffer_reserve(&throttle_delay_variant_buffer, sizeof(struct throttle_delay_dither));
+		diag_variant_buffer_write_nolock(&throttle_delay_variant_buffer, dither, sizeof(struct throttle_delay_dither));
+		diag_variant_buffer_seal(&throttle_delay_variant_buffer);
+		diag_variant_buffer_spin_unlock(&throttle_delay_variant_buffer, flags);
+	}
+}
+
+static int __activate_throttle_delay(void)
+{
+	int ret = 0;
+
+	ret = alloc_diag_variant_buffer(&throttle_delay_variant_buffer);
+	if (ret)
+		goto out_variant_buffer;
+	throttle_delay_alloced = 1;
+
+	JUMP_CHECK(throttle_cfs_rq);
+
+	hook_tracepoint("sched_switch", trace_sched_switch_hit, NULL);
+//	hook_kprobe(&kprobe_dequeue_entity, "dequeue_entity",
+//							kprobe_dequeue_entity_pre, NULL);
+	JUMP_INSTALL(throttle_cfs_rq);
+	return 1;
+out_variant_buffer:
+	return 0;
+}
+
+int activate_throttle_delay(void)
+{
+	if (!throttle_delay_settings.activated)
+		throttle_delay_settings.activated = __activate_throttle_delay();
+
+	return throttle_delay_settings.activated;
+}
+
+static void __deactivate_throttle_delay(void)
+{
+	unhook_tracepoint("sched_switch", trace_sched_switch_hit, NULL);
+
+	JUMP_REMOVE(throttle_cfs_rq);
+
+	msleep(20);
+	while (atomic64_read(&diag_nr_running) > 0)
+	{
+		msleep(10);
+	}
+}
+
+int deactivate_throttle_delay(void)
+{
+	if (throttle_delay_settings.activated)
+		__deactivate_throttle_delay();
+	throttle_delay_settings.activated = 0;
+
+	return 0;
+}
+
+static void dump_data(void)
+{
+	struct throttle_delay_rq rq;
+	unsigned long flags;
+	int cpu;
+
+	rq.et_type = et_throttle_delay_rq;
+	rq.id = diag_throttle_delay_id;
+	do_diag_gettimeofday(&rq.tv);
+
+	for_each_online_cpu(cpu)
+	{
+		rq.seq = throttle_delay_seq;
+		throttle_delay_seq++;
+		rq.cpu = cpu;
+
+		diag_variant_buffer_spin_lock(&throttle_delay_variant_buffer, flags);
+		diag_variant_buffer_reserve(&throttle_delay_variant_buffer, sizeof(struct throttle_delay_rq));
+		diag_variant_buffer_write_nolock(&throttle_delay_variant_buffer, &rq, sizeof(struct throttle_delay_rq));
+		diag_variant_buffer_seal(&throttle_delay_variant_buffer);
+		diag_variant_buffer_spin_unlock(&throttle_delay_variant_buffer, flags);
+	}
+
+
+}
+
+int throttle_delay_syscall(struct pt_regs *regs, long id)
+{
+	int __user *user_ptr_len;
+	size_t __user user_buf_len;
+	void __user *user_buf;
+	int ret = 0;
+	static struct diag_throttle_delay_settings settings;
+
+	switch (id) {
+	case DIAG_THROTTLE_DELAY_SET:
+		user_buf = (void __user *)SYSCALL_PARAM1(regs);
+		user_buf_len = (size_t)SYSCALL_PARAM2(regs);
+
+		if (user_buf_len != sizeof(struct diag_throttle_delay_settings)) {
+			ret = -EINVAL;
+		} else if (throttle_delay_settings.activated) {
+			ret = -EBUSY;
+		} else {
+			ret = copy_from_user(&settings, user_buf, user_buf_len);
+			if (!ret) {
+				throttle_delay_settings = settings;
+			}
+		}
+		break;
+	case DIAG_THROTTLE_DELAY_SETTINGS:
+		user_buf = (void __user *)SYSCALL_PARAM1(regs);
+		user_buf_len = (size_t)SYSCALL_PARAM2(regs);
+
+		if (user_buf_len != sizeof(struct diag_throttle_delay_settings)) {
+			ret = -EINVAL;
+		} else {
+			settings = throttle_delay_settings;
+			ret = copy_to_user(user_buf, &settings, user_buf_len);
+		}
+		break;
+	case DIAG_THROTTLE_DELAY_DUMP:
+		user_ptr_len = (void __user *)SYSCALL_PARAM1(regs);
+		user_buf = (void __user *)SYSCALL_PARAM2(regs);
+		user_buf_len = (size_t)SYSCALL_PARAM3(regs);
+
+		if (!throttle_delay_alloced) {
+			ret = -EINVAL;
+		} else {
+			dump_data();
+			ret = copy_to_user_variant_buffer(&throttle_delay_variant_buffer,
+					user_ptr_len, user_buf, user_buf_len);
+			diag_throttle_delay_id++;
+			record_dump_cmd("throttle-delay");
+		}
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+
+long diag_ioctl_throttle_delay(unsigned int cmd, unsigned long arg)
+{
+	struct diag_ioctl_dump_param dump_param;
+	int ret = 0;
+	static struct diag_throttle_delay_settings settings;
+
+	switch (cmd) {
+	case CMD_THROTTLE_DELAY_SET:
+		if (throttle_delay_settings.activated) {
+			ret = -EBUSY;
+		} else {
+			ret = copy_from_user(&settings, (void *)arg, sizeof(struct diag_throttle_delay_settings));
+			if (!ret) {
+				throttle_delay_settings = settings;
+			}
+		}
+		break;
+	case CMD_THROTTLE_DELAY_SETTINGS:
+		settings = throttle_delay_settings;
+		ret = copy_to_user((void *)arg, &settings, sizeof(struct diag_throttle_delay_settings));
+		break;
+	case CMD_THROTTLE_DELAY_DUMP:
+		ret = copy_from_user(&dump_param, (void *)arg, sizeof(struct diag_ioctl_dump_param));
+		if (!throttle_delay_alloced) {
+			ret = -EINVAL;
+		} else if (!ret) {
+			dump_data();
+			ret = copy_to_user_variant_buffer(&throttle_delay_variant_buffer,
+					dump_param.user_ptr_len, dump_param.user_buf, dump_param.user_buf_len);
+			diag_throttle_delay_id++;
+			record_dump_cmd("throttle-delay");
+		}
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+
+int diag_throttle_delay_init(void)
+{
+	if (lookup_syms())
+		return -EINVAL;
+
+	init_diag_variant_buffer(&throttle_delay_variant_buffer, 4 * 1024 * 1024);
+	jump_init();
+
+    if (throttle_delay_settings.activated)
+		throttle_delay_settings.activated = __activate_throttle_delay();
+
+    return 0;
+
+}
+
+void diag_throttle_delay_exit(void)
+{
+    if (throttle_delay_settings.activated)
+        __deactivate_throttle_delay();
+    throttle_delay_settings.activated = 0;
+
+	destroy_diag_variant_buffer(&throttle_delay_variant_buffer);
+}
+#else
+int diag_throttle_delay_init(void)
+{
+	return 0;
+}
+
+void diag_throttle_delay_exit(void)
+{
+
+}
+#endif
diff --git a/SOURCE/uapi/ali_diagnose.h b/SOURCE/uapi/ali_diagnose.h
index dc7f2d7..f0f06e5 100644
--- a/SOURCE/uapi/ali_diagnose.h
+++ b/SOURCE/uapi/ali_diagnose.h
@@ -99,8 +99,9 @@ extern unsigned long debug_mode;
 #define DIAG_IOCTL_TYPE_RW_SEM (DIAG_IOCTL_TYPE_TASK_MONITOR + 1)
 #define DIAG_IOCTL_TYPE_RSS_MONITOR (DIAG_IOCTL_TYPE_RW_SEM + 1)
 #define DIAG_IOCTL_TYPE_MEMCG_STATS (DIAG_IOCTL_TYPE_RSS_MONITOR + 1)
+#define DIAG_IOCTL_TYPE_THROTTLE_DELAY (DIAG_IOCTL_TYPE_MEMCG_STATS + 1)
 
-#define DIAG_IOCTL_TYPE_END (DIAG_IOCTL_TYPE_MEMCG_STATS + 1)
+#define DIAG_IOCTL_TYPE_END (DIAG_IOCTL_TYPE_THROTTLE_DELAY + 1)
 
 long diag_ioctl_sys_delay(unsigned int cmd, unsigned long arg);
 long diag_ioctl_sys_cost(unsigned int cmd, unsigned long arg);
@@ -343,6 +344,11 @@ struct diag_ioctl_dump_param_cycle {
 #define DIAG_BASE_SYSCALL_MEMCG_STATS \
 	(DIAG_BASE_SYSCALL_PING_DELAY6 + DIAG_SYSCALL_INTERVAL)
 
+/// 1900
+#define DIAG_BASE_SYSCALL_THROTTLE_DELAY \
+	(DIAG_BASE_SYSCALL_PING_DELAY6 + DIAG_SYSCALL_INTERVAL)
+
+
 #define DIAG_SYSCALL_END (DIAG_BASE_SYSCALL + DIAG_SYSCALL_INTERVAL * 1000)
 
 enum diag_record_id {
@@ -511,6 +517,10 @@ enum diag_record_id {
 	et_memcg_stats_summary,
 	et_memcg_stats_detail,
 
+	et_throttle_delay_base = et_rss_monitor_base + DIAG_EVENT_TYPE_INTERVAL,
+	et_throttle_delay_dither,
+	et_throttle_delay_rq,
+
 	et_count
 };
 
diff --git a/SOURCE/uapi/throttle_delay.h b/SOURCE/uapi/throttle_delay.h
new file mode 100644
index 0000000..f304030
--- /dev/null
+++ b/SOURCE/uapi/throttle_delay.h
@@ -0,0 +1,62 @@
+/*
+ * Linux内核诊断工具--用户接口API
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Xiongwei Jiang <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+
+#ifndef UAPI_THROTTLE_DELAY_H
+#define UAPI_THROTTLE_DELAY_H
+
+#include <linux/ioctl.h>
+
+int throttle_delay_syscall(struct pt_regs *regs, long id);
+
+#define DIAG_THROTTLE_DELAY_SET (DIAG_BASE_SYSCALL_THROTTLE_DELAY)
+#define DIAG_THROTTLE_DELAY_SETTINGS (DIAG_THROTTLE_DELAY_SET + 1)
+#define DIAG_THROTTLE_DELAY_DUMP (DIAG_THROTTLE_DELAY_SETTINGS + 1)
+
+struct diag_throttle_delay_settings {
+	unsigned int activated;
+	unsigned int verbose;
+	unsigned int tgid;
+	unsigned int pid;
+	unsigned int bvt;
+	char comm[TASK_COMM_LEN];
+	unsigned int threshold_ms;
+};
+
+struct throttle_delay_rq {
+	int et_type;
+	unsigned long id;
+	unsigned long seq;
+	struct diag_timespec tv;
+	int cpu;
+	int nr_running;
+};
+
+struct throttle_delay_dither {
+	int et_type;
+	unsigned long id;
+	unsigned long seq;
+	struct diag_timespec tv;
+	unsigned long delay_ms;
+	unsigned long now, dequeued;
+	struct diag_task_detail task;
+	struct diag_kern_stack_detail kern_stack;
+	struct diag_user_stack_detail user_stack;
+	struct diag_proc_chains_detail proc_chains;
+};
+
+#define CMD_THROTTLE_DELAY_SET (0)
+#define CMD_THROTTLE_DELAY_SETTINGS (CMD_THROTTLE_DELAY_SET + 1)
+#define CMD_THROTTLE_DELAY_DUMP (CMD_THROTTLE_DELAY_SETTINGS + 1)
+#define DIAG_IOCTL_THROTTLE_DELAY_SET _IOR(DIAG_IOCTL_TYPE_THROTTLE_DELAY, CMD_THROTTLE_DELAY_SET, struct diag_throttle_delay_settings)
+#define DIAG_IOCTL_THROTTLE_DELAY_SETTINGS _IOW(DIAG_IOCTL_TYPE_THROTTLE_DELAY, CMD_THROTTLE_DELAY_SETTINGS, struct diag_throttle_delay_settings)
+#define DIAG_IOCTL_THROTTLE_DELAY_DUMP _IOR(DIAG_IOCTL_TYPE_THROTTLE_DELAY, CMD_THROTTLE_DELAY_DUMP, struct diag_ioctl_dump_param)
+
+#endif /* UAPI_THROTTLE_DELAY_H */
author	Xiongwei Jiang <[email protected]>	2021-11-10 14:26:01 +0800
committer	Xiongwei Jiang <[email protected]>	2021-11-10 14:26:01 +0800
commit	4a950c984e2730fc35aaca319bcb28b51ce260b6 (patch)
tree	9c41045eeae71454d97bb093df22b602d8594860 /SOURCE
parent	19c8cbdf483bb14b4dae02cee41bc657bf98909c (diff)