diff options
| author | Wen Yang <[email protected]> | 2022-07-31 18:00:15 +0800 |
|---|---|---|
| committer | Wen Yang <[email protected]> | 2022-07-31 21:50:14 +0800 |
| commit | 8cd905a1c17f2201e460a2d607413a1303757a32 (patch) | |
| tree | e1a9eb8a5628786fa3db83e0bb1f62b456f74c54 | |
| parent | eebb3c8f25883e2402a24ddeb65b80d5701f2f55 (diff) | |
diagnose: support the pmu metrics
| -rw-r--r-- | SOURCE/diagnose-tools/internal.h | 3 | ||||
| -rw-r--r-- | SOURCE/diagnose-tools/main.cc | 1 | ||||
| -rw-r--r-- | SOURCE/diagnose-tools/pmu.cc | 451 | ||||
| -rwxr-xr-x | SOURCE/module/Makefile | 1 | ||||
| -rw-r--r-- | SOURCE/module/chr_dev.c | 5 | ||||
| -rw-r--r-- | SOURCE/module/entry.c | 15 | ||||
| -rwxr-xr-x | SOURCE/module/internal.h | 11 | ||||
| -rw-r--r-- | SOURCE/module/pmu/debug.c | 158 | ||||
| -rw-r--r-- | SOURCE/module/pmu/debug.h | 120 | ||||
| -rw-r--r-- | SOURCE/module/pmu/entry.c | 558 | ||||
| -rw-r--r-- | SOURCE/module/pmu/pmu.c | 601 | ||||
| -rw-r--r-- | SOURCE/module/pmu/pmu.h | 225 | ||||
| -rw-r--r-- | SOURCE/module/pub/cgroup.c | 8 | ||||
| -rw-r--r-- | SOURCE/module/pub/cgroup.h | 3 | ||||
| -rwxr-xr-x | SOURCE/module/stub.c | 7 | ||||
| -rw-r--r-- | SOURCE/uapi/ali_diagnose.h | 8 | ||||
| -rw-r--r-- | SOURCE/uapi/pmu.h | 67 |
17 files changed, 2237 insertions, 5 deletions
diff --git a/SOURCE/diagnose-tools/internal.h b/SOURCE/diagnose-tools/internal.h index 3a1e879..4ec70ba 100644 --- a/SOURCE/diagnose-tools/internal.h +++ b/SOURCE/diagnose-tools/internal.h @@ -177,6 +177,9 @@ void usage_fs_cache(void); int high_order_main(int argc, char *argv[]); void usage_high_order(void); +int pmu_main(int argc, char **argv); +void usage_pmu(void); + int testcase_main(int argc, char *argv[]); struct timeval; diff --git a/SOURCE/diagnose-tools/main.cc b/SOURCE/diagnose-tools/main.cc index af30bbd..c276046 100644 --- a/SOURCE/diagnose-tools/main.cc +++ b/SOURCE/diagnose-tools/main.cc @@ -249,6 +249,7 @@ static struct diagnose_func all_funcs[] { {"task-monitor", task_monitor_main, 0}, {"rw-sem", rw_sem_main, 0}, {"rss-monitor", rss_monitor_main, 0}, + {"pmu", pmu_main, 0}, {"test", testcase_main, 0}, }; diff --git a/SOURCE/diagnose-tools/pmu.cc b/SOURCE/diagnose-tools/pmu.cc new file mode 100644 index 0000000..9773a2d --- /dev/null +++ b/SOURCE/diagnose-tools/pmu.cc @@ -0,0 +1,451 @@ +/* + * Linux内核诊断工具--用户态pmu功能实现 + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Wen Yang <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ +#include <assert.h> +#include <dirent.h> +#include <errno.h> +#include <getopt.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <stdarg.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <fstream> +#include <iostream> +#include <iomanip> +#include <sstream> + +#include "json/json.h" +#include "internal.h" +#include "params_parse.h" +#include "uapi/pmu.h" + +using namespace std; + +#define NSEC_PER_SEC 1000000000L + +#define PMC_INSTRUCTIONS "instructions" +#define PMC_CYCLES "cycles" +#define PMC_REF_CYCLES "ref-cycles" +#define PMC_FIXED_COUNTERS "fixed-counters" +#define PMC_BRANCH_MISSES "branch-misses" +#define PMC_LAST_CACHE_MISSES "llc-misses" +#define PMC_RAW_PMU_EVENT1 "raw-pmu-event1" +#define PMC_RAW_PMU_EVENT2 "raw-pmu-event2" + +extern unsigned long debug_mode; + +static unsigned long instructions_sum; +static unsigned long cycles_sum; +static unsigned long ref_cycles_sum; +static unsigned long branch_misses_sum; +static unsigned long last_cache_misses_sum; +static unsigned long raw_pmu_event1_sum; +static unsigned long raw_pmu_event2_sum; +static std::stringstream ss_cpu; + +void usage_pmu(void) +{ + printf(" pmu usage:\n"); + printf(" --activate\n"); + printf(" style: whether to use hrtimers to probe long-running processes\n"); + printf(" sample: whether to sample the PMU registers\n"); + printf(" %s: whether to enable the collection of the %s, default is true\n", + PMC_FIXED_COUNTERS, PMC_FIXED_COUNTERS); + printf(" %s: whether to enable the collection of the %s\n", + PMC_BRANCH_MISSES, PMC_BRANCH_MISSES); + printf(" %s: whether to enable the collection of the %s\n", + PMC_LAST_CACHE_MISSES, PMC_LAST_CACHE_MISSES); + printf(" %s: a raw PMU event (eventsel+umask) in the form of NNN where NNN is a hexadecimal event descriptor.\n", + PMC_RAW_PMU_EVENT1); + printf(" %s: a raw PMU event (eventsel+umask) in the form of NNN where NNN is a hexadecimal event descriptor.\n", + PMC_RAW_PMU_EVENT2); + printf(" --deactivate\n"); + printf(" --settings print settings.\n"); + printf(" --report dump log with text.\n"); + printf(" --record\n"); + printf(" sls=/tmp/1.json stored core events in the specified file\n"); + printf(" --sample stop sample if it is 0\n"); +} + +static void print_settings(struct diag_pmu_settings *settings, int is_activate_oper) +{ + printf("SETTINGS: \n"); + + if (!is_activate_oper) + printf(" ACTIVE: %s\n", settings->activated ? "Yes" : "No"); + + printf(" STYLE: %d\n", settings->style); + printf(" SAMPLE: %d\n", settings->sample); + printf(" FIXED-COUNTERS: %d\n", settings->conf_fixed_counters); + printf(" BRANCH-MISSES: %d\n", settings->conf_branch_misses); + printf(" LLC-MISSES: %d\n", settings->conf_last_cache_misses); + printf(" RAW-PMU-EVENT1: 0X%04X\n", settings->conf_raw_pmu_event1); + printf(" RAW-PMU-EVENT2: 0X%04X\n", settings->conf_raw_pmu_event2); +} + +static int do_activate(const char *arg) +{ + int ret = 0; + struct params_parser parse(arg); + struct diag_pmu_settings settings; + string str; + + memset(&settings, 0, sizeof(struct diag_pmu_settings)); + + settings.style = parse.int_value("style"); + settings.sample = parse.bool_value("sample"); + settings.conf_fixed_counters = (parse.string_value(PMC_FIXED_COUNTERS).length() + == 0) ? 1 : parse.bool_value(PMC_FIXED_COUNTERS); + settings.conf_branch_misses = parse.bool_value(PMC_BRANCH_MISSES); + settings.conf_last_cache_misses = parse.bool_value(PMC_LAST_CACHE_MISSES); + settings.conf_raw_pmu_event1 = parse.int_value(PMC_RAW_PMU_EVENT1, 16); + settings.conf_raw_pmu_event2 = parse.int_value(PMC_RAW_PMU_EVENT2, 16); + + ret = diag_call_ioctl(DIAG_IOCTL_PMU_SET, (long)&settings); + printf("Operation %s, return: %d\n", ret ? "failed" : "successful", ret); + print_settings(&settings, 1); + if (ret) + return ret; + + ret = diag_activate("pmu"); + if (ret == 1) { + printf("pmu activated\n"); + ret = 0; + } else { + printf("pmu is not activated, ret %d\n", ret); + } + + return ret; +} + +static int do_deactivate(void) +{ + int ret = 0; + + ret = diag_deactivate("pmu"); + if (ret == 0) { + printf("pmu is not activated\n"); + } else { + printf("deactivate pmu fail, ret is %d\n", ret); + } + return ret; +} + +static int do_settings(const char *arg) +{ + struct diag_pmu_settings settings; + int ret; + + memset(&settings, 0, sizeof(struct diag_pmu_settings)); + ret = diag_call_ioctl(DIAG_IOCTL_PMU_SETTINGS, (long)&settings); + if (ret == 0) { + print_settings(&settings, 0); + } else { + printf("The operation failed!\n"); + printf("Make sure that the diagnose tool is installed correctly.\n"); + } + return ret; +} + +static inline char *ns_to_timespec_str(long long nsec) +{ + static char str[128] = {0}; + struct timespec ts; + + if (!nsec) { + ts.tv_sec = 0; + ts.tv_nsec = 0; + } else { + ts.tv_sec = nsec / NSEC_PER_SEC; + ts.tv_nsec = nsec % NSEC_PER_SEC; + } + + snprintf(str, sizeof(str)-1, "%ld.%ld", ts.tv_sec, ts.tv_nsec); + return str; +} + +static int pmu_extract(void *buf, unsigned int len, void *unused) +{ + int *et_type; + + if (len == 0) + return 0; + + et_type = (int *)buf; + switch (*et_type) { + case et_pmu_detail: + struct diag_pmu_detail *detail; + + if (len < sizeof(struct diag_pmu_detail)) + break; + + detail = (struct diag_pmu_detail *)buf; + if (!detail) + break; + + instructions_sum += detail->instructions; + cycles_sum += detail->cycles; + ref_cycles_sum += detail->ref_cycles; + branch_misses_sum += detail->branch_misses; + last_cache_misses_sum += detail->last_cache_misses; + raw_pmu_event1_sum += detail->raw_pmu_event1; + raw_pmu_event2_sum += detail->raw_pmu_event2; + + ss_cpu << detail->cgrp_buf << "; " + << detail->cpu << "; " + << detail->instructions << "; " + << detail->cycles << "; " + << detail->ref_cycles << "; " + << detail->branch_misses << "; " + << detail->last_cache_misses << "; " + << detail->raw_pmu_event1 << "; " + << detail->raw_pmu_event2 << endl; + break; + + default: + break; + } + + return 0; +} + +static void print_columns_core_events() +{ + printf("cgroup; cpu; instructions; cycles; ref-cycles; branch_misses; " + "last_cache_misses; raw_pmu_event1; raw_pmu_event2\n"); + printf("-----------------------------------------------------------" + "-------------------------------------------------\n"); +} + +static void do_extract(char *buf, int len) +{ + struct timespec ts; + + instructions_sum = 0; + cycles_sum = 0; + ref_cycles_sum = 0; + branch_misses_sum = 0; + last_cache_misses_sum = 0; + raw_pmu_event1_sum = 0; + raw_pmu_event2_sum = 0; + ss_cpu.str(""); + + extract_variant_buffer(buf, len, pmu_extract, NULL); + print_columns_core_events(); + printf("%s\n", ss_cpu.str().c_str()); + + clock_gettime(CLOCK_REALTIME, &ts); + printf("time: %lu.%lu, the core events are summarized as follows, " + "instructions: %lu, cycles: %lu, ref_cycles: %lu, " + "branch_misses: %lu, llc_misses: %lu, raw_pmu_event1: %lu, " + "raw_pmu_event2: %lu.\n", + ts.tv_sec, ts.tv_nsec, + instructions_sum, cycles_sum, ref_cycles_sum, + branch_misses_sum, last_cache_misses_sum, raw_pmu_event1_sum, + raw_pmu_event2_sum); + + printf("\n"); +} + +static int do_dump(void) +{ + static char variant_buf[DIAG_PMU_VARIANT_BUF_LEN]; + int len; + int ret = 0; + struct diag_ioctl_dump_param dump_param = { + .user_ptr_len = &len, + .user_buf_len = DIAG_PMU_VARIANT_BUF_LEN, + .user_buf = variant_buf, + }; + + memset(variant_buf, 0, DIAG_PMU_VARIANT_BUF_LEN); + ret = diag_call_ioctl(DIAG_IOCTL_PMU_DUMP, (long)&dump_param); + if (ret == 0) { + do_extract(variant_buf, len); + } + return ret; +} + +void write_json_file(const char *sls_file, Json::Value &root) +{ + ofstream os; + Json::StreamWriterBuilder builder; + builder.settings_["indentation"] = " "; + std::unique_ptr<Json::StreamWriter> writer(builder.newStreamWriter()); + writer->newline = false; + + if (*sls_file == '\0') { + return; + } + + os.open(sls_file, std::ios::out); + if (1 != os.is_open()) { + return; + } + + writer->write(root, &os); + os << endl; + + return; +} + +static int sls_extract(void *buf, unsigned int len, void *param) +{ + Json::Value *root = (Json::Value *)param; + struct diag_pmu_detail *detail; + Json::Value event; + static int i = 0; + int *et_type; + + if (len == 0) + return 0; + + et_type = (int *)buf; + switch (*et_type) { + case et_pmu_detail: + if (len < sizeof(struct diag_pmu_detail)) + break; + + detail = (struct diag_pmu_detail *)buf; + if (!detail) + break; + + event["cgroup"] = Json::Value(detail->cgrp_buf); + event["cpu"] = Json::Value(detail->cpu); + event[PMC_INSTRUCTIONS] = Json::Value(detail->instructions); + event[PMC_CYCLES] = Json::Value(detail->cycles); + event[PMC_REF_CYCLES] = Json::Value(detail->ref_cycles); + event[PMC_BRANCH_MISSES] = Json::Value(detail->branch_misses); + event[PMC_LAST_CACHE_MISSES] = Json::Value(detail->last_cache_misses); + event[PMC_RAW_PMU_EVENT1] = Json::Value(detail->raw_pmu_event1); + event[PMC_RAW_PMU_EVENT2] = Json::Value(detail->raw_pmu_event2); + (*root)[i++] = event; + break; + + default: + break; + } + + return 0; +} + +static int do_sls(char *arg) +{ + static char variant_buf[20 * 1024 * 1024]; + int ret, len; + struct diag_ioctl_dump_param dump_param = { + .user_ptr_len = &len, + .user_buf_len = 20 * 1024 * 1024, + .user_buf = variant_buf, + }; + + struct params_parser parse(arg); + string sls_file = parse.string_value("sls"); + string imc_file = parse.string_value("imc"); + if (!sls_file.length() && !imc_file.length()) + return -EINVAL; + + if (debug_mode) { + printf("sls=%s, imc=%s\n", sls_file.c_str(), imc_file.c_str()); + } + + ret = diag_call_ioctl(DIAG_IOCTL_PMU_DUMP, (long)&dump_param); + if (ret == 0 && len > 0) { +#if 0 + clock_gettime(CLOCK_REALTIME, &ts); + root["tv_sec"] = Json::Value(ts.tv_sec); + root["tv_nsec"] = Json::Value(ts.tv_nsec); +#endif + Json::Value root_core; + Json::Value root_imc; + + extract_variant_buffer(variant_buf, len, sls_extract, &root_core); + write_json_file(sls_file.c_str(), root_core); + write_json_file(imc_file.c_str(), root_imc); + } + + return ret; +} + +static int do_sample(char *arg) +{ + int ret; + unsigned int sample; + + ret = sscanf(arg, "%d", &sample); + if (ret < 1) + return -EINVAL; + + ret = diag_call_ioctl(DIAG_IOCTL_PMU_SAMPLE, (long)&sample); + printf("set sample for pmu: %d, ret is %d\n", sample, ret); + return ret; +} + +int pmu_main(int argc, char **argv) +{ + static struct option long_options[] = { + {"help", no_argument, 0, 0 }, + {"activate", optional_argument, 0, 0 }, + {"deactivate",no_argument, 0, 0 }, + {"settings", optional_argument, 0, 0 }, + {"report", no_argument, 0, 0 }, + {"record", required_argument, 0, 0 }, + {"sample", required_argument, 0, 0 }, + {0, 0, 0, 0 } + }; + int ret = -EINVAL; + int c; + + if (argc <= 1) { + usage_pmu(); + return -EINVAL; + } + while (1) { + int option_index = -1; + + c = getopt_long_only(argc, argv, "", long_options, &option_index); + if (c == -1) + break; + switch (option_index) { + case 0: + usage_pmu(); + break; + case 1: + ret = do_activate(optarg ? optarg : ""); + break; + case 2: + ret = do_deactivate(); + break; + case 3: + ret = do_settings(optarg ? optarg : ""); + break; + case 4: + ret = do_dump(); + break; + case 5: + ret = do_sls(optarg); + break; + case 6: + ret = do_sample(optarg); + break; + default: + usage_pmu(); + break; + } + } + + return ret; +} diff --git a/SOURCE/module/Makefile b/SOURCE/module/Makefile index c3dadc8..56b19e9 100755 --- a/SOURCE/module/Makefile +++ b/SOURCE/module/Makefile @@ -229,6 +229,7 @@ ifneq ($(KERNELRELEASE),) $(TARGET)-objs += fs/fs_entry.o fs/orphan.o fs/shm.o fs/rw_top.o $(TARGET)-objs += net/net_entry.o net/tcp_retrans.o net/drop_packet.o net/ping_delay.o net/ping_delay6.o \ net/net_bandwidth.o net/tcp_connect.o + $(TARGET)-objs += pmu/entry.o pmu/pmu.o pmu/debug.o ifeq ($(EXPERIENTIAL),1) $(TARGET)-objs += test/test.o diff --git a/SOURCE/module/chr_dev.c b/SOURCE/module/chr_dev.c index 5ba9771..b630df3 100644 --- a/SOURCE/module/chr_dev.c +++ b/SOURCE/module/chr_dev.c @@ -54,6 +54,8 @@ struct diag_dev { struct cdev cdev; }; +extern long diag_ioctl_pmu(unsigned int cmd, unsigned long arg); + static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret = -EINVAL; @@ -187,6 +189,9 @@ static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case DIAG_IOCTL_TYPE_TCP_CONNECT: ret = diag_ioctl_tcp_connect(nr, arg); break; + case DIAG_IOCTL_TYPE_PMU: + ret = diag_ioctl_pmu(nr, arg); + break; default: break; } diff --git a/SOURCE/module/entry.c b/SOURCE/module/entry.c index 268077d..e592edd 100644 --- a/SOURCE/module/entry.c +++ b/SOURCE/module/entry.c @@ -183,6 +183,8 @@ static ssize_t controller_file_write(struct diag_trace_file *trace_file, activate_throttle_delay(); } else if (strcmp(func, "tcp-connect") == 0) { activate_tcp_connect(); + } else if (strcmp(func, "pmu") == 0) { + activate_pmu(); } up(&controller_sem); @@ -264,6 +266,8 @@ static ssize_t controller_file_write(struct diag_trace_file *trace_file, deactivate_throttle_delay(); } else if (strcmp(func, "tcp-connect") == 0) { deactivate_tcp_connect(); + } else if (strcmp(func, "pmu") == 0) { + deactivate_pmu(); } up(&controller_sem); @@ -564,6 +568,12 @@ static int __init diagnosis_init(void) goto out_fs; } + ret = diag_pmu_init(); + if (ret) { + pr_err("diag_pmu_init failed.\n"); + goto out_pmu; + } + ret = diag_xby_test_init(); if (ret) { pr_err("diag_xhy_test_init failed.\n"); @@ -587,6 +597,8 @@ static int __init diagnosis_init(void) out_dev: diag_xby_test_exit(); out_xby_test: + diag_pmu_exit(); +out_pmu: diag_fs_exit(); out_fs: diag_pupil_exit(); @@ -659,6 +671,9 @@ static void __exit diagnosis_exit(void) diag_fs_exit(); msleep(20); + diag_pmu_exit(); + msleep(20); + alidiagnose_symbols_exit(); msleep(20); diff --git a/SOURCE/module/internal.h b/SOURCE/module/internal.h index 801bcd6..af549fc 100755 --- a/SOURCE/module/internal.h +++ b/SOURCE/module/internal.h @@ -58,6 +58,7 @@ static inline void __percpu_counter_add(struct percpu_counter *fbc, #include "pub/variant_buffer.h" #include "pub/stack.h" #include "uapi/throttle_delay.h" +#include "uapi/pmu.h" /** * 手工替换函数相关的宏 */ @@ -478,6 +479,11 @@ struct diag_percpu_context { struct rss_monitor_detail rss_monitor_detail; struct rss_monitor_raw_stack_detail rss_monitor_raw_stack_detail; } rss_monitor; + + struct { + struct perf_event *events[PMU_INDEX_MAX]; + struct diag_pmu_detail detail; + } pmu; }; extern struct diag_percpu_context *diag_percpu_context[NR_CPUS]; @@ -890,6 +896,11 @@ int deactivate_memcg_stats(void); int diag_memcg_stats_init(void); void diag_memcg_stats_exit(void); +int activate_pmu(void); +int deactivate_pmu(void); +int diag_pmu_init(void); +void diag_pmu_exit(void); + int diag_dev_init(void); void diag_dev_cleanup(void); diff --git a/SOURCE/module/pmu/debug.c b/SOURCE/module/pmu/debug.c new file mode 100644 index 0000000..72f50fd --- /dev/null +++ b/SOURCE/module/pmu/debug.c @@ -0,0 +1,158 @@ +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "uapi/pmu.h" + +#if defined(PMU_DEBUG) && PMU_DEBUG > 0 + +#define PMU_DEBUG_FILE "ali-linux/diag/diag_pmu_costs" + +DEFINE_PER_CPU(struct pmu_cost, diag_pmu_costs); + +static int diag_pmu_nr_cgroup = 0; + +void pmu_debug_init(void) +{ + struct pmu_cost *cost; + unsigned int cpu; + + for_each_online_cpu(cpu) { + cost = &per_cpu(diag_pmu_costs, cpu); + memset(cost, 0, sizeof(struct pmu_cost)); + } +} + +void pmu_debug_context_switch(cycles_t cycles_begin, + cycles_t cycles_mm_task_prev, + cycles_t cycles_mm_task_next, + cycles_t cycles_update_pmu_prev, + cycles_t cycles_update_pmu_next, + cycles_t cycles_end) + +{ + struct pmu_cost *cost; + + cost = this_cpu_ptr(&diag_pmu_costs); + cost->nr_switch +=1; + cost->cycles_switch += cycles_end - cycles_begin; + cost->cycles_find_record += cycles_mm_task_prev ? + cycles_mm_task_prev - cycles_begin : 0; + cost->cycles_update_record += cycles_update_pmu_prev ? + cycles_update_pmu_prev - cycles_mm_task_prev : 0; + cost->cycles_find_record += cycles_mm_task_next ? + (cycles_update_pmu_prev ? cycles_mm_task_next - cycles_update_pmu_prev : 0) : 0; + cost->cycles_update_record += cycles_mm_task_next ? + cycles_end - cycles_mm_task_next : 0; +} + +void pmu_debug_cgroup_rmdir(cycles_t cycles_begin, + cycles_t cycles_dump, + cycles_t cycles_detach) +{ + struct pmu_cost *cost; + + cost = &per_cpu(diag_pmu_costs, smp_processor_id()); + cost->nr_exit += 1; + cost->cycles_exit += cycles_detach - cycles_begin; + cost->cycles_dump_record += cycles_dump - cycles_begin; + cost->cycles_detach_record += cycles_detach - cycles_dump; +} + +void pmu_debug_cgroup_mkdir(cycles_t cycles_begin, + cycles_t cycles_end) +{ + struct pmu_cost *cost; + + cost = this_cpu_ptr(&diag_pmu_costs); + cost->nr_fork += 1; + cost->cycles_fork += cycles_end - cycles_begin; + cost->cycles_attach_record += cycles_end - cycles_begin; +} + +void pmu_debug_in_timer(cycles_t cycles_begin, + cycles_t cycles_find_record, + cycles_t cycles_update_record, + cycles_t cycles_end) +{ + struct pmu_cost *cost; + + cost = this_cpu_ptr(&diag_pmu_costs); + cost->nr_timer += 1; + cost->cycles_timer += cycles_end - cycles_begin; + cost->cycles_find_record += cycles_find_record - cycles_begin; + cost->cycles_update_record += cycles_update_record - cycles_find_record; +} + +void pmu_debug_nr_cgroup_inc(void) +{ + diag_pmu_nr_cgroup += 1; +} + +void pmu_debug_nr_cgroup_dec(void) +{ + diag_pmu_nr_cgroup -= 1; +} + + +static int pmu_cost_show(struct seq_file *m, void *v) +{ + struct pmu_cost *cost; + int cpu; + + for_each_online_cpu(cpu) { + cost = &per_cpu(diag_pmu_costs, cpu); + seq_printf(m, "cpu[%d] nr_switch %llu ->cycles_switch %llu " + "nr_timer %llu ->cycles_timer %llu " + "nr_fork %llu ->cycles_fork %llu " + "nr_exit %llu ->cycles_exit %llu " + "| cycles_find_record %llu cycles_update_record %llu " + "cycles_attach_record %llu cycles_detach_record %llu\n", + cpu, cost->nr_switch, cost->cycles_switch, + cost->nr_timer, cost->cycles_timer, + cost->nr_fork, cost->cycles_fork, + cost->nr_exit, cost->cycles_exit, + cost->cycles_find_record, cost->cycles_update_record, + cost->cycles_attach_record, cost->cycles_detach_record); + } + + seq_printf(m, "-----------------------------\n"); + seq_printf(m, "nr_cgroups: %d\n", diag_pmu_nr_cgroup); + + return 0; +} + +static int pmu_cost_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmu_cost_show, NULL); +} + +static const struct file_operations pmu_cost_fops = +{ + .owner = THIS_MODULE, + .open = pmu_cost_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int pmu_debug_proc_create(void) +{ + struct proc_dir_entry *pe; + int ret = 0; + + pe = proc_create(PMU_DEBUG_FILE, S_IFREG | 0444, NULL, + &pmu_cost_fops); + + if (!pe) { + ret = -ENOMEM; + } + + return ret; +} + +void pmu_debug_proc_destroy(void) +{ + remove_proc_entry(PMU_DEBUG_FILE, NULL); + +} + +#endif diff --git a/SOURCE/module/pmu/debug.h b/SOURCE/module/pmu/debug.h new file mode 100644 index 0000000..30f6c0f --- /dev/null +++ b/SOURCE/module/pmu/debug.h @@ -0,0 +1,120 @@ +#ifndef APROF_PMU_DEBUG_H +#define APROF_PMU_DEBUG_H + +#if defined(PMU_DEBUG) && PMU_DEBUG > 0 + + +/** + * 调试PMU相关模块的性能 + */ +struct pmu_cost { + unsigned long long nr_switch; + unsigned long long nr_fork; + unsigned long long nr_exit; + unsigned long long nr_timer; + unsigned long long cycles_switch; + unsigned long long cycles_fork; + unsigned long long cycles_exit; + unsigned long long cycles_timer; + unsigned long long cycles_find_record; + unsigned long long cycles_init_record; + unsigned long long cycles_update_record; + unsigned long long cycles_dump_record; + unsigned long long cycles_attach_record; + unsigned long long cycles_detach_record; +}; + +DECLARE_PER_CPU(struct pmu_cost, diag_pmu_costs); + +extern void pmu_debug_init(void); +extern void pmu_debug_context_switch(cycles_t cycles_begin, + cycles_t cycles_mm_task_prev, + cycles_t cycles_mm_task_next, + cycles_t cycles_update_pmu_prev, + cycles_t cycles_update_pmu_next, + cycles_t cycles_end); +extern void pmu_debug_cgroup_rmdir(cycles_t cycles_begin, + cycles_t cycles_dump, + cycles_t cycles_detach); +extern void pmu_debug_cgroup_mkdir(cycles_t cycles_begin, + cycles_t cycles_end); +extern void pmu_debug_in_timer(cycles_t cycles_begin, + cycles_t cycles_find_record, + cycles_t cycles_update_record, + cycles_t cycles_end); +#define pmu_debug_get_cycles(v) \ + do { \ + v = get_cycles(); \ + } while (0) + +extern int pmu_debug_proc_create(void); +extern void pmu_debug_proc_destroy(void); + +extern void pmu_debug_nr_cgroup_inc(void); +extern void pmu_debug_nr_cgroup_dec(void); + +#else +static inline void pmu_debug_init(void) +{ + // +} + +#define pmu_debug_get_cycles(v) \ + do { \ + } while (0) + +static inline void pmu_debug_context_switch(cycles_t cycles_begin, + cycles_t cycles_mm_task_prev, + cycles_t cycles_mm_task_next, + cycles_t cycles_update_pmu_prev, + cycles_t cycles_update_pmu_next, + cycles_t cycles_end) +{ + // +} + +static inline void pmu_debug_cgroup_rmdir(cycles_t cycles_begin, + cycles_t cycles_dump, + cycles_t cycles_detach) +{ + // +} + +static inline void pmu_debug_cgroup_mkdir(cycles_t cycles_begin, + cycles_t cycles_end) +{ + // +} + +static inline void pmu_debug_in_timer(cycles_t cycles_begin, + cycles_t cycles_find_record, + cycles_t cycles_update_record, + cycles_t cycles_end) +{ + // +} + +static inline int pmu_debug_proc_create(void) +{ + return 0; +} + +static inline void pmu_debug_proc_destroy(void) +{ + // +} + +static inline void pmu_debug_nr_cgroup_inc(void) +{ + // +} + +static inline void pmu_debug_nr_cgroup_dec(void) +{ + // +} + +#endif /* APROF_DEBUG */ + +#endif + diff --git a/SOURCE/module/pmu/entry.c b/SOURCE/module/pmu/entry.c new file mode 100644 index 0000000..d4bbd51 --- /dev/null +++ b/SOURCE/module/pmu/entry.c @@ -0,0 +1,558 @@ +/* + * Linux内核诊断工具--内核态pmu功能 + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Wen Yang <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ + +#include <linux/version.h> +#include <linux/delay.h> +#include <linux/kallsyms.h> +#include <linux/module.h> + +#include "internal.h" +#include "pub/cgroup.h" +#include "pub/trace_point.h" +#include "pub/variant_buffer.h" + +#include "uapi/pmu.h" +#include "pmu/pmu.h" +#include "pmu/debug.h" +#include "pub/mem_pool.h" +#include "pub/kprobe.h" + +#if !defined(ALIOS_7U) && !defined(UBUNTU) +long diag_ioctl_pmu(unsigned int cmd, unsigned long arg) +{ + return -ENOSYS; +} + +int activate_pmu(void) +{ + return 0; +} + +int deactivate_pmu(void) +{ + return 0; +} + +int diag_pmu_init(void) +{ + return 0; +} + +int diag_pmu_exit(void) +{ + return 0; +} + +#else + +extern struct diag_variant_buffer pmu_variant_buffer; +extern struct diag_pmu_settings pmu_settings; +extern struct ali_mem_pool mem_pool; + +#define CGROUP_BUFFER_MAX_COUNT 5000 +#define CGROUP_BUFFER_INIT_COUNT 1000 +static unsigned long diag_pmu_buffer_curr = 0; +static unsigned long diag_pmu_buffer_grow = 0; + +enum { + DIAG_PMU_NOT_LOAD = 0, + DIAG_PMU_LOADING, + DIAG_PMU_LOADED, + DIAG_PMU_EXITING, + DIAG_PMU_EXITED, +}; + +static int diag_pmu_module_state = DIAG_PMU_NOT_LOAD; +static DEFINE_SEMAPHORE(diag_pmu_sem); + +__maybe_unused static struct kprobe kprobe_cgroup_destroy_locked; +__maybe_unused static struct kprobe kprobe_cgroup_populate_dir; + +#if KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE +static void trace_sched_switch_hit(void *__data, bool preempt, + struct task_struct *prev, struct task_struct *next) +#elif KERNEL_VERSION(3, 10, 0) <= LINUX_VERSION_CODE +static void trace_sched_switch_hit(void *__data, + struct task_struct *prev, struct task_struct *next) +#else +static void trace_sched_switch_hit(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +#endif +{ + struct pmu_percpu *record; + struct pmu_registers data = {0}; + + __maybe_unused cycles_t cycles_begin = 0; + __maybe_unused cycles_t cycles_mm_task_prev = 0; + __maybe_unused cycles_t cycles_mm_task_next = 0; + __maybe_unused cycles_t cycles_update_pmu_prev= 0; + __maybe_unused cycles_t cycles_update_pmu_next= 0; + __maybe_unused cycles_t cycles_end; + + if (!pmu_settings.activated || !pmu_settings.sample) + return; + + pmu_debug_get_cycles(cycles_begin); + pmu_read_core_registers(&data); + + record = pmu_find_record(prev); + if (record) { + pmu_debug_get_cycles(cycles_mm_task_prev); + if (record->flags == PMU_FLAG_SCHED_IN) { + pmu_acc_delta(record, &data); + } + record->flags = PMU_FLAG_SCHED_OUT; + pmu_debug_get_cycles(cycles_update_pmu_prev); + } + + record = pmu_find_record(next); + if (record) { + pmu_debug_get_cycles(cycles_mm_task_next); + pmu_refresh_counters(record, &data); + record->flags = PMU_FLAG_SCHED_IN; + pmu_debug_get_cycles(cycles_update_pmu_next); + } + + pmu_debug_get_cycles(cycles_end); + pmu_debug_context_switch(cycles_begin, + cycles_mm_task_prev, + cycles_mm_task_next, + cycles_update_pmu_prev, + cycles_update_pmu_next, + cycles_end); + return; +} + +static void pmu_cgroup_rmdir(struct cgroup *cgrp) +{ + struct diag_pmu_detail *detail; + struct pmu_cgroup *pmu_cgrp; + struct pmu_registers data; + struct pmu_percpu *record; + unsigned long flags; + int cpu; + __maybe_unused cycles_t cycles_begin = 0; + __maybe_unused cycles_t cycles_dump = 0; + __maybe_unused cycles_t cycles_detach = 0; + + if (!cgrp) + return; + + pmu_cgrp = pmu_find_cgroup(cgrp); + if (unlikely(!pmu_cgrp)) + return; + + get_online_cpus(); + preempt_disable(); + + pmu_debug_get_cycles(cycles_begin); + + pmu_cgrp = pmu_find_cgroup(cgrp); + if (!pmu_cgrp) { + put_online_cpus(); + return; + } + + for_each_online_cpu(cpu) { + record = (struct pmu_percpu*)&(pmu_cgrp->percpu_data[cpu]); + if (record) { + memset(&data, 0, sizeof(data)); + pmu_read_and_clear_record(&data, record); + if (!data.instructions && !data.cycles && !data.ref_cycles) + continue; + + detail = &get_percpu_context()->pmu.detail; + pmu_fill_core_detail(detail, &data, cpu); + + diag_variant_buffer_spin_lock(&pmu_variant_buffer, flags); + diag_variant_buffer_reserve(&pmu_variant_buffer, sizeof(*detail)); + diag_variant_buffer_write_nolock(&pmu_variant_buffer, detail, + sizeof(*detail)); + diag_variant_buffer_seal(&pmu_variant_buffer); + diag_variant_buffer_spin_unlock(&pmu_variant_buffer, flags); + } + } + put_online_cpus(); + + pmu_debug_get_cycles(cycles_dump); + + pmu_detach_cgroup(cgrp); + + pmu_debug_get_cycles(cycles_detach); + + pmu_debug_cgroup_rmdir(cycles_begin, cycles_dump, cycles_detach); + + preempt_enable(); +} + +#if KERNEL_VERSION(4, 19, 0) <= LINUX_VERSION_CODE +__maybe_unused static void trace_cgroup_rmdir_hit(void *__data, + struct cgroup *cgrp, const char *path) +{ + pmu_cgroup_rmdir(cgrp); +} +#elif KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE +__maybe_unused static void trace_cgroup_rmdir_hit(void *__data, + struct cgroup *cgrp) +{ + pmu_cgroup_rmdir(cgrp); +} +#endif + +static void pmu_cgroup_mkdir(struct cgroup *cgrp) +{ + __maybe_unused cycles_t cycles_begin = 0; + __maybe_unused cycles_t cycles_end; + + if (unlikely(!cgrp)) + return; + +#if KERNEL_VERSION(4, 8, 0) < LINUX_VERSION_CODE + if (!(cgrp->subtree_ss_mask & (1 << cpuacct_cgrp_id))) + return; +#else + if (!cgrp->subsys[cpuacct_subsys_id]) + return; +#endif + + preempt_disable(); + + pmu_debug_get_cycles(cycles_begin); + pmu_attach_cgroup(cgrp); + pmu_debug_get_cycles(cycles_end); + + preempt_enable(); + + pmu_debug_cgroup_mkdir(cycles_begin, cycles_end); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0) +__maybe_unused static void trace_cgroup_mkdir_hit(void *__data, + struct cgroup *cgrp, const char *path) +{ + pmu_cgroup_mkdir(cgrp); +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) +__maybe_unused static void trace_cgroup_mkdir_hit(void *__data, + struct cgroup *cgrp) +{ + pmu_cgroup_mkdir(cgrp); +} +#else + +__maybe_unused static int kprobe_cgroup_populate_dir_pre(struct kprobe *p, + struct pt_regs *regs) +{ + struct cgroup * cgrp = (void *)ORIG_PARAM1(regs); + + pmu_cgroup_mkdir(cgrp); + return 0; +} + +__maybe_unused static int kprobe_cgroup_destroy_locked_pre(struct kprobe *p, + struct pt_regs *regs) +{ + struct cgroup * cgrp = (void *)ORIG_PARAM1(regs); + + pmu_cgroup_rmdir(cgrp); + return 0; +} + +#endif + +static void pmu_unhook_cgroup_create(void) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0) + unhook_kprobe(&kprobe_cgroup_populate_dir); +#else + unhook_tracepoint("cgroup_mkdir", trace_cgroup_mkdir_hit, NULL); +#endif +} + +static void pmu_unhook_cgroup_destroy(void) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0) + unhook_kprobe(&kprobe_cgroup_destroy_locked); +#else + unhook_tracepoint("cgroup_rmdir", trace_cgroup_rmdir_hit, NULL); +#endif +} + +static int pmu_hook_cgroup_create(void) +{ + int ret; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + ret = hook_tracepoint("cgroup_mkdir", trace_cgroup_mkdir_hit, NULL); +#else + ret = hook_kprobe(&kprobe_cgroup_populate_dir, "cgroup_populate_dir", + kprobe_cgroup_populate_dir_pre, NULL); +#endif + + if (ret) + pr_err("pmu: failed to hook cgroup_mkdir, ret=%d\n", ret); + + return ret; +} + +static int pmu_hook_cgroup_destroy(void) +{ + int ret; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) + ret = hook_tracepoint("cgroup_rmdir", trace_cgroup_rmdir_hit, NULL); +#else + ret = hook_kprobe(&kprobe_cgroup_destroy_locked, "cgroup_destroy_locked", + kprobe_cgroup_destroy_locked_pre, NULL); +#endif + + if (ret) + pr_err("pmu: failed to hook cgroup_rmdir, ret=%d\n", ret); + + return ret; +} + +static int __activate_pmu(void) +{ + int ret = 0; + + pmu_clean_data(); + pmu_attach_all_cgroups(); + + ret = pmu_create_all_events(); + if (ret) { + pr_err("pmu: failed to activate pmu, ret=%d\n", ret); + goto err_out; + } + + ret = pmu_hook_cgroup_create(); + if (ret) + goto err_detach; + + ret = pmu_hook_cgroup_destroy(); + if (ret) { + goto err_unhook_cgroup_mkdir; + } + + ret = hook_tracepoint("sched_switch", trace_sched_switch_hit, NULL); + if (ret) { + pr_err("pmu: failed to hook sched_switch, ret=%d\n", ret); + goto err_unhook_cgroup_rmdir; + } + + pmu_settings.activated = 1; + return 1; + +err_unhook_cgroup_rmdir: + pmu_unhook_cgroup_destroy(); + +err_unhook_cgroup_mkdir: + pmu_unhook_cgroup_create(); + +err_detach: + synchronize_sched(); + + pmu_detach_all_cgroups(); + pmu_destroy_all_events(); + +err_out: + return 0; +} + +int activate_pmu(void) +{ + int ret = 0; + + down(&diag_pmu_sem); + if (!pmu_settings.activated) + ret = __activate_pmu(); + up(&diag_pmu_sem); + + return ret; +} + +static int __deactivate_pmu(void) +{ + int ret = 0; + + unhook_tracepoint("sched_switch", trace_sched_switch_hit, NULL); + pmu_unhook_cgroup_create(); + pmu_unhook_cgroup_destroy(); + + synchronize_sched(); + msleep(10); + pmu_destroy_all_events(); + pmu_detach_all_cgroups(); + + return ret; +} + +int deactivate_pmu(void) +{ + int ret = 0; + + down(&diag_pmu_sem); + if (pmu_settings.activated) { + __deactivate_pmu(); + } else { + ret = -EAGAIN; + } + pmu_settings.activated = 0; + up(&diag_pmu_sem); + + return ret; +} + +long diag_ioctl_pmu(unsigned int cmd, unsigned long arg) +{ + int ret = -EINVAL; + int sample; + static struct diag_pmu_settings settings; + struct diag_ioctl_dump_param dump_param; + + switch (cmd) { + case CMD_PMU_SET: + down(&diag_pmu_sem); + if (pmu_settings.activated) { + ret = -EBUSY; + } else { + ret = copy_from_user(&settings, (void *)arg, sizeof(struct diag_pmu_settings)); + if (!ret) { + pmu_settings = settings; + pmu_settings.activated = 0; + } + } + up(&diag_pmu_sem); + + break; + case CMD_PMU_SETTINGS: + settings = pmu_settings; + ret = copy_to_user((void *)arg, &settings, sizeof(struct diag_pmu_settings)); + + break; + case CMD_PMU_DUMP: + ret = copy_from_user(&dump_param, (void *)arg, sizeof(struct diag_ioctl_dump_param)); + if (!ret) { + pmu_do_dump(); + ret = copy_to_user_variant_buffer(&pmu_variant_buffer, + dump_param.user_ptr_len, dump_param.user_buf, dump_param.user_buf_len); + } + + break; + case CMD_PMU_SAMPLE: + ret = copy_from_user(&sample, (void *)arg, sizeof(int)); + if (!ret) { + pmu_settings.sample = sample; + } + + break; + default: + break; + } + + return ret; +} + +static int diag_pmu_mem_pool_grow(unsigned int num) +{ + int ret; + + if (diag_pmu_buffer_curr + num > CGROUP_BUFFER_MAX_COUNT) + return -EINVAL; + + ret = ali_mem_pool_putin(&mem_pool, num); + if (ret) { + pr_err("pmu: grow mem_pool failed, ret=%d, num=%u\n", + ret, num); + return ret; + } + + diag_pmu_buffer_grow = num; + diag_pmu_buffer_curr += num; + + return 0; +} + +static int pmu_lookup_syms(void) +{ +#if defined(DIAG_ARM64) + LOOKUP_SYMS(armpmu_read); +#else + LOOKUP_SYMS(x86_perf_event_update); +#endif + + return 0; +} + +int diag_pmu_init(void) +{ + int ret; + + WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_LOADING); + diag_pmu_pool_init(); + diag_pmu_radix_init(); + ret = init_diag_variant_buffer(&pmu_variant_buffer, DIAG_PMU_VARIANT_BUF_LEN); + if (ret) { + pr_err("pmu: init variant_buffer failed, ret=%d\n", ret); + return ret; + } + + ret = alloc_diag_variant_buffer(&pmu_variant_buffer); + if (ret) { + pr_err("pmu: alloc variant_buffer failed, ret=%d\n", ret); + goto out_destroy_variant_buffer; + } + + ret = diag_pmu_mem_pool_grow(CGROUP_BUFFER_INIT_COUNT); + if (ret) { + goto out_destroy_variant_buffer; + } + + if (pmu_lookup_syms()) { + ret = -EINVAL; + goto out_destroy_mem_pool; + } + + diag_pmu_init_wq(); + + ret = pmu_debug_proc_create(); + if (ret) { + goto out_destroy_mem_pool; + } + + WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_LOADED); + return 0; + +out_destroy_mem_pool: + ali_mem_pool_destroy(&mem_pool); +out_destroy_variant_buffer: + destroy_diag_variant_buffer(&pmu_variant_buffer); + return ret; +} + +void diag_pmu_exit(void) +{ + down(&diag_pmu_sem); + WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_EXITING); + pmu_debug_proc_destroy(); + + if (pmu_settings.activated) { + __deactivate_pmu(); + } + + destroy_diag_variant_buffer(&pmu_variant_buffer); + + ali_mem_pool_destroy(&mem_pool); + WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_EXITED); + up(&diag_pmu_sem); +} + +#endif diff --git a/SOURCE/module/pmu/pmu.c b/SOURCE/module/pmu/pmu.c new file mode 100644 index 0000000..8495dcd --- /dev/null +++ b/SOURCE/module/pmu/pmu.c @@ -0,0 +1,601 @@ +/* + * Linux内核诊断工具--内核态pmu功能 + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Wen Yang <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ + +#include <linux/version.h> +#include <linux/hrtimer.h> +#include <linux/kernel.h> +#include <linux/kallsyms.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/timex.h> +#include <linux/tracepoint.h> +#include <trace/events/irq.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <trace/events/napi.h> +#include <linux/rtc.h> +#include <linux/time.h> +#include <linux/radix-tree.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/vmalloc.h> +#include <asm/irq_regs.h> + +#include "uapi/pmu.h" +#include "pub/trace_file.h" +#include "pub/variant_buffer.h" +#include "pub/trace_point.h" +#include "pub/cgroup.h" +#include "pub/mem_pool.h" +#include "pmu/pmu.h" +#include "pmu/debug.h" + +atomic64_t pmu_nr_running = ATOMIC64_INIT(0); +struct diag_pmu_settings effective_pmu_settings = {0}; +struct diag_pmu_settings pmu_settings = {0}; + +struct ali_mem_pool mem_pool; +struct diag_variant_buffer pmu_variant_buffer; + +static DEFINE_SPINLOCK(tree_lock); +struct radix_tree_root pmu_cgroup_tree; + +DEFINE_PER_CPU(struct work_struct, dump_pmu_works); + +#if defined (APROF_ARM64) +void (*orig_armpmu_read)(struct perf_event *event) = NULL; +#else +u64 (*orig_x86_perf_event_update)(struct perf_event *event) = NULL; +#endif + +static struct perf_event_attr pmu_attrs[PMU_INDEX_MAX] = +{ + [PMU_INDEX_CYCLES] = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + }, + [PMU_INDEX_INSTRUCTIONS] = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_INSTRUCTIONS, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + }, + [PMU_INDEX_REF_CYCLES] = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_REF_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + }, + [PMU_INDEX_BRANCH_MISSES] = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_BRANCH_MISSES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + }, + +#if defined (APROF_ARM64) + [PMU_INDEX_RAW_EVENT1] = { + .type = PERF_TYPE_RAW, + .config = 16395, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + }, +#else + [PMU_INDEX_LLC_MISSES] = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CACHE_MISSES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + }, +#endif + + [PMU_INDEX_RAW_EVENT1] = { + .type = PERF_TYPE_RAW, + .config = 0x151, + .size = sizeof(struct perf_event_attr), + .pinned = 0, + .disabled = 0, + }, + [PMU_INDEX_RAW_EVENT2] = { + .type = PERF_TYPE_RAW, + .config = 0x3f24, + .size = sizeof(struct perf_event_attr), + .pinned = 0, + .disabled = 0, + }, +}; + +void diag_pmu_radix_init(void) +{ + unsigned long flags; + + spin_lock_irqsave(&tree_lock, flags); + INIT_RADIX_TREE(&pmu_cgroup_tree, GFP_ATOMIC); + spin_unlock_irqrestore(&tree_lock, flags); +} + +void diag_pmu_pool_init(void) +{ + int size; + + size = sizeof(struct pmu_cgroup) + sizeof(struct pmu_percpu) * num_possible_cpus(); + ali_mem_pool_init(&mem_pool, size); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) + +static int pmu_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) +{ + memset(buf, 0, buflen); + + if (orig_kernfs_name && cgrp) { + return orig_kernfs_name(cgrp->kn, buf, buflen); + } else { + return 0; + } +} + +#else + +static int pmu_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) +{ + const char *name; + memset(buf, 0, buflen); + + if (cgrp) { + name = cgroup_name(cgrp); + strncpy(buf, name, buflen); + buf[buflen - 1] = 0; + + return strlen(buf); + } + + return 0; +} +#endif + +void pmu_attach_cgroup(struct cgroup *cgrp) +{ + unsigned long flags; + struct pmu_cgroup *info; + struct pmu_cgroup *tmp; + + info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp); + if (info) + return; + + tmp = ali_mem_pool_alloc(&mem_pool); + if (tmp) { + tmp->cgrp = cgrp; + + spin_lock_irqsave(&tree_lock, flags); + + info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp); + if (info) { + ali_mem_pool_free(&mem_pool, tmp); + } else { + radix_tree_insert(&pmu_cgroup_tree, (unsigned long)cgrp, tmp); + info = tmp; + pmu_debug_nr_cgroup_inc(); + } + + spin_unlock_irqrestore(&tree_lock, flags); + + pmu_cgroup_name(cgrp, info->cgrp_buf, CGROUP_NAME_LEN); + info->cgrp_buf[CGROUP_NAME_LEN - 1] = 0; + } +} + +void pmu_detach_cgroup(struct cgroup *cgrp) +{ + unsigned long flags; + struct pmu_cgroup *info; + + if (!cgrp) + return; + + spin_lock_irqsave(&tree_lock, flags); + + info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp); + if (info) { + info->cgrp = NULL; + radix_tree_delete(&pmu_cgroup_tree, (unsigned long)cgrp); + pmu_debug_nr_cgroup_dec(); + } + + spin_unlock_irqrestore(&tree_lock, flags); + + if (info) { + ali_mem_pool_free(&mem_pool, info); + } +} + +static void pmu_release_perf_event(struct perf_event **event) +{ + if (event && *event) { + printk_ratelimited(KERN_DEBUG "pmu: release perf_event(type=%d," + "config=0x%llx) on cpu[%d]\n", (*event)->attr.type, + (*event)->attr.config, (*event)->cpu); + + perf_event_disable(*event); + perf_event_release_kernel(*event); + *event = NULL; + } +} + +static int pmu_destroy_counter(unsigned int cpu) +{ + struct diag_percpu_context *context = get_percpu_context_cpu(cpu); + int index; + + for (index = 0; index < PMU_INDEX_MAX; ++index) + pmu_release_perf_event(&context->pmu.events[index]); + + return 0; +} + +static struct perf_event *pmu_create_perf_event(struct perf_event_attr *attr, + int cpu) +{ + struct perf_event *event; + + event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL); + if (IS_ERR(event)) { + printk_ratelimited(KERN_ERR "pmu: failed to create perf_event(type=%d," + "config=0x%llx) on cpu[%d], ret=%ld\n", attr->type, attr->config, + cpu, PTR_ERR(event)); + goto err_out; + } + + printk_ratelimited(KERN_DEBUG "pmu: create perf_event(%d/0x%llx) on cpu[%d]" + " successful, state=%d\n", attr->type, attr->config, cpu, event->state); + + perf_event_enable(event); + + return event; + +err_out: + return NULL; +} + +static int _pmu_create_counter(int conf, int replace_config, int cpu, + struct perf_event_attr *attr, struct perf_event **event) +{ + if (!conf || !event || *event) + return 0; + + if (replace_config) + attr->config = conf; + + *event = pmu_create_perf_event(attr, cpu); + return *event ? 0 : -EAGAIN; +} + +#if defined(PMU_DEBUG) && PMU_DEBUG > 0 + #if defined(DIAG_ARM64) + #define APROF_FIXED_COUNTERS 2 + #else + #define APROF_FIXED_COUNTERS 3 + #endif +#else + #define APROF_FIXED_COUNTERS 2 +#endif + +static int pmu_create_core_events(int cpu) +{ + struct diag_percpu_context *context = get_percpu_context_cpu(cpu); + int index; + int ret; + + for (index = 0; index < APROF_FIXED_COUNTERS; ++index) { + ret = _pmu_create_counter(pmu_settings.conf_fixed_counters, 0, cpu, + &pmu_attrs[index], &context->pmu.events[index]); + if (ret) + goto err_out; + } + + ret = _pmu_create_counter(pmu_settings.conf_branch_misses, 0, cpu, + &pmu_attrs[PMU_INDEX_BRANCH_MISSES], + &context->pmu.events[PMU_INDEX_BRANCH_MISSES]); + if (ret) + goto err_out; + + ret = _pmu_create_counter(pmu_settings.conf_last_cache_misses, 0, cpu, + &pmu_attrs[PMU_INDEX_LLC_MISSES], + &context->pmu.events[PMU_INDEX_LLC_MISSES]); + if (ret) + goto err_out; + + ret = _pmu_create_counter(pmu_settings.conf_raw_pmu_event1, 1, cpu, + &pmu_attrs[PMU_INDEX_RAW_EVENT1], + &context->pmu.events[PMU_INDEX_RAW_EVENT1]); + if (ret) + goto err_out; + + ret = _pmu_create_counter(pmu_settings.conf_raw_pmu_event2, 1, cpu, + &pmu_attrs[PMU_INDEX_RAW_EVENT2], + &context->pmu.events[PMU_INDEX_RAW_EVENT2]); + if (ret) + goto err_out; + + return 0; + +err_out: + pmu_destroy_counter(cpu); + return ret; +} + +void pmu_destroy_all_events(void) +{ + unsigned int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + pmu_destroy_counter(cpu); + put_online_cpus(); +} + +int pmu_create_all_events(void) +{ + int cpu; + int ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + ret = pmu_create_core_events(cpu); + if (ret) { + put_online_cpus(); + goto err_out; + } + } + + put_online_cpus(); + + return 0; + +err_out: + pmu_destroy_all_events(); + return ret; +} + +struct cpuacct_impl { + struct cgroup_subsys_state css; + char internal[0]; +}; + +static struct cpuacct * cb_attach_cpuacct_cgrp(struct cpuacct *acct, void *data) +{ + struct cpuacct_impl *impl; + + if (acct) { + impl = (void *)acct; + pmu_attach_cgroup(impl->css.cgroup); + } + + return NULL; +} + +static struct cpuacct * cb_detach_cpuacct_cgrp(struct cpuacct *acct, void *data) +{ + struct cpuacct_impl *impl; + + if (acct) { + impl = (void *)acct; + pmu_detach_cgroup(impl->css.cgroup); + } + + return NULL; +} + +void pmu_attach_all_cgroups(void) +{ + cpuacct_cgroup_walk_tree(cb_attach_cpuacct_cgrp, NULL); +} + +void pmu_detach_all_cgroups(void) +{ + cpuacct_cgroup_walk_tree(cb_detach_cpuacct_cgrp, NULL); +} + +static void pmu_walk_pmu_cgroup_tree(void (*callback)(struct pmu_cgroup *)) +{ + struct pmu_cgroup *pmu_cgrps[NR_BATCH]; + struct pmu_cgroup *pmu_cgrp; + unsigned long pos = 0; + int nr_found; + int i; + + rcu_read_lock(); + + do { + nr_found = radix_tree_gang_lookup(&pmu_cgroup_tree, (void **)pmu_cgrps, pos, NR_BATCH); + + for (i = 0; i < nr_found; i++) { + pmu_cgrp = pmu_cgrps[i]; + callback(pmu_cgrp); + pos = (unsigned long)pmu_cgrp->cgrp + 1; + } + } while (nr_found > 0); + + rcu_read_unlock(); +} + +static void pmu_clean_percpu_data(struct pmu_cgroup *pmu_cgrp) +{ + if (!pmu_cgrp) + return; + + memset(&pmu_cgrp->percpu_data[0], 0, + sizeof(struct pmu_percpu) * num_possible_cpus()); +} + +void pmu_clean_data(void) +{ + pmu_debug_init(); + pmu_walk_pmu_cgroup_tree(pmu_clean_percpu_data); +} + +void pmu_read_and_clear_record(struct pmu_registers *data, + struct pmu_percpu *record) +{ + data->instructions = record->sum.instructions; + data->cycles = record->sum.cycles; + data->ref_cycles = record->sum.ref_cycles; + data->branch_misses = record->sum.branch_misses; + data->last_cache_misses = record->sum.last_cache_misses; + data->raw_pmu_event1 = record->sum.raw_pmu_event1; + data->raw_pmu_event2 = record->sum.raw_pmu_event2; + + record->sum.instructions = 0; + record->sum.cycles = 0; + record->sum.ref_cycles = 0; + record->sum.branch_misses = 0; + record->sum.last_cache_misses = 0; + record->sum.raw_pmu_event1 = 0; + record->sum.raw_pmu_event2 = 0; +} + +void pmu_fill_core_detail(struct diag_pmu_detail *detail, + const struct pmu_registers *data, int cpu) +{ + detail->et_type = et_pmu_detail; + detail->cpu = cpu; + detail->instructions = data->instructions; + detail->ref_cycles = data->ref_cycles; + detail->cycles = data->cycles; + detail->branch_misses = data->branch_misses; + detail->last_cache_misses = data->last_cache_misses; + detail->raw_pmu_event1 = data->raw_pmu_event1; + detail->raw_pmu_event2 = data->raw_pmu_event2; +} + +static void pmu_dump_local_core(struct pmu_cgroup *pmu_cgrp) +{ + struct pmu_registers data = {0}; + struct diag_pmu_detail *detail; + struct pmu_percpu *record; + unsigned long flags; + + if (!pmu_cgrp) + return; + + preempt_disable(); + record = (struct pmu_percpu*)&(pmu_cgrp->percpu_data[smp_processor_id()]); + pmu_read_and_clear_record(&data, record); + if (pmu_settings.conf_fixed_counters && !data.instructions && + !data.cycles && !data.ref_cycles) + goto out; + + detail = &get_percpu_context()->pmu.detail; + pmu_fill_core_detail(detail, &data, smp_processor_id()); + memcpy(&detail->cgrp_buf, &pmu_cgrp->cgrp_buf, CGROUP_NAME_LEN); + + diag_variant_buffer_spin_lock(&pmu_variant_buffer, flags); + diag_variant_buffer_reserve(&pmu_variant_buffer, sizeof(*detail)); + diag_variant_buffer_write_nolock(&pmu_variant_buffer, detail, sizeof(*detail)); + diag_variant_buffer_seal(&pmu_variant_buffer); + diag_variant_buffer_spin_unlock(&pmu_variant_buffer, flags); + +out: + preempt_enable(); + return; +} + +static void pmu_dump_local(struct work_struct *work) +{ + pmu_walk_pmu_cgroup_tree(pmu_dump_local_core); +} + +void diag_pmu_init_wq(void) +{ + int i; + struct work_struct *dump_work; + + for_each_possible_cpu(i) { + dump_work = per_cpu_ptr(&dump_pmu_works, i); + INIT_WORK(dump_work, pmu_dump_local); + } +} + +static void dump_pmu_all(void) +{ + unsigned int cpu; + + if (!pmu_settings.activated) + return; + + atomic64_inc_return(&pmu_nr_running); + get_online_cpus(); + + for_each_online_cpu(cpu) + queue_work_on(cpu, system_wq, per_cpu_ptr(&dump_pmu_works, cpu)); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(&dump_pmu_works, cpu)); + + put_online_cpus(); + atomic64_dec_return(&pmu_nr_running); +} + +void diag_pmu_timer(struct diag_percpu_context *context) +{ + struct pmu_percpu *record; + __maybe_unused cycles_t cycles_begin; + __maybe_unused cycles_t cycles_find_record; + __maybe_unused cycles_t cycles_update_record; + __maybe_unused cycles_t cycles_end; + + if (!pmu_settings.activated || + !pmu_settings.sample || + !pmu_settings.style) + return; + + pmu_debug_get_cycles(cycles_begin); + + record = pmu_find_record(current); + if (record) { + struct pmu_registers data = {0}; + + pmu_debug_get_cycles(cycles_find_record); + + pmu_read_core_registers(&data); + if (record->flags == PMU_FLAG_SCHED_IN) { + pmu_acc_delta(record, &data); + } + + pmu_debug_get_cycles(cycles_update_record); + pmu_debug_get_cycles(cycles_end); + pmu_debug_in_timer(cycles_begin, cycles_find_record, cycles_update_record, cycles_end); + } +} + +void pmu_do_dump(void) +{ + static DEFINE_MUTEX(mutex); + + if (!pmu_settings.activated || !pmu_settings.sample) + return; + + mutex_lock(&mutex); + dump_pmu_all(); + mutex_unlock(&mutex); +} diff --git a/SOURCE/module/pmu/pmu.h b/SOURCE/module/pmu/pmu.h new file mode 100644 index 0000000..70f0afd --- /dev/null +++ b/SOURCE/module/pmu/pmu.h @@ -0,0 +1,225 @@ +#ifndef APROF_PMU_H +#define APROF_PMU_H + +#include <linux/list.h> +#include <linux/cache.h> +#include "internal.h" + +#define PMU_FLAG_UNKNOWN 0 +#define PMU_FLAG_SCHED_IN 1 +#define PMU_FLAG_SCHED_OUT 2 + + +struct pmu_registers { + unsigned long instructions; + unsigned long cycles; + unsigned long ref_cycles; + unsigned long branch_misses; + unsigned long last_cache_misses; + unsigned long raw_pmu_event1; + unsigned long raw_pmu_event2; +}; + +struct pmu_percpu { + struct pmu_registers last; + struct pmu_registers sum; + int flags; + unsigned long __pad __attribute__ ((aligned (32))); +}; + +struct pmu_cgroup { + struct cgroup *cgrp; + unsigned long cpu_count; + char cgrp_buf[CGROUP_NAME_LEN]; + struct pmu_percpu percpu_data[0] __attribute__ ((aligned (64))); +}; + +void pmu_read_and_clear_record(struct pmu_registers *data, + struct pmu_percpu *record); + +void diag_pmu_pool_init(void); +void diag_pmu_radix_init(void); +void diag_pmu_init_wq(void); + +void pmu_do_dump(void); + +int pmu_create_all_events(void); +void pmu_destroy_all_events(void); + +void pmu_clean_data(void); +void pmu_attach_all_cgroups(void); +void pmu_detach_all_cgroups(void); + +void pmu_fill_core_detail(struct diag_pmu_detail *detail, + const struct pmu_registers *data, int cpu); + +void pmu_detach_cgroup(struct cgroup *tsk); +void pmu_attach_cgroup(struct cgroup *tsk); + +int pmu_cpuhp_register(void); +void pmu_cpuhp_unregister(void); + +struct perf_event; + +#if defined(APROF_ARM64) +extern void (*orig_armpmu_read)(struct perf_event *event); +#else +extern u64 (*orig_x86_perf_event_update)(struct perf_event *event); +#endif + +extern struct diag_pmu_settings pmu_settings; + +static inline void pmu_refresh_counters(struct pmu_percpu *record, + const struct pmu_registers *data) +{ + record->last.instructions = data->instructions; + record->last.cycles = data->cycles; + record->last.ref_cycles = data->ref_cycles; + record->last.branch_misses = data->branch_misses; + record->last.last_cache_misses = data->last_cache_misses; + record->last.raw_pmu_event1 = data->raw_pmu_event1; + record->last.raw_pmu_event2 = data->raw_pmu_event2; +} + +static inline void handle_delta(unsigned long curr, unsigned long* prev, + unsigned long *sum, char * prefix) +{ + signed long delta = curr - *prev; + + *prev = curr; + if (likely(delta > 0)) { + *sum += delta; + } +} + +static inline void pmu_acc_delta(struct pmu_percpu *record, + const struct pmu_registers *data) +{ + handle_delta(data->instructions, &record->last.instructions, + &record->sum.instructions, "instructions"); + handle_delta(data->cycles, &record->last.cycles, + &record->sum.cycles, "cycles"); + handle_delta(data->ref_cycles, &record->last.ref_cycles, + &record->sum.ref_cycles, "ref_cycles"); + handle_delta(data->branch_misses, &record->last.branch_misses, + &record->sum.branch_misses, "branch_misses"); + handle_delta(data->last_cache_misses, &record->last.last_cache_misses, + &record->sum.last_cache_misses, "last_chche_misses"); + handle_delta(data->raw_pmu_event1, &record->last.raw_pmu_event1, + &record->sum.raw_pmu_event1, "raw_pmu_event1"); + handle_delta(data->raw_pmu_event2, &record->last.raw_pmu_event2, + &record->sum.raw_pmu_event2, "raw_pmu_event2"); +} + +static inline unsigned long pmu_read_core_event(struct perf_event *event) +{ + unsigned long flags; + + if (!event) + return 0; + +#if defined (APROF_ARM64) + if (!orig_armpmu_read) + return 0; +#else + if (!orig_x86_perf_event_update) + return 0; +#endif + + if (event->state == PERF_EVENT_STATE_ACTIVE) { + local_irq_save(flags); +#if defined (APROF_ARM64) + orig_armpmu_read(event); +#else + orig_x86_perf_event_update(event); +#endif + local_irq_restore(flags); + } + + return local64_read(&event->count); +} + +static inline void pmu_read_core_registers(struct pmu_registers *data) +{ + struct diag_percpu_context *ctx = get_percpu_context(); + + if (unlikely(!ctx)) + return; + + if (pmu_settings.conf_fixed_counters) { + data->cycles = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_CYCLES]); + data->instructions = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_INSTRUCTIONS]); + data->ref_cycles = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_REF_CYCLES]); + } + + if (pmu_settings.conf_branch_misses) + data->branch_misses = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_BRANCH_MISSES]); + + if (pmu_settings.conf_last_cache_misses) { + data->last_cache_misses = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_LLC_MISSES]); + } + + if (pmu_settings.conf_raw_pmu_event1) + data->raw_pmu_event1 = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_RAW_EVENT1]); + + if (pmu_settings.conf_raw_pmu_event2) + data->raw_pmu_event2 = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_RAW_EVENT2]); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) +#define ACCT_CGRP_ID cpuacct_cgrp_id +#else +#define ACCT_CGRP_ID cpuacct_subsys_id +#endif + +struct radix_tree_root; +extern struct radix_tree_root pmu_cgroup_tree; + +static inline struct pmu_cgroup *pmu_pick_cgroup_by_task(struct task_struct *task) +{ + struct cgroup *cgrp = NULL; + struct pmu_cgroup *info = NULL; + + if (task && task->cgroups && + task->cgroups->subsys && + task->cgroups->subsys[ACCT_CGRP_ID] && + task->cgroups->subsys[ACCT_CGRP_ID]->cgroup) + cgrp = task->cgroups->subsys[ACCT_CGRP_ID]->cgroup; + else + goto out; + + rcu_read_lock(); + info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp); + rcu_read_unlock(); + +out: + return info; +} + +static inline struct pmu_percpu *pmu_find_record(struct task_struct *task) +{ + struct pmu_cgroup *pmu_cgrp; + + pmu_cgrp = pmu_pick_cgroup_by_task(task); + if (!pmu_cgrp) + return NULL; + + return (struct pmu_percpu*)&(pmu_cgrp->percpu_data[smp_processor_id()]); +} + +static inline struct pmu_cgroup *pmu_find_cgroup(struct cgroup *cgrp) +{ + struct pmu_cgroup *info = NULL; + + if (!cgrp) + goto out; + + rcu_read_lock(); + info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp); + rcu_read_unlock(); + +out: + return info; +} + +#endif /* APROF_PMU_H */ diff --git a/SOURCE/module/pub/cgroup.c b/SOURCE/module/pub/cgroup.c index 285ccc6..5114ea9 100644 --- a/SOURCE/module/pub/cgroup.c +++ b/SOURCE/module/pub/cgroup.c @@ -12,7 +12,7 @@ #include "pub/cgroup.h" #include "../symbol.h" -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) || LINUX_VERSION_CODE > KERNEL_VERSION(4,10,0) +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) || LINUX_VERSION_CODE > KERNEL_VERSION(5,16,0) struct cgroup * cpuacct_to_cgroup(struct cpuacct *acct) { return NULL; @@ -34,8 +34,6 @@ void diag_cpuacct_cgroup_name_tsk(struct task_struct *tsk, char *buf, unsigned i #else -typedef struct cpuacct *(*match_cpuacct)(struct cpuacct *acct, void *data); - #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) #define diag_css_for_each_descendant_pre(pos, css) \ @@ -98,7 +96,7 @@ struct cgroup *diag_cpuacct_cgroup_tsk(struct task_struct *tsk) return ret; } -static struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data) +struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data) { struct cpuacct *root = orig_root_cpuacct; struct cgroup_subsys_state *css; @@ -173,7 +171,7 @@ static inline int diag_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen return 0; } -static struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data) +struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data) { struct cpuacct *root = orig_root_cpuacct; struct cgroup_subsys_state *css; diff --git a/SOURCE/module/pub/cgroup.h b/SOURCE/module/pub/cgroup.h index ab4f7fa..205db6b 100644 --- a/SOURCE/module/pub/cgroup.h +++ b/SOURCE/module/pub/cgroup.h @@ -21,5 +21,8 @@ struct cgroup *diag_cpuacct_cgroup_tsk(struct task_struct *tsk); void diag_cpuacct_cgroup_name_tsk(struct task_struct *tsk, char *buf, unsigned int count); struct cgroup * cpuacct_to_cgroup(struct cpuacct *acct); +typedef struct cpuacct *(*match_cpuacct)(struct cpuacct *acct, void *data); +struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data); + #endif /* __DIAG_PUB_CGROUP_H */ diff --git a/SOURCE/module/stub.c b/SOURCE/module/stub.c index 2104444..821954d 100755 --- a/SOURCE/module/stub.c +++ b/SOURCE/module/stub.c @@ -260,6 +260,13 @@ int __weak memcg_stats_syscall(struct pt_regs *regs, long id) return -ENOSYS; } +DIAG_WEAK_FUNC_INIT_EXIT(pmu) +DIAG_WEAK_FUNC_ACT_DEACT_IOCTL(pmu) +int __weak pmu_syscall(struct pt_regs *regs, long id) +{ + return -ENOSYS; +} + void __weak sys_loop_timer(struct diag_percpu_context *context) { // diff --git a/SOURCE/uapi/ali_diagnose.h b/SOURCE/uapi/ali_diagnose.h index d5c8cdb..d94e413 100644 --- a/SOURCE/uapi/ali_diagnose.h +++ b/SOURCE/uapi/ali_diagnose.h @@ -101,6 +101,7 @@ extern unsigned long debug_mode; #define DIAG_IOCTL_TYPE_MEMCG_STATS (DIAG_IOCTL_TYPE_RSS_MONITOR + 1) #define DIAG_IOCTL_TYPE_THROTTLE_DELAY (DIAG_IOCTL_TYPE_MEMCG_STATS + 1) #define DIAG_IOCTL_TYPE_TCP_CONNECT (DIAG_IOCTL_TYPE_THROTTLE_DELAY + 1) +#define DIAG_IOCTL_TYPE_PMU (DIAG_IOCTL_TYPE_TCP_CONNECT + 1) #define DIAG_IOCTL_TYPE_END (DIAG_IOCTL_TYPE_THROTTLE_DELAY + 1) @@ -355,6 +356,10 @@ struct diag_ioctl_dump_param_cycle { #define DIAG_BASE_SYSCALL_TCP_CONNECT \ (DIAG_BASE_SYSCALL_THROTTLE_DELAY + DIAG_SYSCALL_INTERVAL) +//2050 +#define DIAG_BASE_SYSCALL_PMU \ + (DIAG_BASE_SYSCALL_TCP_CONNECT + DIAG_SYSCALL_INTERVAL) + #define DIAG_SYSCALL_END (DIAG_BASE_SYSCALL + DIAG_SYSCALL_INTERVAL * 1000) enum diag_record_id { @@ -530,6 +535,9 @@ enum diag_record_id { et_tcp_connect_base = et_throttle_delay_base + DIAG_EVENT_TYPE_INTERVAL, et_tcp_connect_detail, + et_pmu_base = et_tcp_connect_base + DIAG_EVENT_TYPE_INTERVAL, + et_pmu_detail, + et_count }; diff --git a/SOURCE/uapi/pmu.h b/SOURCE/uapi/pmu.h new file mode 100644 index 0000000..46189c7 --- /dev/null +++ b/SOURCE/uapi/pmu.h @@ -0,0 +1,67 @@ +/* + * Linux内核诊断工具--用户接口API + * + * Copyright (C) 2020 Alibaba Ltd. + * + * 作者: Wen Yang <[email protected]> + * + * License terms: GNU General Public License (GPL) version 3 + * + */ + +#ifndef UAPI_CGROUP_STAT_H +#define UAPI_CGROUP_STAT_H + +#include <linux/ioctl.h> +#include "ali_diagnose.h" + +#define DIAG_PMU_VARIANT_BUF_LEN (20 * 1024 * 1024) + +enum pmu_counters +{ + PMU_INDEX_CYCLES = 0, + PMU_INDEX_INSTRUCTIONS, + PMU_INDEX_REF_CYCLES, + PMU_INDEX_BRANCH_MISSES, + PMU_INDEX_LLC_MISSES, + PMU_INDEX_RAW_EVENT1, + PMU_INDEX_RAW_EVENT2, + PMU_INDEX_MAX, +}; + +struct diag_pmu_settings { + unsigned int activated; + unsigned int verbose; + unsigned int style; + unsigned int sample; + unsigned int conf_fixed_counters; + unsigned int conf_branch_misses; + unsigned int conf_last_cache_misses; + unsigned int conf_raw_pmu_event1; + unsigned int conf_raw_pmu_event2; +}; + +struct diag_pmu_detail { + int et_type; + int cpu; + char cgrp_buf[CGROUP_NAME_LEN]; + unsigned long instructions; + unsigned long cycles; + unsigned long ref_cycles; + unsigned long branch_misses; + unsigned long last_cache_misses; + unsigned long raw_pmu_event1; + unsigned long raw_pmu_event2; +}; + +#define CMD_PMU_SET (0) +#define CMD_PMU_SETTINGS (CMD_PMU_SET + 1) +#define CMD_PMU_DUMP (CMD_PMU_SETTINGS + 1) +#define CMD_PMU_ISOLATE (CMD_PMU_DUMP + 1) +#define CMD_PMU_SAMPLE (CMD_PMU_ISOLATE + 1) +#define DIAG_IOCTL_PMU_SET _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_SET, struct diag_pmu_settings) +#define DIAG_IOCTL_PMU_SETTINGS _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_SETTINGS, struct diag_pmu_settings) +#define DIAG_IOCTL_PMU_DUMP _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_DUMP, struct diag_ioctl_dump_param) +#define DIAG_IOCTL_PMU_SAMPLE _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_SAMPLE, int) + +#endif /* UAPI_CGROUP_STAT_H */ |
