summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWen Yang <[email protected]>2022-07-31 18:00:15 +0800
committerWen Yang <[email protected]>2022-07-31 21:50:14 +0800
commit8cd905a1c17f2201e460a2d607413a1303757a32 (patch)
treee1a9eb8a5628786fa3db83e0bb1f62b456f74c54
parenteebb3c8f25883e2402a24ddeb65b80d5701f2f55 (diff)
diagnose: support the pmu metrics
-rw-r--r--SOURCE/diagnose-tools/internal.h3
-rw-r--r--SOURCE/diagnose-tools/main.cc1
-rw-r--r--SOURCE/diagnose-tools/pmu.cc451
-rwxr-xr-xSOURCE/module/Makefile1
-rw-r--r--SOURCE/module/chr_dev.c5
-rw-r--r--SOURCE/module/entry.c15
-rwxr-xr-xSOURCE/module/internal.h11
-rw-r--r--SOURCE/module/pmu/debug.c158
-rw-r--r--SOURCE/module/pmu/debug.h120
-rw-r--r--SOURCE/module/pmu/entry.c558
-rw-r--r--SOURCE/module/pmu/pmu.c601
-rw-r--r--SOURCE/module/pmu/pmu.h225
-rw-r--r--SOURCE/module/pub/cgroup.c8
-rw-r--r--SOURCE/module/pub/cgroup.h3
-rwxr-xr-xSOURCE/module/stub.c7
-rw-r--r--SOURCE/uapi/ali_diagnose.h8
-rw-r--r--SOURCE/uapi/pmu.h67
17 files changed, 2237 insertions, 5 deletions
diff --git a/SOURCE/diagnose-tools/internal.h b/SOURCE/diagnose-tools/internal.h
index 3a1e879..4ec70ba 100644
--- a/SOURCE/diagnose-tools/internal.h
+++ b/SOURCE/diagnose-tools/internal.h
@@ -177,6 +177,9 @@ void usage_fs_cache(void);
int high_order_main(int argc, char *argv[]);
void usage_high_order(void);
+int pmu_main(int argc, char **argv);
+void usage_pmu(void);
+
int testcase_main(int argc, char *argv[]);
struct timeval;
diff --git a/SOURCE/diagnose-tools/main.cc b/SOURCE/diagnose-tools/main.cc
index af30bbd..c276046 100644
--- a/SOURCE/diagnose-tools/main.cc
+++ b/SOURCE/diagnose-tools/main.cc
@@ -249,6 +249,7 @@ static struct diagnose_func all_funcs[] {
{"task-monitor", task_monitor_main, 0},
{"rw-sem", rw_sem_main, 0},
{"rss-monitor", rss_monitor_main, 0},
+ {"pmu", pmu_main, 0},
{"test", testcase_main, 0},
};
diff --git a/SOURCE/diagnose-tools/pmu.cc b/SOURCE/diagnose-tools/pmu.cc
new file mode 100644
index 0000000..9773a2d
--- /dev/null
+++ b/SOURCE/diagnose-tools/pmu.cc
@@ -0,0 +1,451 @@
+/*
+ * Linux内核诊断工具--用户态pmu功能实现
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Wen Yang <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <getopt.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+
+#include "json/json.h"
+#include "internal.h"
+#include "params_parse.h"
+#include "uapi/pmu.h"
+
+using namespace std;
+
+#define NSEC_PER_SEC 1000000000L
+
+#define PMC_INSTRUCTIONS "instructions"
+#define PMC_CYCLES "cycles"
+#define PMC_REF_CYCLES "ref-cycles"
+#define PMC_FIXED_COUNTERS "fixed-counters"
+#define PMC_BRANCH_MISSES "branch-misses"
+#define PMC_LAST_CACHE_MISSES "llc-misses"
+#define PMC_RAW_PMU_EVENT1 "raw-pmu-event1"
+#define PMC_RAW_PMU_EVENT2 "raw-pmu-event2"
+
+extern unsigned long debug_mode;
+
+static unsigned long instructions_sum;
+static unsigned long cycles_sum;
+static unsigned long ref_cycles_sum;
+static unsigned long branch_misses_sum;
+static unsigned long last_cache_misses_sum;
+static unsigned long raw_pmu_event1_sum;
+static unsigned long raw_pmu_event2_sum;
+static std::stringstream ss_cpu;
+
+void usage_pmu(void)
+{
+ printf(" pmu usage:\n");
+ printf(" --activate\n");
+ printf(" style: whether to use hrtimers to probe long-running processes\n");
+ printf(" sample: whether to sample the PMU registers\n");
+ printf(" %s: whether to enable the collection of the %s, default is true\n",
+ PMC_FIXED_COUNTERS, PMC_FIXED_COUNTERS);
+ printf(" %s: whether to enable the collection of the %s\n",
+ PMC_BRANCH_MISSES, PMC_BRANCH_MISSES);
+ printf(" %s: whether to enable the collection of the %s\n",
+ PMC_LAST_CACHE_MISSES, PMC_LAST_CACHE_MISSES);
+ printf(" %s: a raw PMU event (eventsel+umask) in the form of NNN where NNN is a hexadecimal event descriptor.\n",
+ PMC_RAW_PMU_EVENT1);
+ printf(" %s: a raw PMU event (eventsel+umask) in the form of NNN where NNN is a hexadecimal event descriptor.\n",
+ PMC_RAW_PMU_EVENT2);
+ printf(" --deactivate\n");
+ printf(" --settings print settings.\n");
+ printf(" --report dump log with text.\n");
+ printf(" --record\n");
+ printf(" sls=/tmp/1.json stored core events in the specified file\n");
+ printf(" --sample stop sample if it is 0\n");
+}
+
+static void print_settings(struct diag_pmu_settings *settings, int is_activate_oper)
+{
+ printf("SETTINGS: \n");
+
+ if (!is_activate_oper)
+ printf(" ACTIVE: %s\n", settings->activated ? "Yes" : "No");
+
+ printf(" STYLE: %d\n", settings->style);
+ printf(" SAMPLE: %d\n", settings->sample);
+ printf(" FIXED-COUNTERS: %d\n", settings->conf_fixed_counters);
+ printf(" BRANCH-MISSES: %d\n", settings->conf_branch_misses);
+ printf(" LLC-MISSES: %d\n", settings->conf_last_cache_misses);
+ printf(" RAW-PMU-EVENT1: 0X%04X\n", settings->conf_raw_pmu_event1);
+ printf(" RAW-PMU-EVENT2: 0X%04X\n", settings->conf_raw_pmu_event2);
+}
+
+static int do_activate(const char *arg)
+{
+ int ret = 0;
+ struct params_parser parse(arg);
+ struct diag_pmu_settings settings;
+ string str;
+
+ memset(&settings, 0, sizeof(struct diag_pmu_settings));
+
+ settings.style = parse.int_value("style");
+ settings.sample = parse.bool_value("sample");
+ settings.conf_fixed_counters = (parse.string_value(PMC_FIXED_COUNTERS).length()
+ == 0) ? 1 : parse.bool_value(PMC_FIXED_COUNTERS);
+ settings.conf_branch_misses = parse.bool_value(PMC_BRANCH_MISSES);
+ settings.conf_last_cache_misses = parse.bool_value(PMC_LAST_CACHE_MISSES);
+ settings.conf_raw_pmu_event1 = parse.int_value(PMC_RAW_PMU_EVENT1, 16);
+ settings.conf_raw_pmu_event2 = parse.int_value(PMC_RAW_PMU_EVENT2, 16);
+
+ ret = diag_call_ioctl(DIAG_IOCTL_PMU_SET, (long)&settings);
+ printf("Operation %s, return: %d\n", ret ? "failed" : "successful", ret);
+ print_settings(&settings, 1);
+ if (ret)
+ return ret;
+
+ ret = diag_activate("pmu");
+ if (ret == 1) {
+ printf("pmu activated\n");
+ ret = 0;
+ } else {
+ printf("pmu is not activated, ret %d\n", ret);
+ }
+
+ return ret;
+}
+
+static int do_deactivate(void)
+{
+ int ret = 0;
+
+ ret = diag_deactivate("pmu");
+ if (ret == 0) {
+ printf("pmu is not activated\n");
+ } else {
+ printf("deactivate pmu fail, ret is %d\n", ret);
+ }
+ return ret;
+}
+
+static int do_settings(const char *arg)
+{
+ struct diag_pmu_settings settings;
+ int ret;
+
+ memset(&settings, 0, sizeof(struct diag_pmu_settings));
+ ret = diag_call_ioctl(DIAG_IOCTL_PMU_SETTINGS, (long)&settings);
+ if (ret == 0) {
+ print_settings(&settings, 0);
+ } else {
+ printf("The operation failed!\n");
+ printf("Make sure that the diagnose tool is installed correctly.\n");
+ }
+ return ret;
+}
+
+static inline char *ns_to_timespec_str(long long nsec)
+{
+ static char str[128] = {0};
+ struct timespec ts;
+
+ if (!nsec) {
+ ts.tv_sec = 0;
+ ts.tv_nsec = 0;
+ } else {
+ ts.tv_sec = nsec / NSEC_PER_SEC;
+ ts.tv_nsec = nsec % NSEC_PER_SEC;
+ }
+
+ snprintf(str, sizeof(str)-1, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+ return str;
+}
+
+static int pmu_extract(void *buf, unsigned int len, void *unused)
+{
+ int *et_type;
+
+ if (len == 0)
+ return 0;
+
+ et_type = (int *)buf;
+ switch (*et_type) {
+ case et_pmu_detail:
+ struct diag_pmu_detail *detail;
+
+ if (len < sizeof(struct diag_pmu_detail))
+ break;
+
+ detail = (struct diag_pmu_detail *)buf;
+ if (!detail)
+ break;
+
+ instructions_sum += detail->instructions;
+ cycles_sum += detail->cycles;
+ ref_cycles_sum += detail->ref_cycles;
+ branch_misses_sum += detail->branch_misses;
+ last_cache_misses_sum += detail->last_cache_misses;
+ raw_pmu_event1_sum += detail->raw_pmu_event1;
+ raw_pmu_event2_sum += detail->raw_pmu_event2;
+
+ ss_cpu << detail->cgrp_buf << "; "
+ << detail->cpu << "; "
+ << detail->instructions << "; "
+ << detail->cycles << "; "
+ << detail->ref_cycles << "; "
+ << detail->branch_misses << "; "
+ << detail->last_cache_misses << "; "
+ << detail->raw_pmu_event1 << "; "
+ << detail->raw_pmu_event2 << endl;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void print_columns_core_events()
+{
+ printf("cgroup; cpu; instructions; cycles; ref-cycles; branch_misses; "
+ "last_cache_misses; raw_pmu_event1; raw_pmu_event2\n");
+ printf("-----------------------------------------------------------"
+ "-------------------------------------------------\n");
+}
+
+static void do_extract(char *buf, int len)
+{
+ struct timespec ts;
+
+ instructions_sum = 0;
+ cycles_sum = 0;
+ ref_cycles_sum = 0;
+ branch_misses_sum = 0;
+ last_cache_misses_sum = 0;
+ raw_pmu_event1_sum = 0;
+ raw_pmu_event2_sum = 0;
+ ss_cpu.str("");
+
+ extract_variant_buffer(buf, len, pmu_extract, NULL);
+ print_columns_core_events();
+ printf("%s\n", ss_cpu.str().c_str());
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ printf("time: %lu.%lu, the core events are summarized as follows, "
+ "instructions: %lu, cycles: %lu, ref_cycles: %lu, "
+ "branch_misses: %lu, llc_misses: %lu, raw_pmu_event1: %lu, "
+ "raw_pmu_event2: %lu.\n",
+ ts.tv_sec, ts.tv_nsec,
+ instructions_sum, cycles_sum, ref_cycles_sum,
+ branch_misses_sum, last_cache_misses_sum, raw_pmu_event1_sum,
+ raw_pmu_event2_sum);
+
+ printf("\n");
+}
+
+static int do_dump(void)
+{
+ static char variant_buf[DIAG_PMU_VARIANT_BUF_LEN];
+ int len;
+ int ret = 0;
+ struct diag_ioctl_dump_param dump_param = {
+ .user_ptr_len = &len,
+ .user_buf_len = DIAG_PMU_VARIANT_BUF_LEN,
+ .user_buf = variant_buf,
+ };
+
+ memset(variant_buf, 0, DIAG_PMU_VARIANT_BUF_LEN);
+ ret = diag_call_ioctl(DIAG_IOCTL_PMU_DUMP, (long)&dump_param);
+ if (ret == 0) {
+ do_extract(variant_buf, len);
+ }
+ return ret;
+}
+
+void write_json_file(const char *sls_file, Json::Value &root)
+{
+ ofstream os;
+ Json::StreamWriterBuilder builder;
+ builder.settings_["indentation"] = " ";
+ std::unique_ptr<Json::StreamWriter> writer(builder.newStreamWriter());
+ writer->newline = false;
+
+ if (*sls_file == '\0') {
+ return;
+ }
+
+ os.open(sls_file, std::ios::out);
+ if (1 != os.is_open()) {
+ return;
+ }
+
+ writer->write(root, &os);
+ os << endl;
+
+ return;
+}
+
+static int sls_extract(void *buf, unsigned int len, void *param)
+{
+ Json::Value *root = (Json::Value *)param;
+ struct diag_pmu_detail *detail;
+ Json::Value event;
+ static int i = 0;
+ int *et_type;
+
+ if (len == 0)
+ return 0;
+
+ et_type = (int *)buf;
+ switch (*et_type) {
+ case et_pmu_detail:
+ if (len < sizeof(struct diag_pmu_detail))
+ break;
+
+ detail = (struct diag_pmu_detail *)buf;
+ if (!detail)
+ break;
+
+ event["cgroup"] = Json::Value(detail->cgrp_buf);
+ event["cpu"] = Json::Value(detail->cpu);
+ event[PMC_INSTRUCTIONS] = Json::Value(detail->instructions);
+ event[PMC_CYCLES] = Json::Value(detail->cycles);
+ event[PMC_REF_CYCLES] = Json::Value(detail->ref_cycles);
+ event[PMC_BRANCH_MISSES] = Json::Value(detail->branch_misses);
+ event[PMC_LAST_CACHE_MISSES] = Json::Value(detail->last_cache_misses);
+ event[PMC_RAW_PMU_EVENT1] = Json::Value(detail->raw_pmu_event1);
+ event[PMC_RAW_PMU_EVENT2] = Json::Value(detail->raw_pmu_event2);
+ (*root)[i++] = event;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int do_sls(char *arg)
+{
+ static char variant_buf[20 * 1024 * 1024];
+ int ret, len;
+ struct diag_ioctl_dump_param dump_param = {
+ .user_ptr_len = &len,
+ .user_buf_len = 20 * 1024 * 1024,
+ .user_buf = variant_buf,
+ };
+
+ struct params_parser parse(arg);
+ string sls_file = parse.string_value("sls");
+ string imc_file = parse.string_value("imc");
+ if (!sls_file.length() && !imc_file.length())
+ return -EINVAL;
+
+ if (debug_mode) {
+ printf("sls=%s, imc=%s\n", sls_file.c_str(), imc_file.c_str());
+ }
+
+ ret = diag_call_ioctl(DIAG_IOCTL_PMU_DUMP, (long)&dump_param);
+ if (ret == 0 && len > 0) {
+#if 0
+ clock_gettime(CLOCK_REALTIME, &ts);
+ root["tv_sec"] = Json::Value(ts.tv_sec);
+ root["tv_nsec"] = Json::Value(ts.tv_nsec);
+#endif
+ Json::Value root_core;
+ Json::Value root_imc;
+
+ extract_variant_buffer(variant_buf, len, sls_extract, &root_core);
+ write_json_file(sls_file.c_str(), root_core);
+ write_json_file(imc_file.c_str(), root_imc);
+ }
+
+ return ret;
+}
+
+static int do_sample(char *arg)
+{
+ int ret;
+ unsigned int sample;
+
+ ret = sscanf(arg, "%d", &sample);
+ if (ret < 1)
+ return -EINVAL;
+
+ ret = diag_call_ioctl(DIAG_IOCTL_PMU_SAMPLE, (long)&sample);
+ printf("set sample for pmu: %d, ret is %d\n", sample, ret);
+ return ret;
+}
+
+int pmu_main(int argc, char **argv)
+{
+ static struct option long_options[] = {
+ {"help", no_argument, 0, 0 },
+ {"activate", optional_argument, 0, 0 },
+ {"deactivate",no_argument, 0, 0 },
+ {"settings", optional_argument, 0, 0 },
+ {"report", no_argument, 0, 0 },
+ {"record", required_argument, 0, 0 },
+ {"sample", required_argument, 0, 0 },
+ {0, 0, 0, 0 }
+ };
+ int ret = -EINVAL;
+ int c;
+
+ if (argc <= 1) {
+ usage_pmu();
+ return -EINVAL;
+ }
+ while (1) {
+ int option_index = -1;
+
+ c = getopt_long_only(argc, argv, "", long_options, &option_index);
+ if (c == -1)
+ break;
+ switch (option_index) {
+ case 0:
+ usage_pmu();
+ break;
+ case 1:
+ ret = do_activate(optarg ? optarg : "");
+ break;
+ case 2:
+ ret = do_deactivate();
+ break;
+ case 3:
+ ret = do_settings(optarg ? optarg : "");
+ break;
+ case 4:
+ ret = do_dump();
+ break;
+ case 5:
+ ret = do_sls(optarg);
+ break;
+ case 6:
+ ret = do_sample(optarg);
+ break;
+ default:
+ usage_pmu();
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/SOURCE/module/Makefile b/SOURCE/module/Makefile
index c3dadc8..56b19e9 100755
--- a/SOURCE/module/Makefile
+++ b/SOURCE/module/Makefile
@@ -229,6 +229,7 @@ ifneq ($(KERNELRELEASE),)
$(TARGET)-objs += fs/fs_entry.o fs/orphan.o fs/shm.o fs/rw_top.o
$(TARGET)-objs += net/net_entry.o net/tcp_retrans.o net/drop_packet.o net/ping_delay.o net/ping_delay6.o \
net/net_bandwidth.o net/tcp_connect.o
+ $(TARGET)-objs += pmu/entry.o pmu/pmu.o pmu/debug.o
ifeq ($(EXPERIENTIAL),1)
$(TARGET)-objs += test/test.o
diff --git a/SOURCE/module/chr_dev.c b/SOURCE/module/chr_dev.c
index 5ba9771..b630df3 100644
--- a/SOURCE/module/chr_dev.c
+++ b/SOURCE/module/chr_dev.c
@@ -54,6 +54,8 @@ struct diag_dev {
struct cdev cdev;
};
+extern long diag_ioctl_pmu(unsigned int cmd, unsigned long arg);
+
static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int ret = -EINVAL;
@@ -187,6 +189,9 @@ static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case DIAG_IOCTL_TYPE_TCP_CONNECT:
ret = diag_ioctl_tcp_connect(nr, arg);
break;
+ case DIAG_IOCTL_TYPE_PMU:
+ ret = diag_ioctl_pmu(nr, arg);
+ break;
default:
break;
}
diff --git a/SOURCE/module/entry.c b/SOURCE/module/entry.c
index 268077d..e592edd 100644
--- a/SOURCE/module/entry.c
+++ b/SOURCE/module/entry.c
@@ -183,6 +183,8 @@ static ssize_t controller_file_write(struct diag_trace_file *trace_file,
activate_throttle_delay();
} else if (strcmp(func, "tcp-connect") == 0) {
activate_tcp_connect();
+ } else if (strcmp(func, "pmu") == 0) {
+ activate_pmu();
}
up(&controller_sem);
@@ -264,6 +266,8 @@ static ssize_t controller_file_write(struct diag_trace_file *trace_file,
deactivate_throttle_delay();
} else if (strcmp(func, "tcp-connect") == 0) {
deactivate_tcp_connect();
+ } else if (strcmp(func, "pmu") == 0) {
+ deactivate_pmu();
}
up(&controller_sem);
@@ -564,6 +568,12 @@ static int __init diagnosis_init(void)
goto out_fs;
}
+ ret = diag_pmu_init();
+ if (ret) {
+ pr_err("diag_pmu_init failed.\n");
+ goto out_pmu;
+ }
+
ret = diag_xby_test_init();
if (ret) {
pr_err("diag_xhy_test_init failed.\n");
@@ -587,6 +597,8 @@ static int __init diagnosis_init(void)
out_dev:
diag_xby_test_exit();
out_xby_test:
+ diag_pmu_exit();
+out_pmu:
diag_fs_exit();
out_fs:
diag_pupil_exit();
@@ -659,6 +671,9 @@ static void __exit diagnosis_exit(void)
diag_fs_exit();
msleep(20);
+ diag_pmu_exit();
+ msleep(20);
+
alidiagnose_symbols_exit();
msleep(20);
diff --git a/SOURCE/module/internal.h b/SOURCE/module/internal.h
index 801bcd6..af549fc 100755
--- a/SOURCE/module/internal.h
+++ b/SOURCE/module/internal.h
@@ -58,6 +58,7 @@ static inline void __percpu_counter_add(struct percpu_counter *fbc,
#include "pub/variant_buffer.h"
#include "pub/stack.h"
#include "uapi/throttle_delay.h"
+#include "uapi/pmu.h"
/**
* 手工替换函数相关的宏
*/
@@ -478,6 +479,11 @@ struct diag_percpu_context {
struct rss_monitor_detail rss_monitor_detail;
struct rss_monitor_raw_stack_detail rss_monitor_raw_stack_detail;
} rss_monitor;
+
+ struct {
+ struct perf_event *events[PMU_INDEX_MAX];
+ struct diag_pmu_detail detail;
+ } pmu;
};
extern struct diag_percpu_context *diag_percpu_context[NR_CPUS];
@@ -890,6 +896,11 @@ int deactivate_memcg_stats(void);
int diag_memcg_stats_init(void);
void diag_memcg_stats_exit(void);
+int activate_pmu(void);
+int deactivate_pmu(void);
+int diag_pmu_init(void);
+void diag_pmu_exit(void);
+
int diag_dev_init(void);
void diag_dev_cleanup(void);
diff --git a/SOURCE/module/pmu/debug.c b/SOURCE/module/pmu/debug.c
new file mode 100644
index 0000000..72f50fd
--- /dev/null
+++ b/SOURCE/module/pmu/debug.c
@@ -0,0 +1,158 @@
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "uapi/pmu.h"
+
+#if defined(PMU_DEBUG) && PMU_DEBUG > 0
+
+#define PMU_DEBUG_FILE "ali-linux/diag/diag_pmu_costs"
+
+DEFINE_PER_CPU(struct pmu_cost, diag_pmu_costs);
+
+static int diag_pmu_nr_cgroup = 0;
+
+void pmu_debug_init(void)
+{
+ struct pmu_cost *cost;
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ cost = &per_cpu(diag_pmu_costs, cpu);
+ memset(cost, 0, sizeof(struct pmu_cost));
+ }
+}
+
+void pmu_debug_context_switch(cycles_t cycles_begin,
+ cycles_t cycles_mm_task_prev,
+ cycles_t cycles_mm_task_next,
+ cycles_t cycles_update_pmu_prev,
+ cycles_t cycles_update_pmu_next,
+ cycles_t cycles_end)
+
+{
+ struct pmu_cost *cost;
+
+ cost = this_cpu_ptr(&diag_pmu_costs);
+ cost->nr_switch +=1;
+ cost->cycles_switch += cycles_end - cycles_begin;
+ cost->cycles_find_record += cycles_mm_task_prev ?
+ cycles_mm_task_prev - cycles_begin : 0;
+ cost->cycles_update_record += cycles_update_pmu_prev ?
+ cycles_update_pmu_prev - cycles_mm_task_prev : 0;
+ cost->cycles_find_record += cycles_mm_task_next ?
+ (cycles_update_pmu_prev ? cycles_mm_task_next - cycles_update_pmu_prev : 0) : 0;
+ cost->cycles_update_record += cycles_mm_task_next ?
+ cycles_end - cycles_mm_task_next : 0;
+}
+
+void pmu_debug_cgroup_rmdir(cycles_t cycles_begin,
+ cycles_t cycles_dump,
+ cycles_t cycles_detach)
+{
+ struct pmu_cost *cost;
+
+ cost = &per_cpu(diag_pmu_costs, smp_processor_id());
+ cost->nr_exit += 1;
+ cost->cycles_exit += cycles_detach - cycles_begin;
+ cost->cycles_dump_record += cycles_dump - cycles_begin;
+ cost->cycles_detach_record += cycles_detach - cycles_dump;
+}
+
+void pmu_debug_cgroup_mkdir(cycles_t cycles_begin,
+ cycles_t cycles_end)
+{
+ struct pmu_cost *cost;
+
+ cost = this_cpu_ptr(&diag_pmu_costs);
+ cost->nr_fork += 1;
+ cost->cycles_fork += cycles_end - cycles_begin;
+ cost->cycles_attach_record += cycles_end - cycles_begin;
+}
+
+void pmu_debug_in_timer(cycles_t cycles_begin,
+ cycles_t cycles_find_record,
+ cycles_t cycles_update_record,
+ cycles_t cycles_end)
+{
+ struct pmu_cost *cost;
+
+ cost = this_cpu_ptr(&diag_pmu_costs);
+ cost->nr_timer += 1;
+ cost->cycles_timer += cycles_end - cycles_begin;
+ cost->cycles_find_record += cycles_find_record - cycles_begin;
+ cost->cycles_update_record += cycles_update_record - cycles_find_record;
+}
+
+void pmu_debug_nr_cgroup_inc(void)
+{
+ diag_pmu_nr_cgroup += 1;
+}
+
+void pmu_debug_nr_cgroup_dec(void)
+{
+ diag_pmu_nr_cgroup -= 1;
+}
+
+
+static int pmu_cost_show(struct seq_file *m, void *v)
+{
+ struct pmu_cost *cost;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ cost = &per_cpu(diag_pmu_costs, cpu);
+ seq_printf(m, "cpu[%d] nr_switch %llu ->cycles_switch %llu "
+ "nr_timer %llu ->cycles_timer %llu "
+ "nr_fork %llu ->cycles_fork %llu "
+ "nr_exit %llu ->cycles_exit %llu "
+ "| cycles_find_record %llu cycles_update_record %llu "
+ "cycles_attach_record %llu cycles_detach_record %llu\n",
+ cpu, cost->nr_switch, cost->cycles_switch,
+ cost->nr_timer, cost->cycles_timer,
+ cost->nr_fork, cost->cycles_fork,
+ cost->nr_exit, cost->cycles_exit,
+ cost->cycles_find_record, cost->cycles_update_record,
+ cost->cycles_attach_record, cost->cycles_detach_record);
+ }
+
+ seq_printf(m, "-----------------------------\n");
+ seq_printf(m, "nr_cgroups: %d\n", diag_pmu_nr_cgroup);
+
+ return 0;
+}
+
+static int pmu_cost_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmu_cost_show, NULL);
+}
+
+static const struct file_operations pmu_cost_fops =
+{
+ .owner = THIS_MODULE,
+ .open = pmu_cost_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int pmu_debug_proc_create(void)
+{
+ struct proc_dir_entry *pe;
+ int ret = 0;
+
+ pe = proc_create(PMU_DEBUG_FILE, S_IFREG | 0444, NULL,
+ &pmu_cost_fops);
+
+ if (!pe) {
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+
+void pmu_debug_proc_destroy(void)
+{
+ remove_proc_entry(PMU_DEBUG_FILE, NULL);
+
+}
+
+#endif
diff --git a/SOURCE/module/pmu/debug.h b/SOURCE/module/pmu/debug.h
new file mode 100644
index 0000000..30f6c0f
--- /dev/null
+++ b/SOURCE/module/pmu/debug.h
@@ -0,0 +1,120 @@
+#ifndef APROF_PMU_DEBUG_H
+#define APROF_PMU_DEBUG_H
+
+#if defined(PMU_DEBUG) && PMU_DEBUG > 0
+
+
+/**
+ * 调试PMU相关模块的性能
+ */
+struct pmu_cost {
+ unsigned long long nr_switch;
+ unsigned long long nr_fork;
+ unsigned long long nr_exit;
+ unsigned long long nr_timer;
+ unsigned long long cycles_switch;
+ unsigned long long cycles_fork;
+ unsigned long long cycles_exit;
+ unsigned long long cycles_timer;
+ unsigned long long cycles_find_record;
+ unsigned long long cycles_init_record;
+ unsigned long long cycles_update_record;
+ unsigned long long cycles_dump_record;
+ unsigned long long cycles_attach_record;
+ unsigned long long cycles_detach_record;
+};
+
+DECLARE_PER_CPU(struct pmu_cost, diag_pmu_costs);
+
+extern void pmu_debug_init(void);
+extern void pmu_debug_context_switch(cycles_t cycles_begin,
+ cycles_t cycles_mm_task_prev,
+ cycles_t cycles_mm_task_next,
+ cycles_t cycles_update_pmu_prev,
+ cycles_t cycles_update_pmu_next,
+ cycles_t cycles_end);
+extern void pmu_debug_cgroup_rmdir(cycles_t cycles_begin,
+ cycles_t cycles_dump,
+ cycles_t cycles_detach);
+extern void pmu_debug_cgroup_mkdir(cycles_t cycles_begin,
+ cycles_t cycles_end);
+extern void pmu_debug_in_timer(cycles_t cycles_begin,
+ cycles_t cycles_find_record,
+ cycles_t cycles_update_record,
+ cycles_t cycles_end);
+#define pmu_debug_get_cycles(v) \
+ do { \
+ v = get_cycles(); \
+ } while (0)
+
+extern int pmu_debug_proc_create(void);
+extern void pmu_debug_proc_destroy(void);
+
+extern void pmu_debug_nr_cgroup_inc(void);
+extern void pmu_debug_nr_cgroup_dec(void);
+
+#else
+static inline void pmu_debug_init(void)
+{
+ //
+}
+
+#define pmu_debug_get_cycles(v) \
+ do { \
+ } while (0)
+
+static inline void pmu_debug_context_switch(cycles_t cycles_begin,
+ cycles_t cycles_mm_task_prev,
+ cycles_t cycles_mm_task_next,
+ cycles_t cycles_update_pmu_prev,
+ cycles_t cycles_update_pmu_next,
+ cycles_t cycles_end)
+{
+ //
+}
+
+static inline void pmu_debug_cgroup_rmdir(cycles_t cycles_begin,
+ cycles_t cycles_dump,
+ cycles_t cycles_detach)
+{
+ //
+}
+
+static inline void pmu_debug_cgroup_mkdir(cycles_t cycles_begin,
+ cycles_t cycles_end)
+{
+ //
+}
+
+static inline void pmu_debug_in_timer(cycles_t cycles_begin,
+ cycles_t cycles_find_record,
+ cycles_t cycles_update_record,
+ cycles_t cycles_end)
+{
+ //
+}
+
+static inline int pmu_debug_proc_create(void)
+{
+ return 0;
+}
+
+static inline void pmu_debug_proc_destroy(void)
+{
+ //
+}
+
+static inline void pmu_debug_nr_cgroup_inc(void)
+{
+ //
+}
+
+static inline void pmu_debug_nr_cgroup_dec(void)
+{
+ //
+}
+
+#endif /* APROF_DEBUG */
+
+#endif
+
diff --git a/SOURCE/module/pmu/entry.c b/SOURCE/module/pmu/entry.c
new file mode 100644
index 0000000..d4bbd51
--- /dev/null
+++ b/SOURCE/module/pmu/entry.c
@@ -0,0 +1,558 @@
+/*
+ * Linux内核诊断工具--内核态pmu功能
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Wen Yang <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+#include "internal.h"
+#include "pub/cgroup.h"
+#include "pub/trace_point.h"
+#include "pub/variant_buffer.h"
+
+#include "uapi/pmu.h"
+#include "pmu/pmu.h"
+#include "pmu/debug.h"
+#include "pub/mem_pool.h"
+#include "pub/kprobe.h"
+
+#if !defined(ALIOS_7U) && !defined(UBUNTU)
+long diag_ioctl_pmu(unsigned int cmd, unsigned long arg)
+{
+ return -ENOSYS;
+}
+
+int activate_pmu(void)
+{
+ return 0;
+}
+
+int deactivate_pmu(void)
+{
+ return 0;
+}
+
+int diag_pmu_init(void)
+{
+ return 0;
+}
+
+int diag_pmu_exit(void)
+{
+ return 0;
+}
+
+#else
+
+extern struct diag_variant_buffer pmu_variant_buffer;
+extern struct diag_pmu_settings pmu_settings;
+extern struct ali_mem_pool mem_pool;
+
+#define CGROUP_BUFFER_MAX_COUNT 5000
+#define CGROUP_BUFFER_INIT_COUNT 1000
+static unsigned long diag_pmu_buffer_curr = 0;
+static unsigned long diag_pmu_buffer_grow = 0;
+
+enum {
+ DIAG_PMU_NOT_LOAD = 0,
+ DIAG_PMU_LOADING,
+ DIAG_PMU_LOADED,
+ DIAG_PMU_EXITING,
+ DIAG_PMU_EXITED,
+};
+
+static int diag_pmu_module_state = DIAG_PMU_NOT_LOAD;
+static DEFINE_SEMAPHORE(diag_pmu_sem);
+
+__maybe_unused static struct kprobe kprobe_cgroup_destroy_locked;
+__maybe_unused static struct kprobe kprobe_cgroup_populate_dir;
+
+#if KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE
+static void trace_sched_switch_hit(void *__data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+#elif KERNEL_VERSION(3, 10, 0) <= LINUX_VERSION_CODE
+static void trace_sched_switch_hit(void *__data,
+ struct task_struct *prev, struct task_struct *next)
+#else
+static void trace_sched_switch_hit(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+#endif
+{
+ struct pmu_percpu *record;
+ struct pmu_registers data = {0};
+
+ __maybe_unused cycles_t cycles_begin = 0;
+ __maybe_unused cycles_t cycles_mm_task_prev = 0;
+ __maybe_unused cycles_t cycles_mm_task_next = 0;
+ __maybe_unused cycles_t cycles_update_pmu_prev= 0;
+ __maybe_unused cycles_t cycles_update_pmu_next= 0;
+ __maybe_unused cycles_t cycles_end;
+
+ if (!pmu_settings.activated || !pmu_settings.sample)
+ return;
+
+ pmu_debug_get_cycles(cycles_begin);
+ pmu_read_core_registers(&data);
+
+ record = pmu_find_record(prev);
+ if (record) {
+ pmu_debug_get_cycles(cycles_mm_task_prev);
+ if (record->flags == PMU_FLAG_SCHED_IN) {
+ pmu_acc_delta(record, &data);
+ }
+ record->flags = PMU_FLAG_SCHED_OUT;
+ pmu_debug_get_cycles(cycles_update_pmu_prev);
+ }
+
+ record = pmu_find_record(next);
+ if (record) {
+ pmu_debug_get_cycles(cycles_mm_task_next);
+ pmu_refresh_counters(record, &data);
+ record->flags = PMU_FLAG_SCHED_IN;
+ pmu_debug_get_cycles(cycles_update_pmu_next);
+ }
+
+ pmu_debug_get_cycles(cycles_end);
+ pmu_debug_context_switch(cycles_begin,
+ cycles_mm_task_prev,
+ cycles_mm_task_next,
+ cycles_update_pmu_prev,
+ cycles_update_pmu_next,
+ cycles_end);
+ return;
+}
+
+static void pmu_cgroup_rmdir(struct cgroup *cgrp)
+{
+ struct diag_pmu_detail *detail;
+ struct pmu_cgroup *pmu_cgrp;
+ struct pmu_registers data;
+ struct pmu_percpu *record;
+ unsigned long flags;
+ int cpu;
+ __maybe_unused cycles_t cycles_begin = 0;
+ __maybe_unused cycles_t cycles_dump = 0;
+ __maybe_unused cycles_t cycles_detach = 0;
+
+ if (!cgrp)
+ return;
+
+ pmu_cgrp = pmu_find_cgroup(cgrp);
+ if (unlikely(!pmu_cgrp))
+ return;
+
+ get_online_cpus();
+ preempt_disable();
+
+ pmu_debug_get_cycles(cycles_begin);
+
+ pmu_cgrp = pmu_find_cgroup(cgrp);
+ if (!pmu_cgrp) {
+ put_online_cpus();
+ return;
+ }
+
+ for_each_online_cpu(cpu) {
+ record = (struct pmu_percpu*)&(pmu_cgrp->percpu_data[cpu]);
+ if (record) {
+ memset(&data, 0, sizeof(data));
+ pmu_read_and_clear_record(&data, record);
+ if (!data.instructions && !data.cycles && !data.ref_cycles)
+ continue;
+
+ detail = &get_percpu_context()->pmu.detail;
+ pmu_fill_core_detail(detail, &data, cpu);
+
+ diag_variant_buffer_spin_lock(&pmu_variant_buffer, flags);
+ diag_variant_buffer_reserve(&pmu_variant_buffer, sizeof(*detail));
+ diag_variant_buffer_write_nolock(&pmu_variant_buffer, detail,
+ sizeof(*detail));
+ diag_variant_buffer_seal(&pmu_variant_buffer);
+ diag_variant_buffer_spin_unlock(&pmu_variant_buffer, flags);
+ }
+ }
+ put_online_cpus();
+
+ pmu_debug_get_cycles(cycles_dump);
+
+ pmu_detach_cgroup(cgrp);
+
+ pmu_debug_get_cycles(cycles_detach);
+
+ pmu_debug_cgroup_rmdir(cycles_begin, cycles_dump, cycles_detach);
+
+ preempt_enable();
+}
+
+#if KERNEL_VERSION(4, 19, 0) <= LINUX_VERSION_CODE
+__maybe_unused static void trace_cgroup_rmdir_hit(void *__data,
+ struct cgroup *cgrp, const char *path)
+{
+ pmu_cgroup_rmdir(cgrp);
+}
+#elif KERNEL_VERSION(4, 9, 0) <= LINUX_VERSION_CODE
+__maybe_unused static void trace_cgroup_rmdir_hit(void *__data,
+ struct cgroup *cgrp)
+{
+ pmu_cgroup_rmdir(cgrp);
+}
+#endif
+
+static void pmu_cgroup_mkdir(struct cgroup *cgrp)
+{
+ __maybe_unused cycles_t cycles_begin = 0;
+ __maybe_unused cycles_t cycles_end;
+
+ if (unlikely(!cgrp))
+ return;
+
+#if KERNEL_VERSION(4, 8, 0) < LINUX_VERSION_CODE
+ if (!(cgrp->subtree_ss_mask & (1 << cpuacct_cgrp_id)))
+ return;
+#else
+ if (!cgrp->subsys[cpuacct_subsys_id])
+ return;
+#endif
+
+ preempt_disable();
+
+ pmu_debug_get_cycles(cycles_begin);
+ pmu_attach_cgroup(cgrp);
+ pmu_debug_get_cycles(cycles_end);
+
+ preempt_enable();
+
+ pmu_debug_cgroup_mkdir(cycles_begin, cycles_end);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
+__maybe_unused static void trace_cgroup_mkdir_hit(void *__data,
+ struct cgroup *cgrp, const char *path)
+{
+ pmu_cgroup_mkdir(cgrp);
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
+__maybe_unused static void trace_cgroup_mkdir_hit(void *__data,
+ struct cgroup *cgrp)
+{
+ pmu_cgroup_mkdir(cgrp);
+}
+#else
+
+__maybe_unused static int kprobe_cgroup_populate_dir_pre(struct kprobe *p,
+ struct pt_regs *regs)
+{
+ struct cgroup * cgrp = (void *)ORIG_PARAM1(regs);
+
+ pmu_cgroup_mkdir(cgrp);
+ return 0;
+}
+
+__maybe_unused static int kprobe_cgroup_destroy_locked_pre(struct kprobe *p,
+ struct pt_regs *regs)
+{
+ struct cgroup * cgrp = (void *)ORIG_PARAM1(regs);
+
+ pmu_cgroup_rmdir(cgrp);
+ return 0;
+}
+
+#endif
+
+static void pmu_unhook_cgroup_create(void)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)
+ unhook_kprobe(&kprobe_cgroup_populate_dir);
+#else
+ unhook_tracepoint("cgroup_mkdir", trace_cgroup_mkdir_hit, NULL);
+#endif
+}
+
+static void pmu_unhook_cgroup_destroy(void)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)
+ unhook_kprobe(&kprobe_cgroup_destroy_locked);
+#else
+ unhook_tracepoint("cgroup_rmdir", trace_cgroup_rmdir_hit, NULL);
+#endif
+}
+
+static int pmu_hook_cgroup_create(void)
+{
+ int ret;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ ret = hook_tracepoint("cgroup_mkdir", trace_cgroup_mkdir_hit, NULL);
+#else
+ ret = hook_kprobe(&kprobe_cgroup_populate_dir, "cgroup_populate_dir",
+ kprobe_cgroup_populate_dir_pre, NULL);
+#endif
+
+ if (ret)
+ pr_err("pmu: failed to hook cgroup_mkdir, ret=%d\n", ret);
+
+ return ret;
+}
+
+static int pmu_hook_cgroup_destroy(void)
+{
+ int ret;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0)
+ ret = hook_tracepoint("cgroup_rmdir", trace_cgroup_rmdir_hit, NULL);
+#else
+ ret = hook_kprobe(&kprobe_cgroup_destroy_locked, "cgroup_destroy_locked",
+ kprobe_cgroup_destroy_locked_pre, NULL);
+#endif
+
+ if (ret)
+ pr_err("pmu: failed to hook cgroup_rmdir, ret=%d\n", ret);
+
+ return ret;
+}
+
+static int __activate_pmu(void)
+{
+ int ret = 0;
+
+ pmu_clean_data();
+ pmu_attach_all_cgroups();
+
+ ret = pmu_create_all_events();
+ if (ret) {
+ pr_err("pmu: failed to activate pmu, ret=%d\n", ret);
+ goto err_out;
+ }
+
+ ret = pmu_hook_cgroup_create();
+ if (ret)
+ goto err_detach;
+
+ ret = pmu_hook_cgroup_destroy();
+ if (ret) {
+ goto err_unhook_cgroup_mkdir;
+ }
+
+ ret = hook_tracepoint("sched_switch", trace_sched_switch_hit, NULL);
+ if (ret) {
+ pr_err("pmu: failed to hook sched_switch, ret=%d\n", ret);
+ goto err_unhook_cgroup_rmdir;
+ }
+
+ pmu_settings.activated = 1;
+ return 1;
+
+err_unhook_cgroup_rmdir:
+ pmu_unhook_cgroup_destroy();
+
+err_unhook_cgroup_mkdir:
+ pmu_unhook_cgroup_create();
+
+err_detach:
+ synchronize_sched();
+
+ pmu_detach_all_cgroups();
+ pmu_destroy_all_events();
+
+err_out:
+ return 0;
+}
+
+int activate_pmu(void)
+{
+ int ret = 0;
+
+ down(&diag_pmu_sem);
+ if (!pmu_settings.activated)
+ ret = __activate_pmu();
+ up(&diag_pmu_sem);
+
+ return ret;
+}
+
+static int __deactivate_pmu(void)
+{
+ int ret = 0;
+
+ unhook_tracepoint("sched_switch", trace_sched_switch_hit, NULL);
+ pmu_unhook_cgroup_create();
+ pmu_unhook_cgroup_destroy();
+
+ synchronize_sched();
+ msleep(10);
+ pmu_destroy_all_events();
+ pmu_detach_all_cgroups();
+
+ return ret;
+}
+
+int deactivate_pmu(void)
+{
+ int ret = 0;
+
+ down(&diag_pmu_sem);
+ if (pmu_settings.activated) {
+ __deactivate_pmu();
+ } else {
+ ret = -EAGAIN;
+ }
+ pmu_settings.activated = 0;
+ up(&diag_pmu_sem);
+
+ return ret;
+}
+
+long diag_ioctl_pmu(unsigned int cmd, unsigned long arg)
+{
+ int ret = -EINVAL;
+ int sample;
+ static struct diag_pmu_settings settings;
+ struct diag_ioctl_dump_param dump_param;
+
+ switch (cmd) {
+ case CMD_PMU_SET:
+ down(&diag_pmu_sem);
+ if (pmu_settings.activated) {
+ ret = -EBUSY;
+ } else {
+ ret = copy_from_user(&settings, (void *)arg, sizeof(struct diag_pmu_settings));
+ if (!ret) {
+ pmu_settings = settings;
+ pmu_settings.activated = 0;
+ }
+ }
+ up(&diag_pmu_sem);
+
+ break;
+ case CMD_PMU_SETTINGS:
+ settings = pmu_settings;
+ ret = copy_to_user((void *)arg, &settings, sizeof(struct diag_pmu_settings));
+
+ break;
+ case CMD_PMU_DUMP:
+ ret = copy_from_user(&dump_param, (void *)arg, sizeof(struct diag_ioctl_dump_param));
+ if (!ret) {
+ pmu_do_dump();
+ ret = copy_to_user_variant_buffer(&pmu_variant_buffer,
+ dump_param.user_ptr_len, dump_param.user_buf, dump_param.user_buf_len);
+ }
+
+ break;
+ case CMD_PMU_SAMPLE:
+ ret = copy_from_user(&sample, (void *)arg, sizeof(int));
+ if (!ret) {
+ pmu_settings.sample = sample;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int diag_pmu_mem_pool_grow(unsigned int num)
+{
+ int ret;
+
+ if (diag_pmu_buffer_curr + num > CGROUP_BUFFER_MAX_COUNT)
+ return -EINVAL;
+
+ ret = ali_mem_pool_putin(&mem_pool, num);
+ if (ret) {
+ pr_err("pmu: grow mem_pool failed, ret=%d, num=%u\n",
+ ret, num);
+ return ret;
+ }
+
+ diag_pmu_buffer_grow = num;
+ diag_pmu_buffer_curr += num;
+
+ return 0;
+}
+
+static int pmu_lookup_syms(void)
+{
+#if defined(DIAG_ARM64)
+ LOOKUP_SYMS(armpmu_read);
+#else
+ LOOKUP_SYMS(x86_perf_event_update);
+#endif
+
+ return 0;
+}
+
+int diag_pmu_init(void)
+{
+ int ret;
+
+ WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_LOADING);
+ diag_pmu_pool_init();
+ diag_pmu_radix_init();
+ ret = init_diag_variant_buffer(&pmu_variant_buffer, DIAG_PMU_VARIANT_BUF_LEN);
+ if (ret) {
+ pr_err("pmu: init variant_buffer failed, ret=%d\n", ret);
+ return ret;
+ }
+
+ ret = alloc_diag_variant_buffer(&pmu_variant_buffer);
+ if (ret) {
+ pr_err("pmu: alloc variant_buffer failed, ret=%d\n", ret);
+ goto out_destroy_variant_buffer;
+ }
+
+ ret = diag_pmu_mem_pool_grow(CGROUP_BUFFER_INIT_COUNT);
+ if (ret) {
+ goto out_destroy_variant_buffer;
+ }
+
+ if (pmu_lookup_syms()) {
+ ret = -EINVAL;
+ goto out_destroy_mem_pool;
+ }
+
+ diag_pmu_init_wq();
+
+ ret = pmu_debug_proc_create();
+ if (ret) {
+ goto out_destroy_mem_pool;
+ }
+
+ WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_LOADED);
+ return 0;
+
+out_destroy_mem_pool:
+ ali_mem_pool_destroy(&mem_pool);
+out_destroy_variant_buffer:
+ destroy_diag_variant_buffer(&pmu_variant_buffer);
+ return ret;
+}
+
+void diag_pmu_exit(void)
+{
+ down(&diag_pmu_sem);
+ WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_EXITING);
+ pmu_debug_proc_destroy();
+
+ if (pmu_settings.activated) {
+ __deactivate_pmu();
+ }
+
+ destroy_diag_variant_buffer(&pmu_variant_buffer);
+
+ ali_mem_pool_destroy(&mem_pool);
+ WRITE_ONCE(diag_pmu_module_state, DIAG_PMU_EXITED);
+ up(&diag_pmu_sem);
+}
+
+#endif
diff --git a/SOURCE/module/pmu/pmu.c b/SOURCE/module/pmu/pmu.c
new file mode 100644
index 0000000..8495dcd
--- /dev/null
+++ b/SOURCE/module/pmu/pmu.c
@@ -0,0 +1,601 @@
+/*
+ * Linux内核诊断工具--内核态pmu功能
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Wen Yang <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/timex.h>
+#include <linux/tracepoint.h>
+#include <trace/events/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <trace/events/napi.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/radix-tree.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include <asm/irq_regs.h>
+
+#include "uapi/pmu.h"
+#include "pub/trace_file.h"
+#include "pub/variant_buffer.h"
+#include "pub/trace_point.h"
+#include "pub/cgroup.h"
+#include "pub/mem_pool.h"
+#include "pmu/pmu.h"
+#include "pmu/debug.h"
+
+atomic64_t pmu_nr_running = ATOMIC64_INIT(0);
+struct diag_pmu_settings effective_pmu_settings = {0};
+struct diag_pmu_settings pmu_settings = {0};
+
+struct ali_mem_pool mem_pool;
+struct diag_variant_buffer pmu_variant_buffer;
+
+static DEFINE_SPINLOCK(tree_lock);
+struct radix_tree_root pmu_cgroup_tree;
+
+DEFINE_PER_CPU(struct work_struct, dump_pmu_works);
+
+#if defined (APROF_ARM64)
+void (*orig_armpmu_read)(struct perf_event *event) = NULL;
+#else
+u64 (*orig_x86_perf_event_update)(struct perf_event *event) = NULL;
+#endif
+
+static struct perf_event_attr pmu_attrs[PMU_INDEX_MAX] =
+{
+ [PMU_INDEX_CYCLES] = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ },
+ [PMU_INDEX_INSTRUCTIONS] = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_INSTRUCTIONS,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ },
+ [PMU_INDEX_REF_CYCLES] = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_REF_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ },
+ [PMU_INDEX_BRANCH_MISSES] = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_BRANCH_MISSES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ },
+
+#if defined (APROF_ARM64)
+ [PMU_INDEX_RAW_EVENT1] = {
+ .type = PERF_TYPE_RAW,
+ .config = 16395,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ },
+#else
+ [PMU_INDEX_LLC_MISSES] = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CACHE_MISSES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ },
+#endif
+
+ [PMU_INDEX_RAW_EVENT1] = {
+ .type = PERF_TYPE_RAW,
+ .config = 0x151,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 0,
+ .disabled = 0,
+ },
+ [PMU_INDEX_RAW_EVENT2] = {
+ .type = PERF_TYPE_RAW,
+ .config = 0x3f24,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 0,
+ .disabled = 0,
+ },
+};
+
+void diag_pmu_radix_init(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tree_lock, flags);
+ INIT_RADIX_TREE(&pmu_cgroup_tree, GFP_ATOMIC);
+ spin_unlock_irqrestore(&tree_lock, flags);
+}
+
+void diag_pmu_pool_init(void)
+{
+ int size;
+
+ size = sizeof(struct pmu_cgroup) + sizeof(struct pmu_percpu) * num_possible_cpus();
+ ali_mem_pool_init(&mem_pool, size);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0)
+
+static int pmu_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
+{
+ memset(buf, 0, buflen);
+
+ if (orig_kernfs_name && cgrp) {
+ return orig_kernfs_name(cgrp->kn, buf, buflen);
+ } else {
+ return 0;
+ }
+}
+
+#else
+
+static int pmu_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
+{
+ const char *name;
+ memset(buf, 0, buflen);
+
+ if (cgrp) {
+ name = cgroup_name(cgrp);
+ strncpy(buf, name, buflen);
+ buf[buflen - 1] = 0;
+
+ return strlen(buf);
+ }
+
+ return 0;
+}
+#endif
+
+void pmu_attach_cgroup(struct cgroup *cgrp)
+{
+ unsigned long flags;
+ struct pmu_cgroup *info;
+ struct pmu_cgroup *tmp;
+
+ info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp);
+ if (info)
+ return;
+
+ tmp = ali_mem_pool_alloc(&mem_pool);
+ if (tmp) {
+ tmp->cgrp = cgrp;
+
+ spin_lock_irqsave(&tree_lock, flags);
+
+ info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp);
+ if (info) {
+ ali_mem_pool_free(&mem_pool, tmp);
+ } else {
+ radix_tree_insert(&pmu_cgroup_tree, (unsigned long)cgrp, tmp);
+ info = tmp;
+ pmu_debug_nr_cgroup_inc();
+ }
+
+ spin_unlock_irqrestore(&tree_lock, flags);
+
+ pmu_cgroup_name(cgrp, info->cgrp_buf, CGROUP_NAME_LEN);
+ info->cgrp_buf[CGROUP_NAME_LEN - 1] = 0;
+ }
+}
+
+void pmu_detach_cgroup(struct cgroup *cgrp)
+{
+ unsigned long flags;
+ struct pmu_cgroup *info;
+
+ if (!cgrp)
+ return;
+
+ spin_lock_irqsave(&tree_lock, flags);
+
+ info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp);
+ if (info) {
+ info->cgrp = NULL;
+ radix_tree_delete(&pmu_cgroup_tree, (unsigned long)cgrp);
+ pmu_debug_nr_cgroup_dec();
+ }
+
+ spin_unlock_irqrestore(&tree_lock, flags);
+
+ if (info) {
+ ali_mem_pool_free(&mem_pool, info);
+ }
+}
+
+static void pmu_release_perf_event(struct perf_event **event)
+{
+ if (event && *event) {
+ printk_ratelimited(KERN_DEBUG "pmu: release perf_event(type=%d,"
+ "config=0x%llx) on cpu[%d]\n", (*event)->attr.type,
+ (*event)->attr.config, (*event)->cpu);
+
+ perf_event_disable(*event);
+ perf_event_release_kernel(*event);
+ *event = NULL;
+ }
+}
+
+static int pmu_destroy_counter(unsigned int cpu)
+{
+ struct diag_percpu_context *context = get_percpu_context_cpu(cpu);
+ int index;
+
+ for (index = 0; index < PMU_INDEX_MAX; ++index)
+ pmu_release_perf_event(&context->pmu.events[index]);
+
+ return 0;
+}
+
+static struct perf_event *pmu_create_perf_event(struct perf_event_attr *attr,
+ int cpu)
+{
+ struct perf_event *event;
+
+ event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+ if (IS_ERR(event)) {
+ printk_ratelimited(KERN_ERR "pmu: failed to create perf_event(type=%d,"
+ "config=0x%llx) on cpu[%d], ret=%ld\n", attr->type, attr->config,
+ cpu, PTR_ERR(event));
+ goto err_out;
+ }
+
+ printk_ratelimited(KERN_DEBUG "pmu: create perf_event(%d/0x%llx) on cpu[%d]"
+ " successful, state=%d\n", attr->type, attr->config, cpu, event->state);
+
+ perf_event_enable(event);
+
+ return event;
+
+err_out:
+ return NULL;
+}
+
+static int _pmu_create_counter(int conf, int replace_config, int cpu,
+ struct perf_event_attr *attr, struct perf_event **event)
+{
+ if (!conf || !event || *event)
+ return 0;
+
+ if (replace_config)
+ attr->config = conf;
+
+ *event = pmu_create_perf_event(attr, cpu);
+ return *event ? 0 : -EAGAIN;
+}
+
+#if defined(PMU_DEBUG) && PMU_DEBUG > 0
+ #if defined(DIAG_ARM64)
+ #define APROF_FIXED_COUNTERS 2
+ #else
+ #define APROF_FIXED_COUNTERS 3
+ #endif
+#else
+ #define APROF_FIXED_COUNTERS 2
+#endif
+
+static int pmu_create_core_events(int cpu)
+{
+ struct diag_percpu_context *context = get_percpu_context_cpu(cpu);
+ int index;
+ int ret;
+
+ for (index = 0; index < APROF_FIXED_COUNTERS; ++index) {
+ ret = _pmu_create_counter(pmu_settings.conf_fixed_counters, 0, cpu,
+ &pmu_attrs[index], &context->pmu.events[index]);
+ if (ret)
+ goto err_out;
+ }
+
+ ret = _pmu_create_counter(pmu_settings.conf_branch_misses, 0, cpu,
+ &pmu_attrs[PMU_INDEX_BRANCH_MISSES],
+ &context->pmu.events[PMU_INDEX_BRANCH_MISSES]);
+ if (ret)
+ goto err_out;
+
+ ret = _pmu_create_counter(pmu_settings.conf_last_cache_misses, 0, cpu,
+ &pmu_attrs[PMU_INDEX_LLC_MISSES],
+ &context->pmu.events[PMU_INDEX_LLC_MISSES]);
+ if (ret)
+ goto err_out;
+
+ ret = _pmu_create_counter(pmu_settings.conf_raw_pmu_event1, 1, cpu,
+ &pmu_attrs[PMU_INDEX_RAW_EVENT1],
+ &context->pmu.events[PMU_INDEX_RAW_EVENT1]);
+ if (ret)
+ goto err_out;
+
+ ret = _pmu_create_counter(pmu_settings.conf_raw_pmu_event2, 1, cpu,
+ &pmu_attrs[PMU_INDEX_RAW_EVENT2],
+ &context->pmu.events[PMU_INDEX_RAW_EVENT2]);
+ if (ret)
+ goto err_out;
+
+ return 0;
+
+err_out:
+ pmu_destroy_counter(cpu);
+ return ret;
+}
+
+void pmu_destroy_all_events(void)
+{
+ unsigned int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ pmu_destroy_counter(cpu);
+ put_online_cpus();
+}
+
+int pmu_create_all_events(void)
+{
+ int cpu;
+ int ret;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu) {
+ ret = pmu_create_core_events(cpu);
+ if (ret) {
+ put_online_cpus();
+ goto err_out;
+ }
+ }
+
+ put_online_cpus();
+
+ return 0;
+
+err_out:
+ pmu_destroy_all_events();
+ return ret;
+}
+
+struct cpuacct_impl {
+ struct cgroup_subsys_state css;
+ char internal[0];
+};
+
+static struct cpuacct * cb_attach_cpuacct_cgrp(struct cpuacct *acct, void *data)
+{
+ struct cpuacct_impl *impl;
+
+ if (acct) {
+ impl = (void *)acct;
+ pmu_attach_cgroup(impl->css.cgroup);
+ }
+
+ return NULL;
+}
+
+static struct cpuacct * cb_detach_cpuacct_cgrp(struct cpuacct *acct, void *data)
+{
+ struct cpuacct_impl *impl;
+
+ if (acct) {
+ impl = (void *)acct;
+ pmu_detach_cgroup(impl->css.cgroup);
+ }
+
+ return NULL;
+}
+
+void pmu_attach_all_cgroups(void)
+{
+ cpuacct_cgroup_walk_tree(cb_attach_cpuacct_cgrp, NULL);
+}
+
+void pmu_detach_all_cgroups(void)
+{
+ cpuacct_cgroup_walk_tree(cb_detach_cpuacct_cgrp, NULL);
+}
+
+static void pmu_walk_pmu_cgroup_tree(void (*callback)(struct pmu_cgroup *))
+{
+ struct pmu_cgroup *pmu_cgrps[NR_BATCH];
+ struct pmu_cgroup *pmu_cgrp;
+ unsigned long pos = 0;
+ int nr_found;
+ int i;
+
+ rcu_read_lock();
+
+ do {
+ nr_found = radix_tree_gang_lookup(&pmu_cgroup_tree, (void **)pmu_cgrps, pos, NR_BATCH);
+
+ for (i = 0; i < nr_found; i++) {
+ pmu_cgrp = pmu_cgrps[i];
+ callback(pmu_cgrp);
+ pos = (unsigned long)pmu_cgrp->cgrp + 1;
+ }
+ } while (nr_found > 0);
+
+ rcu_read_unlock();
+}
+
+static void pmu_clean_percpu_data(struct pmu_cgroup *pmu_cgrp)
+{
+ if (!pmu_cgrp)
+ return;
+
+ memset(&pmu_cgrp->percpu_data[0], 0,
+ sizeof(struct pmu_percpu) * num_possible_cpus());
+}
+
+void pmu_clean_data(void)
+{
+ pmu_debug_init();
+ pmu_walk_pmu_cgroup_tree(pmu_clean_percpu_data);
+}
+
+void pmu_read_and_clear_record(struct pmu_registers *data,
+ struct pmu_percpu *record)
+{
+ data->instructions = record->sum.instructions;
+ data->cycles = record->sum.cycles;
+ data->ref_cycles = record->sum.ref_cycles;
+ data->branch_misses = record->sum.branch_misses;
+ data->last_cache_misses = record->sum.last_cache_misses;
+ data->raw_pmu_event1 = record->sum.raw_pmu_event1;
+ data->raw_pmu_event2 = record->sum.raw_pmu_event2;
+
+ record->sum.instructions = 0;
+ record->sum.cycles = 0;
+ record->sum.ref_cycles = 0;
+ record->sum.branch_misses = 0;
+ record->sum.last_cache_misses = 0;
+ record->sum.raw_pmu_event1 = 0;
+ record->sum.raw_pmu_event2 = 0;
+}
+
+void pmu_fill_core_detail(struct diag_pmu_detail *detail,
+ const struct pmu_registers *data, int cpu)
+{
+ detail->et_type = et_pmu_detail;
+ detail->cpu = cpu;
+ detail->instructions = data->instructions;
+ detail->ref_cycles = data->ref_cycles;
+ detail->cycles = data->cycles;
+ detail->branch_misses = data->branch_misses;
+ detail->last_cache_misses = data->last_cache_misses;
+ detail->raw_pmu_event1 = data->raw_pmu_event1;
+ detail->raw_pmu_event2 = data->raw_pmu_event2;
+}
+
+static void pmu_dump_local_core(struct pmu_cgroup *pmu_cgrp)
+{
+ struct pmu_registers data = {0};
+ struct diag_pmu_detail *detail;
+ struct pmu_percpu *record;
+ unsigned long flags;
+
+ if (!pmu_cgrp)
+ return;
+
+ preempt_disable();
+ record = (struct pmu_percpu*)&(pmu_cgrp->percpu_data[smp_processor_id()]);
+ pmu_read_and_clear_record(&data, record);
+ if (pmu_settings.conf_fixed_counters && !data.instructions &&
+ !data.cycles && !data.ref_cycles)
+ goto out;
+
+ detail = &get_percpu_context()->pmu.detail;
+ pmu_fill_core_detail(detail, &data, smp_processor_id());
+ memcpy(&detail->cgrp_buf, &pmu_cgrp->cgrp_buf, CGROUP_NAME_LEN);
+
+ diag_variant_buffer_spin_lock(&pmu_variant_buffer, flags);
+ diag_variant_buffer_reserve(&pmu_variant_buffer, sizeof(*detail));
+ diag_variant_buffer_write_nolock(&pmu_variant_buffer, detail, sizeof(*detail));
+ diag_variant_buffer_seal(&pmu_variant_buffer);
+ diag_variant_buffer_spin_unlock(&pmu_variant_buffer, flags);
+
+out:
+ preempt_enable();
+ return;
+}
+
+static void pmu_dump_local(struct work_struct *work)
+{
+ pmu_walk_pmu_cgroup_tree(pmu_dump_local_core);
+}
+
+void diag_pmu_init_wq(void)
+{
+ int i;
+ struct work_struct *dump_work;
+
+ for_each_possible_cpu(i) {
+ dump_work = per_cpu_ptr(&dump_pmu_works, i);
+ INIT_WORK(dump_work, pmu_dump_local);
+ }
+}
+
+static void dump_pmu_all(void)
+{
+ unsigned int cpu;
+
+ if (!pmu_settings.activated)
+ return;
+
+ atomic64_inc_return(&pmu_nr_running);
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ queue_work_on(cpu, system_wq, per_cpu_ptr(&dump_pmu_works, cpu));
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(&dump_pmu_works, cpu));
+
+ put_online_cpus();
+ atomic64_dec_return(&pmu_nr_running);
+}
+
+void diag_pmu_timer(struct diag_percpu_context *context)
+{
+ struct pmu_percpu *record;
+ __maybe_unused cycles_t cycles_begin;
+ __maybe_unused cycles_t cycles_find_record;
+ __maybe_unused cycles_t cycles_update_record;
+ __maybe_unused cycles_t cycles_end;
+
+ if (!pmu_settings.activated ||
+ !pmu_settings.sample ||
+ !pmu_settings.style)
+ return;
+
+ pmu_debug_get_cycles(cycles_begin);
+
+ record = pmu_find_record(current);
+ if (record) {
+ struct pmu_registers data = {0};
+
+ pmu_debug_get_cycles(cycles_find_record);
+
+ pmu_read_core_registers(&data);
+ if (record->flags == PMU_FLAG_SCHED_IN) {
+ pmu_acc_delta(record, &data);
+ }
+
+ pmu_debug_get_cycles(cycles_update_record);
+ pmu_debug_get_cycles(cycles_end);
+ pmu_debug_in_timer(cycles_begin, cycles_find_record, cycles_update_record, cycles_end);
+ }
+}
+
+void pmu_do_dump(void)
+{
+ static DEFINE_MUTEX(mutex);
+
+ if (!pmu_settings.activated || !pmu_settings.sample)
+ return;
+
+ mutex_lock(&mutex);
+ dump_pmu_all();
+ mutex_unlock(&mutex);
+}
diff --git a/SOURCE/module/pmu/pmu.h b/SOURCE/module/pmu/pmu.h
new file mode 100644
index 0000000..70f0afd
--- /dev/null
+++ b/SOURCE/module/pmu/pmu.h
@@ -0,0 +1,225 @@
+#ifndef APROF_PMU_H
+#define APROF_PMU_H
+
+#include <linux/list.h>
+#include <linux/cache.h>
+#include "internal.h"
+
+#define PMU_FLAG_UNKNOWN 0
+#define PMU_FLAG_SCHED_IN 1
+#define PMU_FLAG_SCHED_OUT 2
+
+
+struct pmu_registers {
+ unsigned long instructions;
+ unsigned long cycles;
+ unsigned long ref_cycles;
+ unsigned long branch_misses;
+ unsigned long last_cache_misses;
+ unsigned long raw_pmu_event1;
+ unsigned long raw_pmu_event2;
+};
+
+struct pmu_percpu {
+ struct pmu_registers last;
+ struct pmu_registers sum;
+ int flags;
+ unsigned long __pad __attribute__ ((aligned (32)));
+};
+
+struct pmu_cgroup {
+ struct cgroup *cgrp;
+ unsigned long cpu_count;
+ char cgrp_buf[CGROUP_NAME_LEN];
+ struct pmu_percpu percpu_data[0] __attribute__ ((aligned (64)));
+};
+
+void pmu_read_and_clear_record(struct pmu_registers *data,
+ struct pmu_percpu *record);
+
+void diag_pmu_pool_init(void);
+void diag_pmu_radix_init(void);
+void diag_pmu_init_wq(void);
+
+void pmu_do_dump(void);
+
+int pmu_create_all_events(void);
+void pmu_destroy_all_events(void);
+
+void pmu_clean_data(void);
+void pmu_attach_all_cgroups(void);
+void pmu_detach_all_cgroups(void);
+
+void pmu_fill_core_detail(struct diag_pmu_detail *detail,
+ const struct pmu_registers *data, int cpu);
+
+void pmu_detach_cgroup(struct cgroup *tsk);
+void pmu_attach_cgroup(struct cgroup *tsk);
+
+int pmu_cpuhp_register(void);
+void pmu_cpuhp_unregister(void);
+
+struct perf_event;
+
+#if defined(APROF_ARM64)
+extern void (*orig_armpmu_read)(struct perf_event *event);
+#else
+extern u64 (*orig_x86_perf_event_update)(struct perf_event *event);
+#endif
+
+extern struct diag_pmu_settings pmu_settings;
+
+static inline void pmu_refresh_counters(struct pmu_percpu *record,
+ const struct pmu_registers *data)
+{
+ record->last.instructions = data->instructions;
+ record->last.cycles = data->cycles;
+ record->last.ref_cycles = data->ref_cycles;
+ record->last.branch_misses = data->branch_misses;
+ record->last.last_cache_misses = data->last_cache_misses;
+ record->last.raw_pmu_event1 = data->raw_pmu_event1;
+ record->last.raw_pmu_event2 = data->raw_pmu_event2;
+}
+
+static inline void handle_delta(unsigned long curr, unsigned long* prev,
+ unsigned long *sum, char * prefix)
+{
+ signed long delta = curr - *prev;
+
+ *prev = curr;
+ if (likely(delta > 0)) {
+ *sum += delta;
+ }
+}
+
+static inline void pmu_acc_delta(struct pmu_percpu *record,
+ const struct pmu_registers *data)
+{
+ handle_delta(data->instructions, &record->last.instructions,
+ &record->sum.instructions, "instructions");
+ handle_delta(data->cycles, &record->last.cycles,
+ &record->sum.cycles, "cycles");
+ handle_delta(data->ref_cycles, &record->last.ref_cycles,
+ &record->sum.ref_cycles, "ref_cycles");
+ handle_delta(data->branch_misses, &record->last.branch_misses,
+ &record->sum.branch_misses, "branch_misses");
+ handle_delta(data->last_cache_misses, &record->last.last_cache_misses,
+ &record->sum.last_cache_misses, "last_chche_misses");
+ handle_delta(data->raw_pmu_event1, &record->last.raw_pmu_event1,
+ &record->sum.raw_pmu_event1, "raw_pmu_event1");
+ handle_delta(data->raw_pmu_event2, &record->last.raw_pmu_event2,
+ &record->sum.raw_pmu_event2, "raw_pmu_event2");
+}
+
+static inline unsigned long pmu_read_core_event(struct perf_event *event)
+{
+ unsigned long flags;
+
+ if (!event)
+ return 0;
+
+#if defined (APROF_ARM64)
+ if (!orig_armpmu_read)
+ return 0;
+#else
+ if (!orig_x86_perf_event_update)
+ return 0;
+#endif
+
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ local_irq_save(flags);
+#if defined (APROF_ARM64)
+ orig_armpmu_read(event);
+#else
+ orig_x86_perf_event_update(event);
+#endif
+ local_irq_restore(flags);
+ }
+
+ return local64_read(&event->count);
+}
+
+static inline void pmu_read_core_registers(struct pmu_registers *data)
+{
+ struct diag_percpu_context *ctx = get_percpu_context();
+
+ if (unlikely(!ctx))
+ return;
+
+ if (pmu_settings.conf_fixed_counters) {
+ data->cycles = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_CYCLES]);
+ data->instructions = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_INSTRUCTIONS]);
+ data->ref_cycles = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_REF_CYCLES]);
+ }
+
+ if (pmu_settings.conf_branch_misses)
+ data->branch_misses = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_BRANCH_MISSES]);
+
+ if (pmu_settings.conf_last_cache_misses) {
+ data->last_cache_misses = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_LLC_MISSES]);
+ }
+
+ if (pmu_settings.conf_raw_pmu_event1)
+ data->raw_pmu_event1 = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_RAW_EVENT1]);
+
+ if (pmu_settings.conf_raw_pmu_event2)
+ data->raw_pmu_event2 = pmu_read_core_event(ctx->pmu.events[PMU_INDEX_RAW_EVENT2]);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0)
+#define ACCT_CGRP_ID cpuacct_cgrp_id
+#else
+#define ACCT_CGRP_ID cpuacct_subsys_id
+#endif
+
+struct radix_tree_root;
+extern struct radix_tree_root pmu_cgroup_tree;
+
+static inline struct pmu_cgroup *pmu_pick_cgroup_by_task(struct task_struct *task)
+{
+ struct cgroup *cgrp = NULL;
+ struct pmu_cgroup *info = NULL;
+
+ if (task && task->cgroups &&
+ task->cgroups->subsys &&
+ task->cgroups->subsys[ACCT_CGRP_ID] &&
+ task->cgroups->subsys[ACCT_CGRP_ID]->cgroup)
+ cgrp = task->cgroups->subsys[ACCT_CGRP_ID]->cgroup;
+ else
+ goto out;
+
+ rcu_read_lock();
+ info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp);
+ rcu_read_unlock();
+
+out:
+ return info;
+}
+
+static inline struct pmu_percpu *pmu_find_record(struct task_struct *task)
+{
+ struct pmu_cgroup *pmu_cgrp;
+
+ pmu_cgrp = pmu_pick_cgroup_by_task(task);
+ if (!pmu_cgrp)
+ return NULL;
+
+ return (struct pmu_percpu*)&(pmu_cgrp->percpu_data[smp_processor_id()]);
+}
+
+static inline struct pmu_cgroup *pmu_find_cgroup(struct cgroup *cgrp)
+{
+ struct pmu_cgroup *info = NULL;
+
+ if (!cgrp)
+ goto out;
+
+ rcu_read_lock();
+ info = radix_tree_lookup(&pmu_cgroup_tree, (unsigned long)cgrp);
+ rcu_read_unlock();
+
+out:
+ return info;
+}
+
+#endif /* APROF_PMU_H */
diff --git a/SOURCE/module/pub/cgroup.c b/SOURCE/module/pub/cgroup.c
index 285ccc6..5114ea9 100644
--- a/SOURCE/module/pub/cgroup.c
+++ b/SOURCE/module/pub/cgroup.c
@@ -12,7 +12,7 @@
#include "pub/cgroup.h"
#include "../symbol.h"
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) || LINUX_VERSION_CODE > KERNEL_VERSION(4,10,0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) || LINUX_VERSION_CODE > KERNEL_VERSION(5,16,0)
struct cgroup * cpuacct_to_cgroup(struct cpuacct *acct)
{
return NULL;
@@ -34,8 +34,6 @@ void diag_cpuacct_cgroup_name_tsk(struct task_struct *tsk, char *buf, unsigned i
#else
-typedef struct cpuacct *(*match_cpuacct)(struct cpuacct *acct, void *data);
-
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0)
#define diag_css_for_each_descendant_pre(pos, css) \
@@ -98,7 +96,7 @@ struct cgroup *diag_cpuacct_cgroup_tsk(struct task_struct *tsk)
return ret;
}
-static struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data)
+struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data)
{
struct cpuacct *root = orig_root_cpuacct;
struct cgroup_subsys_state *css;
@@ -173,7 +171,7 @@ static inline int diag_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen
return 0;
}
-static struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data)
+struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data)
{
struct cpuacct *root = orig_root_cpuacct;
struct cgroup_subsys_state *css;
diff --git a/SOURCE/module/pub/cgroup.h b/SOURCE/module/pub/cgroup.h
index ab4f7fa..205db6b 100644
--- a/SOURCE/module/pub/cgroup.h
+++ b/SOURCE/module/pub/cgroup.h
@@ -21,5 +21,8 @@ struct cgroup *diag_cpuacct_cgroup_tsk(struct task_struct *tsk);
void diag_cpuacct_cgroup_name_tsk(struct task_struct *tsk, char *buf, unsigned int count);
struct cgroup * cpuacct_to_cgroup(struct cpuacct *acct);
+typedef struct cpuacct *(*match_cpuacct)(struct cpuacct *acct, void *data);
+struct cpuacct * cpuacct_cgroup_walk_tree(match_cpuacct match_cpuacct, void *data);
+
#endif /* __DIAG_PUB_CGROUP_H */
diff --git a/SOURCE/module/stub.c b/SOURCE/module/stub.c
index 2104444..821954d 100755
--- a/SOURCE/module/stub.c
+++ b/SOURCE/module/stub.c
@@ -260,6 +260,13 @@ int __weak memcg_stats_syscall(struct pt_regs *regs, long id)
return -ENOSYS;
}
+DIAG_WEAK_FUNC_INIT_EXIT(pmu)
+DIAG_WEAK_FUNC_ACT_DEACT_IOCTL(pmu)
+int __weak pmu_syscall(struct pt_regs *regs, long id)
+{
+ return -ENOSYS;
+}
+
void __weak sys_loop_timer(struct diag_percpu_context *context)
{
//
diff --git a/SOURCE/uapi/ali_diagnose.h b/SOURCE/uapi/ali_diagnose.h
index d5c8cdb..d94e413 100644
--- a/SOURCE/uapi/ali_diagnose.h
+++ b/SOURCE/uapi/ali_diagnose.h
@@ -101,6 +101,7 @@ extern unsigned long debug_mode;
#define DIAG_IOCTL_TYPE_MEMCG_STATS (DIAG_IOCTL_TYPE_RSS_MONITOR + 1)
#define DIAG_IOCTL_TYPE_THROTTLE_DELAY (DIAG_IOCTL_TYPE_MEMCG_STATS + 1)
#define DIAG_IOCTL_TYPE_TCP_CONNECT (DIAG_IOCTL_TYPE_THROTTLE_DELAY + 1)
+#define DIAG_IOCTL_TYPE_PMU (DIAG_IOCTL_TYPE_TCP_CONNECT + 1)
#define DIAG_IOCTL_TYPE_END (DIAG_IOCTL_TYPE_THROTTLE_DELAY + 1)
@@ -355,6 +356,10 @@ struct diag_ioctl_dump_param_cycle {
#define DIAG_BASE_SYSCALL_TCP_CONNECT \
(DIAG_BASE_SYSCALL_THROTTLE_DELAY + DIAG_SYSCALL_INTERVAL)
+//2050
+#define DIAG_BASE_SYSCALL_PMU \
+ (DIAG_BASE_SYSCALL_TCP_CONNECT + DIAG_SYSCALL_INTERVAL)
+
#define DIAG_SYSCALL_END (DIAG_BASE_SYSCALL + DIAG_SYSCALL_INTERVAL * 1000)
enum diag_record_id {
@@ -530,6 +535,9 @@ enum diag_record_id {
et_tcp_connect_base = et_throttle_delay_base + DIAG_EVENT_TYPE_INTERVAL,
et_tcp_connect_detail,
+ et_pmu_base = et_tcp_connect_base + DIAG_EVENT_TYPE_INTERVAL,
+ et_pmu_detail,
+
et_count
};
diff --git a/SOURCE/uapi/pmu.h b/SOURCE/uapi/pmu.h
new file mode 100644
index 0000000..46189c7
--- /dev/null
+++ b/SOURCE/uapi/pmu.h
@@ -0,0 +1,67 @@
+/*
+ * Linux内核诊断工具--用户接口API
+ *
+ * Copyright (C) 2020 Alibaba Ltd.
+ *
+ * 作者: Wen Yang <[email protected]>
+ *
+ * License terms: GNU General Public License (GPL) version 3
+ *
+ */
+
+#ifndef UAPI_CGROUP_STAT_H
+#define UAPI_CGROUP_STAT_H
+
+#include <linux/ioctl.h>
+#include "ali_diagnose.h"
+
+#define DIAG_PMU_VARIANT_BUF_LEN (20 * 1024 * 1024)
+
+enum pmu_counters
+{
+ PMU_INDEX_CYCLES = 0,
+ PMU_INDEX_INSTRUCTIONS,
+ PMU_INDEX_REF_CYCLES,
+ PMU_INDEX_BRANCH_MISSES,
+ PMU_INDEX_LLC_MISSES,
+ PMU_INDEX_RAW_EVENT1,
+ PMU_INDEX_RAW_EVENT2,
+ PMU_INDEX_MAX,
+};
+
+struct diag_pmu_settings {
+ unsigned int activated;
+ unsigned int verbose;
+ unsigned int style;
+ unsigned int sample;
+ unsigned int conf_fixed_counters;
+ unsigned int conf_branch_misses;
+ unsigned int conf_last_cache_misses;
+ unsigned int conf_raw_pmu_event1;
+ unsigned int conf_raw_pmu_event2;
+};
+
+struct diag_pmu_detail {
+ int et_type;
+ int cpu;
+ char cgrp_buf[CGROUP_NAME_LEN];
+ unsigned long instructions;
+ unsigned long cycles;
+ unsigned long ref_cycles;
+ unsigned long branch_misses;
+ unsigned long last_cache_misses;
+ unsigned long raw_pmu_event1;
+ unsigned long raw_pmu_event2;
+};
+
+#define CMD_PMU_SET (0)
+#define CMD_PMU_SETTINGS (CMD_PMU_SET + 1)
+#define CMD_PMU_DUMP (CMD_PMU_SETTINGS + 1)
+#define CMD_PMU_ISOLATE (CMD_PMU_DUMP + 1)
+#define CMD_PMU_SAMPLE (CMD_PMU_ISOLATE + 1)
+#define DIAG_IOCTL_PMU_SET _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_SET, struct diag_pmu_settings)
+#define DIAG_IOCTL_PMU_SETTINGS _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_SETTINGS, struct diag_pmu_settings)
+#define DIAG_IOCTL_PMU_DUMP _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_DUMP, struct diag_ioctl_dump_param)
+#define DIAG_IOCTL_PMU_SAMPLE _IOWR(DIAG_IOCTL_TYPE_PMU, CMD_PMU_SAMPLE, int)
+
+#endif /* UAPI_CGROUP_STAT_H */