diff options
Diffstat (limited to 'kernel/events')
| -rw-r--r-- | kernel/events/callchain.c | 4 | ||||
| -rw-r--r-- | kernel/events/core.c | 724 | ||||
| -rw-r--r-- | kernel/events/hw_breakpoint.c | 3 | ||||
| -rw-r--r-- | kernel/events/ring_buffer.c | 2 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 44 |
5 files changed, 567 insertions, 210 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b9c7e00725d6..9d24b6e0c91f 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -2,7 +2,7 @@ /* * Performance events callchain code, extracted from core.c: * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> @@ -246,7 +246,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { - if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + if (!is_user_task(current)) goto exit_put; regs = task_pt_regs(current); } diff --git a/kernel/events/core.c b/kernel/events/core.c index dad0d3d2e85f..6d1f8bad7e1c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2,7 +2,7 @@ /* * Performance events core code: * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> @@ -57,6 +57,7 @@ #include <linux/task_work.h> #include <linux/percpu-rwsem.h> #include <linux/unwind_deferred.h> +#include <linux/kvm_types.h> #include "internal.h" @@ -166,6 +167,18 @@ enum event_type_t { EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, + /* + * EVENT_GUEST is set when scheduling in/out events between the host + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST + * is used: + * + * - In for_each_epc() to skip PMUs that don't support events in a + * MEDIATED_VPMU guest, i.e. don't need to be context switched. + * - To indicate the start/end point of the events in a guest. Guest + * running time is deducted for host-only (exclude_guest) events. + */ + EVENT_GUEST = 0x40, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -458,6 +471,20 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static DEFINE_PER_CPU(bool, guest_ctx_loaded); + +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return __this_cpu_read(guest_ctx_loaded); +} +#else +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return false; +} +#endif + /* * perf event paranoia level: * -1 - not paranoid at all @@ -779,33 +806,97 @@ do { \ ___p; \ }) -#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ +static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) + return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) + return true; + return false; +} + +#define for_each_epc(_epc, _ctx, _pmu, _event_type) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ - if (_cgroup && !_epc->nr_cgroups) \ + if (perf_skip_pmu_ctx(_epc, _event_type)) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else -static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_disable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_disable(pmu_ctx->pmu); } -static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_enable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + if (adv) + time->time += now - time->stamp; + time->stamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(time->offset, time->time - time->stamp); +} + +static_assert(offsetof(struct perf_event_context, timeguest) - + offsetof(struct perf_event_context, time) == + sizeof(struct perf_time_ctx)); + +#define T_TOTAL 0 +#define T_GUEST 1 + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[T_TOTAL].time; + + if (event->attr.exclude_guest) + time -= times[T_GUEST].time; + + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { + /* + * (now + times[total].offset) - (now + times[guest].offset) := + * times[total].offset - times[guest].offset + */ + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); + } + + return now + READ_ONCE(times[T_TOTAL].offset); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -842,12 +933,16 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -856,20 +951,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); - return now; + return __perf_event_time_ctx(event, &t->time); + + return __perf_event_time_ctx_now(event, &t->time, now); } -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) { - if (adv) - info->time += now - info->timestamp; - info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + update_perf_time_ctx(&info->time, now, true); + if (is_guest_mediated_pmu_loaded()) + __update_cgrp_guest_time(info, now, true); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -885,7 +981,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -908,11 +1004,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -932,8 +1028,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + update_perf_time_ctx(&info->time, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -964,8 +1064,7 @@ static void perf_cgroup_switch(struct task_struct *task) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_disable(&cpuctx->ctx, true); + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* @@ -981,7 +1080,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1138,7 +1237,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1550,29 +1649,24 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); - lockdep_assert_held(&ctx->lock); - if (adv) - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + lockdep_assert_held(&ctx->lock); + + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (is_guest_mediated_pmu_loaded()) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1585,7 +1679,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1599,10 +1693,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); - now += READ_ONCE(ctx->timeoffset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -2422,20 +2515,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, + bool final, enum event_type_t event_type) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; + update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, final); + /* vPMU should not stop time */ + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - __ctx_time_update(cpuctx, ctx, false); + __ctx_time_update(cpuctx, ctx, false, 0); } /* @@ -2861,14 +2957,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); } /* @@ -2902,11 +2999,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_disable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); @@ -2924,13 +3021,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_enable(epc->pmu); } } @@ -3479,11 +3576,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3507,14 +3603,14 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); - ctx->is_active &= ~event_type; + ctx->is_active &= ~active_type; if (!(ctx->is_active & EVENT_ALL)) { /* @@ -3533,9 +3629,20 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3691,7 +3798,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || @@ -3715,7 +3822,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); /* * RCU_INIT_POINTER here is safe because we've not @@ -3739,13 +3846,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); raw_spin_unlock(&ctx->lock); } } @@ -3992,10 +4099,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4003,13 +4115,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any host events from PMU with + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. + */ + if (is_guest_mediated_pmu_loaded() && + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4017,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (*perf_event_fasync(event)) event->pending_kill = POLL_ERR; - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending_irq); } else { struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); @@ -4032,11 +4154,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, @@ -4045,20 +4171,18 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); if (event_type & EVENT_FLEXIBLE) - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4066,9 +4190,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4076,7 +4202,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t barrier(); } - ctx->is_active |= (event_type | EVENT_TIME); + ctx->is_active |= active_type | EVENT_TIME; if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; @@ -4084,21 +4210,37 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in the required exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = event_type & EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); } } @@ -4114,11 +4256,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4131,7 +4273,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4141,18 +4283,18 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, false); + perf_ctx_disable(&cpuctx->ctx, 0); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, NULL); + perf_event_sched_in(cpuctx, ctx, NULL, 0); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, false); + perf_ctx_enable(&cpuctx->ctx, 0); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -4671,7 +4813,7 @@ static void __perf_event_read(void *info) struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - struct pmu *pmu = event->pmu; + struct pmu *pmu; /* * If this is a task context, we need to check whether it is @@ -4683,7 +4825,7 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - raw_spin_lock(&ctx->lock); + guard(raw_spinlock)(&ctx->lock); ctx_time_update_event(ctx, event); perf_event_update_time(event); @@ -4691,25 +4833,22 @@ static void __perf_event_read(void *info) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; + return; if (!data->group) { - pmu->read(event); + perf_pmu_read(event); data->ret = 0; - goto unlock; + return; } + pmu = event->pmu_ctx->pmu; pmu->start_txn(pmu, PERF_PMU_TXN_READ); - pmu->read(event); - + perf_pmu_read(event); for_each_sibling_event(sub, event) perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); - -unlock: - raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event, bool self) @@ -4917,7 +5056,7 @@ alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc_obj(struct perf_event_context); if (!ctx) return NULL; @@ -5057,7 +5196,7 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, return epc; } - new = kzalloc(sizeof(*epc), GFP_KERNEL); + new = kzalloc_obj(*epc); if (!new) return ERR_PTR(-ENOMEM); @@ -5229,15 +5368,15 @@ static void unaccount_freq_event(void) static struct perf_ctx_data * -alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags) { struct perf_ctx_data *cd; - cd = kzalloc(sizeof(*cd), GFP_KERNEL); + cd = kzalloc_obj(*cd, gfp_flags); if (!cd) return NULL; - cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags); if (!cd->data) { kfree(cd); return NULL; @@ -5271,18 +5410,29 @@ static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) static int attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, - bool global) + bool global, gfp_t gfp_flags) { struct perf_ctx_data *cd, *old = NULL; - cd = alloc_perf_ctx_data(ctx_cache, global); + cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags); if (!cd) return -ENOMEM; for (;;) { - if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) { if (old) perf_free_ctx_data_rcu(old); + /* + * Above try_cmpxchg() pairs with try_cmpxchg() from + * detach_task_ctx_data() such that + * if we race with perf_event_exit_task(), we must + * observe PF_EXITING. + */ + if (task->flags & PF_EXITING) { + /* detach_task_ctx_data() may free it already */ + if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); + } return 0; } @@ -5328,6 +5478,8 @@ again: /* Allocate everything */ scoped_guard (rcu) { for_each_process_thread(g, p) { + if (p->flags & PF_EXITING) + continue; cd = rcu_dereference(p->perf_ctx_data); if (cd && !cd->global) { cd->global = 1; @@ -5335,6 +5487,12 @@ again: cd = NULL; } if (!cd) { + /* + * Try to allocate context quickly before + * traversing the whole thread list again. + */ + if (!attach_task_ctx_data(p, ctx_cache, true, GFP_NOWAIT)) + continue; get_task_struct(p); goto alloc; } @@ -5345,7 +5503,7 @@ again: return 0; alloc: - ret = attach_task_ctx_data(p, ctx_cache, true); + ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL); put_task_struct(p); if (ret) { __detach_global_ctx_data(); @@ -5365,7 +5523,7 @@ attach_perf_ctx_data(struct perf_event *event) return -ENOMEM; if (task) - return attach_task_ctx_data(task, ctx_cache, false); + return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL); ret = attach_global_ctx_data(ctx_cache); if (ret) @@ -5400,22 +5558,15 @@ static void __detach_global_ctx_data(void) struct task_struct *g, *p; struct perf_ctx_data *cd; -again: scoped_guard (rcu) { for_each_process_thread(g, p) { cd = rcu_dereference(p->perf_ctx_data); - if (!cd || !cd->global) - continue; - cd->global = 0; - get_task_struct(p); - goto detach; + if (cd && cd->global) { + cd->global = 0; + detach_task_ctx_data(p); + } } } - return; -detach: - detach_task_ctx_data(p); - put_task_struct(p); - goto again; } static void detach_global_ctx_data(void) @@ -5594,6 +5745,8 @@ static void __free_event(struct perf_event *event) { struct pmu *pmu = event->pmu; + security_perf_event_free(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5647,6 +5800,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void mediated_pmu_unaccount_event(struct perf_event *event); + DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) /* vs perf_event_alloc() success */ @@ -5656,8 +5811,7 @@ static void _free_event(struct perf_event *event) irq_work_sync(&event->pending_disable_irq); unaccount_event(event); - - security_perf_event_free(event); + mediated_pmu_unaccount_event(event); if (event->rb) { /* @@ -6180,6 +6334,138 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static atomic_t nr_include_guest_events __read_mostly; + +static atomic_t nr_mediated_pmu_vms __read_mostly; +static DEFINE_MUTEX(perf_mediated_pmu_mutex); + +/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ +static inline bool is_include_guest_event(struct perf_event *event) +{ + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && + !event->attr.exclude_guest) + return true; + + return false; +} + +static int mediated_pmu_account_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return 0; + + if (atomic_inc_not_zero(&nr_include_guest_events)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_mediated_pmu_vms)) + return -EOPNOTSUPP; + + atomic_inc(&nr_include_guest_events); + return 0; +} + +static void mediated_pmu_unaccount_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return; + + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) + return; + + atomic_dec(&nr_include_guest_events); +} + +/* + * Currently invoked at VM creation to + * - Check whether there are existing !exclude_guest events of PMU with + * PERF_PMU_CAP_MEDIATED_VPMU + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU + * + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf + * still owns all the PMU resources. + */ +int perf_create_mediated_pmu(void) +{ + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_include_guest_events)) + return -EBUSY; + + atomic_inc(&nr_mediated_pmu_vms); + return 0; +} +EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); + +void perf_release_mediated_pmu(void) +{ + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) + return; + + atomic_dec(&nr_mediated_pmu_vms); +} +EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); + +/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ +void perf_load_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); + } + + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, true); +} +EXPORT_SYMBOL_GPL(perf_load_guest_context); + +void perf_put_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lo |
