aboutsummaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/callchain.c4
-rw-r--r--kernel/events/core.c724
-rw-r--r--kernel/events/hw_breakpoint.c3
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c44
5 files changed, 567 insertions, 210 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b9c7e00725d6..9d24b6e0c91f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -2,7 +2,7 @@
/*
* Performance events callchain code, extracted from core.c:
*
- * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
@@ -246,7 +246,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
if (user && !crosstask) {
if (!user_mode(regs)) {
- if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+ if (!is_user_task(current))
goto exit_put;
regs = task_pt_regs(current);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dad0d3d2e85f..6d1f8bad7e1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2,7 +2,7 @@
/*
* Performance events core code:
*
- * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
@@ -57,6 +57,7 @@
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
#include <linux/unwind_deferred.h>
+#include <linux/kvm_types.h>
#include "internal.h"
@@ -166,6 +167,18 @@ enum event_type_t {
EVENT_CPU = 0x10,
EVENT_CGROUP = 0x20,
+ /*
+ * EVENT_GUEST is set when scheduling in/out events between the host
+ * and a guest with a mediated vPMU. Among other things, EVENT_GUEST
+ * is used:
+ *
+ * - In for_each_epc() to skip PMUs that don't support events in a
+ * MEDIATED_VPMU guest, i.e. don't need to be context switched.
+ * - To indicate the start/end point of the events in a guest. Guest
+ * running time is deducted for host-only (exclude_guest) events.
+ */
+ EVENT_GUEST = 0x40,
+ EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST,
/* compound helpers */
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
@@ -458,6 +471,20 @@ static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
+#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
+static DEFINE_PER_CPU(bool, guest_ctx_loaded);
+
+static __always_inline bool is_guest_mediated_pmu_loaded(void)
+{
+ return __this_cpu_read(guest_ctx_loaded);
+}
+#else
+static __always_inline bool is_guest_mediated_pmu_loaded(void)
+{
+ return false;
+}
+#endif
+
/*
* perf event paranoia level:
* -1 - not paranoid at all
@@ -779,33 +806,97 @@ do { \
___p; \
})
-#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
+{
+ if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
+ return true;
+ if ((event_type & EVENT_GUEST) &&
+ !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU))
+ return true;
+ return false;
+}
+
+#define for_each_epc(_epc, _ctx, _pmu, _event_type) \
list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
- if (_cgroup && !_epc->nr_cgroups) \
+ if (perf_skip_pmu_ctx(_epc, _event_type)) \
continue; \
else if (_pmu && _epc->pmu != _pmu) \
continue; \
else
-static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
+static void perf_ctx_disable(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
struct perf_event_pmu_context *pmu_ctx;
- for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ for_each_epc(pmu_ctx, ctx, NULL, event_type)
perf_pmu_disable(pmu_ctx->pmu);
}
-static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
+static void perf_ctx_enable(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
struct perf_event_pmu_context *pmu_ctx;
- for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ for_each_epc(pmu_ctx, ctx, NULL, event_type)
perf_pmu_enable(pmu_ctx->pmu);
}
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv)
+{
+ if (adv)
+ time->time += now - time->stamp;
+ time->stamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(time->offset, time->time - time->stamp);
+}
+
+static_assert(offsetof(struct perf_event_context, timeguest) -
+ offsetof(struct perf_event_context, time) ==
+ sizeof(struct perf_time_ctx));
+
+#define T_TOTAL 0
+#define T_GUEST 1
+
+static inline u64 __perf_event_time_ctx(struct perf_event *event,
+ struct perf_time_ctx *times)
+{
+ u64 time = times[T_TOTAL].time;
+
+ if (event->attr.exclude_guest)
+ time -= times[T_GUEST].time;
+
+ return time;
+}
+
+static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
+ struct perf_time_ctx *times,
+ u64 now)
+{
+ if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) {
+ /*
+ * (now + times[total].offset) - (now + times[guest].offset) :=
+ * times[total].offset - times[guest].offset
+ */
+ return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset);
+ }
+
+ return now + READ_ONCE(times[T_TOTAL].offset);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -842,12 +933,16 @@ static inline int is_cgroup_event(struct perf_event *event)
return event->cgrp != NULL;
}
+static_assert(offsetof(struct perf_cgroup_info, timeguest) -
+ offsetof(struct perf_cgroup_info, time) ==
+ sizeof(struct perf_time_ctx));
+
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
- return t->time;
+ return __perf_event_time_ctx(event, &t->time);
}
static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
@@ -856,20 +951,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
t = per_cpu_ptr(event->cgrp->info, event->cpu);
if (!__load_acquire(&t->active))
- return t->time;
- now += READ_ONCE(t->timeoffset);
- return now;
+ return __perf_event_time_ctx(event, &t->time);
+
+ return __perf_event_time_ctx_now(event, &t->time, now);
}
-static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
- if (adv)
- info->time += now - info->timestamp;
- info->timestamp = now;
- /*
- * see update_context_time()
- */
- WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
+ update_perf_time_ctx(&info->timeguest, now, adv);
+}
+
+static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now)
+{
+ update_perf_time_ctx(&info->time, now, true);
+ if (is_guest_mediated_pmu_loaded())
+ __update_cgrp_guest_time(info, now, true);
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
@@ -885,7 +981,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- __update_cgrp_time(info, now, true);
+ update_cgrp_time(info, now);
if (final)
__store_release(&info->active, 0);
}
@@ -908,11 +1004,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
* Do not update time when cgroup is not active
*/
if (info->active)
- __update_cgrp_time(info, perf_clock(), true);
+ update_cgrp_time(info, perf_clock());
}
static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
struct perf_event_context *ctx = &cpuctx->ctx;
struct perf_cgroup *cgrp = cpuctx->cgrp;
@@ -932,8 +1028,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- __update_cgrp_time(info, ctx->timestamp, false);
- __store_release(&info->active, 1);
+ if (guest) {
+ __update_cgrp_guest_time(info, ctx->time.stamp, false);
+ } else {
+ update_perf_time_ctx(&info->time, ctx->time.stamp, false);
+ __store_release(&info->active, 1);
+ }
}
}
@@ -964,8 +1064,7 @@ static void perf_cgroup_switch(struct task_struct *task)
return;
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
- perf_ctx_disable(&cpuctx->ctx, true);
+ perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
@@ -981,7 +1080,7 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
- perf_ctx_enable(&cpuctx->ctx, true);
+ perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -1138,7 +1237,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
}
@@ -1550,29 +1649,24 @@ static void perf_unpin_context(struct perf_event_context *ctx)
*/
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
- u64 now = perf_clock();
-
lockdep_assert_held(&ctx->lock);
- if (adv)
- ctx->time += now - ctx->timestamp;
- ctx->timestamp = now;
+ update_perf_time_ctx(&ctx->time, perf_clock(), adv);
+}
- /*
- * The above: time' = time + (now - timestamp), can be re-arranged
- * into: time` = now + (time - timestamp), which gives a single value
- * offset to compute future time without locks on.
- *
- * See perf_event_time_now(), which can be used from NMI context where
- * it's (obviously) not possible to acquire ctx->lock in order to read
- * both the above values in a consistent manner.
- */
- WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
+{
+ lockdep_assert_held(&ctx->lock);
+
+ /* must be called after __update_context_time(); */
+ update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv);
}
static void update_context_time(struct perf_event_context *ctx)
{
__update_context_time(ctx, true);
+ if (is_guest_mediated_pmu_loaded())
+ __update_context_guest_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
@@ -1585,7 +1679,7 @@ static u64 perf_event_time(struct perf_event *event)
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx->time;
+ return __perf_event_time_ctx(event, &ctx->time);
}
static u64 perf_event_time_now(struct perf_event *event, u64 now)
@@ -1599,10 +1693,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now)
return perf_cgroup_event_time_now(event, now);
if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
- return ctx->time;
+ return __perf_event_time_ctx(event, &ctx->time);
- now += READ_ONCE(ctx->timeoffset);
- return now;
+ return __perf_event_time_ctx_now(event, &ctx->time, now);
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -2422,20 +2515,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
}
static inline void
-__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx,
+ bool final, enum event_type_t event_type)
{
if (ctx->is_active & EVENT_TIME) {
if (ctx->is_active & EVENT_FROZEN)
return;
+
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, final);
+ /* vPMU should not stop time */
+ update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final);
}
}
static inline void
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
- __ctx_time_update(cpuctx, ctx, false);
+ __ctx_time_update(cpuctx, ctx, false, 0);
}
/*
@@ -2861,14 +2957,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct pmu *pmu)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
- ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
if (ctx)
- ctx_sched_in(ctx, pmu, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
if (ctx)
- ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
}
/*
@@ -2902,11 +2999,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ for_each_epc(epc, &cpuctx->ctx, pmu, 0)
perf_pmu_disable(epc->pmu);
if (task_ctx) {
- for_each_epc(epc, task_ctx, pmu, false)
+ for_each_epc(epc, task_ctx, pmu, 0)
perf_pmu_disable(epc->pmu);
task_ctx_sched_out(task_ctx, pmu, event_type);
@@ -2924,13 +3021,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
else if (event_type & EVENT_PINNED)
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx, pmu);
+ perf_event_sched_in(cpuctx, task_ctx, pmu, 0);
- for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ for_each_epc(epc, &cpuctx->ctx, pmu, 0)
perf_pmu_enable(epc->pmu);
if (task_ctx) {
- for_each_epc(epc, task_ctx, pmu, false)
+ for_each_epc(epc, task_ctx, pmu, 0)
perf_pmu_enable(epc->pmu);
}
}
@@ -3479,11 +3576,10 @@ static void
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ enum event_type_t active_type = event_type & ~EVENT_FLAGS;
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
- bool cgroup = event_type & EVENT_CGROUP;
- event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3507,14 +3603,14 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
*
* would only update time for the pinned events.
*/
- __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
*/
barrier();
- ctx->is_active &= ~event_type;
+ ctx->is_active &= ~active_type;
if (!(ctx->is_active & EVENT_ALL)) {
/*
@@ -3533,9 +3629,20 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
cpuctx->task_ctx = NULL;
}
- is_active ^= ctx->is_active; /* changed bits */
+ if (event_type & EVENT_GUEST) {
+ /*
+ * Schedule out all exclude_guest events of PMU
+ * with PERF_PMU_CAP_MEDIATED_VPMU.
+ */
+ is_active = EVENT_ALL;
+ __update_context_guest_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx, true);
+ barrier();
+ } else {
+ is_active ^= ctx->is_active; /* changed bits */
+ }
- for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ for_each_epc(pmu_ctx, ctx, pmu, event_type)
__pmu_ctx_sched_out(pmu_ctx, is_active);
}
@@ -3691,7 +3798,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
/* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
if (local_read(&ctx->nr_no_switch_fast) ||
@@ -3715,7 +3822,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, task, false);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3739,13 +3846,13 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
inside_switch:
perf_ctx_sched_task_cb(ctx, task, false);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3992,10 +4099,15 @@ static inline void group_update_userpage(struct perf_event *group_event)
event_update_userpage(event);
}
+struct merge_sched_data {
+ int can_add_hw;
+ enum event_type_t event_type;
+};
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- int *can_add_hw = data;
+ struct merge_sched_data *msd = data;
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -4003,13 +4115,22 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, *can_add_hw)) {
+ /*
+ * Don't schedule in any host events from PMU with
+ * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running.
+ */
+ if (is_guest_mediated_pmu_loaded() &&
+ event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU &&
+ !(msd->event_type & EVENT_GUEST))
+ return 0;
+
+ if (group_can_go_on(event, msd->can_add_hw)) {
if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
- *can_add_hw = 0;
+ msd->can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
@@ -4017,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (*perf_event_fasync(event))
event->pending_kill = POLL_ERR;
- perf_event_wakeup(event);
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending_irq);
} else {
struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
@@ -4032,11 +4154,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
static void pmu_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
- struct pmu *pmu)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
- int can_add_hw = 1;
+ struct merge_sched_data msd = {
+ .can_add_hw = 1,
+ .event_type = event_type,
+ };
visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
+ merge_sched_in, &msd);
}
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
@@ -4045,20 +4171,18 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
struct perf_event_context *ctx = pmu_ctx->ctx;
if (event_type & EVENT_PINNED)
- pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type);
if (event_type & EVENT_FLEXIBLE)
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type);
}
static void
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ enum event_type_t active_type = event_type & ~EVENT_FLAGS;
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
- bool cgroup = event_type & EVENT_CGROUP;
-
- event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -4066,9 +4190,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
return;
if (!(is_active & EVENT_TIME)) {
+ /* EVENT_TIME should be active while the guest runs */
+ WARN_ON_ONCE(event_type & EVENT_GUEST);
/* start ctx time */
__update_context_time(ctx, false);
- perf_cgroup_set_timestamp(cpuctx);
+ perf_cgroup_set_timestamp(cpuctx, false);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
@@ -4076,7 +4202,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
barrier();
}
- ctx->is_active |= (event_type | EVENT_TIME);
+ ctx->is_active |= active_type | EVENT_TIME;
if (ctx->task) {
if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
@@ -4084,21 +4210,37 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
}
- is_active ^= ctx->is_active; /* changed bits */
+ if (event_type & EVENT_GUEST) {
+ /*
+ * Schedule in the required exclude_guest events of PMU
+ * with PERF_PMU_CAP_MEDIATED_VPMU.
+ */
+ is_active = event_type & EVENT_ALL;
+
+ /*
+ * Update ctx time to set the new start time for
+ * the exclude_guest events.
+ */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
+ barrier();
+ } else {
+ is_active ^= ctx->is_active; /* changed bits */
+ }
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED) {
- for_each_epc(pmu_ctx, ctx, pmu, cgroup)
- __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ for_each_epc(pmu_ctx, ctx, pmu, event_type)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST));
}
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE) {
- for_each_epc(pmu_ctx, ctx, pmu, cgroup)
- __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ for_each_epc(pmu_ctx, ctx, pmu, event_type)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST));
}
}
@@ -4114,11 +4256,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
perf_ctx_sched_task_cb(ctx, task, true);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -4131,7 +4273,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;
- perf_ctx_disable(ctx, false);
+ perf_ctx_disable(ctx, 0);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -4141,18 +4283,18 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx, false);
+ perf_ctx_disable(&cpuctx->ctx, 0);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
- perf_event_sched_in(cpuctx, ctx, NULL);
+ perf_event_sched_in(cpuctx, ctx, NULL, 0);
perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx, false);
+ perf_ctx_enable(&cpuctx->ctx, 0);
- perf_ctx_enable(ctx, false);
+ perf_ctx_enable(ctx, 0);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -4671,7 +4813,7 @@ static void __perf_event_read(void *info)
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
- struct pmu *pmu = event->pmu;
+ struct pmu *pmu;
/*
* If this is a task context, we need to check whether it is
@@ -4683,7 +4825,7 @@ static void __perf_event_read(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- raw_spin_lock(&ctx->lock);
+ guard(raw_spinlock)(&ctx->lock);
ctx_time_update_event(ctx, event);
perf_event_update_time(event);
@@ -4691,25 +4833,22 @@ static void __perf_event_read(void *info)
perf_event_update_sibling_time(event);
if (event->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
+ return;
if (!data->group) {
- pmu->read(event);
+ perf_pmu_read(event);
data->ret = 0;
- goto unlock;
+ return;
}
+ pmu = event->pmu_ctx->pmu;
pmu->start_txn(pmu, PERF_PMU_TXN_READ);
- pmu->read(event);
-
+ perf_pmu_read(event);
for_each_sibling_event(sub, event)
perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
-
-unlock:
- raw_spin_unlock(&ctx->lock);
}
static inline u64 perf_event_count(struct perf_event *event, bool self)
@@ -4917,7 +5056,7 @@ alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
- ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+ ctx = kzalloc_obj(struct perf_event_context);
if (!ctx)
return NULL;
@@ -5057,7 +5196,7 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
return epc;
}
- new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ new = kzalloc_obj(*epc);
if (!new)
return ERR_PTR(-ENOMEM);
@@ -5229,15 +5368,15 @@ static void unaccount_freq_event(void)
static struct perf_ctx_data *
-alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags)
{
struct perf_ctx_data *cd;
- cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ cd = kzalloc_obj(*cd, gfp_flags);
if (!cd)
return NULL;
- cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags);
if (!cd->data) {
kfree(cd);
return NULL;
@@ -5271,18 +5410,29 @@ static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
static int
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
- bool global)
+ bool global, gfp_t gfp_flags)
{
struct perf_ctx_data *cd, *old = NULL;
- cd = alloc_perf_ctx_data(ctx_cache, global);
+ cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags);
if (!cd)
return -ENOMEM;
for (;;) {
- if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) {
if (old)
perf_free_ctx_data_rcu(old);
+ /*
+ * Above try_cmpxchg() pairs with try_cmpxchg() from
+ * detach_task_ctx_data() such that
+ * if we race with perf_event_exit_task(), we must
+ * observe PF_EXITING.
+ */
+ if (task->flags & PF_EXITING) {
+ /* detach_task_ctx_data() may free it already */
+ if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+ }
return 0;
}
@@ -5328,6 +5478,8 @@ again:
/* Allocate everything */
scoped_guard (rcu) {
for_each_process_thread(g, p) {
+ if (p->flags & PF_EXITING)
+ continue;
cd = rcu_dereference(p->perf_ctx_data);
if (cd && !cd->global) {
cd->global = 1;
@@ -5335,6 +5487,12 @@ again:
cd = NULL;
}
if (!cd) {
+ /*
+ * Try to allocate context quickly before
+ * traversing the whole thread list again.
+ */
+ if (!attach_task_ctx_data(p, ctx_cache, true, GFP_NOWAIT))
+ continue;
get_task_struct(p);
goto alloc;
}
@@ -5345,7 +5503,7 @@ again:
return 0;
alloc:
- ret = attach_task_ctx_data(p, ctx_cache, true);
+ ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL);
put_task_struct(p);
if (ret) {
__detach_global_ctx_data();
@@ -5365,7 +5523,7 @@ attach_perf_ctx_data(struct perf_event *event)
return -ENOMEM;
if (task)
- return attach_task_ctx_data(task, ctx_cache, false);
+ return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL);
ret = attach_global_ctx_data(ctx_cache);
if (ret)
@@ -5400,22 +5558,15 @@ static void __detach_global_ctx_data(void)
struct task_struct *g, *p;
struct perf_ctx_data *cd;
-again:
scoped_guard (rcu) {
for_each_process_thread(g, p) {
cd = rcu_dereference(p->perf_ctx_data);
- if (!cd || !cd->global)
- continue;
- cd->global = 0;
- get_task_struct(p);
- goto detach;
+ if (cd && cd->global) {
+ cd->global = 0;
+ detach_task_ctx_data(p);
+ }
}
}
- return;
-detach:
- detach_task_ctx_data(p);
- put_task_struct(p);
- goto again;
}
static void detach_global_ctx_data(void)
@@ -5594,6 +5745,8 @@ static void __free_event(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
+ security_perf_event_free(event);
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5647,6 +5800,8 @@ static void __free_event(struct perf_event *event)
call_rcu(&event->rcu_head, free_event_rcu);
}
+static void mediated_pmu_unaccount_event(struct perf_event *event);
+
DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
/* vs perf_event_alloc() success */
@@ -5656,8 +5811,7 @@ static void _free_event(struct perf_event *event)
irq_work_sync(&event->pending_disable_irq);
unaccount_event(event);
-
- security_perf_event_free(event);
+ mediated_pmu_unaccount_event(event);
if (event->rb) {
/*
@@ -6180,6 +6334,138 @@ u64 perf_event_pause(struct perf_event *event, bool reset)
}
EXPORT_SYMBOL_GPL(perf_event_pause);
+#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
+static atomic_t nr_include_guest_events __read_mostly;
+
+static atomic_t nr_mediated_pmu_vms __read_mostly;
+static DEFINE_MUTEX(perf_mediated_pmu_mutex);
+
+/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */
+static inline bool is_include_guest_event(struct perf_event *event)
+{
+ if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) &&
+ !event->attr.exclude_guest)
+ return true;
+
+ return false;
+}
+
+static int mediated_pmu_account_event(struct perf_event *event)
+{
+ if (!is_include_guest_event(event))
+ return 0;
+
+ if (atomic_inc_not_zero(&nr_include_guest_events))
+ return 0;
+
+ guard(mutex)(&perf_mediated_pmu_mutex);
+ if (atomic_read(&nr_mediated_pmu_vms))
+ return -EOPNOTSUPP;
+
+ atomic_inc(&nr_include_guest_events);
+ return 0;
+}
+
+static void mediated_pmu_unaccount_event(struct perf_event *event)
+{
+ if (!is_include_guest_event(event))
+ return;
+
+ if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events)))
+ return;
+
+ atomic_dec(&nr_include_guest_events);
+}
+
+/*
+ * Currently invoked at VM creation to
+ * - Check whether there are existing !exclude_guest events of PMU with
+ * PERF_PMU_CAP_MEDIATED_VPMU
+ * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on
+ * PMUs with PERF_PMU_CAP_MEDIATED_VPMU
+ *
+ * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
+ * still owns all the PMU resources.
+ */
+int perf_create_mediated_pmu(void)
+{
+ if (atomic_inc_not_zero(&nr_mediated_pmu_vms))
+ return 0;
+
+ guard(mutex)(&perf_mediated_pmu_mutex);
+ if (atomic_read(&nr_include_guest_events))
+ return -EBUSY;
+
+ atomic_inc(&nr_mediated_pmu_vms);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu);
+
+void perf_release_mediated_pmu(void)
+{
+ if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms)))
+ return;
+
+ atomic_dec(&nr_mediated_pmu_vms);
+}
+EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu);
+
+/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
+void perf_load_guest_context(void)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ lockdep_assert_irqs_disabled();
+
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+
+ if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
+ return;
+
+ perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
+ if (cpuctx->task_ctx) {
+ perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+ task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
+ }
+
+ perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+ if (cpuctx->task_ctx)
+ perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+
+ __this_cpu_write(guest_ctx_loaded, true);
+}
+EXPORT_SYMBOL_GPL(perf_load_guest_context);
+
+void perf_put_guest_context(void)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ lo