Re: [PATCH v2 2/2] perf/core: Remove perf_cpu_context::unique_pmu

From: Peter Zijlstra
Date: Wed Jan 25 2017 - 10:24:16 EST


On Fri, Jan 20, 2017 at 12:30:38PM -0800, David Carrillo-Cisneros wrote:
> On Fri, Jan 20, 2017 at 1:20 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> > On Wed, Jan 18, 2017 at 11:24:54AM -0800, David Carrillo-Cisneros wrote:
> >> cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs
> >> with shared pmus in order to avoid visiting the same cpuctx more than once
> >> in a for_each_pmu loop.
> >>
> >> cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they
> >> have only one pmu per cpuctx. Since perf_pmu_sched_task is only called in
> >> hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it.
> >>
> >> The change above, together with the previous patch in this series, removed
> >> the remaining uses of cpuctx->unique_pmu, so we remove it altogether.
> >>
> >> Signed-off-by: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
> >> Acked-by: Mark Rutland <mark.rutland@xxxxxxx>
> >

> >
> > This very much relies on us never calling perf_pmu_unregister() on the
> > software PMUs afaict. A condition not mention in the Changelog.
> >
> What's a good way to solve this? Update the Changelog or add code to
> update ctx->pmu?

I think just update the Changelog and maybe put a comment near
perf_pmu_register() and/or the sw pmu abuse that relies on this.

> This issue would go away cleanly if we were to remove the context
> sharing across pmu's. Would you support work in that direction?

Its something that I've considered, the trivial solution is folding it
all into the one swevent pmu by adding a switch in all the
add/del/start/stop/read methods. Its a wee bit ugly but straight fwd.

I've not really found anything less ugly though; and I have to fully
admit to the current situation being rather vile.


I also just found the below patch that I've had bitrotting since 2015.

---
Subject: perf: Move all software PMUs into their own file
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Fri Apr 17 19:52:17 CEST 2015


Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/events/Makefile | 2
kernel/events/core.c | 1280 +++++------------------------------------------
kernel/events/internal.h | 13
kernel/events/software.c | 1021 +++++++++++++++++++++++++++++++++++++
4 files changed, 1184 insertions(+), 1132 deletions(-)

--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,7 +2,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif

-obj-y := core.o ring_buffer.o callchain.o
+obj-y := core.o software.o ring_buffer.o callchain.o

obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_UPROBES) += uprobes.o
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,14 +36,11 @@
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
-#include <linux/bpf.h>
-#include <linux/filter.h>

#include "internal.h"

@@ -1828,8 +1825,6 @@ static void perf_set_shadow_time(struct
event->shadow_ctx_time = tstamp - ctx->timestamp;
}

-#define MAX_INTERRUPTS (~0ULL)
-
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

@@ -3411,9 +3406,6 @@ find_get_context(struct pmu *pmu, struct
return ERR_PTR(err);
}

-static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
-
static void free_event_rcu(struct rcu_head *head)
{
struct perf_event *event;
@@ -4020,8 +4012,6 @@ static inline int perf_fget_light(int fd

static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
-static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -6036,9 +6026,9 @@ static void perf_log_itrace_start(struct
* Generic event overflow handling, sampling.
*/

-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
@@ -6111,1155 +6101,223 @@ int perf_event_overflow(struct perf_even
return __perf_event_overflow(event, 1, data, regs);
}

-/*
- * Generic software event infrastructure
- */
-
-struct swevent_htable {
- struct swevent_hlist *swevent_hlist;
- struct mutex hlist_mutex;
- int hlist_refcount;
-
- /* Recursion avoidance in each contexts */
- int recursion[PERF_NR_CONTEXTS];
-
- /* Keeps track of cpu being initialized/exited */
- bool online;
-};
-
-static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
-
-/*
- * We directly increment event->count and keep a second value in
- * event->hw.period_left to count intervals. This period event
- * is kept in the range [-sample_period, 0] so that we can use the
- * sign as trigger.
- */
-
-u64 perf_swevent_set_period(struct perf_event *event)
+static void perf_pmu_nop_void(struct pmu *pmu)
{
- struct hw_perf_event *hwc = &event->hw;
- u64 period = hwc->last_period;
- u64 nr, offset;
- s64 old, val;
-
- hwc->last_period = hwc->sample_period;
-
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
-
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
-
- return nr;
}

-static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static int perf_pmu_nop_int(struct pmu *pmu)
{
- struct hw_perf_event *hwc = &event->hw;
- int throttle = 0;
-
- if (!overflow)
- overflow = perf_swevent_set_period(event);
-
- if (hwc->interrupts == MAX_INTERRUPTS)
- return;
-
- for (; overflow; overflow--) {
- if (__perf_event_overflow(event, throttle,
- data, regs)) {
- /*
- * We inhibit the overflow from happening when
- * hwc->interrupts == MAX_INTERRUPTS.
- */
- break;
- }
- throttle = 1;
- }
+ return 0;
}

-static void perf_swevent_event(struct perf_event *event, u64 nr,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void perf_pmu_start_txn(struct pmu *pmu)
{
- struct hw_perf_event *hwc = &event->hw;
-
- local64_add(nr, &event->count);
-
- if (!regs)
- return;
-
- if (!is_sampling_event(event))
- return;
-
- if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
- data->period = nr;
- return perf_swevent_overflow(event, 1, data, regs);
- } else
- data->period = event->hw.last_period;
-
- if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
- return perf_swevent_overflow(event, 1, data, regs);
-
- if (local64_add_negative(nr, &hwc->period_left))
- return;
-
- perf_swevent_overflow(event, 0, data, regs);
+ perf_pmu_disable(pmu);
}

-static int perf_exclude_event(struct perf_event *event,
- struct pt_regs *regs)
+static int perf_pmu_commit_txn(struct pmu *pmu)
{
- if (event->hw.state & PERF_HES_STOPPED)
- return 1;
-
- if (regs) {
- if (event->attr.exclude_user && user_mode(regs))
- return 1;
-
- if (event->attr.exclude_kernel && !user_mode(regs))
- return 1;
- }
-
+ perf_pmu_enable(pmu);
return 0;
}

-static int perf_swevent_match(struct perf_event *event,
- enum perf_type_id type,
- u32 event_id,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- if (event->attr.type != type)
- return 0;
-
- if (event->attr.config != event_id)
- return 0;
-
- if (perf_exclude_event(event, regs))
- return 0;
-
- return 1;
-}
-
-static inline u64 swevent_hash(u64 type, u32 event_id)
+static void perf_pmu_cancel_txn(struct pmu *pmu)
{
- u64 val = event_id | (type << 32);
-
- return hash_64(val, SWEVENT_HLIST_BITS);
+ perf_pmu_enable(pmu);
}

-static inline struct hlist_head *
-__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+static int perf_event_idx_default(struct perf_event *event)
{
- u64 hash = swevent_hash(type, event_id);
-
- return &hlist->heads[hash];
+ return 0;
}

-/* For the read side: events when they trigger */
-static inline struct hlist_head *
-find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
- struct swevent_hlist *hlist;
+ struct pmu *pmu;

- hlist = rcu_dereference(swhash->swevent_hlist);
- if (!hlist)
+ if (ctxn < 0)
return NULL;

- return __find_swevent_head(hlist, type, event_id);
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (pmu->task_ctx_nr == ctxn)
+ return pmu->pmu_cpu_context;
+ }
+
+ return NULL;
}

-/* For the event head insertion and removal in the hlist */
-static inline struct hlist_head *
-find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
{
- struct swevent_hlist *hlist;
- u32 event_id = event->attr.config;
- u64 type = event->attr.type;
-
- /*
- * Event scheduling is always serialized against hlist allocation
- * and release. Which makes the protected version suitable here.
- * The context lock guarantees that.
- */
- hlist = rcu_dereference_protected(swhash->swevent_hlist,
- lockdep_is_held(&event->ctx->lock));
- if (!hlist)
- return NULL;
+ int cpu;

- return __find_swevent_head(hlist, type, event_id);
-}
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;

-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
- u64 nr,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
- struct perf_event *event;
- struct hlist_head *head;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);

- rcu_read_lock();
- head = find_swevent_head_rcu(swhash, type, event_id);
- if (!head)
- goto end;
-
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_swevent_match(event, type, event_id, data, regs))
- perf_swevent_event(event, nr, data, regs);
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
}
-end:
- rcu_read_unlock();
}

-DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
-
-int perf_swevent_get_recursion_context(void)
+static void free_pmu_context(struct pmu *pmu)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- return get_recursion_context(swhash->recursion);
-}
-EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+ struct pmu *i;

-inline void perf_swevent_put_recursion_context(int rctx)
-{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ mutex_lock(&pmus_lock);
+ /*
+ * Like a real lame refcount.
+ */
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
+ goto out;
+ }
+ }

- put_recursion_context(swhash->recursion, rctx);
+ free_percpu(pmu->pmu_cpu_context);
+out:
+ mutex_unlock(&pmus_lock);
}
+static struct idr pmu_idr;

-void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
{
- struct perf_sample_data data;
-
- if (WARN_ON_ONCE(!regs))
- return;
+ struct pmu *pmu = dev_get_drvdata(dev);

- perf_sample_data_init(&data, addr, 0);
- do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);

-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
{
- int rctx;
-
- preempt_disable_notrace();
- rctx = perf_swevent_get_recursion_context();
- if (unlikely(rctx < 0))
- goto fail;
-
- ___perf_sw_event(event_id, nr, regs, addr);
+ struct pmu *pmu = dev_get_drvdata(dev);

- perf_swevent_put_recursion_context(rctx);
-fail:
- preempt_enable_notrace();
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}

-static void perf_swevent_read(struct perf_event *event)
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
-}
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;

-static int perf_swevent_add(struct perf_event *event, int flags)
-{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
- struct hw_perf_event *hwc = &event->hw;
- struct hlist_head *head;
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;

- if (is_sampling_event(event)) {
- hwc->last_period = hwc->sample_period;
- perf_swevent_set_period(event);
- }
+ if (timer < 1)
+ return -EINVAL;

- hwc->state = !(flags & PERF_EF_START);
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;

- head = find_swevent_head(swhash, event);
- if (!head) {
- /*
- * We can race with cpu hotplug code. Do not
- * WARN if the cpu just got unplugged.
- */
- WARN_ON_ONCE(swhash->online);
- return -EINVAL;
- }
+ pmu->hrtimer_interval_ms = timer;

- hlist_add_head_rcu(&event->hlist_entry, head);
- perf_event_update_userpage(event);
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

- return 0;
-}
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }

-static void perf_swevent_del(struct perf_event *event, int flags)
-{
- hlist_del_rcu(&event->hlist_entry);
+ return count;
}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

-static void perf_swevent_start(struct perf_event *event, int flags)
-{
- event->hw.state = 0;
-}
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(pmu_dev);

-static void perf_swevent_stop(struct perf_event *event, int flags)
-{
- event->hw.state = PERF_HES_STOPPED;
-}
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_groups = pmu_dev_groups,
+};

-/* Deref the hlist from the update side */
-static inline struct swevent_hlist *
-swevent_hlist_deref(struct swevent_htable *swhash)
+static void pmu_dev_release(struct device *dev)
{
- return rcu_dereference_protected(swhash->swevent_hlist,
- lockdep_is_held(&swhash->hlist_mutex));
+ kfree(dev);
}

-static void swevent_hlist_release(struct swevent_htable *swhash)
+static int pmu_dev_alloc(struct pmu *pmu)
{
- struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
-
- if (!hlist)
- return;
+ int ret = -ENOMEM;

- RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
- kfree_rcu(hlist, rcu_head);
-}
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;

-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
-{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ pmu->dev->groups = pmu->attr_groups;
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;

- mutex_lock(&swhash->hlist_mutex);
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;

- if (!--swhash->hlist_refcount)
- swevent_hlist_release(swhash);
+out:
+ return ret;

- mutex_unlock(&swhash->hlist_mutex);
+free_dev:
+ put_device(pmu->dev);
+ goto out;
}

-static void swevent_hlist_put(struct perf_event *event)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- swevent_hlist_put_cpu(event, cpu);
-}
+static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;

-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
- int err = 0;
+ int cpu, ret;

- mutex_lock(&swhash->hlist_mutex);
+ mutex_lock(&pmus_lock);
+ ret = -ENOMEM;
+ pmu->pmu_disable_count = alloc_percpu(int);
+ if (!pmu->pmu_disable_count)
+ goto unlock;

- if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
- struct swevent_hlist *hlist;
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;

- hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
- if (!hlist) {
- err = -ENOMEM;
- goto exit;
+ if (type < 0) {
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
+ goto free_pdc;
}
- rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
- swhash->hlist_refcount++;
-exit:
- mutex_unlock(&swhash->hlist_mutex);
+ pmu->type = type;

- return err;
-}
-
-static int swevent_hlist_get(struct perf_event *event)
-{
- int err;
- int cpu, failed_cpu;
-
- get_online_cpus();
- for_each_possible_cpu(cpu) {
- err = swevent_hlist_get_cpu(event, cpu);
- if (err) {
- failed_cpu = cpu;
- goto fail;
- }
- }
- put_online_cpus();
-
- return 0;
-fail:
- for_each_possible_cpu(cpu) {
- if (cpu == failed_cpu)
- break;
- swevent_hlist_put_cpu(event, cpu);
- }
-
- put_online_cpus();
- return err;
-}
-
-struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
-
-static void sw_perf_event_destroy(struct perf_event *event)
-{
- u64 event_id = event->attr.config;
-
- WARN_ON(event->parent);
-
- static_key_slow_dec(&perf_swevent_enabled[event_id]);
- swevent_hlist_put(event);
-}
-
-static int perf_swevent_init(struct perf_event *event)
-{
- u64 event_id = event->attr.config;
-
- if (event->attr.type != PERF_TYPE_SOFTWARE)
- return -ENOENT;
-
- /*
- * no branch sampling for software events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- switch (event_id) {
- case PERF_COUNT_SW_CPU_CLOCK:
- case PERF_COUNT_SW_TASK_CLOCK:
- return -ENOENT;
-
- default:
- break;
- }
-
- if (event_id >= PERF_COUNT_SW_MAX)
- return -ENOENT;
-
- if (!event->parent) {
- int err;
-
- err = swevent_hlist_get(event);
- if (err)
- return err;
-
- static_key_slow_inc(&perf_swevent_enabled[event_id]);
- event->destroy = sw_perf_event_destroy;
- }
-
- return 0;
-}
-
-static struct pmu perf_swevent = {
- .task_ctx_nr = perf_sw_context,
-
- .capabilities = PERF_PMU_CAP_NO_NMI,
-
- .event_init = perf_swevent_init,
- .add = perf_swevent_add,
- .del = perf_swevent_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
-#ifdef CONFIG_EVENT_TRACING
-
-static int perf_tp_filter_match(struct perf_event *event,
- struct perf_sample_data *data)
-{
- void *record = data->raw->data;
-
- if (likely(!event->filter) || filter_match_preds(event->filter, record))
- return 1;
- return 0;
-}
-
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- if (event->hw.state & PERF_HES_STOPPED)
- return 0;
- /*
- * All tracepoints are from kernel-space.
- */
- if (event->attr.exclude_kernel)
- return 0;
-
- if (!perf_tp_filter_match(event, data))
- return 0;
-
- return 1;
-}
-
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
- struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
-{
- struct perf_sample_data data;
- struct perf_event *event;
-
- struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
- };
-
- perf_sample_data_init(&data, addr, 0);
- data.raw = &raw;
-
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
-
- /*
- * If we got specified a target task, also iterate its context and
- * deliver this event there too.
- */
- if (task && task != current) {
- struct perf_event_context *ctx;
- struct trace_entry *entry = record;
-
- rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
- if (!ctx)
- goto unlock;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
-unlock:
- rcu_read_unlock();
- }
-
- perf_swevent_put_recursion_context(rctx);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
-
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
-static inline void perf_tp_register(void)
-{
- perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
-}
-
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- char *filter_str;
- int ret;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- filter_str = strndup_user(arg, PAGE_SIZE);
- if (IS_ERR(filter_str))
- return PTR_ERR(filter_str);
-
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
- kfree(filter_str);
- return ret;
-}
-
-static void perf_event_free_filter(struct perf_event *event)
-{
- ftrace_profile_free_filter(event);
-}
-
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
-{
- struct bpf_prog *prog;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- if (event->tp_event->prog)
- return -EEXIST;
-
- if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
- /* bpf programs can only be attached to kprobes */
- return -EINVAL;
-
- prog = bpf_prog_get(prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (prog->type != BPF_PROG_TYPE_KPROBE) {
- /* valid fd, but invalid bpf program type */
- bpf_prog_put(prog);
- return -EINVAL;
- }
-
- event->tp_event->prog = prog;
-
- return 0;
-}
-
-static void perf_event_free_bpf_prog(struct perf_event *event)
-{
- struct bpf_prog *prog;
-
- if (!event->tp_event)
- return;
-
- prog = event->tp_event->prog;
- if (prog) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
- }
-}
-
-#else
-
-static inline void perf_tp_register(void)
-{
-}
-
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- return -ENOENT;
-}
-
-static void perf_event_free_filter(struct perf_event *event)
-{
-}
-
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
-{
- return -ENOENT;
-}
-
-static void perf_event_free_bpf_prog(struct perf_event *event)
-{
-}
-#endif /* CONFIG_EVENT_TRACING */
-
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-void perf_bp_event(struct perf_event *bp, void *data)
-{
- struct perf_sample_data sample;
- struct pt_regs *regs = data;
-
- perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
-
- if (!bp->hw.state && !perf_exclude_event(bp, regs))
- perf_swevent_event(bp, 1, &sample, regs);
-}
-#endif
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
- enum hrtimer_restart ret = HRTIMER_RESTART;
- struct perf_sample_data data;
- struct pt_regs *regs;
- struct perf_event *event;
- u64 period;
-
- event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-
- if (event->state != PERF_EVENT_STATE_ACTIVE)
- return HRTIMER_NORESTART;
-
- event->pmu->read(event);
-
- perf_sample_data_init(&data, 0, event->hw.last_period);
- regs = get_irq_regs();
-
- if (regs && !perf_exclude_event(event, regs)) {
- if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (__perf_event_overflow(event, 1, &data, regs))
- ret = HRTIMER_NORESTART;
- }
-
- period = max_t(u64, 10000, event->hw.sample_period);
- hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
- return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- s64 period;
-
- if (!is_sampling_event(event))
- return;
-
- period = local64_read(&hwc->period_left);
- if (period) {
- if (period < 0)
- period = 10000;
-
- local64_set(&hwc->period_left, 0);
- } else {
- period = max_t(u64, 10000, hwc->sample_period);
- }
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (is_sampling_event(event)) {
- ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
- local64_set(&hwc->period_left, ktime_to_ns(remaining));
-
- hrtimer_cancel(&hwc->hrtimer);
- }
-}
-
-static void perf_swevent_init_hrtimer(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (!is_sampling_event(event))
- return;
-
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
-
- /*
- * Since hrtimers have a fixed rate, we can do a static freq->period
- * mapping and avoid the whole period adjust feedback stuff.
- */
- if (event->attr.freq) {
- long freq = event->attr.sample_freq;
-
- event->attr.sample_period = NSEC_PER_SEC / freq;
- hwc->sample_period = event->attr.sample_period;
- local64_set(&hwc->period_left, hwc->sample_period);
- hwc->last_period = hwc->sample_period;
- event->attr.freq = 0;
- }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_event_update(struct perf_event *event)
-{
- s64 prev;
- u64 now;
-
- now = local_clock();
- prev = local64_xchg(&event->hw.prev_count, now);
- local64_add(now - prev, &event->count);
-}
-
-static void cpu_clock_event_start(struct perf_event *event, int flags)
-{
- local64_set(&event->hw.prev_count, local_clock());
- perf_swevent_start_hrtimer(event);
-}
-
-static void cpu_clock_event_stop(struct perf_event *event, int flags)
-{
- perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
-}
-
-static int cpu_clock_event_add(struct perf_event *event, int flags)
-{
- if (flags & PERF_EF_START)
- cpu_clock_event_start(event, flags);
- perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void cpu_clock_event_del(struct perf_event *event, int flags)
-{
- cpu_clock_event_stop(event, flags);
-}
-
-static void cpu_clock_event_read(struct perf_event *event)
-{
- cpu_clock_event_update(event);
-}
-
-static int cpu_clock_event_init(struct perf_event *event)
-{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
- return -ENOENT;
-
- if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
- return -ENOENT;
-
- /*
- * no branch sampling for software events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- perf_swevent_init_hrtimer(event);
-
- return 0;
-}
-
-static struct pmu perf_cpu_clock = {
- .task_ctx_nr = perf_sw_context,
-
- .capabilities = PERF_PMU_CAP_NO_NMI,
-
- .event_init = cpu_clock_event_init,
- .add = cpu_clock_event_add,
- .del = cpu_clock_event_del,
- .start = cpu_clock_event_start,
- .stop = cpu_clock_event_stop,
- .read = cpu_clock_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_event_update(struct perf_event *event, u64 now)
-{
- u64 prev;
- s64 delta;
-
- prev = local64_xchg(&event->hw.prev_count, now);
- delta = now - prev;
- local64_add(delta, &event->count);
-}
-
-static void task_clock_event_start(struct perf_event *event, int flags)
-{
- local64_set(&event->hw.prev_count, event->ctx->time);
- perf_swevent_start_hrtimer(event);
-}
-
-static void task_clock_event_stop(struct perf_event *event, int flags)
-{
- perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
-}
-
-static int task_clock_event_add(struct perf_event *event, int flags)
-{
- if (flags & PERF_EF_START)
- task_clock_event_start(event, flags);
- perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void task_clock_event_del(struct perf_event *event, int flags)
-{
- task_clock_event_stop(event, PERF_EF_UPDATE);
-}
-
-static void task_clock_event_read(struct perf_event *event)
-{
- u64 now = perf_clock();
- u64 delta = now - event->ctx->timestamp;
- u64 time = event->ctx->time + delta;
-
- task_clock_event_update(event, time);
-}
-
-static int task_clock_event_init(struct perf_event *event)
-{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
- return -ENOENT;
-
- if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
- return -ENOENT;
-
- /*
- * no branch sampling for software events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- perf_swevent_init_hrtimer(event);
-
- return 0;
-}
-
-static struct pmu perf_task_clock = {
- .task_ctx_nr = perf_sw_context,
-
- .capabilities = PERF_PMU_CAP_NO_NMI,
-
- .event_init = task_clock_event_init,
- .add = task_clock_event_add,
- .del = task_clock_event_del,
- .start = task_clock_event_start,
- .stop = task_clock_event_stop,
- .read = task_clock_event_read,
-};
-
-static void perf_pmu_nop_void(struct pmu *pmu)
-{
-}
-
-static int perf_pmu_nop_int(struct pmu *pmu)
-{
- return 0;
-}
-
-static void perf_pmu_start_txn(struct pmu *pmu)
-{
- perf_pmu_disable(pmu);
-}
-
-static int perf_pmu_commit_txn(struct pmu *pmu)
-{
- perf_pmu_enable(pmu);
- return 0;
-}
-
-static void perf_pmu_cancel_txn(struct pmu *pmu)
-{
- perf_pmu_enable(pmu);
-}
-
-static int perf_event_idx_default(struct perf_event *event)
-{
- return 0;
-}
-
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
- if (cpuctx->unique_pmu == old_pmu)
- cpuctx->unique_pmu = pmu;
- }
-}
-
-static void free_pmu_context(struct pmu *pmu)
-{
- struct pmu *i;
-
- mutex_lock(&pmus_lock);
- /*
- * Like a real lame refcount.
- */
- list_for_each_entry(i, &pmus, entry) {
- if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
- update_pmu_context(i, pmu);
- goto out;
- }
- }
-
- free_percpu(pmu->pmu_cpu_context);
-out:
- mutex_unlock(&pmus_lock);
-}
-static struct idr pmu_idr;
-
-static ssize_t
-type_show(struct device *dev, struct device_attribute *attr, char *page)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
-}
-static DEVICE_ATTR_RO(type);
-
-static ssize_t
-perf_event_mux_interval_ms_show(struct device *dev,
- struct device_attribute *attr,
- char *page)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
-}
-
-static ssize_t
-perf_event_mux_interval_ms_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
- int timer, cpu, ret;
-
- ret = kstrtoint(buf, 0, &timer);
- if (ret)
- return ret;
-
- if (timer < 1)
- return -EINVAL;
-
- /* same value, noting to do */
- if (timer == pmu->hrtimer_interval_ms)
- return count;
-
- pmu->hrtimer_interval_ms = timer;
-
- /* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
-
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
- }
-
- return count;
-}
-static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-
-static struct attribute *pmu_dev_attrs[] = {
- &dev_attr_type.attr,
- &dev_attr_perf_event_mux_interval_ms.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(pmu_dev);
-
-static int pmu_bus_running;
-static struct bus_type pmu_bus = {
- .name = "event_source",
- .dev_groups = pmu_dev_groups,
-};
-
-static void pmu_dev_release(struct device *dev)
-{
- kfree(dev);
-}
-
-static int pmu_dev_alloc(struct pmu *pmu)
-{
- int ret = -ENOMEM;
-
- pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
- if (!pmu->dev)
- goto out;
-
- pmu->dev->groups = pmu->attr_groups;
- device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
-
- dev_set_drvdata(pmu->dev, pmu);
- pmu->dev->bus = &pmu_bus;
- pmu->dev->release = pmu_dev_release;
- ret = device_add(pmu->dev);
- if (ret)
- goto free_dev;
-
-out:
- return ret;
-
-free_dev:
- put_device(pmu->dev);
- goto out;
-}
-
-static struct lock_class_key cpuctx_mutex;
-static struct lock_class_key cpuctx_lock;
-
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
-{
- int cpu, ret;
-
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
-
- pmu->type = -1;
- if (!name)
- goto skip_type;
- pmu->name = name;
-
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
- goto free_pdc;
- }
- }
- pmu->type = type;
-
- if (pmu_bus_running) {
- ret = pmu_dev_alloc(pmu);
- if (ret)
- goto free_idr;
- }
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }

skip_type:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
@@ -8808,30 +7866,10 @@ int perf_event_init_task(struct task_str

static void __init perf_event_init_all_cpus(void)
{
- struct swevent_htable *swhash;
int cpu;

- for_each_possible_cpu(cpu) {
- swhash = &per_cpu(swevent_htable, cpu);
- mutex_init(&swhash->hlist_mutex);
+ for_each_possible_cpu(cpu)
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
- }
-}
-
-static void perf_event_init_cpu(int cpu)
-{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
- mutex_lock(&swhash->hlist_mutex);
- swhash->online = true;
- if (swhash->hlist_refcount > 0) {
- struct swevent_hlist *hlist;
-
- hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
- WARN_ON(!hlist);
- rcu_assign_pointer(swhash->swevent_hlist, hlist);
- }
- mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
@@ -8862,20 +7900,8 @@ static void perf_event_exit_cpu_context(
}
srcu_read_unlock(&pmus_srcu, idx);
}
-
-static void perf_event_exit_cpu(int cpu)
-{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
- perf_event_exit_cpu_context(cpu);
-
- mutex_lock(&swhash->hlist_mutex);
- swhash->online = false;
- swevent_hlist_release(swhash);
- mutex_unlock(&swhash->hlist_mutex);
-}
#else
-static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_exit_cpu_context(int cpu) { }
#endif

static int
@@ -8884,7 +7910,7 @@ perf_reboot(struct notifier_block *notif
int cpu;

for_each_online_cpu(cpu)
- perf_event_exit_cpu(cpu);
+ perf_event_exit_cpu_context(cpu);

return NOTIFY_OK;
}
@@ -8905,14 +7931,9 @@ perf_cpu_notify(struct notifier_block *s

switch (action & ~CPU_TASKS_FROZEN) {

- case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
- perf_event_init_cpu(cpu);
- break;
-
case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
- perf_event_exit_cpu(cpu);
+ perf_event_exit_cpu_context(cpu);
break;
default:
break;
@@ -8929,10 +7950,7 @@ void __init perf_event_init(void)

perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
- perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
- perf_tp_register();
+ perf_swevent_register();
perf_cpu_notifier(perf_cpu_notify);
register_reboot_notifier(&perf_reboot_notifier);

--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -228,4 +228,17 @@ static inline bool arch_perf_have_user_s
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */

+#define MAX_INTERRUPTS (~0ULL)
+
+extern int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs);
+
+extern void perf_event_free_filter(struct perf_event *event);
+extern void perf_event_free_bpf_prog(struct perf_event *event);
+extern int perf_event_set_filter(struct perf_event *event, void __user *arg);
+extern int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+
+extern void perf_swevent_register(void);
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
--- /dev/null
+++ b/kernel/events/software.c
@@ -0,0 +1,1021 @@
+
+#include <linux/perf_event.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/ftrace_event.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+#include "internal.h"
+
+/*
+ * Generic software event infrastructure
+ */
+
+struct swevent_htable {
+ struct swevent_hlist *swevent_hlist;
+ struct mutex hlist_mutex;
+ int hlist_refcount;
+
+ /* Recursion avoidance in each contexts */
+ int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+u64 perf_swevent_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
+
+again:
+ old = val = local64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
+
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (local64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
+
+ return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int throttle = 0;
+
+ if (!overflow)
+ overflow = perf_swevent_set_period(event);
+
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
+
+ for (; overflow; overflow--) {
+ if (__perf_event_overflow(event, throttle,
+ data, regs)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ throttle = 1;
+ }
+}
+
+static void perf_swevent_event(struct perf_event *event, u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ local64_add(nr, &event->count);
+
+ if (!regs)
+ return;
+
+ if (!is_sampling_event(event))
+ return;
+
+ if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+ data->period = nr;
+ return perf_swevent_overflow(event, 1, data, regs);
+ } else
+ data->period = event->hw.last_period;
+
+ if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+ return perf_swevent_overflow(event, 1, data, regs);
+
+ if (local64_add_negative(nr, &hwc->period_left))
+ return;
+
+ perf_swevent_overflow(event, 0, data, regs);
+}
+
+static int perf_exclude_event(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 1;
+
+ if (regs) {
+ if (event->attr.exclude_user && user_mode(regs))
+ return 1;
+
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return 1;
+ }
+
+ return 0;
+}
+
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+ struct perf_sample_data sample;
+ struct pt_regs *regs = data;
+
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
+
+ if (!bp->hw.state && !perf_exclude_event(bp, regs))
+ perf_swevent_event(bp, 1, &sample, regs);
+}
+#endif
+
+static int perf_swevent_match(struct perf_event *event,
+ enum perf_type_id type,
+ u32 event_id,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->attr.type != type)
+ return 0;
+
+ if (event->attr.config != event_id)
+ return 0;
+
+ if (perf_exclude_event(event, regs))
+ return 0;
+
+ return 1;
+}
+
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+ u64 val = event_id | (type << 32);
+
+ return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+ u64 hash = swevent_hash(type, event_id);
+
+ return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+{
+ struct swevent_hlist *hlist;
+
+ hlist = rcu_dereference(swhash->swevent_hlist);
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+{
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+ u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct perf_event *event;
+ struct hlist_head *head;
+
+ rcu_read_lock();
+ head = find_swevent_head_rcu(swhash, type, event_id);
+ if (!head)
+ goto end;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_swevent_match(event, type, event_id, data, regs))
+ perf_swevent_event(event, nr, data, regs);
+ }
+end:
+ rcu_read_unlock();
+}
+
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
+int perf_swevent_get_recursion_context(void)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+ return get_recursion_context(swhash->recursion);
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+inline void perf_swevent_put_recursion_context(int rctx)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+ put_recursion_context(swhash->recursion, rctx);
+}
+
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ struct perf_sample_data data;
+
+ if (WARN_ON_ONCE(!regs))
+ return;
+
+ perf_sample_data_init(&data, addr, 0);
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (unlikely(rctx < 0))
+ goto fail;
+
+ ___perf_sw_event(event_id, nr, regs, addr);
+
+ perf_swevent_put_recursion_context(rctx);
+fail:
+ preempt_enable_notrace();
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_add(struct perf_event *event, int flags)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct hw_perf_event *hwc = &event->hw;
+ struct hlist_head *head;
+
+ if (is_sampling_event(event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(event);
+ }
+
+ hwc->state = !(flags & PERF_EF_START);
+
+ head = find_swevent_head(swhash, event);
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
+ return -EINVAL;
+ }
+
+ hlist_add_head_rcu(&event->hlist_entry, head);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void perf_swevent_del(struct perf_event *event, int flags)
+{
+ hlist_del_rcu(&event->hlist_entry);
+}
+
+static void perf_swevent_start(struct perf_event *event, int flags)
+{
+ event->hw.state = 0;
+}
+
+static void perf_swevent_stop(struct perf_event *event, int flags)
+{
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct swevent_htable *swhash)
+{
+ return rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&swhash->hlist_mutex));
+}
+
+static void swevent_hlist_release(struct swevent_htable *swhash)
+{
+ struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
+
+ if (!hlist)
+ return;
+
+ RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
+ kfree_rcu(hlist, rcu_head);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!--swhash->hlist_refcount)
+ swevent_hlist_release(swhash);
+
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ int err = 0;
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+ if (!hlist) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ swhash->hlist_refcount++;
+exit:
+ mutex_unlock(&swhash->hlist_mutex);
+
+ return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+ int err;
+ int cpu, failed_cpu;
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu) {
+ err = swevent_hlist_get_cpu(event, cpu);
+ if (err) {
+ failed_cpu = cpu;
+ goto fail;
+ }
+ }
+ put_online_cpus();
+
+ return 0;
+fail:
+ for_each_possible_cpu(cpu) {
+ if (cpu == failed_cpu)
+ break;
+ swevent_hlist_put_cpu(event, cpu);
+ }
+
+ put_online_cpus();
+ return err;
+}
+
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ WARN_ON(event->parent);
+
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
+ swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ switch (event_id) {
+ case PERF_COUNT_SW_CPU_CLOCK:
+ case PERF_COUNT_SW_TASK_CLOCK:
+ return -ENOENT;
+
+ default:
+ break;
+ }
+
+ if (event_id >= PERF_COUNT_SW_MAX)
+ return -ENOENT;
+
+ if (!event->parent) {
+ int err;
+
+ err = swevent_hlist_get(event);
+ if (err)
+ return err;
+
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
+ event->destroy = sw_perf_event_destroy;
+ }
+
+ return 0;
+}
+
+static struct pmu perf_swevent = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = perf_swevent_init,
+ .add = perf_swevent_add,
+ .del = perf_swevent_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
+#ifdef CONFIG_EVENT_TRACING
+
+static int perf_tp_filter_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
+ /*
+ * All tracepoints are from kernel-space.
+ */
+ if (event->attr.exclude_kernel)
+ return 0;
+
+ if (!perf_tp_filter_match(event, data))
+ return 0;
+
+ return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
+{
+ struct perf_sample_data data;
+ struct perf_event *event;
+
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
+ perf_sample_data_init(&data, addr, 0);
+ data.raw = &raw;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
+ perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+}
+
+int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+void perf_event_free_filter(struct perf_event *event)
+{
+ ftrace_profile_free_filter(event);
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ if (event->tp_event->prog)
+ return -EEXIST;
+
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ /* bpf programs can only be attached to kprobes */
+ return -EINVAL;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_KPROBE) {
+ /* valid fd, but invalid bpf program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ event->tp_event->prog = prog;
+
+ return 0;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog *prog;
+
+ if (!event->tp_event)
+ return;
+
+ prog = event->tp_event->prog;
+ if (prog) {
+ event->tp_event->prog = NULL;
+ bpf_prog_put(prog);
+ }
+}
+
+#else
+
+static inline void perf_tp_register(void)
+{
+}
+
+int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ return -ENOENT;
+}
+
+void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ return -ENOENT;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
+#endif /* CONFIG_EVENT_TRACING */
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct pt_regs *regs;
+ struct perf_event *event;
+ u64 period;
+
+ event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
+ event->pmu->read(event);
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ regs = get_irq_regs();
+
+ if (regs && !perf_exclude_event(event, regs)) {
+ if (!(event->attr.exclude_idle && is_idle_task(current)))
+ if (__perf_event_overflow(event, 1, &data, regs))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, event->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
+
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
+
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_sampling_event(event)) {
+ ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+ local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+ hrtimer_cancel(&hwc->hrtimer);
+ }
+}
+
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
+ event->attr.freq = 0;
+ }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+ s64 prev;
+ u64 now;
+
+ now = local_clock();
+ prev = local64_xchg(&event->hw.prev_count, now);
+ local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, local_clock());
+ perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ cpu_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+ cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = cpu_clock_event_init,
+ .add = cpu_clock_event_add,
+ .del = cpu_clock_event_del,
+ .start = cpu_clock_event_start,
+ .stop = cpu_clock_event_stop,
+ .read = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+ u64 prev;
+ s64 delta;
+
+ prev = local64_xchg(&event->hw.prev_count, now);
+ delta = now - prev;
+ local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, event->ctx->time);
+ perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ task_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+ task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+ u64 now = local_clock(); /* XXX */
+ u64 delta = now - event->ctx->timestamp;
+ u64 time = event->ctx->time + delta;
+
+ task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_task_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = task_clock_event_init,
+ .add = task_clock_event_add,
+ .del = task_clock_event_del,
+ .start = task_clock_event_start,
+ .stop = task_clock_event_stop,
+ .read = task_clock_event_read,
+};
+
+static void __init perf_swevent_init_all_cpus(void)
+{
+ struct swevent_htable *swhash;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ swhash = &per_cpu(swevent_htable, cpu);
+ mutex_init(&swhash->hlist_mutex);
+ }
+}
+
+static void perf_swevent_init_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
+ if (swhash->hlist_refcount > 0) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+ WARN_ON(!hlist);
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void perf_swevent_exit_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
+ swevent_hlist_release(swhash);
+ mutex_unlock(&swhash->hlist_mutex);
+}
+#else
+static inline void perf_swevent_exit_cpu(int cpu) { }
+#endif
+
+static int
+perf_swevent_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_FAILED:
+ perf_swevent_init_cpu(cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_PREPARE:
+ perf_swevent_exit_cpu(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+__init void perf_swevent_register(void)
+{
+ perf_swevent_init_all_cpus();
+
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_tp_register();
+
+ perf_cpu_notifier(perf_swevent_notify);
+}
+