diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 45fce7497178..c7a28d1c7bc7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -833,6 +833,7 @@ struct perf_event { struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; + struct rcuwait pending_work_wait; atomic_t event_limit; diff --git a/kernel/events/core.c b/kernel/events/core.c index 01cc0f41b411..b6ffd995a51e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5313,6 +5313,30 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); +static void perf_pending_task_sync(struct perf_event *event) +{ + struct callback_head *head = &event->pending_task; + + if (!event->pending_work) + return; + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, head)) { + event->pending_work = 0; + local_dec(&event->ctx->nr_no_switch_fast); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_pending_task(). The RCU grace period before the event is freed + * will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); +} + /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { @@ -5365,6 +5389,7 @@ static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); unaccount_event(event); @@ -5457,17 +5482,10 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event *parent; - if (!atomic_long_dec_and_test(&event->refcount)) return; - parent = event->parent; _free_event(event); - - /* Matches the refcount bump in inherit_event() */ - if (parent) - put_event(parent); } /* @@ -5551,6 +5569,11 @@ again: if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); } else { var = &ctx->refcount; } @@ -5576,8 +5599,7 @@ again: void *var = &child->ctx->refcount; list_del(&child->child_list); - /* Last reference unless ->pending_task work is pending */ - put_event(child); + free_event(child); /* * Wake any perf_event_free_task() waiting for this event to be @@ -5588,11 +5610,7 @@ again: } no_ctx: - /* - * Last reference unless ->pending_task work is pending on this event - * or any of its children. - */ - put_event(event); + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -6977,6 +6995,12 @@ static void perf_pending_task(struct callback_head *head) struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; + /* + * All accesses to the event must belong to the same implicit RCU read-side + * critical section as the ->pending_work reset. See comment in + * perf_pending_task_sync(). + */ + rcu_read_lock(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. @@ -6987,8 +7011,9 @@ static void perf_pending_task(struct callback_head *head) event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); + rcuwait_wake_up(&event->pending_work_wait); } - put_event(event); + rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); @@ -9896,7 +9921,6 @@ static int __perf_event_overflow(struct perf_event *event, !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -12245,6 +12269,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13387,7 +13412,8 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); - put_event(event); + free_event(event); + put_event(parent_event); return; } @@ -13505,11 +13531,13 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); + put_event(parent); + raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); - put_event(event); + free_event(event); } /*