Re: [PATCH v2] perf/core: restore __perf_remove_from_context when DETACH_EXIT not set

From: David Wang
Date: Tue Jun 03 2025 - 09:51:13 EST





At 2025-06-03 21:03:55, "David Wang" <00107082@xxxxxxx> wrote:
>
>At 2025-06-03 20:54:40, "Peter Zijlstra" <peterz@xxxxxxxxxxxxx> wrote:
>>On Tue, Jun 03, 2025 at 02:50:56PM +0200, Peter Zijlstra wrote:
>>> On Tue, Jun 03, 2025 at 06:44:58PM +0800, David Wang wrote:
>>>
>>>
>>> > (As yeoreum.yun@xxxxxxx pointed out, the change in perf_remove_from_context() made
>>> > perf_event_set_state() happened before list_del_event(), resulting in perf_cgroup_event_disable()
>>> > not called.)
>>>
>>> Aah, d'0h. Let me see what we should do there.
>>
>>Does this help? This way event_sched_out() will call
>>perf_cgroup_event_disable().
>>
>>
>>diff --git a/kernel/events/core.c b/kernel/events/core.c
>>index f34c99f8ce8f..adbb0372825f 100644
>>--- a/kernel/events/core.c
>>+++ b/kernel/events/core.c
>>@@ -2494,9 +2494,9 @@ __perf_remove_from_context(struct perf_event *event,
>> if (flags & DETACH_REVOKE)
>> state = PERF_EVENT_STATE_REVOKED;
>> if (flags & DETACH_DEAD) {
>>- event->pending_disable = 1;
>> state = PERF_EVENT_STATE_DEAD;
>> }
>>+ event->pending_disable = 1;
>> event_sched_out(event, ctx);
>> perf_event_set_state(event, min(event->state, state));
>>
>
>Ok, I will give it a try and update later.

Sadly no, caught a kernel panic at the first round....

I tried to use perf to reproduce this, but no luck so far. Following is the code I used to reproduce.

(The code is silly, but valid I think....)
To reproduce, I use following steps:
Open two terminals:
1. In terminal A
mkdir /sys/fs/cgroup/mytest
echo $$ > /sys/fs/cgroup/mytest/cgroup.procs
2. In terminal B
[g++ following code if not done yet g++ -o profiler xx.cpp]
./profiler mytest
3. Do something in terminal A, usually I would run following command under kernel source tree
for i in {1..200}; do find ./ -name nottobefound > /dev/null; done
4. wait for 5~10mintes
5. In terminal B, ctrl-C stop the profiler
6. reboot
(On my system, with 6.15 at most 4 rounds of test would catch a kernel panic.)

I could not reproduce it with my KVM, maybe I need more trials.
Not sure whether anyone else could reproduce this.


---
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <poll.h>
#include <signal.h>
#include <fcntl.h>
#include <elf.h>
#include <string.h>

#include <vector>
#include <string>
#include <map>
#include <unordered_map>
#include <unordered_set>
#include <algorithm>
using namespace std;


#define MAXN 512
#define MAXCPU 128
#define error(msg) do { perror(msg); exit(1); } while(0)

static long perf_event_open(struct perf_event_attr *perf_event,
pid_t pid, int cpu, int group_fd, unsigned long flags) {
return syscall(__NR_perf_event_open, perf_event,
pid, cpu, group_fd, flags);
}

struct pollfd polls[MAXCPU];
// res for cleanup
static long long psize;
map<int, pair<void*, long long>> res;
static long long eventc = 0;

void int_exit(int _) {
for (auto x: res) {
auto y = x.second;
void* addr = y.first;
munmap(addr, (1+MAXN)*psize);
close(x.first);
}
res.clear();
printf("total %lld events collect\n", eventc);
exit(0);
}
int process_event(char *base, unsigned long long size, unsigned long long offset) {
struct perf_event_header* p = NULL;
offset%=size;
p = (struct perf_event_header*) (base+offset);
eventc++;
return p->size;
}

int main(int argc, char *argv[]) {
if (argc<2) { printf("Need cgroup name\n"); return 1; }
char xb[256];
snprintf(xb, sizeof(xb), "/sys/fs/cgroup/%s", argv[1]);
int cgroup_id = open(xb, O_CLOEXEC);
if (cgroup_id <= 0) error("error open cgroup dir");
int cpu_num = sysconf(_SC_NPROCESSORS_ONLN);
psize = sysconf(_SC_PAGE_SIZE); // getpagesize();
struct perf_event_attr attr;
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_SOFTWARE;
attr.size = sizeof(attr);
attr.config = PERF_COUNT_SW_CPU_CLOCK;
attr.sample_freq = 9999;//777; // adjust it
attr.freq = 1;
attr.wakeup_events = 16;
attr.sample_type = PERF_SAMPLE_CALLCHAIN;
attr.sample_max_stack = 32;
attr.exclude_callchain_user = 1;
// start perf event
int i, k, fd;
void* addr;
for (i=0, k=0; i<cpu_num&&i<MAXCPU; i++) {
printf("attaching cpu %d\n", i);
fd = perf_event_open(&attr, cgroup_id, i, -1, PERF_FLAG_FD_CLOEXEC|PERF_FLAG_PID_CGROUP);
if (fd<0) error("fail to open perf event");
addr = mmap(NULL, (1+MAXN)*psize, PROT_READ, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) error("mmap failed");
res[fd] = {addr, 0};
polls[k].fd = fd;
polls[k].events = POLLIN;
polls[k].revents = 0;
k++;
}
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);

unsigned long long head;
int event_size;
struct perf_event_mmap_page *mp;
while (poll(polls, k, -1)>0) {
for (i=0; i<k; i++) {
if ((polls[i].revents&POLLIN)==0) continue;
fd = polls[i].fd;
addr = res[fd].first;
mp = (struct perf_event_mmap_page *)addr;
head = res[fd].second;
ioctl(fd, PERF_EVENT_IOC_PAUSE_OUTPUT, 1);
if (head>mp->data_head) head=mp->data_head;
head = mp->data_head-((mp->data_head-head)%mp->data_size);
while(head<mp->data_head) {
head += process_event((char*)addr+mp->data_offset, mp->data_size, head);
}
res[fd].second = mp->data_head;
ioctl(fd, PERF_EVENT_IOC_PAUSE_OUTPUT, 0);
}
}
int_exit(0);
return 0;
}