[PATCH] s390/cpum_sf: add READ_ONCE() semantics to compare and swap loops

From: Heiko Carstens
Date: Thu Jan 05 2023 - 09:44:20 EST


The current cmpxchg_double() loops within the perf hw sampling code do not
have READ_ONCE() semantics to read the old value from memory. This allows
the compiler to generate code which reads the "old" value several times
from memory, which again allows for inconsistencies.

For example:

/* Reset trailer (using compare-double-and-swap) */
do {
te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
te_flags |= SDB_TE_ALERT_REQ_MASK;
} while (!cmpxchg_double(&te->flags, &te->overflow,
te->flags, te->overflow,
te_flags, 0ULL));

The compiler could generate code where te->flags used within the
cmpxchg_double() call may be refetched from memory and which is not
necessarily identical to the previous read version which was used to
generate te_flags. Which in turn means that an incorrect update could
happen.

Fix this by adding READ_ONCE() semantics to all cmpxchg_double()
loops. Given that READ_ONCE() cannot generate code on s390 which atomically
reads 16 bytes, use a private compare-and-swap-double implementation to
achieve that.

Also replace cmpxchg_double() with the private implementation to be able to
re-use the old value within the loops.

As a side effect this converts the whole code to only use bit fields
to read and modify bits within the hws trailer header.

Reported-by: Alexander Gordeev <agordeev@xxxxxxxxxxxxx>
Acked-by: Alexander Gordeev <agordeev@xxxxxxxxxxxxx>
Acked-by: Hendrik Brueckner <brueckner@xxxxxxxxxxxxx>
Signed-off-by: Heiko Carstens <hca@xxxxxxxxxxxxx>
---
arch/s390/include/asm/cpu_mf.h | 31 +++++-----
arch/s390/kernel/perf_cpum_sf.c | 101 ++++++++++++++++++++------------
2 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index feaba12dbecb..efa103b52a1a 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -131,19 +131,21 @@ struct hws_combined_entry {
struct hws_diag_entry diag; /* Diagnostic-sampling data entry */
} __packed;

-struct hws_trailer_entry {
- union {
- struct {
- unsigned int f:1; /* 0 - Block Full Indicator */
- unsigned int a:1; /* 1 - Alert request control */
- unsigned int t:1; /* 2 - Timestamp format */
- unsigned int :29; /* 3 - 31: Reserved */
- unsigned int bsdes:16; /* 32-47: size of basic SDE */
- unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
- };
- unsigned long long flags; /* 0 - 63: All indicators */
+union hws_trailer_header {
+ struct {
+ unsigned int f:1; /* 0 - Block Full Indicator */
+ unsigned int a:1; /* 1 - Alert request control */
+ unsigned int t:1; /* 2 - Timestamp format */
+ unsigned int :29; /* 3 - 31: Reserved */
+ unsigned int bsdes:16; /* 32-47: size of basic SDE */
+ unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
+ unsigned long long overflow; /* 64 - Overflow Count */
};
- unsigned long long overflow; /* 64 - sample Overflow count */
+ __uint128_t val;
+};
+
+struct hws_trailer_entry {
+ union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count */
unsigned char timestamp[16]; /* 16 - 31 timestamp */
unsigned long long reserved1; /* 32 -Reserved */
unsigned long long reserved2; /* */
@@ -290,14 +292,11 @@ static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
return USEC_PER_SEC * qsi->cpu_speed / rate;
}

-#define SDB_TE_ALERT_REQ_MASK 0x4000000000000000UL
-#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL
-
/* Return TOD timestamp contained in an trailer entry */
static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
{
/* TOD in STCKE format */
- if (te->t)
+ if (te->header.t)
return *((unsigned long long *) &te->timestamp[1]);

/* TOD in STCK format */
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 332a49965130..ce886a03545a 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -163,14 +163,15 @@ static void free_sampling_buffer(struct sf_buffer *sfb)

static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
{
- unsigned long sdb, *trailer;
+ struct hws_trailer_entry *te;
+ unsigned long sdb;

/* Allocate and initialize sample-data-block */
sdb = get_zeroed_page(gfp_flags);
if (!sdb)
return -ENOMEM;
- trailer = trailer_entry_ptr(sdb);
- *trailer = SDB_TE_ALERT_REQ_MASK;
+ te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ te->header.a = 1;

/* Link SDB into the sample-data-block-table */
*sdbt = sdb;
@@ -1206,7 +1207,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
"%s: Found unknown"
" sampling data entry: te->f %i"
" basic.def %#4x (%p)\n", __func__,
- te->f, sample->def, sample);
+ te->header.f, sample->def, sample);
/* Sample slot is not yet written or other record.
*
* This condition can occur if the buffer was reused
@@ -1217,7 +1218,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* that are not full. Stop processing if the first
* invalid format was detected.
*/
- if (!te->f)
+ if (!te->header.f)
break;
}

@@ -1227,6 +1228,16 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
}
}

+static inline __uint128_t __cdsg(__uint128_t *ptr, __uint128_t old, __uint128_t new)
+{
+ asm volatile(
+ " cdsg %[old],%[new],%[ptr]\n"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return old;
+}
+
/* hw_perf_event_update() - Process sampling buffer
* @event: The perf event
* @flush_all: Flag to also flush partially filled sample-data-blocks
@@ -1243,10 +1254,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
*/
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
+ unsigned long long event_overflow, sampl_overflow, num_sdb;
+ union hws_trailer_header old, prev, new;
struct hw_perf_event *hwc = &event->hw;
struct hws_trailer_entry *te;
unsigned long *sdbt;
- unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
int done;

/*
@@ -1266,25 +1278,25 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);

/* Leave loop if no more work to do (block full indicator) */
- if (!te->f) {
+ if (!te->header.f) {
done = 1;
if (!flush_all)
break;
}

/* Check the sample overflow count */
- if (te->overflow)
+ if (te->header.overflow)
/* Account sample overflows and, if a particular limit
* is reached, extend the sampling buffer.
* For details, see sfb_account_overflows().
*/
- sampl_overflow += te->overflow;
+ sampl_overflow += te->header.overflow;

/* Timestamps are valid for full sample-data-blocks only */
debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx "
"overflow %llu timestamp %#llx\n",
- __func__, (unsigned long)sdbt, te->overflow,
- (te->f) ? trailer_timestamp(te) : 0ULL);
+ __func__, (unsigned long)sdbt, te->header.overflow,
+ (te->header.f) ? trailer_timestamp(te) : 0ULL);

/* Collect all samples from a single sample-data-block and
* flag if an (perf) event overflow happened. If so, the PMU
@@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
num_sdb++;

/* Reset trailer (using compare-double-and-swap) */
+ /* READ_ONCE() 16 byte header */
+ prev.val = __cdsg(&te->header.val, 0, 0);
do {
- te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te_flags |= SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- te->flags, te->overflow,
- te_flags, 0ULL));
+ old.val = prev.val;
+ new.val = prev.val;
+ new.f = 0;
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = __cdsg(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);

/* Advance to next sample-data-block */
sdbt++;
@@ -1384,7 +1400,7 @@ static void aux_output_end(struct perf_output_handle *handle)
range_scan = AUX_SDB_NUM_ALERT(aux);
for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+ if (!te->header.f)
break;
}
/* i is num of SDBs which are full */
@@ -1392,7 +1408,7 @@ static void aux_output_end(struct perf_output_handle *handle)

/* Remove alert indicators in the buffer */
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags &= ~SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 0;

debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n",
__func__, i, range_scan, aux->head);
@@ -1437,9 +1453,9 @@ static int aux_output_begin(struct perf_output_handle *handle,
idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- te->flags &= ~(SDB_TE_BUFFER_FULL_MASK |
- SDB_TE_ALERT_REQ_MASK);
- te->overflow = 0;
+ te->header.f = 0;
+ te->header.a = 0;
+ te->header.overflow = 0;
}
/* Save the position of empty SDBs */
aux->empty_mark = aux->head + range - 1;
@@ -1448,7 +1464,7 @@ static int aux_output_begin(struct perf_output_handle *handle,
/* Set alert indicator */
aux->alert_mark = aux->head + range/2 - 1;
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 1;

/* Reset hardware buffer head */
head = AUX_SDB_INDEX(aux, aux->head);
@@ -1475,14 +1491,17 @@ static int aux_output_begin(struct perf_output_handle *handle,
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
+ union hws_trailer_header old, prev, new;
struct hws_trailer_entry *te;

te = aux_sdb_trailer(aux, alert_index);
+ /* READ_ONCE() 16 byte header */
+ prev.val = __cdsg(&te->header.val, 0, 0);
do {
- orig_flags = te->flags;
- *overflow = orig_overflow = te->overflow;
- if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+ old.val = prev.val;
+ new.val = prev.val;
+ *overflow = old.overflow;
+ if (old.f) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
*/
return false;
}
- new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = __cdsg(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
return true;
}

@@ -1522,8 +1541,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
unsigned long i, range_scan, idx, idx_old;
+ union hws_trailer_header old, prev, new;
+ unsigned long long orig_overflow;
struct hws_trailer_entry *te;

debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
@@ -1554,17 +1574,20 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
idx_old = idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
+ /* READ_ONCE() 16 byte header */
+ prev.val = __cdsg(&te->header.val, 0, 0);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+ old.val = prev.val;
+ new.val = prev.val;
+ orig_overflow = old.overflow;
+ new.f = 0;
+ new.overflow = 0;
if (idx == aux->alert_mark)
- new_flags |= SDB_TE_ALERT_REQ_MASK;
+ new.a = 1;
else
- new_flags &= ~SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 0;
+ prev.val = __cdsg(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
*overflow += orig_overflow;
}

--
2.34.1