[PATCH v2] tracing: Add test for user space strings when filtering on string pointers

From: Steven Rostedt
Date: Mon Jan 10 2022 - 11:55:39 EST


From: Steven Rostedt <rostedt@xxxxxxxxxxx>

Pingfan reported that the following causes a fault:

echo "filename ~ \"cpu\"" > events/syscalls/sys_enter_openat/filter
echo 1 > events/syscalls/sys_enter_at/enable

The reason is that trace event filter treats the user space pointer
defined by "filename" as a normal pointer to compare against the "cpu"
string. If the string is not loaded into memory yet, it will trigger a
fault in kernel space:

kvm-03-guest16 login: [72198.026181] BUG: unable to handle page fault for address: 00007fffaae8ef60
#PF: supervisor read access in kernel mode
#PF: error_code(0x0001) - permissions violation
PGD 80000001008b7067 P4D 80000001008b7067 PUD 2393f1067 PMD 2393ec067 PTE 8000000108f47867
Oops: 0001 [#1] PREEMPT SMP PTI
CPU: 1 PID: 1 Comm: systemd Kdump: loaded Not tainted 5.14.0-32.el9.x86_64 #1
Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
RIP: 0010:strlen+0x0/0x20
Code: 48 89 f9 74 09 48 83 c1 01 80 39 00 75 f7 31 d2 44 0f b6 04 16 44 88 04 11
48 83 c2 01 45 84 c0 75 ee c3 0f 1f 80 00 00 00 00 <80> 3f 00 74 10 48 89 f8
48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 31
RSP: 0018:ffffb5b900013e48 EFLAGS: 00010246
RAX: 0000000000000018 RBX: ffff8fc1c49ede00 RCX: 0000000000000000
RDX: 0000000000000020 RSI: ffff8fc1c02d601c RDI: 00007fffaae8ef60
RBP: 00007fffaae8ef60 R08: 0005034f4ddb8ea4 R09: 0000000000000000
R10: ffff8fc1c02d601c R11: 0000000000000000 R12: ffff8fc1c8a6e380
R13: 0000000000000000 R14: ffff8fc1c02d6010 R15: ffff8fc1c00453c0
FS: 00007fa86123db40(0000) GS:ffff8fc2ffd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fffaae8ef60 CR3: 0000000102880001 CR4: 00000000007706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
filter_pred_pchar+0x18/0x40
filter_match_preds+0x31/0x70
ftrace_syscall_enter+0x27a/0x2c0
syscall_trace_enter.constprop.0+0x1aa/0x1d0
do_syscall_64+0x16/0x90
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7fa861d88664

To be even more robust, test both kernel and user space strings. If the
string fails to read, then simply have the filter fail.

Link: https://lore.kernel.org/all/20220107044951.22080-1-kernelfans@xxxxxxxxx/

Cc: stable@xxxxxxxxxxxxxxx
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Masami Hiramatsu <mhiramat@xxxxxxxxxx>
Cc: Tom Zanussi <zanussi@xxxxxxxxxx>
Reported-by: Pingfan Liu <kernelfans@xxxxxxxxx>
Fixes: 87a342f5db69d ("tracing/filters: Support filtering for char * strings")
Signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx>
---
Changes since v1: https://lkml.kernel.org/r/20220107225840.003487216@xxxxxxxxxxx

- Up the size of temp buffer from 512 to 1024
Still less than PATH_MAX, but should be good enough in real systems.
If this is an issue, then we can add a way for users to updated it.

- Add documentation about the limit of comparing to string pointers.

- Add typecast from str to ustr to denote adding "__user"

- Allocate buffer only when comparing with string pointers.

- Only do the test on string pointers and not all string compares.

Documentation/trace/events.rst | 10 +++++
kernel/trace/trace_events_filter.c | 59 ++++++++++++++++++++++++++++--
2 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index 8ddb9b09451c..45e66a60a816 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -230,6 +230,16 @@ Currently the caret ('^') for an error always appears at the beginning of
the filter string; the error message should still be useful though
even without more accurate position info.

+5.2.1 Filter limitations
+------------------------
+
+If a filter is placed on a string pointer ``(char *)`` that does not point
+to a string on the ring buffer, but instead points to kernel or user space
+memory, then, for safety reasons, at most 1024 bytes of the content is
+copied onto a temporary buffer to do the compare. If the copy of the memory
+faults (the pointer points to memory that should not be accessed), then the
+string compare will be treated as not matching.
+
5.3 Clearing filters
--------------------

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 996920ed1812..91352a64be09 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -5,6 +5,7 @@
* Copyright (C) 2009 Tom Zanussi <tzanussi@xxxxxxxxx>
*/

+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -654,6 +655,40 @@ DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
DEFINE_EQUALITY_PRED(8);

+/* user space strings temp buffer */
+#define USTRING_BUF_SIZE 1024
+
+struct ustring_buffer {
+ char buffer[USTRING_BUF_SIZE];
+};
+
+static __percpu struct ustring_buffer *ustring_per_cpu;
+
+static __always_inline char *test_string(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char __user *ustr;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ if (likely((unsigned long)str >= TASK_SIZE)) {
+ /* For safety, do not trust the string pointer */
+ if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ return NULL;
+ } else {
+ /* user space address? */
+ ustr = (char __user *)str;
+ if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ return NULL;
+ }
+ return kstr;
+}
+
/* Filter predicate for fixed sized arrays of characters */
static int filter_pred_string(struct filter_pred *pred, void *event)
{
@@ -671,10 +706,16 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
static int filter_pred_pchar(struct filter_pred *pred, void *event)
{
char **addr = (char **)(event + pred->offset);
+ char *str;
int cmp, match;
- int len = strlen(*addr) + 1; /* including tailing '\0' */
+ int len;

- cmp = pred->regex.match(*addr, &pred->regex, len);
+ str = test_string(*addr);
+ if (!str)
+ return 0;
+
+ len = strlen(str) + 1; /* including tailing '\0' */
+ cmp = pred->regex.match(str, &pred->regex, len);

match = cmp ^ pred->not;

@@ -1348,8 +1389,17 @@ static int parse_pred(const char *str, void *data,
pred->fn = filter_pred_strloc;
} else if (field->filter_type == FILTER_RDYN_STRING)
pred->fn = filter_pred_strrelloc;
- else
+ else {
+
+ if (!ustring_per_cpu) {
+ /* Once allocated, keep it around for good */
+ ustring_per_cpu = alloc_percpu(struct ustring_buffer);
+ if (!ustring_per_cpu)
+ goto err_mem;
+ }
+
pred->fn = filter_pred_pchar;
+ }
/* go past the last quote */
i++;

@@ -1415,6 +1465,9 @@ static int parse_pred(const char *str, void *data,
err_free:
kfree(pred);
return -EINVAL;
+err_mem:
+ kfree(pred);
+ return -ENOMEM;
}

enum {
--
2.33.0