Re: [PATCH v7 02/13] user_events: Add minimal support for trace_event into ftrace

From: Masami Hiramatsu
Date: Fri Dec 10 2021 - 05:44:07 EST


Hi Beau,

Thanks for updating the patch! I have some comments below.

On Thu, 9 Dec 2021 14:31:59 -0800
Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> wrote:

[..]
> +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
> +
> +#define FIELD_DEPTH_TYPE 0
> +#define FIELD_DEPTH_NAME 1
> +#define FIELD_DEPTH_SIZE 2
> +
> +/*
> + * Limits how many trace_event calls user processes can create:
> + * Must be multiple of PAGE_SIZE.
> + */
> +#define MAX_PAGES 1
> +#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
> +
> +/* Limit how long of an event name plus args within the subsystem. */
> +#define MAX_EVENT_DESC 512
> +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
> +#define MAX_FIELD_ARRAY_SIZE (2 * PAGE_SIZE)

I don't recommend to record the event which size is more than a page size...
Maybe 256 entries?
It is also better to limit the total size of the event and the number
of fields (arguments).

Steve, can we write such a big event data on the trace buffer?

[..]
> +
> +static int user_field_array_size(const char *type)
> +{
> + const char *start = strchr(type, '[');
> + char val[8];
> + int size = 0;
> +
> + if (start == NULL)
> + return -EINVAL;
> +
> + start++;
> +
> + while (*start != ']' && size < (sizeof(val) - 1))
> + val[size++] = *start++;
> +
> + if (*start != ']')
> + return -EINVAL;
> +
> + val[size] = 0;

It's '\0', not 0.

If I were you, I just use strlcpy(val, start, sizeof(val)), and
strchr(val, ']'). Sometimes using standard libc function will
be easer to understand what it does. :)

> +
> + if (kstrtouint(val, 0, &size))
> + return -EINVAL;
> +
> + if (size > MAX_FIELD_ARRAY_SIZE)
> + return -EINVAL;
> +
> + return size;
> +}
> +
> +static int user_field_size(const char *type)
> +{
> + /* long is not allowed from a user, since it's ambigious in size */
> + if (strcmp(type, "s64") == 0)
> + return sizeof(s64);
> + if (strcmp(type, "u64") == 0)
> + return sizeof(u64);
> + if (strcmp(type, "s32") == 0)
> + return sizeof(s32);
> + if (strcmp(type, "u32") == 0)
> + return sizeof(u32);
> + if (strcmp(type, "int") == 0)
> + return sizeof(int);
> + if (strcmp(type, "unsigned int") == 0)
> + return sizeof(unsigned int);
> + if (strcmp(type, "s16") == 0)
> + return sizeof(s16);
> + if (strcmp(type, "u16") == 0)
> + return sizeof(u16);
> + if (strcmp(type, "short") == 0)
> + return sizeof(short);
> + if (strcmp(type, "unsigned short") == 0)
> + return sizeof(unsigned short);
> + if (strcmp(type, "s8") == 0)
> + return sizeof(s8);
> + if (strcmp(type, "u8") == 0)
> + return sizeof(u8);
> + if (strcmp(type, "char") == 0)
> + return sizeof(char);
> + if (strcmp(type, "unsigned char") == 0)
> + return sizeof(unsigned char);
> + if (str_has_prefix(type, "char["))
> + return user_field_array_size(type);
> + if (str_has_prefix(type, "unsigned char["))
> + return user_field_array_size(type);
> + if (str_has_prefix(type, "__data_loc "))
> + return sizeof(u32);
> + if (str_has_prefix(type, "__rel_loc "))
> + return sizeof(u32);
> +
> + /* Uknown basic type, error */
> + return -EINVAL;
> +}
> +
> +static void user_event_destroy_fields(struct user_event *user)
> +{
> + struct ftrace_event_field *field, *next;
> + struct list_head *head = &user->fields;
> +
> + list_for_each_entry_safe(field, next, head, link) {
> + list_del(&field->link);
> + kfree(field);
> + }
> +}
> +
> +static int user_event_add_field(struct user_event *user, const char *type,
> + const char *name, int offset, int size,
> + int is_signed, int filter_type)
> +{
> + struct ftrace_event_field *field;
> +
> + field = kmalloc(sizeof(*field), GFP_KERNEL);
> +
> + if (!field)
> + return -ENOMEM;
> +
> + field->type = type;
> + field->name = name;
> + field->offset = offset;
> + field->size = size;
> + field->is_signed = is_signed;
> + field->filter_type = filter_type;
> +
> + list_add(&field->link, &user->fields);

I recommend to use list_add_tail() here so that when accessing the
list of field without reverse order. (I found this in [4/13])

> +
> + return 0;
> +}
> +
> +/*
> + * Parses the values of a field within the description
> + * Format: type name [size]

Hmm, don't you accept redundant spaces and tabs?
If this accepts the redundant spaces/tabs, I recommend you to use
argv_split() instead of strpbrk() etc. e.g.

int argc, name_idx = 0, size;
int ret = -EINVAL;
char **argv;

argv = argv_split(GFP_KERNEL, field, &argc);
if (!argv)
return -ENOMEM;

if (!strcmp(argv[pos], "__data_loc") ||
!strcmp(argv[pos], "__rel_loc")) {
if (++pos >= argc)
goto error;
}
if (!strcmp(argv[pos], "unsigned")) {
if (++pos >= argc)
goto error;
} else if (!strcmp(argv[pos], "struct")) {
is_struct = true;
if (++pos >= argc)
goto error;
}
if (++pos >= argc)
goto error;
name_idx = pos++;
if (pos < argc) { // size
if (!is_struct)
goto error;
if (kstrtou32(argv[pos++], 10, &size))
goto error;
} else
size = user_field_size(argv[name_idx - 1]);

if (pos != argc)
goto error;

// note that type index is always 0 and size must be converted.
user_event_add_field(user, argv, name_idx, saved_offset, size,
type[0] != 'u', FILTER_OTHER);

ret = 0;
error:
argv_free(argv);
return ret;

(This also requires to simplify user_field_size() and remove FIELD_DEPTH_*)
What would you think?

> + */
> +static int user_event_parse_field(char *field, struct user_event *user,
> + u32 *offset)
> +{
> + char *part, *type, *name;
> + u32 depth = 0, saved_offset = *offset;
> + int len, size = -EINVAL;
> + bool is_struct = false;
> +
> + field = skip_spaces(field);
> +
> + if (*field == 0)
> + return 0;
> +
> + /* Handle types that have a space within */
> + len = str_has_prefix(field, "unsigned ");
> + if (len)
> + goto skip_next;
> +
> + len = str_has_prefix(field, "struct ");
> + if (len) {
> + is_struct = true;
> + goto skip_next;
> + }
> +
> + len = str_has_prefix(field, "__data_loc unsigned ");
> + if (len)
> + goto skip_next;
> +
> + len = str_has_prefix(field, "__data_loc ");
> + if (len)
> + goto skip_next;
> +
> + len = str_has_prefix(field, "__rel_loc unsigned ");
> + if (len)
> + goto skip_next;
> +
> + len = str_has_prefix(field, "__rel_loc ");
> + if (len)
> + goto skip_next;
> +
> + goto parse;
> +skip_next:
> + type = field;
> + field = strpbrk(field + len, " ");
> +
> + if (field == NULL)
> + return -EINVAL;
> +
> + *field++ = 0;
> + depth++;
> +parse:
> + while ((part = strsep(&field, " ")) != NULL) {
> + switch (depth++) {
> + case FIELD_DEPTH_TYPE:
> + type = part;
> + break;
> + case FIELD_DEPTH_NAME:
> + name = part;
> + break;
> + case FIELD_DEPTH_SIZE:
> + if (!is_struct)
> + return -EINVAL;
> +
> + if (kstrtou32(part, 10, &size))
> + return -EINVAL;
> + break;
> + default:
> + return -EINVAL;
> + }
> + }
> +
> + if (depth < FIELD_DEPTH_SIZE)
> + return -EINVAL;
> +
> + if (depth == FIELD_DEPTH_SIZE)
> + size = user_field_size(type);
> +
> + if (size == 0)
> + return -EINVAL;
> +
> + if (size < 0)
> + return size;
> +
> + *offset = saved_offset + size;
> +
> + return user_event_add_field(user, type, name, saved_offset, size,
> + type[0] != 'u', FILTER_OTHER);
> +}

[..]
> +
> +/*
> + * Register callback for our events from tracing sub-systems.
> + */
> +static int user_event_reg(struct trace_event_call *call,
> + enum trace_reg type,
> + void *data)
> +{
> + struct user_event *user = (struct user_event *)call->data;
> + int ret = 0;
> +
> + if (!user)
> + return -ENOENT;
> +
> + switch (type) {
> + case TRACE_REG_REGISTER:
> + ret = tracepoint_probe_register(call->tp,
> + call->class->probe,
> + data);
> + if (!ret)
> + goto inc;
> + break;
> +
> + case TRACE_REG_UNREGISTER:
> + tracepoint_probe_unregister(call->tp,
> + call->class->probe,
> + data);
> + goto dec;
> +
> +#ifdef CONFIG_PERF_EVENTS
> + case TRACE_REG_PERF_REGISTER:
> + case TRACE_REG_PERF_UNREGISTER:
> + case TRACE_REG_PERF_OPEN:
> + case TRACE_REG_PERF_CLOSE:
> + case TRACE_REG_PERF_ADD:
> + case TRACE_REG_PERF_DEL:
> + break;
> +#endif

At this moment (in this patch), you can just add a default case,
or just ignore it, because it does nothing.

> + }
> +
> + return ret;
> +inc:
> + atomic_inc(&user->refcnt);
> + update_reg_page_for(user);
> + return 0;
> +dec:
> + update_reg_page_for(user);
> + atomic_dec(&user->refcnt);
> + return 0;
> +}
> +

[..]
> +/*
> + * Validates the user payload and writes via iterator.
> + */
> +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
> +{
> + struct user_event_refs *refs;
> + struct user_event *user = NULL;
> + struct tracepoint *tp;
> + ssize_t ret = i->count;
> + int idx;
> +
> + if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
> + return -EFAULT;
> +
> + rcu_read_lock_sched();
> +
> + refs = rcu_dereference_sched(file->private_data);
> +
> + /*
> + * The refs->events array is protected by RCU, and new items may be
> + * added. But the user retrieved from indexing into the events array
> + * shall be immutable while the file is opened.
> + */
> + if (likely(refs && idx < refs->count))
> + user = refs->events[idx];
> +
> + rcu_read_unlock_sched();
> +
> + if (unlikely(user == NULL))
> + return -ENOENT;
> +
> + tp = &user->tracepoint;
> +
> + /*
> + * It's possible key.enabled disables after this check, however
> + * we don't mind if a few events are included in this condition.
> + */
> + if (likely(atomic_read(&tp->key.enabled) > 0)) {
> + struct tracepoint_func *probe_func_ptr;
> + user_event_func_t probe_func;
> + void *tpdata;
> + void *kdata;
> + u32 datalen;
> +
> + kdata = kmalloc(i->count, GFP_KERNEL);
> +
> + if (unlikely(!kdata))
> + return -ENOMEM;
> +
> + datalen = copy_from_iter(kdata, i->count, i);

Don't we need to add this datalen to ret?

> +
> + rcu_read_lock_sched();
> +
> + probe_func_ptr = rcu_dereference_sched(tp->funcs);
> +
> + if (probe_func_ptr) {
> + do {
> + probe_func = probe_func_ptr->func;
> + tpdata = probe_func_ptr->data;
> + probe_func(user, kdata, datalen, tpdata);
> + } while ((++probe_func_ptr)->func);
> + }
> +
> + rcu_read_unlock_sched();
> +
> + kfree(kdata);
> + }
> +
> + return ret;
> +}

Thank you,



--
Masami Hiramatsu <mhiramat@xxxxxxxxxx>