diff options
Diffstat (limited to 'kernel/trace/trace_syscalls.c')
| -rw-r--r-- | kernel/trace/trace_syscalls.c | 436 |
1 files changed, 417 insertions, 19 deletions
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0f932b22f9ec..528ac90eda5d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/kernel_stat.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -123,6 +124,9 @@ const char *get_syscall_name(int syscall) return entry->name; } +/* Added to user strings when max limit is reached */ +#define EXTRA "..." + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -132,7 +136,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, syscall; + int i, syscall, val; + unsigned char *ptr; + int len; trace = (typeof(trace))ent; syscall = trace->nr; @@ -167,6 +173,19 @@ print_syscall_enter(struct trace_iterator *iter, int flags, else trace_seq_printf(s, "%s: 0x%lx", entry->args[i], trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* This arg points to a user space string */ + ptr = (void *)trace->args + sizeof(long) * entry->nb_args; + val = *(int *)ptr; + + /* The value is a dynamic string (len << 16 | offset) */ + ptr = (void *)ent + (val & 0xffff); + len = val >> 16; + + trace_seq_printf(s, " \"%.*s\"", len, ptr); } trace_seq_putc(s, ')'); @@ -223,15 +242,27 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", - entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); + if (i) + pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", + entry->args[i], sizeof(unsigned long)); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* Add the format for the user space string */ + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", ((unsigned long)(REC->%s))", entry->args[i]); + if (!(BIT(i) & entry->user_mask)) + continue; + /* The user space string for arg has name __<arg>_val */ + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", + entry->args[i]); } #undef LEN_OR_ZERO @@ -277,8 +308,12 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; + unsigned long mask; + char *arg; int offset = offsetof(typeof(trace), args); + int idx; int ret = 0; + int len; int i; for (i = 0; i < meta->nb_args; i++) { @@ -291,9 +326,148 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) offset += sizeof(unsigned long); } + if (ret || !meta->user_mask) + return ret; + + mask = meta->user_mask; + idx = ffs(mask) - 1; + + /* + * User space strings are faulted into a temporary buffer and then + * added as a dynamic string to the end of the event. + * The user space string name for the arg pointer is "__<arg>_val". + */ + len = strlen(meta->args[idx]) + sizeof("___val"); + arg = kmalloc(len, GFP_KERNEL); + if (WARN_ON_ONCE(!arg)) { + meta->user_mask = 0; + return -ENOMEM; + } + + snprintf(arg, len, "__%s_val", meta->args[idx]); + + ret = trace_define_field(call, "__data_loc char[]", + arg, offset, sizeof(int), 0, + FILTER_OTHER); + if (ret) + kfree(arg); return ret; } +#define SYSCALL_FAULT_BUF_SZ 512 + +/* Use the tracing per CPU buffer infrastructure to copy from user space */ +struct syscall_user_buffer { + struct trace_user_buf_info buf; + struct rcu_head rcu; +}; + +static struct syscall_user_buffer *syscall_buffer; + +static int syscall_fault_buffer_enable(void) +{ + struct syscall_user_buffer *sbuf; + int ret; + + lockdep_assert_held(&syscall_trace_lock); + + if (syscall_buffer) { + trace_user_fault_get(&syscall_buffer->buf); + return 0; + } + + sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + + ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); + if (ret < 0) { + kfree(sbuf); + return ret; + } + + WRITE_ONCE(syscall_buffer, sbuf); + + return 0; +} + +static void rcu_free_syscall_buffer(struct rcu_head *rcu) +{ + struct syscall_user_buffer *sbuf = + container_of(rcu, struct syscall_user_buffer, rcu); + + trace_user_fault_destroy(&sbuf->buf); + kfree(sbuf); +} + + +static void syscall_fault_buffer_disable(void) +{ + struct syscall_user_buffer *sbuf = syscall_buffer; + + lockdep_assert_held(&syscall_trace_lock); + + if (trace_user_fault_put(&sbuf->buf)) + return; + + WRITE_ONCE(syscall_buffer, NULL); + call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); +} + +static int syscall_copy_user(char *buf, const char __user *ptr, + size_t size, void *data) +{ + unsigned long *ret_size = data; + int ret; + + ret = strncpy_from_user(buf, ptr, size); + if (ret < 0) + return 1; + *ret_size = ret; + return 0; +} + +static char *sys_fault_user(struct syscall_metadata *sys_data, + struct syscall_user_buffer *sbuf, + unsigned long *args, unsigned int *data_size) +{ + unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; + unsigned long mask = sys_data->user_mask; + int idx = ffs(mask) - 1; + char *ptr; + char *buf; + + /* Get the pointer to user space memory to read */ + ptr = (char *)args[idx]; + *data_size = 0; + + buf = trace_user_fault_read(&sbuf->buf, ptr, size, + syscall_copy_user, &size); + if (!buf) + return NULL; + + /* Replace any non-printable characters with '.' */ + for (int i = 0; i < size; i++) { + if (!isprint(buf[i])) + buf[i] = '.'; + } + + /* + * If the text was truncated due to our max limit, add "..." to + * the string. + */ + if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { + strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), + EXTRA, sizeof(EXTRA)); + size = SYSCALL_FAULT_BUF_SZ; + } else { + buf[size++] = '\0'; + } + + *data_size = size; + return buf; +} + static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -302,15 +476,17 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; unsigned long args[6]; + char *user_ptr; + int user_size = 0; int syscall_nr; - int size; + int size = 0; + bool mayfault; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -327,7 +503,32 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!sys_data) return; - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + guard(preempt_notrace)(); + + syscall_get_arguments(current, regs, args); + + if (mayfault) { + struct syscall_user_buffer *sbuf; + + /* If the syscall_buffer is NULL, tracing is being shutdown */ + sbuf = READ_ONCE(syscall_buffer); + if (!sbuf) + return; + + user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size); + /* + * user_size is the amount of data to append. + * Need to add 4 for the meta field that points to + * the user memory at the end of the event and also + * stores its size. + */ + size = 4 + user_size; + } + + size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) @@ -335,9 +536,36 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) { + void *ptr; + int val; + + /* + * Set the pointer to point to the meta data of the event + * that has information about the stored user space memory. + */ + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; + + /* + * The meta data will store the offset of the user data from + * the beginning of the event. + */ + val = (ptr - (void *)entry) + 4; + + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (user_size << 16); + + /* Nothing to do if the user space was empty or faulted */ + if (user_size) { + /* Now store the user space data into the event */ + ptr += 4; + memcpy(ptr, user_ptr, user_size); + } + } + trace_event_buffer_commit(&fbuffer); } @@ -386,39 +614,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) static int reg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int ret = 0; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!tr->sys_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!tr->sys_refcount_enter) { ret = register_trace_sys_enter(ftrace_syscall_enter, tr); - if (!ret) { - WRITE_ONCE(tr->enter_syscall_files[num], file); - tr->sys_refcount_enter++; + if (ret < 0) { + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + WRITE_ONCE(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + return 0; } static void unreg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); tr->sys_refcount_enter--; WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int reg_event_syscall_exit(struct trace_event_file *file, @@ -459,6 +698,163 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, mutex_unlock(&syscall_trace_lock); } +/* + * For system calls that reference user space memory that can + * be recorded into the event, set the system call meta data's user_mask + * to the "args" index that points to the user space memory to retrieve. + */ +static void check_faultable_syscall(struct trace_event_call *call, int nr) +{ + struct syscall_metadata *sys_data = call->data; + + /* Only work on entry */ + if (sys_data->enter_event != call) + return; + + switch (nr) { + /* user arg at position 0 */ +#ifdef __NR_access + case __NR_access: +#endif + case __NR_acct: + case __NR_add_key: /* Just _type. TODO add _description */ + case __NR_chdir: +#ifdef __NR_chown + case __NR_chown: +#endif +#ifdef __NR_chmod + case __NR_chmod: +#endif + case __NR_chroot: +#ifdef __NR_creat + case __NR_creat: +#endif + case __NR_delete_module: + case __NR_execve: + case __NR_fsopen: + case __NR_getxattr: /* Just pathname, TODO add name */ +#ifdef __NR_lchown + case __NR_lchown: +#endif + case __NR_lgetxattr: /* Just pathname, TODO add name */ + case __NR_lremovexattr: /* Just pathname, TODO add name */ +#ifdef __NR_link + case __NR_link: /* Just oldname. TODO add newname */ +#endif + case __NR_listxattr: /* Just pathname, TODO add list */ + case __NR_llistxattr: /* Just pathname, TODO add list */ + case __NR_lsetxattr: /* Just pathname, TODO add list */ +#ifdef __NR_open + case __NR_open: +#endif + case __NR_memfd_create: + case __NR_mount: /* Just dev_name, TODO add dir_name and type */ +#ifdef __NR_mkdir + case __NR_mkdir: +#endif +#ifdef __NR_mknod + case __NR_mknod: +#endif + case __NR_mq_open: + case __NR_mq_unlink: + case __NR_pivot_root: /* Just new_root, TODO add old_root */ +#ifdef __NR_readlink + case __NR_readlink: +#endif + case __NR_removexattr: /* Just pathname, TODO add name */ +#ifdef __NR_rename + case __NR_rename: /* Just oldname. TODO add newname */ +#endif + case __NR_request_key: /* Just _type. TODO add _description */ +#ifdef __NR_rmdir + case __NR_rmdir: +#endif + case __NR_setxattr: /* Just pathname, TODO add list */ + case __NR_shmdt: +#ifdef __NR_statfs + case __NR_statfs: +#endif + case __NR_swapon: + case __NR_swapoff: +#ifdef __NR_symlink + case __NR_symlink: /* Just oldname. TODO add newname */ +#endif +#ifdef __NR_truncate + case __NR_truncate: +#endif +#ifdef __NR_unlink + case __NR_unlink: +#endif + case __NR_umount2: +#ifdef __NR_utime + case __NR_utime: +#endif +#ifdef __NR_utimes + case __NR_utimes: +#endif + sys_data->user_mask = BIT(0); + break; + /* user arg at position 1 */ + case __NR_execveat: + case __NR_faccessat: + case __NR_faccessat2: + case __NR_finit_module: + case __NR_fchmodat: + case __NR_fchmodat2: + case __NR_fchownat: + case __NR_fgetxattr: + case __NR_flistxattr: + case __NR_fsetxattr: + case __NR_fspick: + case __NR_fremovexattr: +#ifdef __NR_futimesat + case __NR_futimesat: +#endif + case __NR_getxattrat: /* Just pathname, TODO add name */ + case __NR_inotify_add_watch: + case __NR_linkat: /* Just oldname. TODO add newname */ + case __NR_listxattrat: /* Just pathname, TODO add list */ + case __NR_mkdirat: + case __NR_mknodat: + case __NR_mount_setattr: + case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */ + case __NR_name_to_handle_at: +#ifdef __NR_newfstatat + case __NR_newfstatat: +#endif + case __NR_openat: + case __NR_openat2: + case __NR_open_tree: + case __NR_open_tree_attr: + case __NR_readlinkat: +#ifdef __NR_renameat + case __NR_renameat: /* Just oldname. TODO add newname */ +#endif + case __NR_renameat2: /* Just oldname. TODO add newname */ + case __NR_removexattrat: /* Just pathname, TODO add name */ + case __NR_quotactl: + case __NR_setxattrat: /* Just pathname, TODO add list */ + case __NR_syslog: + case __NR_symlinkat: /* Just oldname. TODO add newname */ + case __NR_statx: + case __NR_unlinkat: + case __NR_utimensat: + sys_data->user_mask = BIT(1); + break; + /* user arg at position 2 */ + case __NR_init_module: + case __NR_fsconfig: + sys_data->user_mask = BIT(2); + break; + /* user arg at position 4 */ + case __NR_fanotify_mark: + sys_data->user_mask = BIT(4); + break; + default: + sys_data->user_mask = 0; + } +} + static int __init init_syscall_trace(struct trace_event_call *call) { int id; @@ -471,6 +867,8 @@ static int __init init_syscall_trace(struct trace_event_call *call) return -ENOSYS; } + check_faultable_syscall(call, num); + if (set_syscall_print_fmt(call) < 0) return -ENOMEM; |