diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/coredump.c | 5 | ||||
| -rw-r--r-- | fs/d_path.c | 8 | ||||
| -rw-r--r-- | fs/exec.c | 4 | ||||
| -rw-r--r-- | fs/fhandle.c | 62 | ||||
| -rw-r--r-- | fs/fs_struct.c | 36 | ||||
| -rw-r--r-- | fs/internal.h | 4 | ||||
| -rw-r--r-- | fs/libfs.c | 34 | ||||
| -rw-r--r-- | fs/namei.c | 8 | ||||
| -rw-r--r-- | fs/pidfs.c | 436 |
9 files changed, 333 insertions, 264 deletions
diff --git a/fs/coredump.c b/fs/coredump.c index fadf9d4be2e1..fedbead956ed 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -710,11 +710,6 @@ static bool coredump_sock_connect(struct core_name *cn, struct coredump_params * retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, O_NONBLOCK | SOCK_COREDUMP); - /* - * ... Make sure to only put our reference after connect() took - * its own reference keeping the pidfs entry alive ... - */ - pidfs_put_pid(cprm->pid); if (retval) { if (retval == -EAGAIN) diff --git a/fs/d_path.c b/fs/d_path.c index 5f4da5c8d5db..bb365511066b 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /** @@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /* diff --git a/fs/exec.c b/fs/exec.c index ba400aafd640..fe895e47f1dd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1515,7 +1515,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * state is protected by cred_guard_mutex we hold. */ n_fs = 1; - spin_lock(&p->fs->lock); + read_seqlock_excl(&p->fs->seq); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) @@ -1528,7 +1528,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; - spin_unlock(&p->fs->lock); + read_sequnlock_excl(&p->fs->seq); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) diff --git a/fs/fhandle.c b/fs/fhandle.c index 3e092ae6d142..7c236f64cdea 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path, if (fh_flags & EXPORT_FH_CONNECTABLE) { handle->handle_type |= FILEID_IS_CONNECTABLE; if (d_is_dir(path->dentry)) - fh_flags |= FILEID_IS_DIR; + handle->handle_type |= FILEID_IS_DIR; } retval = 0; } @@ -168,23 +168,28 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, return err; } -static int get_path_from_fd(int fd, struct path *root) +static int get_path_anchor(int fd, struct path *root) { - if (fd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - spin_lock(&fs->lock); - *root = fs->pwd; - path_get(root); - spin_unlock(&fs->lock); - } else { + if (fd >= 0) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); + return 0; } - return 0; + if (fd == AT_FDCWD) { + get_fs_pwd(current->fs, root); + return 0; + } + + if (fd == FD_PIDFS_ROOT) { + pidfs_get_root(root); + return 0; + } + + return -EBADF; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) @@ -323,13 +328,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, { int retval = 0; struct file_handle f_handle; - struct file_handle *handle = NULL; + struct file_handle *handle __free(kfree) = NULL; struct handle_to_path_ctx ctx = {}; const struct export_operations *eops; - retval = get_path_from_fd(mountdirfd, &ctx.root); + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) + return -EINVAL; + + if (f_handle.handle_type < 0 || + FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) + return -EINVAL; + + retval = get_path_anchor(mountdirfd, &ctx.root); if (retval) - goto out_err; + return retval; eops = ctx.root.mnt->mnt_sb->s_export_op; if (eops && eops->permission) @@ -339,21 +355,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, if (retval) goto out_path; - if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { - retval = -EFAULT; - goto out_path; - } - if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || - (f_handle.handle_bytes == 0)) { - retval = -EINVAL; - goto out_path; - } - if (f_handle.handle_type < 0 || - FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { - retval = -EINVAL; - goto out_path; - } - handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { @@ -366,7 +367,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; - goto out_handle; + goto out_path; } /* @@ -384,11 +385,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); -out_handle: - kfree(handle); out_path: path_put(&ctx.root); -out_err: return retval; } diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 64c2d0814ed6..28be762ac1c6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path) struct path old_root; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_root = fs->root; fs->root = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_root.dentry) path_put(&old_root); } @@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path) struct path old_pwd; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_pwd.dentry) path_put(&old_pwd); @@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) fs = p->fs; if (fs) { int hits = 0; - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); - write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); } task_unlock(p); } @@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk) if (fs) { int kill; task_lock(tsk); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); tsk->fs = NULL; kill = !--fs->users; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(tsk); if (kill) free_fs_struct(fs); @@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) if (fs) { fs->users = 1; fs->in_exec = 0; - spin_lock_init(&fs->lock); - seqcount_spinlock_init(&fs->seq, &fs->lock); + seqlock_init(&fs->seq); fs->umask = old->umask; - spin_lock(&old->lock); + read_seqlock_excl(&old->seq); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); - spin_unlock(&old->lock); + read_sequnlock_excl(&old->seq); } return fs; } @@ -140,10 +133,10 @@ int unshare_fs_struct(void) return -ENOMEM; task_lock(current); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); kill = !--fs->users; current->fs = new_fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(current); if (kill) @@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, - .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), - .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), + .seq = __SEQLOCK_UNLOCKED(init_fs.seq), .umask = 0022, }; diff --git a/fs/internal.h b/fs/internal.h index d733d8bb3d1f..38e8aab27bbd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -323,12 +323,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { + struct dentry *(*stash_dentry)(struct dentry **stashed, + struct dentry *dentry); void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry); struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted @@ -351,3 +354,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags); int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void pidfs_get_root(struct path *path); diff --git a/fs/libfs.c b/fs/libfs.c index 67bd8eea4af1..ce8c496a6940 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2143,6 +2143,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed) dentry = rcu_dereference(*stashed); if (!dentry) return NULL; + if (IS_ERR(dentry)) + return dentry; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; @@ -2175,7 +2177,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); - WARN_ON_ONCE(!IS_IMMUTABLE(inode)); dentry = d_alloc_anon(sb); if (!dentry) { @@ -2191,8 +2192,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, return dentry; } -static struct dentry *stash_dentry(struct dentry **stashed, - struct dentry *dentry) +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { @@ -2233,14 +2233,16 @@ static struct dentry *stash_dentry(struct dentry **stashed, int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { - struct dentry *dentry; + struct dentry *dentry, *res; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = stashed_dentry_get(stashed); - if (path->dentry) { + res = stashed_dentry_get(stashed); + if (IS_ERR(res)) + return PTR_ERR(res); + if (res) { sops->put_data(data); - goto out_path; + goto make_path; } /* Allocate a new dentry. */ @@ -2249,14 +2251,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ - path->dentry = stash_dentry(stashed, dentry); - if (path->dentry != dentry) + if (sops->stash_dentry) + res = sops->stash_dentry(stashed, dentry); + else + res = stash_dentry(stashed, dentry); + if (IS_ERR(res)) { + dput(dentry); + return PTR_ERR(res); + } + if (res != dentry) dput(dentry); -out_path: - WARN_ON_ONCE(path->dentry->d_fsdata != stashed); - WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); +make_path: + path->dentry = res; path->mnt = mntget(mnt); + VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); return 0; } diff --git a/fs/namei.c b/fs/namei.c index a1cd129a2e8b..cd43ff89fbaa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; @@ -2571,11 +2571,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; diff --git a/fs/pidfs.c b/fs/pidfs.c index 4625e097e3a0..edc35522d75c 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -21,11 +21,23 @@ #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/coredump.h> +#include <linux/xattr.h> #include "internal.h" #include "mount.h" -static struct kmem_cache *pidfs_cachep __ro_after_init; +#define PIDFS_PID_DEAD ERR_PTR(-ESRCH) + +static struct kmem_cache *pidfs_attr_cachep __ro_after_init; +static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; + +static struct path pidfs_root_path = {}; + +void pidfs_get_root(struct path *path) +{ + *path = pidfs_root_path; + path_get(path); +} /* * Stashes information that userspace needs to access even after the @@ -37,17 +49,12 @@ struct pidfs_exit_info { __u32 coredump_mask; }; -struct pidfs_inode { +struct pidfs_attr { + struct simple_xattrs *xattrs; struct pidfs_exit_info __pei; struct pidfs_exit_info *exit_info; - struct inode vfs_inode; }; -static inline struct pidfs_inode *pidfs_i(struct inode *inode) -{ - return container_of(inode, struct pidfs_inode, vfs_inode); -} - static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -125,6 +132,7 @@ void pidfs_add_pid(struct pid *pid) pid->ino = pidfs_ino_nr; pid->stashed = NULL; + pid->attr = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); @@ -139,6 +147,33 @@ void pidfs_remove_pid(struct pid *pid) write_seqcount_end(&pidmap_lock_seq); } +void pidfs_free_pid(struct pid *pid) +{ + struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr); + struct simple_xattrs *xattrs __free(kfree) = NULL; + + /* + * Any dentry must've been wiped from the pid by now. + * Otherwise there's a reference count bug. + */ + VFS_WARN_ON_ONCE(pid->stashed); + + /* + * This if an error occurred during e.g., task creation that + * causes us to never go through the exit path. + */ + if (unlikely(!attr)) + return; + + /* This never had a pidfd created. */ + if (IS_ERR(attr)) + return; + + xattrs = no_free_ptr(attr->xattrs); + if (xattrs) + simple_xattrs_free(xattrs, NULL); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -261,13 +296,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; - struct inode *inode = file_inode(file); struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct task_struct *task; + struct pidfs_attr *attr; const struct cred *c; __u64 mask; @@ -286,8 +321,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!pid_in_current_pidns(pid)) return -ESRCH; + attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + exit_info = READ_ONCE(attr->exit_info); if (exit_info) { kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS @@ -300,7 +336,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (mask & PIDFD_INFO_COREDUMP) { kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); + kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); } task = get_pid_task(pid, PIDTYPE_PID); @@ -552,41 +588,61 @@ struct pid *pidfd_pid(const struct file *file) * task has been reaped which cannot happen until we're out of * release_task(). * - * If this struct pid is referred to by a pidfd then - * stashed_dentry_get() will return the dentry and inode for that struct - * pid. Since we've taken a reference on it there's now an additional - * reference from the exit path on it. Which is fine. We're going to put - * it again in a second and we know that the pid is kept alive anyway. + * If this struct pid has at least once been referred to by a pidfd then + * pid->attr will be allocated. If not we mark the struct pid as dead so + * anyone who is trying to register it with pidfs will fail to do so. + * Otherwise we would hand out pidfs for reaped tasks without having + * exit information available. * - * Worst case is that we've filled in the info and immediately free the - * dentry and inode afterwards since the pidfd has been closed. Since + * Worst case is that we've filled in the info and the pid gets freed + * right away in free_pid() when no one holds a pidfd anymore. Since * pidfs_exit() currently is placed after exit_task_work() we know that - * it cannot be us aka the exiting task holding a pidfd to ourselves. + * it cannot be us aka the exiting task holding a pidfd to itself. */ void pidfs_exit(struct task_struct *tsk) { - struct dentry *dentry; + struct pid *pid = task_pid(tsk); + struct pidfs_attr *attr; + struct pidfs_exit_info *exit_info; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif might_sleep(); - dentry = stashed_dentry_get(&task_pid(tsk)->stashed); - if (dentry) { - struct inode *inode = d_inode(dentry); - struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; + guard(spinlock_irq)(&pid->wait_pidfd.lock); + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; + } - rcu_read_lock(); - cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); - rcu_read_unlock(); + /* + * If @pid->attr is set someone might still legitimately hold a + * pidfd to @pid or someone might concurrently still be getting + * a reference to an already stashed dentry from @pid->stashed. + * So defer cleaning @pid->attr until the last reference to @pid + * is put + */ + + exit_info = &attr->__pei; + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); #endif - exit_info->exit_code = tsk->exit_code; + exit_info->exit_code = tsk->exit_code; - /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); - dput(dentry); - } + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&attr->exit_info, &attr->__pei); } #ifdef CONFIG_COREDUMP @@ -594,16 +650,15 @@ void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; struct pidfs_exit_info *exit_info; - struct dentry *dentry; - struct inode *inode; + struct pidfs_attr *attr; __u32 coredump_mask = 0; - dentry = pid->stashed; - if (WARN_ON_ONCE(!dentry)) - return; + attr = READ_ONCE(pid->attr); - inode = d_inode(dentry); - exit_info = &pidfs_i(inode)->__pei; + VFS_WARN_ON_ONCE(!attr); + VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); + + exit_info = &attr->__pei; /* Note how we were coredumped. */ coredump_mask = pidfs_coredump_mask(cprm->mm_flags); /* Note that we actually did coredump. */ @@ -634,9 +689,24 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); } +static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + return simple_xattr_list(inode, xattrs, buf, size); +} + static const struct inode_operations pidfs_inode_operations = { - .getattr = pidfs_getattr, - .setattr = pidfs_setattr, + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, + .listxattr = pidfs_listxattr, }; static void pidfs_evict_inode(struct inode *inode) @@ -647,30 +717,9 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } -static struct inode *pidfs_alloc_inode(struct super_block *sb) -{ - struct pidfs_inode *pi; - - pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); - if (!pi) - return NULL; - - memset(&pi->__pei, 0, sizeof(pi->__pei)); - pi->exit_info = NULL; - - return &pi->vfs_inode; -} - -static void pidfs_free_inode(struct inode *inode) -{ - kmem_cache_free(pidfs_cachep, pidfs_i(inode)); -} - static const struct super_operations pidfs_sops = { - .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, - .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -770,6 +819,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, if (ret < 0) return ERR_PTR(ret); + VFS_WARN_ON_ONCE(!pid->attr); + mntput(path.mnt); return path.dentry; } @@ -796,53 +847,8 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, - unsigned int flags) -{ - enum pid_type type; - - if (flags & PIDFD_STALE) - return true; - - /* - * Make sure that if a pidfd is created PIDFD_INFO_EXIT - * information will be available. So after an inode for the - * pidfd has been allocated perform another check that the pid - * is still alive. If it is exit information is available even - * if the task gets reaped before the pidfd is returned to - * userspace. The only exception are indicated by PIDFD_STALE: - * - * (1) The kernel is in the middle of task creation and thus no - * task linkage has been established yet. - * (2) The caller knows @pid has been registered in pidfs at a - * time when the task was still alive. - * - * In both cases exit information will have been reported. - */ - if (flags & PIDFD_THREAD) - type = PIDTYPE_PID; - else - type = PIDTYPE_TGID; - - /* - * Since pidfs_exit() is called before struct pid's task linkage - * is removed the case where the task got reaped but a dentry - * was already attached to struct pid and exit information was - * recorded and published can be handled correctly. - */ - if (unlikely(!pid_has_task(pid, type))) { - struct inode *inode = d_inode(path->dentry); - return !!READ_ONCE(pidfs_i(inode)->exit_info); - } - - return true; -} - static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { - if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) - return ERR_PTR(-ESRCH); - /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -864,6 +870,8 @@ static int pidfs_init_inode(struct inode *inode, void *data) inode->i_private = data; inode->i_flags |= S_PRIVATE | S_ANON_INODE; + /* We allow to set xattrs. */ + inode->i_flags &= ~S_IMMUTABLE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; @@ -878,9 +886,127 @@ static void pidfs_put_data(void *data) put_pid(pid); } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct pidfs_attr *new_attr __free(kfree) = NULL; + struct pidfs_attr *attr; + + might_sleep(); + + if (!pid) + return 0; + + attr = READ_ONCE(pid->attr); + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (attr) + return 0; + + new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + /* Synchronize with pidfs_exit(). */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + attr = pid->attr; + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (unlikely(attr)) + return 0; + + pid->attr = no_free_ptr(new_attr); + return 0; +} + +static struct dentry *pidfs_stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + int ret; + struct pid *pid = d_inode(dentry)->i_private; + + VFS_WARN_ON_ONCE(stashed != &pid->stashed); + + ret = pidfs_register_pid(pid); + if (ret) + return ERR_PTR(ret); + + return stash_dentry(stashed, dentry); +} + static const struct stashed_operations pidfs_stashed_ops = { - .init_inode = pidfs_init_inode, - .put_data = pidfs_put_data, + .stash_dentry = pidfs_stash_dentry, + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, +}; + +static int pidfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + name = xattr_full_name(handler, suffix); + return simple_xattr_get(xattrs, name, value, size); +} + +static int pidfs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *suffix, + const void *value, size_t size, int flags) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + struct simple_xattr *old_xattr; + + /* Ensure we're the only one to set @attr->xattrs. */ + WARN_ON_ONCE(!inode_is_locked(inode)); + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) { + xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL); + if (!xattrs) + return -ENOMEM; + + simple_xattrs_init(xattrs); + smp_store_release(&pid->attr->xattrs, xattrs); + } + + name = xattr_full_name(handler, suffix); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; +} + +static const struct xattr_handler pidfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = pidfs_xattr_get, + .set = pidfs_xattr_set, +}; + +static const struct xattr_handler *const pidfs_xattr_handlers[] = { + &pidfs_trusted_xattr_handler, + NULL }; static int pidfs_init_fs_context(struct fs_context *fc) @@ -891,9 +1017,12 @@ static int pidfs_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; + ctx->xattr = pidfs_xattr_handlers; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } @@ -921,8 +1050,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (ret < 0) return ERR_PTR(ret); - if (!pidfs_pid_valid(pid, &path, flags)) - return ERR_PTR(-ESRCH); + VFS_WARN_ON_ONCE(!pid->attr); flags &= ~PIDFD_STALE; flags |= O_RDWR; @@ -934,79 +1062,21 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } -/** - * pidfs_register_pid - register a struct pid in pidfs - * @pid: pid to pin - * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. - * - * Return: On success zero, on error a negative error code is returned. - */ -int pidfs_register_pid(struct pid *pid) -{ - struct path path __free(path_put) = {}; - int ret; - - might_sleep(); - - if (!pid) - return 0; - - ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); - if (unlikely(ret)) - return ret; - /* Keep the dentry and only put the reference to the mount. */ - path.dentry = NULL; - return 0; -} - -/** - * pidfs_get_pid - pin a struct pid through pidfs - * @pid: pid to pin - * - * Similar to pidfs_register_pid() but only valid if the caller knows - * there's a reference to the @pid through a dentry already that can't - * go away. - */ -void pidfs_get_pid(struct pid *pid) -{ - if (!pid) - return; - WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); -} - -/** - * pidfs_put_pid - drop a pidfs reference - * @pid: pid to drop - * - * Drop a reference to @pid via pidfs. This is only safe if the - * reference has been taken via pidfs_get_pid(). - */ -void pidfs_put_pid(struct pid *pid) -{ - might_sleep(); - - if (!pid) - return; - VFS_WARN_ON_ONCE(!pid->stashed); - dput(pid->stashed); -} - -static void pidfs_inode_init_once(void *data) -{ - struct pidfs_inode *pi = data; - - inode_init_once(&pi->vfs_inode); -} - void __init pidfs_init(void) { - pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, + pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT | SLAB_PANIC), - pidfs_inode_init_once); + SLAB_ACCOUNT | SLAB_PANIC), NULL); + + pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache", + sizeof(struct simple_xattrs), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), NULL); + pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_root_path.mnt = pidfs_mnt; + pidfs_root_path.dentry = pidfs_mnt->mnt_root; } |