summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/coredump.c5
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/exec.c4
-rw-r--r--fs/fhandle.c62
-rw-r--r--fs/fs_struct.c36
-rw-r--r--fs/internal.h4
-rw-r--r--fs/libfs.c34
-rw-r--r--fs/namei.c8
-rw-r--r--fs/pidfs.c436
9 files changed, 333 insertions, 264 deletions
diff --git a/fs/coredump.c b/fs/coredump.c
index fadf9d4be2e1..fedbead956ed 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -710,11 +710,6 @@ static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *
retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len,
O_NONBLOCK | SOCK_COREDUMP);
- /*
- * ... Make sure to only put our reference after connect() took
- * its own reference keeping the pidfs entry alive ...
- */
- pidfs_put_pid(cprm->pid);
if (retval) {
if (retval == -EAGAIN)
diff --git a/fs/d_path.c b/fs/d_path.c
index 5f4da5c8d5db..bb365511066b 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/**
@@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
*pwd = fs->pwd;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/*
diff --git a/fs/exec.c b/fs/exec.c
index ba400aafd640..fe895e47f1dd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1515,7 +1515,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* state is protected by cred_guard_mutex we hold.
*/
n_fs = 1;
- spin_lock(&p->fs->lock);
+ read_seqlock_excl(&p->fs->seq);
rcu_read_lock();
for_other_threads(p, t) {
if (t->fs == p->fs)
@@ -1528,7 +1528,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
p->fs->in_exec = 1;
- spin_unlock(&p->fs->lock);
+ read_sequnlock_excl(&p->fs->seq);
}
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 3e092ae6d142..7c236f64cdea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (fh_flags & EXPORT_FH_CONNECTABLE) {
handle->handle_type |= FILEID_IS_CONNECTABLE;
if (d_is_dir(path->dentry))
- fh_flags |= FILEID_IS_DIR;
+ handle->handle_type |= FILEID_IS_DIR;
}
retval = 0;
}
@@ -168,23 +168,28 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
return err;
}
-static int get_path_from_fd(int fd, struct path *root)
+static int get_path_anchor(int fd, struct path *root)
{
- if (fd == AT_FDCWD) {
- struct fs_struct *fs = current->fs;
- spin_lock(&fs->lock);
- *root = fs->pwd;
- path_get(root);
- spin_unlock(&fs->lock);
- } else {
+ if (fd >= 0) {
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
+ return 0;
}
- return 0;
+ if (fd == AT_FDCWD) {
+ get_fs_pwd(current->fs, root);
+ return 0;
+ }
+
+ if (fd == FD_PIDFS_ROOT) {
+ pidfs_get_root(root);
+ return 0;
+ }
+
+ return -EBADF;
}
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
@@ -323,13 +328,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
{
int retval = 0;
struct file_handle f_handle;
- struct file_handle *handle = NULL;
+ struct file_handle *handle __free(kfree) = NULL;
struct handle_to_path_ctx ctx = {};
const struct export_operations *eops;
- retval = get_path_from_fd(mountdirfd, &ctx.root);
+ if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+ return -EFAULT;
+
+ if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+ (f_handle.handle_bytes == 0))
+ return -EINVAL;
+
+ if (f_handle.handle_type < 0 ||
+ FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS)
+ return -EINVAL;
+
+ retval = get_path_anchor(mountdirfd, &ctx.root);
if (retval)
- goto out_err;
+ return retval;
eops = ctx.root.mnt->mnt_sb->s_export_op;
if (eops && eops->permission)
@@ -339,21 +355,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
if (retval)
goto out_path;
- if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
- retval = -EFAULT;
- goto out_path;
- }
- if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
- (f_handle.handle_bytes == 0)) {
- retval = -EINVAL;
- goto out_path;
- }
- if (f_handle.handle_type < 0 ||
- FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) {
- retval = -EINVAL;
- goto out_path;
- }
-
handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
@@ -366,7 +367,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
&ufh->f_handle,
f_handle.handle_bytes)) {
retval = -EFAULT;
- goto out_handle;
+ goto out_path;
}
/*
@@ -384,11 +385,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
handle->handle_type &= ~FILEID_USER_FLAGS_MASK;
retval = do_handle_to_path(handle, path, &ctx);
-out_handle:
- kfree(handle);
out_path:
path_put(&ctx.root);
-out_err:
return retval;
}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..28be762ac1c6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path)
struct path old_root;
path_get(path);
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
old_root = fs->root;
fs->root = *path;
- write_seqcount_end(&fs->seq);
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
if (old_root.dentry)
path_put(&old_root);
}
@@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
struct path old_pwd;
path_get(path);
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
- write_seqcount_end(&fs->seq);
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
if (old_pwd.dentry)
path_put(&old_pwd);
@@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
fs = p->fs;
if (fs) {
int hits = 0;
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
hits += replace_path(&fs->root, old_root, new_root);
hits += replace_path(&fs->pwd, old_root, new_root);
- write_seqcount_end(&fs->seq);
while (hits--) {
count++;
path_get(new_root);
}
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
}
task_unlock(p);
}
@@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk)
if (fs) {
int kill;
task_lock(tsk);
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
tsk->fs = NULL;
kill = !--fs->users;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
task_unlock(tsk);
if (kill)
free_fs_struct(fs);
@@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
if (fs) {
fs->users = 1;
fs->in_exec = 0;
- spin_lock_init(&fs->lock);
- seqcount_spinlock_init(&fs->seq, &fs->lock);
+ seqlock_init(&fs->seq);
fs->umask = old->umask;
- spin_lock(&old->lock);
+ read_seqlock_excl(&old->seq);
fs->root = old->root;
path_get(&fs->root);
fs->pwd = old->pwd;
path_get(&fs->pwd);
- spin_unlock(&old->lock);
+ read_sequnlock_excl(&old->seq);
}
return fs;
}
@@ -140,10 +133,10 @@ int unshare_fs_struct(void)
return -ENOMEM;
task_lock(current);
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
kill = !--fs->users;
current->fs = new_fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
task_unlock(current);
if (kill)
@@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask);
/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
.users = 1,
- .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
- .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
+ .seq = __SEQLOCK_UNLOCKED(init_fs.seq),
.umask = 0022,
};
diff --git a/fs/internal.h b/fs/internal.h
index d733d8bb3d1f..38e8aab27bbd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -323,12 +323,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
+ struct dentry *(*stash_dentry)(struct dentry **stashed,
+ struct dentry *dentry);
void (*put_data)(void *data);
int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry);
struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
* path_mounted - check whether path is mounted
@@ -351,3 +354,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags);
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
+void pidfs_get_root(struct path *path);
diff --git a/fs/libfs.c b/fs/libfs.c
index 67bd8eea4af1..ce8c496a6940 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2143,6 +2143,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed)
dentry = rcu_dereference(*stashed);
if (!dentry)
return NULL;
+ if (IS_ERR(dentry))
+ return dentry;
if (!lockref_get_not_dead(&dentry->d_lockref))
return NULL;
return dentry;
@@ -2175,7 +2177,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
/* Notice when this is changed. */
WARN_ON_ONCE(!S_ISREG(inode->i_mode));
- WARN_ON_ONCE(!IS_IMMUTABLE(inode));
dentry = d_alloc_anon(sb);
if (!dentry) {
@@ -2191,8 +2192,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
return dentry;
}
-static struct dentry *stash_dentry(struct dentry **stashed,
- struct dentry *dentry)
+struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry)
{
guard(rcu)();
for (;;) {
@@ -2233,14 +2233,16 @@ static struct dentry *stash_dentry(struct dentry **stashed,
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path)
{
- struct dentry *dentry;
+ struct dentry *dentry, *res;
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
/* See if dentry can be reused. */
- path->dentry = stashed_dentry_get(stashed);
- if (path->dentry) {
+ res = stashed_dentry_get(stashed);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ if (res) {
sops->put_data(data);
- goto out_path;
+ goto make_path;
}
/* Allocate a new dentry. */
@@ -2249,14 +2251,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
return PTR_ERR(dentry);
/* Added a new dentry. @data is now owned by the filesystem. */
- path->dentry = stash_dentry(stashed, dentry);
- if (path->dentry != dentry)
+ if (sops->stash_dentry)
+ res = sops->stash_dentry(stashed, dentry);
+ else
+ res = stash_dentry(stashed, dentry);
+ if (IS_ERR(res)) {
+ dput(dentry);
+ return PTR_ERR(res);
+ }
+ if (res != dentry)
dput(dentry);
-out_path:
- WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
- WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+make_path:
+ path->dentry = res;
path->mnt = mntget(mnt);
+ VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+ VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
return 0;
}
diff --git a/fs/namei.c b/fs/namei.c
index a1cd129a2e8b..cd43ff89fbaa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
nd->root = fs->root;
nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
} else {
get_fs_root(fs, &nd->root);
nd->state |= ND_ROOT_GRABBED;
@@ -2571,11 +2571,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 4625e097e3a0..edc35522d75c 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -21,11 +21,23 @@
#include <linux/utsname.h>
#include <net/net_namespace.h>
#include <linux/coredump.h>
+#include <linux/xattr.h>
#include "internal.h"
#include "mount.h"
-static struct kmem_cache *pidfs_cachep __ro_after_init;
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
+static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
+
+static struct path pidfs_root_path = {};
+
+void pidfs_get_root(struct path *path)
+{
+ *path = pidfs_root_path;
+ path_get(path);
+}
/*
* Stashes information that userspace needs to access even after the
@@ -37,17 +49,12 @@ struct pidfs_exit_info {
__u32 coredump_mask;
};
-struct pidfs_inode {
+struct pidfs_attr {
+ struct simple_xattrs *xattrs;
struct pidfs_exit_info __pei;
struct pidfs_exit_info *exit_info;
- struct inode vfs_inode;
};
-static inline struct pidfs_inode *pidfs_i(struct inode *inode)
-{
- return container_of(inode, struct pidfs_inode, vfs_inode);
-}
-
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -125,6 +132,7 @@ void pidfs_add_pid(struct pid *pid)
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
+ pid->attr = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +147,33 @@ void pidfs_remove_pid(struct pid *pid)
write_seqcount_end(&pidmap_lock_seq);
}
+void pidfs_free_pid(struct pid *pid)
+{
+ struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
+ struct simple_xattrs *xattrs __free(kfree) = NULL;
+
+ /*
+ * Any dentry must've been wiped from the pid by now.
+ * Otherwise there's a reference count bug.
+ */
+ VFS_WARN_ON_ONCE(pid->stashed);
+
+ /*
+ * This if an error occurred during e.g., task creation that
+ * causes us to never go through the exit path.
+ */
+ if (unlikely(!attr))
+ return;
+
+ /* This never had a pidfd created. */
+ if (IS_ERR(attr))
+ return;
+
+ xattrs = no_free_ptr(attr->xattrs);
+ if (xattrs)
+ simple_xattrs_free(xattrs, NULL);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +296,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
- struct inode *inode = file_inode(file);
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
struct task_struct *task;
+ struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
@@ -286,8 +321,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!pid_in_current_pidns(pid))
return -ESRCH;
+ attr = READ_ONCE(pid->attr);
if (mask & PIDFD_INFO_EXIT) {
- exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ exit_info = READ_ONCE(attr->exit_info);
if (exit_info) {
kinfo.mask |= PIDFD_INFO_EXIT;
#ifdef CONFIG_CGROUPS
@@ -300,7 +336,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (mask & PIDFD_INFO_COREDUMP) {
kinfo.mask |= PIDFD_INFO_COREDUMP;
- kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+ kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
}
task = get_pid_task(pid, PIDTYPE_PID);
@@ -552,41 +588,61 @@ struct pid *pidfd_pid(const struct file *file)
* task has been reaped which cannot happen until we're out of
* release_task().
*
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
*
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
* pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
*/
void pidfs_exit(struct task_struct *tsk)
{
- struct dentry *dentry;
+ struct pid *pid = task_pid(tsk);
+ struct pidfs_attr *attr;
+ struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
might_sleep();
- dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
- if (dentry) {
- struct inode *inode = d_inode(dentry);
- struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
- rcu_read_lock();
- cgrp = task_dfl_cgroup(tsk);
- exit_info->cgroupid = cgroup_id(cgrp);
- rcu_read_unlock();
+ /*
+ * If @pid->attr is set someone might still legitimately hold a
+ * pidfd to @pid or someone might concurrently still be getting
+ * a reference to an already stashed dentry from @pid->stashed.
+ * So defer cleaning @pid->attr until the last reference to @pid
+ * is put
+ */
+
+ exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
#endif
- exit_info->exit_code = tsk->exit_code;
+ exit_info->exit_code = tsk->exit_code;
- /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
- smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
- dput(dentry);
- }
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&attr->exit_info, &attr->__pei);
}
#ifdef CONFIG_COREDUMP
@@ -594,16 +650,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
{
struct pid *pid = cprm->pid;
struct pidfs_exit_info *exit_info;
- struct dentry *dentry;
- struct inode *inode;
+ struct pidfs_attr *attr;
__u32 coredump_mask = 0;
- dentry = pid->stashed;
- if (WARN_ON_ONCE(!dentry))
- return;
+ attr = READ_ONCE(pid->attr);
- inode = d_inode(dentry);
- exit_info = &pidfs_i(inode)->__pei;
+ VFS_WARN_ON_ONCE(!attr);
+ VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+ exit_info = &attr->__pei;
/* Note how we were coredumped. */
coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
/* Note that we actually did coredump. */
@@ -634,9 +689,24 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
}
+static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ struct simple_xattrs *xattrs;
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs)
+ return 0;
+
+ return simple_xattr_list(inode, xattrs, buf, size);
+}
+
static const struct inode_operations pidfs_inode_operations = {
- .getattr = pidfs_getattr,
- .setattr = pidfs_setattr,
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+ .listxattr = pidfs_listxattr,
};
static void pidfs_evict_inode(struct inode *inode)
@@ -647,30 +717,9 @@ static void pidfs_evict_inode(struct inode *inode)
put_pid(pid);
}
-static struct inode *pidfs_alloc_inode(struct super_block *sb)
-{
- struct pidfs_inode *pi;
-
- pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
- if (!pi)
- return NULL;
-
- memset(&pi->__pei, 0, sizeof(pi->__pei));
- pi->exit_info = NULL;
-
- return &pi->vfs_inode;
-}
-
-static void pidfs_free_inode(struct inode *inode)
-{
- kmem_cache_free(pidfs_cachep, pidfs_i(inode));
-}
-
static const struct super_operations pidfs_sops = {
- .alloc_inode = pidfs_alloc_inode,
.drop_inode = generic_delete_inode,
.evict_inode = pidfs_evict_inode,
- .free_inode = pidfs_free_inode,
.statfs = simple_statfs,
};
@@ -770,6 +819,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
if (ret < 0)
return ERR_PTR(ret);
+ VFS_WARN_ON_ONCE(!pid->attr);
+
mntput(path.mnt);
return path.dentry;
}
@@ -796,53 +847,8 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
-static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
- unsigned int flags)
-{
- enum pid_type type;
-
- if (flags & PIDFD_STALE)
- return true;
-
- /*
- * Make sure that if a pidfd is created PIDFD_INFO_EXIT
- * information will be available. So after an inode for the
- * pidfd has been allocated perform another check that the pid
- * is still alive. If it is exit information is available even
- * if the task gets reaped before the pidfd is returned to
- * userspace. The only exception are indicated by PIDFD_STALE:
- *
- * (1) The kernel is in the middle of task creation and thus no
- * task linkage has been established yet.
- * (2) The caller knows @pid has been registered in pidfs at a
- * time when the task was still alive.
- *
- * In both cases exit information will have been reported.
- */
- if (flags & PIDFD_THREAD)
- type = PIDTYPE_PID;
- else
- type = PIDTYPE_TGID;
-
- /*
- * Since pidfs_exit() is called before struct pid's task linkage
- * is removed the case where the task got reaped but a dentry
- * was already attached to struct pid and exit information was
- * recorded and published can be handled correctly.
- */
- if (unlikely(!pid_has_task(pid, type))) {
- struct inode *inode = d_inode(path->dentry);
- return !!READ_ONCE(pidfs_i(inode)->exit_info);
- }
-
- return true;
-}
-
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
- if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags))
- return ERR_PTR(-ESRCH);
-
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
@@ -864,6 +870,8 @@ static int pidfs_init_inode(struct inode *inode, void *data)
inode->i_private = data;
inode->i_flags |= S_PRIVATE | S_ANON_INODE;
+ /* We allow to set xattrs. */
+ inode->i_flags &= ~S_IMMUTABLE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
@@ -878,9 +886,127 @@ static void pidfs_put_data(void *data)
put_pid(pid);
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct pidfs_attr *new_attr __free(kfree) = NULL;
+ struct pidfs_attr *attr;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ attr = READ_ONCE(pid->attr);
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (attr)
+ return 0;
+
+ new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ /* Synchronize with pidfs_exit(). */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ attr = pid->attr;
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (unlikely(attr))
+ return 0;
+
+ pid->attr = no_free_ptr(new_attr);
+ return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ int ret;
+ struct pid *pid = d_inode(dentry)->i_private;
+
+ VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+ ret = pidfs_register_pid(pid);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return stash_dentry(stashed, dentry);
+}
+
static const struct stashed_operations pidfs_stashed_ops = {
- .init_inode = pidfs_init_inode,
- .put_data = pidfs_put_data,
+ .stash_dentry = pidfs_stash_dentry,
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
+};
+
+static int pidfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, void *value, size_t size)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ const char *name;
+ struct simple_xattrs *xattrs;
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs)
+ return 0;
+
+ name = xattr_full_name(handler, suffix);
+ return simple_xattr_get(xattrs, name, value, size);
+}
+
+static int pidfs_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap, struct dentry *unused,
+ struct inode *inode, const char *suffix,
+ const void *value, size_t size, int flags)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ const char *name;
+ struct simple_xattrs *xattrs;
+ struct simple_xattr *old_xattr;
+
+ /* Ensure we're the only one to set @attr->xattrs. */
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs) {
+ xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
+ if (!xattrs)
+ return -ENOMEM;
+
+ simple_xattrs_init(xattrs);
+ smp_store_release(&pid->attr->xattrs, xattrs);
+ }
+
+ name = xattr_full_name(handler, suffix);
+ old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
+
+ simple_xattr_free(old_xattr);
+ return 0;
+}
+
+static const struct xattr_handler pidfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = pidfs_xattr_get,
+ .set = pidfs_xattr_set,
+};
+
+static const struct xattr_handler *const pidfs_xattr_handlers[] = {
+ &pidfs_trusted_xattr_handler,
+ NULL
};
static int pidfs_init_fs_context(struct fs_context *fc)
@@ -891,9 +1017,12 @@ static int pidfs_init_fs_context(struct fs_context *fc)
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
+ ctx->xattr = pidfs_xattr_handlers;
fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0;
}
@@ -921,8 +1050,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
if (ret < 0)
return ERR_PTR(ret);
- if (!pidfs_pid_valid(pid, &path, flags))
- return ERR_PTR(-ESRCH);
+ VFS_WARN_ON_ONCE(!pid->attr);
flags &= ~PIDFD_STALE;
flags |= O_RDWR;
@@ -934,79 +1062,21 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
return pidfd_file;
}
-/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
- struct path path __free(path_put) = {};
- int ret;
-
- might_sleep();
-
- if (!pid)
- return 0;
-
- ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
- if (unlikely(ret))
- return ret;
- /* Keep the dentry and only put the reference to the mount. */
- path.dentry = NULL;
- return 0;
-}
-
-/**
- * pidfs_get_pid - pin a struct pid through pidfs
- * @pid: pid to pin
- *
- * Similar to pidfs_register_pid() but only valid if the caller knows
- * there's a reference to the @pid through a dentry already that can't
- * go away.
- */
-void pidfs_get_pid(struct pid *pid)
-{
- if (!pid)
- return;
- WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
-}
-
-/**
- * pidfs_put_pid - drop a pidfs reference
- * @pid: pid to drop
- *
- * Drop a reference to @pid via pidfs. This is only safe if the
- * reference has been taken via pidfs_get_pid().
- */
-void pidfs_put_pid(struct pid *pid)
-{
- might_sleep();
-
- if (!pid)
- return;
- VFS_WARN_ON_ONCE(!pid->stashed);
- dput(pid->stashed);
-}
-
-static void pidfs_inode_init_once(void *data)
-{
- struct pidfs_inode *pi = data;
-
- inode_init_once(&pi->vfs_inode);
-}
-
void __init pidfs_init(void)
{
- pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
+ pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT | SLAB_PANIC),
- pidfs_inode_init_once);
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
+
+ pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
+ sizeof(struct simple_xattrs), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
+
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
+
+ pidfs_root_path.mnt = pidfs_mnt;
+ pidfs_root_path.dentry = pidfs_mnt->mnt_root;
}