diff options
| author | Christian Brauner <brauner@kernel.org> | 2025-10-29 13:20:32 +0100 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2025-11-03 17:41:18 +0100 |
| commit | 76b6f5dfb3fda76fce1f9990d6fa58adc711122b (patch) | |
| tree | f808b164b58924e4111639f6ff5b13b71a849e1f /kernel/nstree.c | |
| parent | 560e25e70fa40ec69f97f14207bde9bc18bec9b8 (diff) | |
nstree: add listns()
Add a new listns() system call that allows userspace to iterate through
namespaces in the system. This provides a programmatic interface to
discover and inspect namespaces, enhancing existing namespace apis.
Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:
1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
running process but are kept alive by file descriptors, bind mounts,
or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
namespaces.
The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.
/*
* @req: Pointer to struct ns_id_req specifying search parameters
* @ns_ids: User buffer to receive namespace IDs
* @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
* @flags: Reserved for future use (must be 0)
*/
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
size_t nr_ns_ids, unsigned int flags);
Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code
/*
* @size: Structure size
* @ns_id: Starting point for iteration; use 0 for first call, then
* use the last returned ID for subsequent calls to paginate
* @ns_type: Bitmask of namespace types to include (from enum ns_type):
* 0: Return all namespace types
* MNT_NS: Mount namespaces
* NET_NS: Network namespaces
* USER_NS: User namespaces
* etc. Can be OR'd together
* @user_ns_id: Filter results to namespaces owned by this user namespace:
* 0: Return all namespaces (subject to permission checks)
* LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
* Other value: Namespaces owned by the specified user namespace ID
*/
struct ns_id_req {
__u32 size; /* sizeof(struct ns_id_req) */
__u32 spare; /* Reserved, must be 0 */
__u64 ns_id; /* Last seen namespace ID (for pagination) */
__u32 ns_type; /* Filter by namespace type(s) */
__u32 spare2; /* Reserved, must be 0 */
__u64 user_ns_id; /* Filter by owning user namespace */
};
Example 1: List all namespaces
void list_all_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0, /* Start from beginning */
.ns_type = 0, /* All types */
.user_ns_id = 0, /* All user namespaces */
};
uint64_t ids[100];
ssize_t ret;
printf("All namespaces in the system:\n");
do {
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
break;
}
for (ssize_t i = 0; i < ret; i++)
printf(" Namespace ID: %llu\n", (unsigned long long)ids[i]);
/* Continue from last seen ID */
if (ret > 0)
req.ns_id = ids[ret - 1];
} while (ret == 100); /* Buffer was full, more may exist */
}
Example 2: List network namespaces only
void list_network_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = NET_NS, /* Only network namespaces */
.user_ns_id = 0,
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
return;
}
printf("Network namespaces: %zd found\n", ret);
for (ssize_t i = 0; i < ret; i++)
printf(" netns ID: %llu\n", (unsigned long long)ids[i]);
}
Example 3: List namespaces owned by current user namespace
void list_owned_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = 0, /* All types */
.user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
return;
}
printf("Namespaces owned by my user namespace: %zd\n", ret);
for (ssize_t i = 0; i < ret; i++)
printf(" ns ID: %llu\n", (unsigned long long)ids[i]);
}
Example 4: List multiple namespace types
void list_network_and_mount_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = NET_NS | MNT_NS, /* Network and mount */
.user_ns_id = 0,
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
printf("Network and mount namespaces: %zd found\n", ret);
}
Example 5: Pagination through large namespace sets
void list_all_with_pagination(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = 0,
.user_ns_id = 0,
};
uint64_t ids[50];
size_t total = 0;
ssize_t ret;
printf("Enumerating all namespaces with pagination:\n");
while (1) {
ret = listns(&req, ids, 50, 0);
if (ret < 0) {
perror("listns");
break;
}
if (ret == 0)
break; /* No more namespaces */
total += ret;
printf(" Batch: %zd namespaces\n", ret);
/* Last ID in this batch becomes start of next batch */
req.ns_id = ids[ret - 1];
if (ret < 50)
break; /* Partial batch = end of results */
}
printf("Total: %zu namespaces\n", total);
}
Permission Model
listns() respects namespace isolation and capabilities:
(1) Global listing (user_ns_id = 0):
- Requires CAP_SYS_ADMIN in the namespace's owning user namespace
- OR the namespace must be in the caller's namespace context (e.g.,
a namespace the caller is currently using)
- User namespaces additionally allow listing if the caller has
CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
- Requires CAP_SYS_ADMIN in the specified owner user namespace
- OR the namespace must be in the caller's namespace context
- This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
- Only "active" namespaces are listed
- A namespace is active if it has a non-zero __ns_ref_active count
- This includes namespaces used by running processes, held by open
file descriptors, or kept active by bind mounts
- Inactive namespaces (kept alive only by internal kernel
references) are not visible via listns()
Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'kernel/nstree.c')
| -rw-r--r-- | kernel/nstree.c | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/kernel/nstree.c b/kernel/nstree.c index dcad6a308547..4a8838683b6b 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -5,6 +5,7 @@ #include <linux/proc_ns.h> #include <linux/rculist.h> #include <linux/vfsdebug.h> +#include <linux/syscalls.h> #include <linux/user_namespace.h> static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); @@ -359,3 +360,399 @@ u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + read_seqlock_excl(&ns_tree_lock); + node = owner->ns_owner_tree.rb_node; + + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + read_sequnlock_excl(&ns_tree_lock); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static void __ns_put(struct ns_common *ns) +{ + if (ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); + for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; + ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { + struct ns_common *valid __free(ns_put); + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + if (put_user(valid->ns_id, ns_ids + ret)) + return -EINVAL; + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + read_seqlock_excl(&ns_tree_lock); + if (ns_tree) + node = ns_tree->ns_tree.rb_node; + else + node = ns_unified_tree.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + read_sequnlock_excl(&ns_tree_lock); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_list_node); + return list_entry_rcu(head->next, struct ns_common, ns_unified_list_node); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_list_node.next, struct ns_common, ns_list_node); + return list_entry_rcu(ns->ns_unified_list_node.next, struct ns_common, ns_unified_list_node); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree *ns_tree) +{ + if (ns_tree) + return &ns->ns_list_node == head; + return &ns->ns_unified_list_node == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL; + struct ns_tree *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list; + else + head = &ns_unified_list; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid __free(ns_put); + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + if (put_user(valid->ns_id, ns_ids + ret)) + return -EINVAL; + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} |