diff options
Diffstat (limited to 'tools')
| -rw-r--r-- | tools/sched_ext/Makefile | 4 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/common.bpf.h | 15 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.bpf.h | 314 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.h | 14 | ||||
| -rw-r--r-- | tools/sched_ext/scx_cpu0.bpf.c | 88 | ||||
| -rw-r--r-- | tools/sched_ext/scx_cpu0.c | 106 | ||||
| -rw-r--r-- | tools/sched_ext/scx_flatcg.bpf.c | 10 | ||||
| -rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 52 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/Makefile | 1 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/peek_dsq.bpf.c | 251 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/peek_dsq.c | 224 |
11 files changed, 950 insertions, 129 deletions
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index d68780e2e03d..e4bda2474060 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -133,6 +133,7 @@ $(MAKE_DIRS): $(call msg,MKDIR,,$@) $(Q)mkdir -p $@ +ifneq ($(CROSS_COMPILE),) $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ | $(OBJ_DIR)/libbpf @@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ EXTRA_CFLAGS='-g -O0 -fPIC' \ LDFLAGS="$(LDFLAGS)" \ DESTDIR=$(OUTPUT_DIR) prefix= all install_headers +endif $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ @@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 06e2551033cb..821d5791bd42 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void) s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; -s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, - const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; -void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) __ksym __weak; +bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; @@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; -struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index dd9144624dc9..f2969c3061a7 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -16,119 +16,92 @@ }) /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ -#define __COMPAT_scx_bpf_task_cgroup(p) \ - (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ - scx_bpf_task_cgroup((p)) : NULL) +struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak; + +#define scx_bpf_task_cgroup(p) \ + (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \ + scx_bpf_task_cgroup___new((p)) : NULL) /* * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are * renamed to unload the verb. * - * Build error is triggered if old names are used. New binaries work with both - * new and old names. The compat macros will be removed on v6.15 release. - * * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()"). - * Preserve __COMPAT macros until v6.15. */ -void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; -bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; - -#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ - scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \ - scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags))) - -#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \ - scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \ - scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags))) +bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; + +bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; #define scx_bpf_dsq_move_to_local(dsq_id) \ - (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \ - scx_bpf_dsq_move_to_local((dsq_id)) : \ - scx_bpf_consume___compat((dsq_id))) - -#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \ - scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \ - scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \ + scx_bpf_dsq_move_to_local___new((dsq_id)) : \ + scx_bpf_consume___old((dsq_id))) + +#define scx_bpf_dsq_move_set_slice(it__iter, slice) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \ + scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \ + scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \ + (void)0)) + +#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \ + scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \ + scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \ (void)0)) -#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \ - scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \ - scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \ - (void) 0)) - -#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move) ? \ - scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \ - scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \ + scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \ + scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) -#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \ - scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ - scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \ + scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \ + scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +/* + * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits") + * + * Compat macro will be dropped on v6.19 release. + */ +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; + #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ (bpf_ksym_exists(bpf_cpumask_populate) ? \ (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) -#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") - -#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()") - -#define scx_bpf_consume(dsq_id) ({ \ - _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \ - false; \ -}) - -#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()") - -#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()") - -#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \ - false; \ -}) - -#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()") - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()") - -#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \ - false; \ -}) +/* + * v6.19: Introduce lockless peek API for user DSQs. + * + * Preserve the following macro until v6.21. + */ +static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) +{ + struct task_struct *p = NULL; + struct bpf_iter_scx_dsq it; + + if (bpf_ksym_exists(scx_bpf_dsq_peek)) + return scx_bpf_dsq_peek(dsq_id); + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) + p = bpf_iter_scx_dsq_next(&it); + bpf_iter_scx_dsq_destroy(&it); + return p; +} /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on @@ -248,6 +221,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) } /* + * v6.19: To work around BPF maximum parameter limit, the following kfuncs are + * replaced with variants that pack scalar arguments in a struct. Wrappers are + * provided to maintain source compatibility. + * + * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the + * block on dispatch renaming above for more details. + * + * The kernel will carry the compat variants until v6.23 to maintain binary + * compatibility. After v6.23 release, remove the compat handling and move the + * wrappers to common.bpf.h. + */ +s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; +void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; + +/** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details. + */ +static inline s32 +scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) { + struct scx_bpf_select_cpu_and_args args = { + .prev_cpu = prev_cpu, + .wake_flags = wake_flags, + .flags = flags, + }; + + return __scx_bpf_select_cpu_and(p, cpus_allowed, &args); + } else { + return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags, + cpus_allowed, flags); + } +} + +/** + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details. + */ +static inline bool +scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, + u64 enq_flags) +{ + if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) { + struct scx_bpf_dsq_insert_vtime_args args = { + .dsq_id = dsq_id, + .slice = slice, + .vtime = vtime, + .enq_flags = enq_flags, + }; + + return __scx_bpf_dsq_insert_vtime(p, &args); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) { + scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } else { + scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move + * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22. + * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on + * kernel side. The entire suffix can be dropped later. + * + * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on + * dispatch renaming above for more details. + */ +bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; + +static inline bool +scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) +{ + if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) { + return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) { + scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags); + return true; + } else { + scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for + * sub-sched authority checks. Drop the wrappers and move the decls to + * common.bpf.h after v6.22. + */ +bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak; +bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak; + +static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + if (bpf_ksym_exists(scx_bpf_task_set_slice___new)) + scx_bpf_task_set_slice___new(p, slice); + else + p->scx.slice = slice; +} + +static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new)) + scx_bpf_task_set_dsq_vtime___new(p, vtime); + else + p->scx.dsq_vtime = vtime; +} + +/* + * v6.19: The new void variant can be called from anywhere while the older v1 + * variant can only be called from ops.cpu_release(). The double ___ prefixes on + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after + * v6.22. + */ +u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak; +void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak; + +static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void) +{ + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat); +} + +static inline void scx_bpf_reenqueue_local(void) +{ + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local___v2___compat(); + else + scx_bpf_reenqueue_local___v1(); +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 35c67c5174ac..8b4897fc8b99 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void) * * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * the current minimum required kernel version. + * + * COMPAT: + * - v6.17: ops.cgroup_set_bandwidth() + * - v6.19: ops.cgroup_set_idle() */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ @@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void) SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ SCX_ENUM_INIT(__skel); \ + if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \ + } \ + if (__skel->struct_ops.__ops_name->cgroup_set_idle && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \ + } \ __skel; \ }) diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c new file mode 100644 index 000000000000..6326ce598c8e --- /dev/null +++ b/tools/sched_ext/scx_cpu0.bpf.c @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A CPU0 scheduler. + * + * This scheduler queues all tasks to a shared DSQ and only dispatches them on + * CPU0 in FIFO order. This is useful for testing bypass behavior when many + * tasks are concentrated on a single CPU. If the load balancer doesn't work, + * bypass mode can trigger task hangs or RCU stalls as the queue is long and + * there's only one CPU working on it. + * + * - Statistics tracking how many tasks are queued to local and CPU0 DSQs. + * - Termination notification for userspace. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ + +UEI_DEFINE(uei); + +/* + * We create a custom DSQ with ID 0 that we dispatch to and consume from on + * CPU0. + */ +#define DSQ_CPU0 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, cpu0] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + return 0; +} + +void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags) +{ + /* + * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on + * CPU 0. Queue on whichever CPU it's currently only. + */ + if (scx_bpf_task_cpu(p) != 0) { + stat_inc(0); /* count local queueing */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + return; + } + + stat_inc(1); /* count cpu0 queueing */ + scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == 0) + scx_bpf_dsq_move_to_local(DSQ_CPU0); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init) +{ + return scx_bpf_create_dsq(DSQ_CPU0, -1); +} + +void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(cpu0_ops, + .select_cpu = (void *)cpu0_select_cpu, + .enqueue = (void *)cpu0_enqueue, + .dispatch = (void *)cpu0_dispatch, + .init = (void *)cpu0_init, + .exit = (void *)cpu0_exit, + .name = "cpu0"); diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c new file mode 100644 index 000000000000..1e4fa4ab8da9 --- /dev/null +++ b/tools/sched_ext/scx_cpu0.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_cpu0.bpf.skel.h" + +const char help_fmt[] = +"A cpu0 sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-v]\n" +"\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int sig) +{ + exit_req = 1; +} + +static void read_stats(struct scx_cpu0 *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_cpu0 *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "vh")) != -1) { + switch (opt) { + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei); + link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu cpu0=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_cpu0__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 2c720e3ecad5..43126858b8e4 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) return; } - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (!cgc) goto out_release; @@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) { struct cgroup *cgrp; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, true); bpf_cgroup_release(cgrp); } @@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) if (fifo_sched) return; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { /* @@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) if (!taskc->bypassed_at) return; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { __sync_fetch_and_add(&cgc->cvtime_delta, @@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) { struct cgroup *cgrp; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, false); bpf_cgroup_release(cgrp); } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3072b593f898..df21fad0c438 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) void *ring; s32 cpu; + if (enq_flags & SCX_ENQ_REENQ) + __sync_fetch_and_add(&nr_reenqueued, 1); + if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) return; @@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer) if (tctx->highpri) { /* exercise the set_*() and vtime interface too */ - __COMPAT_scx_bpf_dsq_move_set_slice( - BPF_FOR_EACH_ITER, slice_ns * 2); - __COMPAT_scx_bpf_dsq_move_set_vtime( - BPF_FOR_EACH_ITER, highpri_seq++); - __COMPAT_scx_bpf_dsq_move_vtime( - BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); + scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); + scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); + scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); } } @@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer) else cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); - if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, - SCX_DSQ_LOCAL_ON | cpu, - SCX_ENQ_PREEMPT)) { + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { if (cpu == this_cpu) { dispatched = true; __sync_fetch_and_add(&nr_expedited_local, 1); @@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, return task_qdist(a) > task_qdist(b); } -void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +SEC("tp_btf/sched_switch") +int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, + struct task_struct *next, unsigned long prev_state) { - u32 cnt; + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + return 0; /* - * Called when @cpu is taken by a higher priority scheduling class. This - * makes @cpu no longer available for executing sched_ext tasks. As we - * don't want the tasks in @cpu's local dsq to sit there until @cpu - * becomes available again, re-enqueue them into the global dsq. See - * %SCX_ENQ_REENQ handling in qmap_enqueue(). + * If @cpu is taken by a higher priority scheduling class, it is no + * longer available for executing sched_ext tasks. As we don't want the + * tasks in @cpu's local dsq to sit there until @cpu becomes available + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ + * handling in qmap_enqueue(). */ - cnt = scx_bpf_reenqueue_local(); - if (cnt) - __sync_fetch_and_add(&nr_reenqueued, cnt); + switch (next->policy) { + case 1: /* SCHED_FIFO */ + case 2: /* SCHED_RR */ + case 6: /* SCHED_DEADLINE */ + scx_bpf_reenqueue_local(); + } + + return 0; +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + /* see qmap_sched_switch() to learn how to do this on newer kernels */ + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local(); } s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 9d9d6b4c38b0..5fe45f9c5f8f 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -174,6 +174,7 @@ auto-test-targets := \ minimal \ numa \ allowed_cpus \ + peek_dsq \ prog_run \ reload_loop \ select_cpu_dfl \ diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c new file mode 100644 index 000000000000..a3faf5bb49d6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A BPF program for testing DSQ operations and peek in particular. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> + */ + +#include <scx/common.bpf.h> +#include <scx/compat.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); /* Error handling */ + +#define MAX_SAMPLES 100 +#define MAX_CPUS 512 +#define DSQ_POOL_SIZE 8 +int max_samples = MAX_SAMPLES; +int max_cpus = MAX_CPUS; +int dsq_pool_size = DSQ_POOL_SIZE; + +/* Global variables to store test results */ +int dsq_peek_result1 = -1; +long dsq_inserted_pid = -1; +int insert_test_cpu = -1; /* Set to the cpu that performs the test */ +long dsq_peek_result2 = -1; +long dsq_peek_result2_pid = -1; +long dsq_peek_result2_expected = -1; +int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */ +int real_dsq_id = 1235; /* DSQ for normal operation */ +int enqueue_count = -1; +int dispatch_count = -1; +bool debug_ksym_exists; + +/* DSQ pool for stress testing */ +int dsq_pool_base_id = 2000; +int phase1_complete = -1; +long total_peek_attempts = -1; +long successful_peeks = -1; + +/* BPF map for sharing peek results with userspace */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_SAMPLES); + __type(key, u32); + __type(value, long); +} peek_results SEC(".maps"); + +static int get_random_dsq_id(void) +{ + u64 time = bpf_ktime_get_ns(); + + return dsq_pool_base_id + (time % DSQ_POOL_SIZE); +} + +static void record_peek_result(long pid) +{ + u32 slot_key; + long *slot_pid_ptr; + int ix; + + if (pid <= 0) + return; + + /* Find an empty slot or one with the same PID */ + bpf_for(ix, 0, 10) { + slot_key = (pid + ix) % MAX_SAMPLES; + slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); + if (!slot_pid_ptr) + continue; + + if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) { + *slot_pid_ptr = pid; + break; + } + } +} + +/* Scan all DSQs in the pool and try to move a task to local */ +static int scan_dsq_pool(void) +{ + struct task_struct *task; + int moved = 0; + int i; + + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + total_peek_attempts++; + + task = __COMPAT_scx_bpf_dsq_peek(dsq_id); + if (task) { + successful_peeks++; + record_peek_result(task->pid); + + /* Try to move this task to local */ + if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) { + moved = 1; + break; + } + } + } + return moved; +} + +/* Struct_ops scheduler for testing DSQ peek operations */ +void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_struct *peek_result; + int last_insert_test_cpu, cpu; + + enqueue_count++; + cpu = bpf_get_smp_processor_id(); + last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu); + + /* Phase 1: Simple insert-then-peek test (only on first task) */ + if (last_insert_test_cpu == -1) { + bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu); + + /* Test 1: Peek empty DSQ - should return NULL */ + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); + dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */ + + /* Test 2: Insert task into test DSQ for testing in dispatch callback */ + dsq_inserted_pid = p->pid; + scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags); + dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */ + } else if (!phase1_complete) { + /* Still in phase 1, use real DSQ */ + scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags); + } else { + /* Phase 2: Random DSQ insertion for stress testing */ + int random_dsq_id = get_random_dsq_id(); + + scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags); + } +} + +void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev) +{ + dispatch_count++; + + /* Phase 1: Complete the simple peek test if we inserted a task but + * haven't tested peek yet + */ + if (insert_test_cpu == cpu && dsq_peek_result2 == -1) { + struct task_struct *peek_result; + + bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu); + + /* Test 3: Peek DSQ after insert - should return the task we inserted */ + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); + /* Store the PID of the peeked task for comparison */ + dsq_peek_result2 = (long)peek_result; + dsq_peek_result2_pid = peek_result ? peek_result->pid : -1; + + /* Now consume the task since we've peeked at it */ + scx_bpf_dsq_move_to_local(test_dsq_id); + + /* Mark phase 1 as complete */ + phase1_complete = 1; + bpf_printk("Phase 1 complete, starting phase 2 stress testing"); + } else if (!phase1_complete) { + /* Still in phase 1, use real DSQ */ + scx_bpf_dsq_move_to_local(real_dsq_id); + } else { + /* Phase 2: Scan all DSQs in the pool and try to move a task */ + if (!scan_dsq_pool()) { + /* No tasks found in DSQ pool, fall back to real DSQ */ + scx_bpf_dsq_move_to_local(real_dsq_id); + } + } +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init) +{ + s32 err; + int i; + + /* Always set debug values so we can see which version we're using */ + debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0; + + /* Initialize state first */ + insert_test_cpu = -1; + enqueue_count = 0; + dispatch_count = 0; + phase1_complete = 0; + total_peek_attempts = 0; + successful_peeks = 0; + + /* Create the test and real DSQs */ + err = scx_bpf_create_dsq(test_dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); + return err; + } + err = scx_bpf_create_dsq(real_dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); + return err; + } + + /* Create the DSQ pool for stress testing */ + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + err = scx_bpf_create_dsq(dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err); + return err; + } + } + + /* Initialize the peek results map */ + bpf_for(i, 0, MAX_SAMPLES) { + u32 key = i; + long pid = -1; + + bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY); + } + + return 0; +} + +void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei) +{ + int i; + + /* Destroy the primary DSQs */ + scx_bpf_destroy_dsq(test_dsq_id); + scx_bpf_destroy_dsq(real_dsq_id); + + /* Destroy the DSQ pool */ + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + scx_bpf_destroy_dsq(dsq_id); + } + + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops peek_dsq_ops = { + .enqueue = (void *)peek_dsq_enqueue, + .dispatch = (void *)peek_dsq_dispatch, + .init = (void *)peek_dsq_init, + .exit = (void *)peek_dsq_exit, + .name = "peek_dsq", +}; diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c new file mode 100644 index 000000000000..a717384a3224 --- /dev/null +++ b/tools/testing/selftests/sched_ext/peek_dsq.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for DSQ operations including create, destroy, and peek operations. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include <pthread.h> +#include <string.h> +#include <sched.h> +#include "peek_dsq.bpf.skel.h" +#include "scx_test.h" + +#define NUM_WORKERS 4 + +static bool workload_running = true; +static pthread_t workload_threads[NUM_WORKERS]; + +/** + * Background workload thread that sleeps and wakes rapidly to exercise + * the scheduler's enqueue operations and ensure DSQ operations get tested. + */ +static void *workload_thread_fn(void *arg) +{ + while (workload_running) { + /* Sleep for a very short time to trigger scheduler activity */ + usleep(1000); /* 1ms sleep */ + /* Yield to ensure we go through the scheduler */ + sched_yield(); + } + return NULL; +} + +static enum scx_test_status setup(void **ctx) +{ + struct peek_dsq *skel; + + skel = peek_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name) +{ + long count = 0; + + printf("Observed %s DSQ peek pids:\n", dsq_name); + for (int i = 0; i < max_samples; i++) { + long pid; + int err; + + err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid); + if (err == 0) { + if (pid == 0) { + printf(" Sample %d: NULL peek\n", i); + } else if (pid > 0) { + printf(" Sample %d: pid %ld\n", i, pid); + count++; + } + } else { + printf(" Sample %d: error reading pid (err=%d)\n", i, err); + } + } + printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name); + return count; +} + +static enum scx_test_status run(void *ctx) +{ + struct peek_dsq *skel = ctx; + bool failed = false; + int seconds = 3; + int err; + + /* Enable the scheduler to test DSQ operations */ + printf("Enabling scheduler to test DSQ insert operations...\n"); + + struct bpf_link *link = + bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops); + + if (!link) { + SCX_ERR("Failed to attach struct_ops"); + return SCX_TEST_FAIL; + } + + printf("Starting %d background workload threads...\n", NUM_WORKERS); + workload_running = true; + for (int i = 0; i < NUM_WORKERS; i++) { + err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL); + if (err) { + SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err)); + /* Stop already created threads */ + workload_running = false; + for (int j = 0; j < i; j++) + pthread_join(workload_threads[j], NULL); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + + printf("Waiting for enqueue events.\n"); + sleep(seconds); + while (skel->data->enqueue_count <= 0) { + printf("."); + fflush(stdout); + sleep(1); + seconds++; + if (seconds >= 30) { + printf("\n\u2717 Timeout waiting for enqueue events\n"); + /* Stop workload threads and cleanup */ + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) + pthread_join(workload_threads[i], NULL); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) { + err = pthread_join(workload_threads[i], NULL); + if (err) { + SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err)); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + printf("Background workload threads stopped.\n"); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + + /* Detach the scheduler */ + bpf_link__destroy(link); + + printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds, + skel->data->enqueue_count, skel->data->dispatch_count); + printf("Debug: ksym_exists=%d\n", + skel->bss->debug_ksym_exists); + + /* Check DSQ insert result */ + printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu); + if (skel->data->insert_test_cpu != -1) + printf("\u2713 DSQ insert succeeded !\n"); + else { + printf("\u2717 DSQ insert failed or not attempted\n"); + failed = true; + } + + /* Check DSQ peek results */ + printf(" DSQ peek result 1 (before insert): %d\n", + skel->data->dsq_peek_result1); + if (skel->data->dsq_peek_result1 == 0) + printf("\u2713 DSQ peek verification success: peek returned NULL!\n"); + else { + printf("\u2717 DSQ peek verification failed\n"); + failed = true; + } + + printf(" DSQ peek result 2 (after insert): %ld\n", + skel->data->dsq_peek_result2); + printf(" DSQ peek result 2, expected: %ld\n", + skel->data->dsq_peek_result2_expected); + if (skel->data->dsq_peek_result2 == + skel->data->dsq_peek_result2_expected) + printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n"); + else { + printf("\u2717 DSQ peek verification failed\n"); + failed = true; + } + + printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid); + printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid); + + int pid_count; + + pid_count = print_observed_pids(skel->maps.peek_results, + skel->data->max_samples, "DSQ pool"); + printf("Total non-null peek observations: %ld out of %ld\n", + skel->data->successful_peeks, skel->data->total_peek_attempts); + + if (skel->bss->debug_ksym_exists && pid_count == 0) { + printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n"); + failed = true; + } + if (skel->bss->debug_ksym_exists && pid_count > 0) + printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n"); + + if (failed) + return SCX_TEST_FAIL; + else + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct peek_dsq *skel = ctx; + + if (workload_running) { + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) + pthread_join(workload_threads[i], NULL); + } + + peek_dsq__destroy(skel); +} + +struct scx_test peek_dsq = { + .name = "peek_dsq", + .description = + "Test DSQ create/destroy operations and future peek functionality", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&peek_dsq) |