summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 11:34:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 11:34:40 -0700
commit263e777ee3e00d628ac2660f68c82aeab14707b3 (patch)
tree9860bb4d064d31532c20799958475465f884ce02 /include
parent18b19abc3709b109676ffd1f48dcd332c2e477d4 (diff)
parent9426414f0d42f824892ecd4dccfebf8987084a41 (diff)
Merge tag 'vfs-6.18-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs writeback updates from Christian Brauner: "This contains work adressing lockups reported by users when a systemd unit reading lots of files from a filesystem mounted with the lazytime mount option exits. With the lazytime mount option enabled we can be switching many dirty inodes on cgroup exit to the parent cgroup. The numbers observed in practice when systemd slice of a large cron job exits can easily reach hundreds of thousands or millions. The logic in inode_do_switch_wbs() which sorts the inode into appropriate place in b_dirty list of the target wb however has linear complexity in the number of dirty inodes thus overall time complexity of switching all the inodes is quadratic leading to workers being pegged for hours consuming 100% of the CPU and switching inodes to the parent wb. Simple reproducer of the issue: FILES=10000 # Filesystem mounted with lazytime mount option MNT=/mnt/ echo "Creating files and switching timestamps" for (( j = 0; j < 50; j ++ )); do mkdir $MNT/dir$j for (( i = 0; i < $FILES; i++ )); do echo "foo" >$MNT/dir$j/file$i done touch -a -t 202501010000 $MNT/dir$j/file* done wait echo "Syncing and flushing" sync echo 3 >/proc/sys/vm/drop_caches echo "Reading all files from a cgroup" mkdir /sys/fs/cgroup/unified/mycg1 || exit echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit for (( j = 0; j < 50; j ++ )); do cat /mnt/dir$j/file* >/dev/null & done wait echo "Switching wbs" # Now rmdir the cgroup after the script exits This can be solved by: - Avoiding contention on the wb->list_lock when switching inodes by running a single work item per wb and managing a queue of items switching to the wb - Allowing rescheduling when switching inodes over to a different cgroup to avoid softlockups - Maintaining the b_dirty list ordering instead of sorting it" * tag 'vfs-6.18-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: writeback: Add tracepoint to track pending inode switches writeback: Avoid excessively long inode switching times writeback: Avoid softlockup when switching many inodes writeback: Avoid contention on wb->list_lock when switching inodes
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev-defs.h4
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/writeback.h29
3 files changed, 35 insertions, 0 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ad261082bba..c5c9d89c73ed 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -152,6 +152,10 @@ struct bdi_writeback {
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
struct list_head b_attached; /* attached inodes, protected by list_lock */
struct list_head offline_node; /* anchored at offline_cgwbs */
+ struct work_struct switch_work; /* work used to perform inode switching
+ * to this wb */
+ struct llist_head switch_wbs_ctxs; /* queued contexts for
+ * writeback switching */
union {
struct work_struct release_work;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a2848d731a46..15a4bc4ab819 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}
+void inode_switch_wbs_work_fn(struct work_struct *work);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 1e23919c0da9..c08aff044e80 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
)
);
+TRACE_EVENT(inode_switch_wbs_queue,
+
+ TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
+ unsigned int count),
+
+ TP_ARGS(old_wb, new_wb, count),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(ino_t, old_cgroup_ino)
+ __field(ino_t, new_cgroup_ino)
+ __field(unsigned int, count)
+ ),
+
+ TP_fast_assign(
+ strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
+ __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
+ __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
+ __entry->count = count;
+ ),
+
+ TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
+ __entry->name,
+ (unsigned long)__entry->old_cgroup_ino,
+ (unsigned long)__entry->new_cgroup_ino,
+ __entry->count
+ )
+);
+
TRACE_EVENT(inode_switch_wbs,
TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,