summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_zone_alloc.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 09:20:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 09:20:51 -0800
commitebaeabfa5ab711a9b69b686d58329e258fdae75f (patch)
treec4b0915701c65abed0bc3fe9f351f42daaaa9089 /fs/xfs/xfs_zone_alloc.c
parent9368f0f9419cde028a6e58331065900ff089bc36 (diff)
parent4952f35f0545f3b53dab8d5fd727c4827c2a2778 (diff)
Merge tag 'vfs-6.19-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull writeback updates from Christian Brauner: "Features: - Allow file systems to increase the minimum writeback chunk size. The relatively low minimal writeback size of 4MiB means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. This adds a superblock field that allows the file system to override the default size, and sets it to the zone size for zoned XFS. - Add logging for slow writeback when it exceeds sysctl_hung_task_timeout_secs. This helps identify tasks waiting for a long time and pinpoint potential issues. Recording the starting jiffies is also useful when debugging a crashed vmcore. - Wake up waiting tasks when finishing the writeback of a chunk Cleanups: - filemap_* writeback interface cleanups. Adding filemap_fdatawrite_wbc ended up being a mistake, as all but the original btrfs caller should be using better high level interfaces instead. This series removes all these low-level interfaces, switches btrfs to a more specific interface, and cleans up other too low-level interfaces. With this the writeback_control that is passed to the writeback code is only initialized in three places. - Remove __filemap_fdatawrite, __filemap_fdatawrite_range, and filemap_fdatawrite_wbc - Add filemap_flush_nr helper for btrfs - Push struct writeback_control into start_delalloc_inodes in btrfs - Rename filemap_fdatawrite_range_kick to filemap_flush_range - Stop opencoding filemap_fdatawrite_range in 9p, ocfs2, and mm - Make wbc_to_tag() inline and use it in fs" * tag 'vfs-6.19-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: fs: Make wbc_to_tag() inline and use it in fs. xfs: set s_min_writeback_pages for zoned file systems writeback: allow the file system to override MIN_WRITEBACK_PAGES writeback: cleanup writeback_chunk_size mm: rename filemap_fdatawrite_range_kick to filemap_flush_range mm: remove __filemap_fdatawrite_range mm: remove filemap_fdatawrite_wbc mm: remove __filemap_fdatawrite mm,btrfs: add a filemap_flush_nr helper btrfs: push struct writeback_control into start_delalloc_inodes btrfs: use the local tmp_inode variable in start_delalloc_inodes ocfs2: don't opencode filemap_fdatawrite_range in ocfs2_journal_submit_inode_data_buffers 9p: don't opencode filemap_fdatawrite_range in v9fs_mmap_vm_close mm: don't opencode filemap_fdatawrite_range in filemap_invalidate_inode writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs) writeback: Wake up waiting tasks when finishing the writeback of a chunk.
Diffstat (limited to 'fs/xfs/xfs_zone_alloc.c')
-rw-r--r--fs/xfs/xfs_zone_alloc.c28
1 files changed, 26 insertions, 2 deletions
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index ef7a931ebde5..8dde444596f1 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1204,6 +1204,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1234,10 +1235,33 @@ xfs_mount_zones(
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),