diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 09:20:51 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 09:20:51 -0800 |
| commit | ebaeabfa5ab711a9b69b686d58329e258fdae75f (patch) | |
| tree | c4b0915701c65abed0bc3fe9f351f42daaaa9089 /fs | |
| parent | 9368f0f9419cde028a6e58331065900ff089bc36 (diff) | |
| parent | 4952f35f0545f3b53dab8d5fd727c4827c2a2778 (diff) | |
Merge tag 'vfs-6.19-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull writeback updates from Christian Brauner:
"Features:
- Allow file systems to increase the minimum writeback chunk size.
The relatively low minimal writeback size of 4MiB means that
written back inodes on rotational media are switched a lot. Besides
introducing additional seeks, this also can lead to extreme file
fragmentation on zoned devices when a lot of files are cached
relative to the available writeback bandwidth.
This adds a superblock field that allows the file system to
override the default size, and sets it to the zone size for zoned
XFS.
- Add logging for slow writeback when it exceeds
sysctl_hung_task_timeout_secs. This helps identify tasks waiting
for a long time and pinpoint potential issues. Recording the
starting jiffies is also useful when debugging a crashed vmcore.
- Wake up waiting tasks when finishing the writeback of a chunk
Cleanups:
- filemap_* writeback interface cleanups.
Adding filemap_fdatawrite_wbc ended up being a mistake, as all but
the original btrfs caller should be using better high level
interfaces instead.
This series removes all these low-level interfaces, switches btrfs
to a more specific interface, and cleans up other too low-level
interfaces. With this the writeback_control that is passed to the
writeback code is only initialized in three places.
- Remove __filemap_fdatawrite, __filemap_fdatawrite_range, and
filemap_fdatawrite_wbc
- Add filemap_flush_nr helper for btrfs
- Push struct writeback_control into start_delalloc_inodes in btrfs
- Rename filemap_fdatawrite_range_kick to filemap_flush_range
- Stop opencoding filemap_fdatawrite_range in 9p, ocfs2, and mm
- Make wbc_to_tag() inline and use it in fs"
* tag 'vfs-6.19-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: Make wbc_to_tag() inline and use it in fs.
xfs: set s_min_writeback_pages for zoned file systems
writeback: allow the file system to override MIN_WRITEBACK_PAGES
writeback: cleanup writeback_chunk_size
mm: rename filemap_fdatawrite_range_kick to filemap_flush_range
mm: remove __filemap_fdatawrite_range
mm: remove filemap_fdatawrite_wbc
mm: remove __filemap_fdatawrite
mm,btrfs: add a filemap_flush_nr helper
btrfs: push struct writeback_control into start_delalloc_inodes
btrfs: use the local tmp_inode variable in start_delalloc_inodes
ocfs2: don't opencode filemap_fdatawrite_range in ocfs2_journal_submit_inode_data_buffers
9p: don't opencode filemap_fdatawrite_range in v9fs_mmap_vm_close
mm: don't opencode filemap_fdatawrite_range in filemap_invalidate_inode
writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs)
writeback: Wake up waiting tasks when finishing the writeback of a chunk.
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/9p/vfs_file.c | 17 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.c | 5 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 46 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 6 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 5 | ||||
| -rw-r--r-- | fs/f2fs/data.c | 5 | ||||
| -rw-r--r-- | fs/fs-writeback.c | 55 | ||||
| -rw-r--r-- | fs/gfs2/aops.c | 5 | ||||
| -rw-r--r-- | fs/ocfs2/journal.c | 11 | ||||
| -rw-r--r-- | fs/super.c | 1 | ||||
| -rw-r--r-- | fs/sync.c | 10 | ||||
| -rw-r--r-- | fs/xfs/xfs_zone_alloc.c | 28 |
12 files changed, 90 insertions, 104 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index eb0b083da269..612a230bc012 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { - struct inode *inode; - - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_ALL, - .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, - /* absolute end, byte at end included */ - .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + - (vma->vm_end - vma->vm_start - 1), - }; - if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); - inode = file_inode(vma->vm_file); - filemap_fdatawrite_wbc(inode->i_mapping, &wbc); + filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping, + (loff_t)vma->vm_pgoff * PAGE_SIZE, + (loff_t)vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1)); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 23273d0e6f22..b925da738afd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2468,10 +2468,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 264d87e9eb6a..2d0e5086dad8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8715,15 +8715,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, - struct writeback_control *wbc, bool snapshot, - bool in_reclaim_context) +static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, + bool snapshot, bool in_reclaim_context) { struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; - bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); @@ -8749,10 +8747,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); - if (full_flush) { - work = btrfs_alloc_delalloc_work(&inode->vfs_inode); + if (nr_to_write == NULL) { + work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { - iput(&inode->vfs_inode); + iput(tmp_inode); ret = -ENOMEM; goto out; } @@ -8760,9 +8758,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (ret || wbc->nr_to_write <= 0) + + if (ret || *nr_to_write <= 0) goto out; } cond_resched(); @@ -8788,29 +8788,17 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - - return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = nr, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + long *nr_to_write = nr == LONG_MAX ? NULL : &nr; struct btrfs_root *root; LIST_HEAD(splice); int ret; @@ -8822,13 +8810,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { - /* - * Reset nr_to_write here so we know that we're doing a full - * flush. - */ - if (nr == LONG_MAX) - wbc.nr_to_write = LONG_MAX; - root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); @@ -8837,9 +8818,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + ret = start_delalloc_inodes(root, nr_to_write, false, + in_reclaim_context); btrfs_put_root(root); - if (ret < 0 || wbc.nr_to_write <= 0) + if (ret < 0 || nr <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e2b15236bab7..57dc1ff91df2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2618,10 +2618,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a8989f143c89..8bf4feda42b0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 917a450c503b..6800886c4d10 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include <linux/sched/sysctl.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -32,11 +33,6 @@ #include "internal.h" /* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - -/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { @@ -200,6 +196,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -212,8 +221,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -808,9 +818,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) @@ -1882,8 +1892,8 @@ out: return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1901,16 +1911,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) - pages = LONG_MAX; - else { - pages = min(wb->avg_write_bandwidth / 2, - global_wb_domain.dirty_limit / DIRTY_SCOPE); - pages = min(pages, work->nr_pages); - pages = round_down(pages + MIN_WRITEBACK_PAGES, - MIN_WRITEBACK_PAGES); - } + return LONG_MAX; - return pages; + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -2012,7 +2019,7 @@ static long writeback_sb_inodes(struct super_block *sb, inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -2022,6 +2029,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 38d4f343187a..e2b1c860664d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -311,10 +311,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5f58ff2175f..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -902,15 +902,8 @@ bail: static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) diff --git a/fs/super.c b/fs/super.c index 277b84e5c279..7c66b96b59be 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: diff --git a/fs/sync.c b/fs/sync.c index f161092a28a5..431fc5f5be06 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -281,14 +281,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - int sync_mode = WB_SYNC_NONE; - if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) - sync_mode = WB_SYNC_ALL; - - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - sync_mode); + ret = filemap_fdatawrite_range(mapping, offset, + endbyte); + else + ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index ef7a931ebde5..8dde444596f1 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1204,6 +1204,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1234,10 +1235,33 @@ xfs_mount_zones( return -ENOMEM; xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { error = blkdev_report_zones(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), |