summaryrefslogtreecommitdiff
path: root/fs/btrfs/ordered-data.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2024-07-11 10:42:33 +0200
committerIngo Molnar <mingo@kernel.org>2024-07-11 10:42:33 +0200
commit011b1134b82c2750d83a299a1369c678845de45a (patch)
treea6a2ba6bfa62b02cb7b00a67c6f449d965140e5a /fs/btrfs/ordered-data.c
parentd329605287020c3d1c3b0dadc63d8208e7251382 (diff)
parentddae0ca2a8fe12d0e24ab10ba759c3fbd755ada8 (diff)
Merge branch 'sched/urgent' into sched/core, to pick up fixes and refresh the branch
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs/btrfs/ordered-data.c')
-rw-r--r--fs/btrfs/ordered-data.c31
1 files changed, 31 insertions, 0 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c5bdd674f55c..35a413ce935d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
+
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;