summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c1
-rw-r--r--fs/9p/vfs_dentry.c10
-rw-r--r--fs/9p/vfs_file.c17
-rw-r--r--fs/9p/vfs_inode.c10
-rw-r--r--fs/9p/vfs_inode_dotl.c10
-rw-r--r--fs/Makefile2
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/afs/cell.c121
-rw-r--r--fs/afs/dir.c4
-rw-r--r--fs/afs/dynroot.c9
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/afs/internal.h13
-rw-r--r--fs/afs/mntpt.c3
-rw-r--r--fs/afs/proc.c3
-rw-r--r--fs/afs/security.c49
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/vl_alias.c3
-rw-r--r--fs/aio.c6
-rw-r--r--fs/backing-file.c153
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/inode.c21
-rw-r--r--fs/binfmt_misc.c11
-rw-r--r--fs/btrfs/block-group.c10
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/defrag.c14
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/delayed-inode.h7
-rw-r--r--fs/btrfs/extent_io.c31
-rw-r--r--fs/btrfs/file.c19
-rw-r--r--fs/btrfs/free-space-tree.c15
-rw-r--r--fs/btrfs/inode.c89
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/misc.h5
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/relocation.c13
-rw-r--r--fs/btrfs/scrub.c6
-rw-r--r--fs/btrfs/send.c60
-rw-r--r--fs/btrfs/subpage.c5
-rw-r--r--fs/btrfs/super.c11
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/tree-log.c5
-rw-r--r--fs/btrfs/volumes.c20
-rw-r--r--fs/btrfs/zoned.c62
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/crypto.c4
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c28
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/coda/cnode.c4
-rw-r--r--fs/coredump.c144
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/inline_crypt.c3
-rw-r--r--fs/crypto/keyring.c2
-rw-r--r--fs/crypto/keysetup.c2
-rw-r--r--fs/dax.c32
-rw-r--r--fs/dcache.c37
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/Kconfig2
-rw-r--r--fs/ecryptfs/crypto.c90
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h13
-rw-r--r--fs/ecryptfs/inode.c13
-rw-r--r--fs/ecryptfs/keystore.c65
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/super.c5
-rw-r--r--fs/efivarfs/super.c1
-rw-r--r--fs/efs/inode.c2
-rw-r--r--fs/erofs/data.c5
-rw-r--r--fs/erofs/decompressor_zstd.c11
-rw-r--r--fs/erofs/fileio.c6
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/zmap.c59
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/exfat_fs.h1
-rw-r--r--fs/exfat/file.c7
-rw-r--r--fs/exfat/namei.c8
-rw-r--r--fs/exfat/nls.c3
-rw-r--r--fs/exfat/super.c5
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/ext4_jbd2.c11
-rw-r--r--fs/ext4/inode.c36
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/orphan.c8
-rw-r--r--fs/f2fs/acl.c1
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/f2fs/inode.c2
-rw-r--r--fs/f2fs/namei.c4
-rw-r--r--fs/f2fs/super.c4
-rw-r--r--fs/fat/inode.c7
-rw-r--r--fs/file.c35
-rw-r--r--fs/file_attr.c20
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fs-writeback.c187
-rw-r--r--fs/fs_dirent.c (renamed from fs/fs_types.c)2
-rw-r--r--fs/fs_struct.c6
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c286
-rw-r--r--fs/fuse/fuse_i.h8
-rw-r--r--fs/fuse/inode.c17
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/gfs2/aops.c14
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c31
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/hugetlbfs/inode.c9
-rw-r--r--fs/inode.c317
-rw-r--r--fs/iomap/Makefile3
-rw-r--r--fs/iomap/bio.c88
-rw-r--r--fs/iomap/buffered-io.c646
-rw-r--r--fs/iomap/direct-io.c268
-rw-r--r--fs/iomap/internal.h12
-rw-r--r--fs/iomap/ioend.c2
-rw-r--r--fs/iomap/iter.c20
-rw-r--r--fs/iomap/seek.c8
-rw-r--r--fs/iomap/trace.h7
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/jbd2/transaction.c13
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_incore.h6
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/libfs.c7
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c150
-rw-r--r--fs/namespace.c97
-rw-r--r--fs/netfs/buffered_write.c2
-rw-r--r--fs/netfs/misc.c10
-rw-r--r--fs/netfs/read_single.c6
-rw-r--r--fs/nfs/client.c8
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c35
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/localio.c273
-rw-r--r--fs/nfs/nfs3client.c14
-rw-r--r--fs/nfs/nfs4client.c15
-rw-r--r--fs/nfs/nfs4idmap.c7
-rw-r--r--fs/nfs/nfs4proc.c22
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/pnfs_nfs.c66
-rw-r--r--fs/nfs/sysfs.c1
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/flexfilelayout.c8
-rw-r--r--fs/nfsd/nfs4proc.c21
-rw-r--r--fs/nfsd/nfs4state.c69
-rw-r--r--fs/nfsd/nfs4xdr.c26
-rw-r--r--fs/nfsd/nfsd.h4
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/ifile.c2
-rw-r--r--fs/nilfs2/inode.c10
-rw-r--r--fs/nilfs2/nilfs.h1
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/notify/fdinfo.c6
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/nsfs.c103
-rw-r--r--fs/ntfs3/inode.c2
-rw-r--r--fs/ntfs3/super.c1
-rw-r--r--fs/ocfs2/acl.c1
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/inode.c27
-rw-r--r--fs/ocfs2/inode.h1
-rw-r--r--fs/ocfs2/journal.c11
-rw-r--r--fs/ocfs2/move_extents.c5
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/inode.c3
-rw-r--r--fs/open.c15
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/inode.c6
-rw-r--r--fs/orangefs/orangefs-utils.c6
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/file.c5
-rw-r--r--fs/overlayfs/inode.c11
-rw-r--r--fs/overlayfs/util.c14
-rw-r--r--fs/pidfs.c189
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/generic.c12
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/resctrl/monitor.c16
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/smb/client/Kconfig7
-rw-r--r--fs/smb/client/cached_dir.c51
-rw-r--r--fs/smb/client/cifs_spnego.c6
-rw-r--r--fs/smb/client/cifsacl.c5
-rw-r--r--fs/smb/client/cifsencrypt.c201
-rw-r--r--fs/smb/client/cifsfs.c8
-rw-r--r--fs/smb/client/cifsglob.h26
-rw-r--r--fs/smb/client/cifsproto.h13
-rw-r--r--fs/smb/client/cifssmb.c30
-rw-r--r--fs/smb/client/connect.c47
-rw-r--r--fs/smb/client/dfs_cache.c55
-rw-r--r--fs/smb/client/file.c1
-rw-r--r--fs/smb/client/fs_context.c8
-rw-r--r--fs/smb/client/inode.c26
-rw-r--r--fs/smb/client/link.c31
-rw-r--r--fs/smb/client/misc.c17
-rw-r--r--fs/smb/client/sess.c2
-rw-r--r--fs/smb/client/smb1ops.c1
-rw-r--r--fs/smb/client/smb2inode.c2
-rw-r--r--fs/smb/client/smb2misc.c53
-rw-r--r--fs/smb/client/smb2ops.c15
-rw-r--r--fs/smb/client/smb2pdu.c7
-rw-r--r--fs/smb/client/smb2proto.h14
-rw-r--r--fs/smb/client/smb2transport.c182
-rw-r--r--fs/smb/client/smbdirect.c427
-rw-r--r--fs/smb/client/smbdirect.h2
-rw-r--r--fs/smb/client/trace.c1
-rw-r--r--fs/smb/client/transport.c2
-rw-r--r--fs/smb/client/xattr.c1
-rw-r--r--fs/smb/common/cifsglob.h30
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h24
-rw-r--r--fs/smb/server/mgmt/user_session.c7
-rw-r--r--fs/smb/server/smb2pdu.c11
-rw-r--r--fs/smb/server/smb_common.h14
-rw-r--r--fs/smb/server/transport_ipc.c20
-rw-r--r--fs/smb/server/transport_rdma.c449
-rw-r--r--fs/smb/server/transport_tcp.c5
-rw-r--r--fs/splice.c2
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/super.c14
-rw-r--r--fs/sync.c19
-rw-r--r--fs/sysfs/group.c26
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/utimes.c1
-rw-r--r--fs/xfs/Kconfig11
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h6
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/inode_repair.c2
-rw-r--r--fs/xfs/scrub/nlinks.c34
-rw-r--r--fs/xfs/scrub/parent.c2
-rw-r--r--fs/xfs/scrub/symlink_repair.c2
-rw-r--r--fs/xfs/scrub/xfarray.c2
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_discard.c4
-rw-r--r--fs/xfs/xfs_file.c50
-rw-r--r--fs/xfs/xfs_health.c4
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_iomap.c120
-rw-r--r--fs/xfs/xfs_iops.c2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_reflink.h2
-rw-r--r--fs/xfs/xfs_super.c58
-rw-r--r--fs/xfs/xfs_zone_alloc.c190
-rw-r--r--fs/xfs/xfs_zone_gc.c108
-rw-r--r--fs/xfs/xfs_zone_priv.h2
-rw-r--r--fs/zonefs/file.c5
-rw-r--r--fs/zonefs/super.c4
284 files changed, 4331 insertions, 3162 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eed551d8555f..633da5e37299 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
#include <linux/slab.h>
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f3248a3e5402..c1acbc98465d 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -66,7 +66,6 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct p9_fid *fid;
struct inode *inode;
struct v9fs_inode *v9inode;
- unsigned int cached;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -76,11 +75,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto out_valid;
v9inode = V9FS_I(inode);
- struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-
- cached = v9ses->cache & (CACHE_META | CACHE_LOOSE);
-
- if (!cached || v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+ if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
int retval;
struct v9fs_session_info *v9ses;
@@ -114,6 +109,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
p9_debug(P9_DEBUG_VFS,
"refresh inode: dentry = %pd (%p), got error %pe\n",
dentry, dentry, ERR_PTR(retval));
+ if (retval < 0)
return retval;
}
}
@@ -150,8 +146,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
};
const struct dentry_operations v9fs_dentry_operations = {
- .d_revalidate = v9fs_lookup_revalidate,
- .d_weak_revalidate = __v9fs_lookup_revalidate,
.d_release = v9fs_dentry_release,
.d_unalias_trylock = v9fs_dentry_unalias_trylock,
.d_unalias_unlock = v9fs_dentry_unalias_unlock,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index eb0b083da269..612a230bc012 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
{
- struct inode *inode;
-
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_ALL,
- .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
- /* absolute end, byte at end included */
- .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
- (vma->vm_end - vma->vm_start - 1),
- };
-
if (!(vma->vm_flags & VM_SHARED))
return;
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
- inode = file_inode(vma->vm_file);
- filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
+ filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping,
+ (loff_t)vma->vm_pgoff * PAGE_SIZE,
+ (loff_t)vma->vm_pgoff * PAGE_SIZE +
+ (vma->vm_end - vma->vm_start - 1));
}
static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 69f378a83775..8666c9c62258 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -422,7 +422,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
/*
* initialize the inode with the stat info
@@ -1339,14 +1339,8 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
* Don't update inode if the file type is different
*/
umode = p9mode2unixmode(v9ses, st, &rdev);
- if (inode_wrong_type(inode, umode)) {
- /*
- * Do this as a way of letting the caller know the inode should not
- * be reused
- */
- v9fs_invalidate_inode_attr(inode);
+ if (inode_wrong_type(inode, umode))
goto out;
- }
/*
* We don't want to refresh inode->i_size,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0b404e8484d2..1661a25f2772 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -112,7 +112,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
/*
* initialize the inode with the stat info
@@ -897,14 +897,8 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
/*
* Don't update inode if the file type is different
*/
- if (inode_wrong_type(inode, st->st_mode)) {
- /*
- * Do this as a way of letting the caller know the inode should not
- * be reused
- */
- v9fs_invalidate_inode_attr(inode);
+ if (inode_wrong_type(inode, st->st_mode))
goto out;
- }
/*
* We don't want to refresh inode->i_size,
diff --git a/fs/Makefile b/fs/Makefile
index e3523ab2e587..a04274a3c854 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
+ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
file_attr.o
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0210df8d3500..0bfc7d151dcd 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
pr_debug("affs_iget(%lu)\n", inode->i_ino);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index f31359922e98..71c10a05cebe 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -140,7 +140,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
- cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL);
+ /* Allocate the cell name and the key name in one go. */
+ cell->name = kmalloc(1 + namelen + 1 +
+ 4 + namelen + 1, GFP_KERNEL);
if (!cell->name) {
kfree(cell);
return ERR_PTR(-ENOMEM);
@@ -151,7 +153,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name_len = namelen;
for (i = 0; i < namelen; i++)
cell->name[i] = tolower(name[i]);
- cell->name[i] = 0;
+ cell->name[i++] = 0;
+
+ cell->key_desc = cell->name + i;
+ memcpy(cell->key_desc, "afs@", 4);
+ memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1);
cell->net = net;
refcount_set(&cell->ref, 1);
@@ -229,7 +235,7 @@ error:
* @name: The name of the cell.
* @namesz: The strlen of the cell name.
* @vllist: A colon/comma separated list of numeric IP addresses or NULL.
- * @excl: T if an error should be given if the cell name already exists.
+ * @reason: The reason we're doing the lookup
* @trace: The reason to be logged if the lookup is successful.
*
* Look up a cell record by name and query the DNS for VL server addresses if
@@ -239,7 +245,8 @@ error:
*/
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl,
+ const char *vllist,
+ enum afs_lookup_cell_for reason,
enum afs_cell_trace trace)
{
struct afs_cell *cell, *candidate, *cursor;
@@ -247,12 +254,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
enum afs_cell_state state;
int ret, n;
- _enter("%s,%s", name, vllist);
+ _enter("%s,%s,%u", name, vllist, reason);
- if (!excl) {
+ if (reason != AFS_LOOKUP_CELL_PRELOAD) {
cell = afs_find_cell(net, name, namesz, trace);
- if (!IS_ERR(cell))
+ if (!IS_ERR(cell)) {
+ if (reason == AFS_LOOKUP_CELL_DYNROOT)
+ goto no_wait;
+ if (cell->state == AFS_CELL_SETTING_UP ||
+ cell->state == AFS_CELL_UNLOOKED)
+ goto lookup_cell;
goto wait_for_cell;
+ }
}
/* Assume we're probably going to create a cell and preallocate and
@@ -298,26 +311,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
- afs_queue_cell(cell, afs_cell_trace_queue_new);
+lookup_cell:
+ if (reason != AFS_LOOKUP_CELL_PRELOAD &&
+ reason != AFS_LOOKUP_CELL_ROOTCELL) {
+ set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
+ afs_queue_cell(cell, afs_cell_trace_queue_new);
+ }
wait_for_cell:
- _debug("wait_for_cell");
state = smp_load_acquire(&cell->state); /* vs error */
- if (state != AFS_CELL_ACTIVE &&
- state != AFS_CELL_DEAD) {
+ switch (state) {
+ case AFS_CELL_ACTIVE:
+ case AFS_CELL_DEAD:
+ break;
+ case AFS_CELL_UNLOOKED:
+ default:
+ if (reason == AFS_LOOKUP_CELL_PRELOAD ||
+ reason == AFS_LOOKUP_CELL_ROOTCELL)
+ break;
+ _debug("wait_for_cell");
afs_see_cell(cell, afs_cell_trace_wait);
wait_var_event(&cell->state,
({
state = smp_load_acquire(&cell->state); /* vs error */
state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
}));
+ _debug("waited_for_cell %d %d", cell->state, cell->error);
}
+no_wait:
/* Check the state obtained from the wait check. */
+ state = smp_load_acquire(&cell->state); /* vs error */
if (state == AFS_CELL_DEAD) {
ret = cell->error;
goto error;
}
+ if (state == AFS_CELL_ACTIVE) {
+ switch (cell->dns_status) {
+ case DNS_LOOKUP_NOT_DONE:
+ if (cell->dns_source == DNS_RECORD_FROM_CONFIG) {
+ ret = 0;
+ break;
+ }
+ fallthrough;
+ default:
+ ret = -EIO;
+ goto error;
+ case DNS_LOOKUP_GOOD:
+ case DNS_LOOKUP_GOOD_WITH_BAD:
+ ret = 0;
+ break;
+ case DNS_LOOKUP_GOT_NOT_FOUND:
+ ret = -ENOENT;
+ goto error;
+ case DNS_LOOKUP_BAD:
+ ret = -EREMOTEIO;
+ goto error;
+ case DNS_LOOKUP_GOT_LOCAL_FAILURE:
+ case DNS_LOOKUP_GOT_TEMP_FAILURE:
+ case DNS_LOOKUP_GOT_NS_FAILURE:
+ ret = -EDESTADDRREQ;
+ goto error;
+ }
+ }
_leave(" = %p [cell]", cell);
return cell;
@@ -325,7 +381,7 @@ wait_for_cell:
cell_already_exists:
_debug("cell exists");
cell = cursor;
- if (excl) {
+ if (reason == AFS_LOOKUP_CELL_PRELOAD) {
ret = -EEXIST;
} else {
afs_use_cell(cursor, trace);
@@ -384,7 +440,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
return -EINVAL;
/* allocate a cell record for the root/workstation cell */
- new_root = afs_lookup_cell(net, rootcell, len, vllist, false,
+ new_root = afs_lookup_cell(net, rootcell, len, vllist,
+ AFS_LOOKUP_CELL_ROOTCELL,
afs_cell_trace_use_lookup_ws);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
@@ -660,33 +717,6 @@ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
}
/*
- * Allocate a key to use as a placeholder for anonymous user security.
- */
-static int afs_alloc_anon_key(struct afs_cell *cell)
-{
- struct key *key;
- char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp;
-
- /* Create a key to represent an anonymous user. */
- memcpy(keyname, "afs@", 4);
- dp = keyname + 4;
- cp = cell->name;
- do {
- *dp++ = tolower(*cp);
- } while (*cp++);
-
- key = rxrpc_get_null_key(keyname);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- cell->anonymous_key = key;
-
- _debug("anon key %p{%x}",
- cell->anonymous_key, key_serial(cell->anonymous_key));
- return 0;
-}
-
-/*
* Activate a cell.
*/
static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
@@ -695,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
struct afs_cell *pcell;
int ret;
- if (!cell->anonymous_key) {
- ret = afs_alloc_anon_key(cell);
- if (ret < 0)
- return ret;
- }
-
ret = afs_proc_cell_setup(cell);
if (ret < 0)
return ret;
@@ -777,6 +801,7 @@ static bool afs_manage_cell(struct afs_cell *cell)
switch (cell->state) {
case AFS_CELL_SETTING_UP:
goto set_up_cell;
+ case AFS_CELL_UNLOOKED:
case AFS_CELL_ACTIVE:
goto cell_is_active;
case AFS_CELL_REMOVING:
@@ -797,7 +822,7 @@ set_up_cell:
goto remove_cell;
}
- afs_set_cell_state(cell, AFS_CELL_ACTIVE);
+ afs_set_cell_state(cell, AFS_CELL_UNLOOKED);
cell_is_active:
if (afs_has_cell_expired(cell, &next_manage))
@@ -807,6 +832,8 @@ cell_is_active:
ret = afs_update_cell(cell);
if (ret < 0)
cell->error = ret;
+ if (cell->state == AFS_CELL_UNLOOKED)
+ afs_set_cell_state(cell, AFS_CELL_ACTIVE);
}
if (next_manage < TIME64_MAX && cell->net->live) {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 89d36e3e5c79..f4e9e12373ac 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
struct inode *inode = NULL, *ti;
afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
- bool supports_ibulk;
+ bool supports_ibulk, isnew;
long ret;
int i;
@@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
* callback counters.
*/
ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode,
- afs_ilookup5_test_by_fid, &vp->fid);
+ afs_ilookup5_test_by_fid, &vp->fid, &isnew);
if (!IS_ERR_OR_NULL(ti)) {
vnode = AFS_FS_I(ti);
vp->dv_before = vnode->status.data_version;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 8c6130789fde..aa56e8951e03 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -64,7 +64,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
vnode = AFS_FS_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 2);
@@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *
dotted = true;
}
- cell = afs_lookup_cell(net, name, len, NULL, false,
+ cell = afs_lookup_cell(net, name, len, NULL,
+ AFS_LOOKUP_CELL_DYNROOT,
afs_cell_trace_use_lookup_dynroot);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
@@ -258,7 +259,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry
vnode = AFS_FS_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 1);
@@ -383,7 +384,7 @@ struct inode *afs_dynroot_iget_root(struct super_block *sb)
vnode = AFS_FS_I(inode);
/* there shouldn't be an existing inode */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 2);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e1cb17b85791..dde1857fcabb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->netfs.inode.i_state & I_NEW) {
+ if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
afs_op_set_error(op, ret);
if (ret == 0)
@@ -579,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
/* deal with an existing inode */
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
_leave(" = %p", inode);
return inode;
}
@@ -639,7 +639,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
_debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid);
- BUG_ON(!(inode->i_state & I_NEW));
+ BUG_ON(!(inode_state_read_once(inode) & I_NEW));
vnode = AFS_FS_I(inode);
vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
@@ -748,7 +748,7 @@ void afs_evict_inode(struct inode *inode)
if ((S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) &&
- (inode->i_state & I_DIRTY) &&
+ (inode_state_read_once(inode) & I_DIRTY) &&
!sbi->dyn_root) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a45ae5c2ef8a..009064b8d661 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -343,6 +343,7 @@ extern const char afs_init_sysname[];
enum afs_cell_state {
AFS_CELL_SETTING_UP,
+ AFS_CELL_UNLOOKED,
AFS_CELL_ACTIVE,
AFS_CELL_REMOVING,
AFS_CELL_DEAD,
@@ -412,6 +413,7 @@ struct afs_cell {
u8 name_len; /* Length of name */
char *name; /* Cell name, case-flattened and NUL-padded */
+ char *key_desc; /* Authentication key description */
};
/*
@@ -1049,9 +1051,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
extern int afs_cell_init(struct afs_net *, const char *);
extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
enum afs_cell_trace);
+enum afs_lookup_cell_for {
+ AFS_LOOKUP_CELL_DYNROOT,
+ AFS_LOOKUP_CELL_MOUNTPOINT,
+ AFS_LOOKUP_CELL_DIRECT_MOUNT,
+ AFS_LOOKUP_CELL_PRELOAD,
+ AFS_LOOKUP_CELL_ROOTCELL,
+ AFS_LOOKUP_CELL_ALIAS_CHECK,
+};
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl,
+ const char *vllist,
+ enum afs_lookup_cell_for reason,
enum afs_cell_trace trace);
extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 1ad048e6e164..57c204a3c04e 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size > AFS_MAXCELLNAME)
return -ENAMETOOLONG;
- cell = afs_lookup_cell(ctx->net, p, size, NULL, false,
+ cell = afs_lookup_cell(ctx->net, p, size, NULL,
+ AFS_LOOKUP_CELL_MOUNTPOINT,
afs_cell_trace_use_lookup_mntpt);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 40e879c8ca77..44520549b509 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (strcmp(buf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_lookup_cell(net, name, strlen(name), args, true,
+ cell = afs_lookup_cell(net, name, strlen(name), args,
+ AFS_LOOKUP_CELL_PRELOAD,
afs_cell_trace_use_lookup_add);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 6a7744c9e2a2..55ddce94af03 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -16,6 +16,31 @@
static DEFINE_HASHTABLE(afs_permits_cache, 10);
static DEFINE_SPINLOCK(afs_permits_lock);
+static DEFINE_MUTEX(afs_key_lock);
+
+/*
+ * Allocate a key to use as a placeholder for anonymous user security.
+ */
+static int afs_alloc_anon_key(struct afs_cell *cell)
+{
+ struct key *key;
+
+ mutex_lock(&afs_key_lock);
+ key = cell->anonymous_key;
+ if (!key) {
+ key = rxrpc_get_null_key(cell->key_desc);
+ if (!IS_ERR(key))
+ cell->anonymous_key = key;
+ }
+ mutex_unlock(&afs_key_lock);
+
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ _debug("anon key %p{%x}",
+ cell->anonymous_key, key_serial(cell->anonymous_key));
+ return 0;
+}
/*
* get a key
@@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock);
struct key *afs_request_key(struct afs_cell *cell)
{
struct key *key;
+ int ret;
- _enter("{%x}", key_serial(cell->anonymous_key));
+ _enter("{%s}", cell->key_desc);
- _debug("key %s", cell->anonymous_key->description);
- key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description,
+ _debug("key %s", cell->key_desc);
+ key = request_key_net(&key_type_rxrpc, cell->key_desc,
cell->net->net, NULL);
if (IS_ERR(key)) {
if (PTR_ERR(key) != -ENOKEY) {
@@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell)
return key;
}
+ if (!cell->anonymous_key) {
+ ret = afs_alloc_anon_key(cell);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
/* act as anonymous user */
_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
return key_get(cell->anonymous_key);
@@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
{
struct key *key;
- _enter("{%x}", key_serial(cell->anonymous_key));
+ _enter("{%s}", cell->key_desc);
- _debug("key %s", cell->anonymous_key->description);
- key = request_key_net_rcu(&key_type_rxrpc,
- cell->anonymous_key->description,
+ _debug("key %s", cell->key_desc);
+ key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc,
cell->net->net);
if (IS_ERR(key)) {
if (PTR_ERR(key) != -ENOKEY) {
@@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
}
/* act as anonymous user */
+ if (!cell->anonymous_key)
+ return NULL; /* Need to allocate */
_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
return key_get(cell->anonymous_key);
} else {
@@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode,
if (mask & MAY_NOT_BLOCK) {
key = afs_request_key_rcu(vnode->volume->cell);
- if (IS_ERR(key))
+ if (IS_ERR_OR_NULL(key))
return -ECHILD;
ret = -ECHILD;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index da407f2d6f0d..d672b7ab57ae 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
/* lookup the cell record */
if (cellname) {
cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
- NULL, false,
+ NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT,
afs_cell_trace_use_lookup_mount);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%*.*s'\n",
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 709b4cdb723e..fc9676abd252 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
if (!name_len || name_len > AFS_MAXCELLNAME)
master = ERR_PTR(-EOPNOTSUPP);
else
- master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false,
+ master = afs_lookup_cell(cell->net, cell_name, name_len, NULL,
+ AFS_LOOKUP_CELL_ALIAS_CHECK,
afs_cell_trace_use_lookup_canonical);
kfree(cell_name);
if (IS_ERR(master))
diff --git a/fs/aio.c b/fs/aio.c
index 5bc133386407..0a23a8c0717f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1640,10 +1640,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
- const struct cred *old_cred = override_creds(iocb->fsync.creds);
- iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
- revert_creds(old_cred);
+ scoped_with_creds(iocb->fsync.creds)
+ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+
put_cred(iocb->fsync.creds);
iocb_put(iocb);
}
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 15a7f8031084..45da8600d564 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -157,13 +157,37 @@ static int backing_aio_init_wq(struct kiocb *iocb)
return sb_init_dio_done_wq(sb);
}
+static int do_backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags)
+{
+ struct backing_aio *aio = NULL;
+ int ret;
+
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ return vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ }
+
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return -ENOMEM;
+
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ return ret;
+}
ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- struct backing_aio *aio = NULL;
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -176,41 +200,57 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- old_cred = override_creds(ctx->cred);
+ scoped_with_creds(ctx->cred)
+ ret = do_backing_file_read_iter(file, iter, iocb, flags);
+
+ if (ctx->accessed)
+ ctx->accessed(iocb->ki_filp);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+static int do_backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ void (*end_write)(struct kiocb *, ssize_t))
+{
+ struct backing_aio *aio;
+ int ret;
+
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
- ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
- } else {
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
-
- aio->orig_iocb = iocb;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_complete = backing_aio_rw_complete;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ if (end_write)
+ end_write(iocb, ret);
+ return ret;
}
-out:
- revert_creds(old_cred);
- if (ctx->accessed)
- ctx->accessed(iocb->ki_filp);
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ return ret;
+
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return -ENOMEM;
+ aio->orig_iocb = iocb;
+ aio->end_write = end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
return ret;
}
-EXPORT_SYMBOL_GPL(backing_file_read_iter);
ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -227,46 +267,8 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- /*
- * Stacked filesystems don't support deferred completions, don't copy
- * this property in case it is set by the issuer.
- */
- flags &= ~IOCB_DIO_CALLER_COMP;
-
- old_cred = override_creds(ctx->cred);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(flags);
-
- ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
- if (ctx->end_write)
- ctx->end_write(iocb, ret);
- } else {
- struct backing_aio *aio;
-
- ret = backing_aio_init_wq(iocb);
- if (ret)
- goto out;
-
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
-
- aio->orig_iocb = iocb;
- aio->end_write = ctx->end_write;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_flags = flags;
- aio->iocb.ki_complete = backing_aio_queue_completion;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
- }
-out:
- revert_creds(old_cred);
-
- return ret;
+ scoped_with_creds(ctx->cred)
+ return do_backing_file_write_iter(file, iter, iocb, flags, ctx->end_write);
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
@@ -275,15 +277,13 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO;
- old_cred = override_creds(ctx->cred);
- ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred)
+ ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
if (ctx->accessed)
ctx->accessed(iocb->ki_filp);
@@ -297,7 +297,6 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
size_t len, unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
@@ -310,11 +309,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (ret)
return ret;
- old_cred = override_creds(ctx->cred);
- file_start_write(out);
- ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
- file_end_write(out);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred) {
+ file_start_write(out);
+ ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
+ file_end_write(out);
+ }
if (ctx->end_write)
ctx->end_write(iocb, ret);
@@ -326,7 +325,6 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write);
int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
struct file *user_file = vma->vm_file;
int ret;
@@ -338,9 +336,8 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
vma_set_file(vma, file);
- old_cred = override_creds(ctx->cred);
- ret = vfs_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred)
+ ret = vfs_mmap(vma->vm_file, vma);
if (ctx->accessed)
ctx->accessed(user_file);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8f430ff8e445..9fcfdd6b8189 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -307,7 +307,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
befs_ino = BEFS_I(inode);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1d41ce477df5..ce6f83234b67 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -42,7 +42,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
@@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
di = (struct bfs_inode *)bh->b_data + off;
- inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
+ /*
+ * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that
+ * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode
+ * value. This means that, although bfs_write_inode() saves whole
+ * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX}
+ * bits), middle 7 bits of di->i_mode value can be garbage when these
+ * bits were not saved by bfs_write_inode().
+ * Since we can't tell whether middle 7 bits are garbage, use only
+ * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being
+ * garbage) and reconstruct S_IFMT bits for Linux environment from
+ * di->i_vtype value.
+ */
+ inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode);
if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
inode->i_mode |= S_IFDIR;
inode->i_op = &bfs_dir_inops;
@@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
inode->i_mapping->a_ops = &bfs_aops;
+ } else {
+ brelse(bh);
+ printf("Unknown vtype=%u %s:%08lx\n",
+ le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino);
+ goto error;
}
BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a839f960cd4a..d7aec5b87c2b 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -782,8 +782,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
- const struct cred *old_cred;
-
/*
* Now that we support unprivileged binfmt_misc mounts make
* sure we use the credentials that the register @file was
@@ -791,9 +789,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
* didn't matter much as only a privileged process could open
* the register file.
*/
- old_cred = override_creds(file->f_cred);
- f = open_exec(e->interpreter);
- revert_creds(old_cred);
+ scoped_with_creds(file->f_cred)
+ f = open_exec(e->interpreter);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
@@ -837,8 +834,10 @@ out:
inode_unlock(d_inode(root));
if (err) {
- if (f)
+ if (f) {
+ exe_file_allow_write_access(f);
filp_close(f, NULL);
+ }
kfree(e);
return err;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5322ef2ae015..08cdda47509f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1850,12 +1850,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_should_reclaim(fs_info))
return;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- sb_end_write(fs_info->sb);
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
- }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1863,7 +1861,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return;
}
@@ -1947,7 +1944,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
/*
* Get out fast, in case we're read-only or unmounting the
* filesystem. It is OK to drop block groups from the list even
- * for the read-only case. As we did sb_start_write(),
+ * for the read-only case. As we did take the super write lock,
* "mount -o remount,ro" won't happen and read-only filesystem
* means it is forced read-only due to a fatal error. So, it
* never gets back to read-write to let us reclaim again.
@@ -2030,7 +2027,6 @@ end:
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index eba188a9e3bb..aee1fd21cdd6 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -85,8 +85,8 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6
{
/* @cur must be inside the folio. */
ASSERT(folio_pos(folio) <= cur);
- ASSERT(cur < folio_end(folio));
- return min(range_end, folio_end(folio)) - cur;
+ ASSERT(cur < folio_next_pos(folio));
+ return umin(range_end, folio_next_pos(folio)) - cur;
}
int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 7b277934f66f..a7f20f048398 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -254,10 +254,9 @@ again:
range.extent_thresh = defrag->extent_thresh;
file_ra_state_init(ra, inode->vfs_inode.i_mapping);
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
+ scoped_guard(super_write, fs_info->sb)
+ ret = btrfs_defrag_file(inode, ra, &range,
+ defrag->transid, BTRFS_DEFRAG_BATCH);
iput(&inode->vfs_inode);
if (ret < 0)
@@ -886,7 +885,7 @@ again:
}
lock_start = folio_pos(folio);
- lock_end = folio_end(folio) - 1;
+ lock_end = folio_next_pos(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -1178,7 +1177,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
if (!folio)
break;
- if (start >= folio_end(folio) || start + len <= folio_pos(folio))
+ if (start >= folio_next_pos(folio) ||
+ start + len <= folio_pos(folio))
continue;
btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
@@ -1219,7 +1219,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
folios[i] = NULL;
goto free_folios;
}
- cur = folio_end(folios[i]);
+ cur = folio_next_pos(folios[i]);
}
for (int i = 0; i < nr_pages; i++) {
if (!folios[i])
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 41e37f7f67cc..3df7b9d7fbe8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -2110,9 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
+ btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i],
&delayed_node_trackers[i]);
- btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
}
}
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 0d949edc0caf..b09d4ec8c77d 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -219,6 +219,13 @@ static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed
if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
return;
+ /*
+ * Only print if there are leaked references. The caller is
+ * holding one reference, so if refs == 1 there is no leak.
+ */
+ if (refcount_read(&node->refs) == 1)
+ return;
+
ref_tracker_dir_print(&node->ref_dir.dir,
BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c123a3ef154a..7361d5d890d2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -333,7 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
goto out;
}
range_start = max_t(u64, folio_pos(folio), start);
- range_len = min_t(u64, folio_end(folio), end + 1) - range_start;
+ range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start;
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
@@ -387,7 +387,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
- ASSERT(!(orig_start >= folio_end(locked_folio) ||
+ ASSERT(!(orig_start >= folio_next_pos(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
@@ -493,7 +493,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_end(folio));
+ start + len <= folio_next_pos(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
@@ -973,7 +973,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
{
const u64 ra_pos = readahead_pos(ractl);
const u64 ra_end = ra_pos + readahead_length(ractl);
- const u64 em_end = em->start + em->ram_bytes;
+ const u64 em_end = em->start + em->len;
/* No expansion for holes and inline extents. */
if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
@@ -1201,7 +1201,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* finished our folio read and unlocked the folio.
*/
if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_end(folio),
+ u64 range_len = umin(folio_next_pos(folio),
ordered->file_offset + ordered->num_bytes) - cur;
ret = true;
@@ -1223,7 +1223,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* So we return true and update @next_ret to the OE/folio boundary.
*/
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_end(folio),
+ u64 range_len = umin(folio_next_pos(folio),
ordered->file_offset + ordered->num_bytes) - cur;
/*
@@ -2215,7 +2215,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_end(folio),
+ u32 range_len = min_t(u64, folio_next_pos(folio),
eb->start + eb->len) - range_start;
folio_lock(folio);
@@ -2228,6 +2228,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_account_cgroup_owner(wbc, folio, range_len);
folio_unlock(folio);
}
+ /*
+ * If the fs is already in error status, do not submit any writeback
+ * but immediately finish it.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info))) {
+ btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info)));
+ return;
+ }
btrfs_submit_bbio(bbio, 0);
}
@@ -2460,10 +2468,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
&BTRFS_I(inode)->runtime_flags))
wbc->tagged_writepages = 1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
@@ -2619,7 +2624,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
continue;
}
- cur_end = min_t(u64, folio_end(folio) - 1, end);
+ cur_end = min_t(u64, folio_next_pos(folio) - 1, end);
cur_len = cur_end + 1 - cur;
ASSERT(folio_test_locked(folio));
@@ -3860,7 +3865,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_end(folio),
+ u32 range_len = min_t(u64, folio_next_pos(folio),
eb->start + eb->len) - range_start;
bio_add_folio_nofail(&bbio->bio, folio, range_len,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7efd1f8a1912..e7453f992e1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -89,7 +89,8 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
- ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes);
+ ASSERT(folio_pos(folio) <= pos &&
+ folio_next_pos(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -799,7 +800,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
- u64 clamp_end = min_t(u64, pos + len, folio_end(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
@@ -1254,8 +1255,8 @@ again:
* The reserved range goes beyond the current folio, shrink the reserved
* space to the folio boundary.
*/
- if (reserved_start + reserved_len > folio_end(folio)) {
- const u64 last_block = folio_end(folio);
+ if (reserved_start + reserved_len > folio_next_pos(folio)) {
+ const u64 last_block = folio_next_pos(folio);
shrink_reserved_space(inode, *data_reserved, reserved_start,
reserved_len, last_block - reserved_start,
@@ -2854,12 +2855,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 range_start;
+ u64 range_end;
int ret;
int ret2;
if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
return 0;
+ range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
+ range_end = round_up(end, root->fs_info->sectorsize);
+
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
+ range_end - range_start);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index dad0b492a663..d86541073d42 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1106,14 +1106,15 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
* If ret is 1 (no key found), it means this is an empty block group,
* without any extents allocated from it and there's no block group
* item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
- * because we are using the block group tree feature, so block group
- * items are stored in the block group tree. It also means there are no
- * extents allocated for block groups with a start offset beyond this
- * block group's end offset (this is the last, highest, block group).
+ * because we are using the block group tree feature (so block group
+ * items are stored in the block group tree) or this is a new block
+ * group created in the current transaction and its block group item
+ * was not yet inserted in the extent tree (that happens in
+ * btrfs_create_pending_block_groups() -> insert_block_group_item()).
+ * It also means there are no extents allocated for block groups with a
+ * start offset beyond this block group's end offset (this is the last,
+ * highest, block group).
*/
- if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
- ASSERT(ret == 0);
-
start = block_group->start;
end = block_group->start + block_group->length;
while (ret == 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b1b3a0553ee..9c6ca87b3d56 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9,6 +9,7 @@
#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
@@ -177,8 +178,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return ret;
}
ret = paths_from_inode(inum, ipath);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_put_root(local_root);
goto err;
+ }
/*
* We deliberately ignore the bit ipath might have been too small to
@@ -409,7 +412,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
continue;
}
- index = folio_end(folio) >> PAGE_SHIFT;
+ index = folio_next_index(folio);
/*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
@@ -2336,7 +2339,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
* The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio)));
+ ASSERT(!(end <= folio_pos(locked_folio) ||
+ start >= folio_next_pos(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
@@ -2743,7 +2747,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = folio_pos(folio);
- u64 page_end = folio_end(folio) - 1;
+ u64 page_end = folio_next_pos(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
@@ -3884,7 +3888,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
ASSERT(ret != -ENOMEM);
return ret;
} else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING)));
}
return 0;
@@ -4855,7 +4859,7 @@ again:
*/
zero_start = max_t(u64, folio_pos(folio), start);
- zero_end = folio_end(folio);
+ zero_end = folio_next_pos(folio);
folio_zero_range(folio, zero_start - folio_pos(folio),
zero_end - zero_start);
@@ -5038,7 +5042,7 @@ again:
* not reach disk, it still affects our page caches.
*/
zero_start = max_t(u64, folio_pos(folio), start);
- zero_end = min_t(u64, folio_end(folio) - 1, end);
+ zero_end = min_t(u64, folio_next_pos(folio) - 1, end);
} else {
zero_start = max_t(u64, block_start, start);
zero_end = min_t(u64, block_end, end);
@@ -5361,7 +5365,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct rb_node *node;
- ASSERT(inode->i_state & I_FREEING);
+ ASSERT(inode_state_read_once(inode) & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
@@ -5799,7 +5803,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->vfs_inode.i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
@@ -5823,7 +5827,7 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->vfs_inode.i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
path = btrfs_alloc_path();
@@ -5837,6 +5841,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (ret)
return ERR_PTR(ret);
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
unlock_new_inode(&inode->vfs_inode);
return inode;
}
@@ -6289,8 +6295,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
}
/*
- * This is a copy of file_update_time. We need this so we can return error on
- * ENOSPC for updating the inode in the case of file write and mmap writes.
+ * We need our own ->update_time so that we can return error on ENOSPC for
+ * updating the inode in the case of file write and mmap writes.
*/
static int btrfs_update_time(struct inode *inode, int flags)
{
@@ -6788,8 +6794,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
}
ret = btrfs_create_new_inode(trans, &new_inode_args);
- if (!ret)
+ if (!ret) {
+ if (S_ISDIR(inode->i_mode))
+ inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
d_instantiate_new(dentry, inode);
+ }
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -6873,7 +6882,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
BTRFS_I(inode)->dir_index = 0ULL;
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
@@ -7480,7 +7488,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
u64 page_start = folio_pos(folio);
u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
- int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+ int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING;
/*
* We have folio locked so no new ordered extent can be created on this
@@ -8709,15 +8717,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root,
- struct writeback_control *wbc, bool snapshot,
- bool in_reclaim_context)
+static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
+ bool snapshot, bool in_reclaim_context)
{
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
int ret = 0;
- bool full_flush = wbc->nr_to_write == LONG_MAX;
mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
@@ -8743,10 +8749,10 @@ static int start_delalloc_inodes(struct btrfs_root *root,
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
- if (full_flush) {
- work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
+ if (nr_to_write == NULL) {
+ work = btrfs_alloc_delalloc_work(tmp_inode);
if (!work) {
- iput(&inode->vfs_inode);
+ iput(tmp_inode);
ret = -ENOMEM;
goto out;
}
@@ -8754,9 +8760,11 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ ret = filemap_flush_nr(tmp_inode->i_mapping,
+ nr_to_write);
btrfs_add_delayed_iput(inode);
- if (ret || wbc->nr_to_write <= 0)
+
+ if (ret || *nr_to_write <= 0)
goto out;
}
cond_resched();
@@ -8782,29 +8790,17 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
struct btrfs_fs_info *fs_info = root->fs_info;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
-
- return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
+ return start_delalloc_inodes(root, NULL, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = nr,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ long *nr_to_write = nr == LONG_MAX ? NULL : &nr;
struct btrfs_root *root;
LIST_HEAD(splice);
int ret;
@@ -8816,13 +8812,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice)) {
- /*
- * Reset nr_to_write here so we know that we're doing a full
- * flush.
- */
- if (nr == LONG_MAX)
- wbc.nr_to_write = LONG_MAX;
-
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
@@ -8831,9 +8820,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ ret = start_delalloc_inodes(root, nr_to_write, false,
+ in_reclaim_context);
btrfs_put_root(root);
- if (ret < 0 || wbc.nr_to_write <= 0)
+ if (ret < 0 || nr <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -9169,6 +9159,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
+/*
+ * NOTE: in case you are adding MAY_EXEC check for directories:
+ * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
+ * elide calls here.
+ */
static int btrfs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 185bef0df1c2..8cb7d5a462ef 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3740,7 +3740,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
if (!prealloc) {
ret = -ENOMEM;
- goto drop_write;
+ goto out;
}
}
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 60f9b000d644..17b71e1285e5 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -209,9 +209,4 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
return (found_set == start + nbits);
}
-static inline u64 folio_end(struct folio *folio)
-{
- return folio_pos(folio) + folio_size(folio);
-}
-
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2829f20d7bb5..7fedebbee558 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_end(folio));
+ ASSERT(file_offset + len <= folio_next_pos(folio));
/*
* Ordered flag indicates whether we still have
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1175b8192cd7..31ad8580322a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1539,8 +1539,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
ASSERT(prealloc);
/* Check the level of src and dst first */
- if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
+ kfree(prealloc);
return -EINVAL;
+ }
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index de4cb0f3fbd0..e9224145d754 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -982,7 +982,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
extent_root = btrfs_extent_root(fs_info, 0);
/* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
- if (IS_ERR(extent_root)) {
+ if (!extent_root) {
btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
return 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8dd8de6b9fb8..0765e06d00b8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3780,6 +3780,7 @@ out:
/*
* Mark start of chunk relocation that is cancellable. Check if the cancellation
* has been requested meanwhile and don't start in that case.
+ * NOTE: if this returns an error, reloc_chunk_end() must not be called.
*
* Return:
* 0 success
@@ -3796,10 +3797,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
btrfs_info(fs_info, "chunk relocation canceled on start");
- /*
- * On cancel, clear all requests but let the caller mark
- * the end after cleanup operations.
- */
+ /* On cancel, clear all requests. */
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
atomic_set(&fs_info->reloc_cancel_req, 0);
return -ECANCELED;
}
@@ -3808,9 +3807,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
/*
* Mark end of chunk relocation that is cancellable and wake any waiters.
+ * NOTE: call only if a previous call to reloc_chunk_start() succeeded.
*/
static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
{
+ ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags));
/* Requested after start, clear bit first so any waiters can continue */
if (atomic_read(&fs_info->reloc_cancel_req) > 0)
btrfs_info(fs_info, "chunk relocation canceled during operation");
@@ -4023,9 +4024,9 @@ out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
+ reloc_chunk_end(fs_info);
out_put_bg:
btrfs_put_block_group(bg);
- reloc_chunk_end(fs_info);
free_reloc_control(rc);
return err;
}
@@ -4208,8 +4209,8 @@ out_clean:
ret = ret2;
out_unset:
unset_reloc_control(rc);
-out_end:
reloc_chunk_end(fs_info);
+out_end:
free_reloc_control(rc);
out:
free_reloc_roots(&reloc_roots);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4691d0bdb2e8..ba20d9286a34 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -694,7 +694,7 @@ static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
/* stripe->folios[] is allocated by us and no highmem is allowed. */
ASSERT(folio);
- ASSERT(!folio_test_partial_kmap(folio));
+ ASSERT(!folio_test_highmem(folio));
return folio_address(folio) + offset_in_folio(folio, offset);
}
@@ -707,7 +707,7 @@ static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int secto
/* stripe->folios[] is allocated by us and no highmem is allowed. */
ASSERT(folio);
- ASSERT(!folio_test_partial_kmap(folio));
+ ASSERT(!folio_test_highmem(folio));
/* And the range must be contained inside the folio. */
ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));
return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset);
@@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
&length, &bioc, NULL, NULL);
if (ret < 0) {
+ bio_put(bio);
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
goto out;
@@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_put_bioc(bioc);
if (!rbio) {
ret = -ENOMEM;
+ bio_put(bio);
btrfs_bio_counter_dec(fs_info);
goto out;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9230e5066fc6..96a030d28e09 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -178,7 +178,6 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
- struct fs_path cur_inode_path;
bool cur_inode_new;
bool cur_inode_new_gen;
bool cur_inode_deleted;
@@ -305,6 +304,9 @@ struct send_ctx {
struct btrfs_lru_cache dir_created_cache;
struct btrfs_lru_cache dir_utimes_cache;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct fs_path cur_inode_path;
};
struct pending_dir_move {
@@ -4100,6 +4102,48 @@ out:
return ret;
}
+static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_check_dir_ref_comp(entry, parent) < 0;
+}
+
+static int record_check_dir_ref_in_tree(struct rb_root *root,
+ struct recorded_ref *ref, struct list_head *list)
+{
+ struct recorded_ref *tmp_ref;
+ int ret;
+
+ if (rb_find(ref, root, rbtree_check_dir_ref_comp))
+ return 0;
+
+ ret = dup_ref(ref, list);
+ if (ret < 0)
+ return ret;
+
+ tmp_ref = list_last_entry(list, struct recorded_ref, list);
+ rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less);
+ tmp_ref->root = root;
+ return 0;
+}
+
static int rename_current_inode(struct send_ctx *sctx,
struct fs_path *current_path,
struct fs_path *new_path)
@@ -4127,11 +4171,11 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct recorded_ref *cur;
struct recorded_ref *cur2;
LIST_HEAD(check_dirs);
+ struct rb_root rbtree_check_dirs = RB_ROOT;
struct fs_path *valid_path = NULL;
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- u64 last_dir_ino_rm = 0;
bool did_overwrite = false;
bool is_orphan = false;
bool can_rename = true;
@@ -4435,7 +4479,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4463,7 +4507,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4473,7 +4517,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* We have a moved dir. Add the old parent to check_dirs
*/
cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list);
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
} else if (!S_ISDIR(sctx->cur_inode_mode)) {
@@ -4507,7 +4551,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (is_current_inode_path(sctx, cur->full_path))
fs_path_reset(&sctx->cur_inode_path);
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4550,8 +4594,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete &&
- cur->dir != last_dir_ino_rm) {
+ } else if (ret == inode_state_did_delete) {
ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
@@ -4563,7 +4606,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
- last_dir_ino_rm = cur->dir;
}
}
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 5ca8d4db6722..a7ba868e9372 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -186,7 +186,8 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
* unmapped page like dummy extent buffer pages.
*/
if (folio->mapping)
- ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio),
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_next_pos(folio),
"start=%llu len=%u folio_pos=%llu folio_size=%zu",
start, len, folio_pos(folio), folio_size(folio));
}
@@ -217,7 +218,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start;
+ *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start;
}
static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d6e496436539..430e7419349c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1900,8 +1900,6 @@ static int btrfs_get_tree_super(struct fs_context *fc)
return PTR_ERR(sb);
}
- set_device_specific_options(fs_info);
-
if (sb->s_root) {
/*
* Not the first mount of the fs thus got an existing super block.
@@ -1946,6 +1944,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
deactivate_locked_super(sb);
return -EACCES;
}
+ set_device_specific_options(fs_info);
bdev = fs_devices->latest_dev->bdev;
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
@@ -2069,7 +2068,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
- btrfs_free_fs_info(fs_info);
+ /*
+ * Dont call btrfs_free_fs_info() to free it as it's still
+ * initialized partially.
+ */
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kvfree(fs_info);
return -ENOMEM;
}
btrfs_init_fs_info(fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ca30b15ea452..c10b4c242acf 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1797,7 +1797,7 @@ static int check_inode_extref(struct extent_buffer *leaf,
struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
u16 namelen;
- if (unlikely(ptr + sizeof(*extref)) > end) {
+ if (unlikely(ptr + sizeof(*extref) > end)) {
inode_ref_err(leaf, slot,
"inode extref overflow, ptr %lu end %lu inode_extref size %zu",
ptr, end, sizeof(*extref));
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 621e0df097e3..30f3c3b849c1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -7122,7 +7122,7 @@ log_extents:
* a power failure unless the log was synced as part of an fsync
* against any other unrelated inode.
*/
- if (inode_only != LOG_INODE_EXISTS)
+ if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
@@ -7910,6 +7910,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ /* The inode has a new name (ref/extref), so make sure we log it. */
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+
btrfs_init_log_ctx(&ctx, inode);
ctx.logging_new_name = true;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2bec544d8ba3..cc8aa4a04348 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2002,14 +2002,11 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- int ret;
-
- ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
- if (ret)
- return;
- inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
- path_put(&path);
+ if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) {
+ vfs_utimes(&path, NULL);
+ path_put(&path);
+ }
}
static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
@@ -4660,12 +4657,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
- sb_end_write(fs_info->sb);
return ret;
}
@@ -8177,12 +8174,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
- sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8210,7 +8207,6 @@ out:
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index e00036672f33..d1db7fa1fe58 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
if (!btrfs_dev_is_sequential(device, info->physical)) {
up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
+ info->capacity = device->zone_info->zone_size;
return 0;
}
@@ -1522,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1529,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
- u64 stripe_nr, full_stripe_nr;
- u64 stripe_offset;
- int stripe_index;
- stripe_nr = div64_u64(last_alloc, map->stripe_size);
- stripe_offset = stripe_nr * map->stripe_size;
- full_stripe_nr = div_u64(stripe_nr, map->num_stripes);
- div_u64_rem(stripe_nr, map->num_stripes, &stripe_index);
-
- zone_info[i].alloc_offset =
- full_stripe_nr * map->stripe_size;
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > i)
- zone_info[i].alloc_offset += map->stripe_size;
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
else if (stripe_index == i)
- zone_info[i].alloc_offset +=
- (last_alloc - stripe_offset);
+ zone_info[i].alloc_offset += stripe_offset;
}
if (test_bit(0, active) != test_bit(i, active)) {
@@ -1574,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1581,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
@@ -1594,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
- u64 stripe_nr, full_stripe_nr;
- u64 stripe_offset;
- int stripe_index;
-
- stripe_nr = div64_u64(last_alloc, map->stripe_size);
- stripe_offset = stripe_nr * map->stripe_size;
- full_stripe_nr = div_u64(stripe_nr,
- map->num_stripes / map->sub_stripes);
- div_u64_rem(stripe_nr,
- (map->num_stripes / map->sub_stripes),
- &stripe_index);
-
- zone_info[i].alloc_offset =
- full_stripe_nr * map->stripe_size;
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > (i / map->sub_stripes))
- zone_info[i].alloc_offset += map->stripe_size;
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
else if (stripe_index == (i / map->sub_stripes))
- zone_info[i].alloc_offset +=
- (last_alloc - stripe_offset);
+ zone_info[i].alloc_offset += stripe_offset;
}
if ((i % map->sub_stripes) == 0) {
@@ -1683,8 +1680,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
- /* Zone capacity is always zone size in emulation */
- cache->zone_capacity = cache->length;
ret = calculate_alloc_pointer(cache, &last_alloc, new);
if (ret) {
btrfs_err(fs_info,
@@ -1693,6 +1688,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
+ cache->zone_capacity = cache->length;
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
@@ -1753,7 +1749,7 @@ out:
!fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
btrfs_bg_type_to_raid_name(map->type));
- return -EINVAL;
+ ret = -EINVAL;
}
if (unlikely(cache->alloc_offset > cache->zone_capacity)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 6a8752f7bbed..838c0c571022 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -611,9 +611,9 @@ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
return err;
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
goto out;
err = sync_inode_metadata(inode, 1);
@@ -2732,7 +2732,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
loff_t i_size = i_size_read(inode);
/* Is the folio fully inside i_size? */
- if (folio_pos(folio) + folio_size(folio) <= i_size)
+ if (folio_next_pos(folio) <= i_size)
return __block_write_full_folio(inode, folio, get_block, wbc);
/* Is the folio fully outside i_size? (truncate in progress) */
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 322ed268f14a..63b75d214210 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping,
ceph_wbc->index = ceph_wbc->start_index;
ceph_wbc->end = -1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
- } else {
- ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
- }
+ ceph_wbc->tag = wbc_to_tag(wbc);
ceph_wbc->op_idx = -1;
ceph_wbc->num_ops = 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 930fbd54d2c8..f678bab189d8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -26,7 +26,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
return;
/* Only new inodes! */
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return;
WARN_ON_ONCE(ci->netfs.cache);
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 7026e794813c..928746b92512 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -329,7 +329,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen)
out:
kfree(cryptbuf);
if (dir != parent) {
- if ((dir->i_state & I_NEW))
+ if ((inode_state_read_once(dir) & I_NEW))
discard_new_inode(dir);
else
iput(dir);
@@ -438,7 +438,7 @@ out:
fscrypt_fname_free_buffer(&_tname);
out_inode:
if (dir != fname->dir) {
- if ((dir->i_state & I_NEW))
+ if ((inode_state_read_once(dir) & I_NEW))
discard_new_inode(dir);
else
iput(dir);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 99b30f784ee2..983390069f73 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -740,7 +740,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
/*
* If it's not I_NEW, then someone created this before
* we got here. Assume the server is aware of it at
@@ -901,7 +901,7 @@ retry:
new_inode = NULL;
goto out_req;
}
- WARN_ON_ONCE(!(new_inode->i_state & I_NEW));
+ WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW));
spin_lock(&dentry->d_lock);
di->flags |= CEPH_DENTRY_ASYNC_CREATE;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a6e260d9e420..37d3a2477c17 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -132,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
goto out_err;
}
- inode->i_state = 0;
+ inode_state_assign_raw(inode, 0);
inode->i_mode = *mode;
err = ceph_security_init_secctx(dentry, *mode, as_ctx);
@@ -201,7 +201,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
ceph_present_inode(inode), ceph_vinop(inode), inode,
- !!(inode->i_state & I_NEW));
+ !!(inode_state_read_once(inode) & I_NEW));
return inode;
}
@@ -228,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
goto err;
}
- if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
goto err;
@@ -261,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
}
#endif
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
@@ -270,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
return inode;
err:
- if ((inode->i_state & I_NEW))
+ if ((inode_state_read_once(inode) & I_NEW))
discard_new_inode(inode);
else
iput(inode);
@@ -744,7 +744,7 @@ void ceph_evict_inode(struct inode *inode)
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_NETFS_WB)
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
@@ -1013,7 +1013,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
le64_to_cpu(info->version), ci->i_version);
/* Once I_NEW is cleared, we can't change type or dev numbers */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_mode = mode;
} else {
if (inode_wrong_type(inode, mode)) {
@@ -1090,7 +1090,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
#ifdef CONFIG_FS_ENCRYPTION
if (iinfo->fscrypt_auth_len &&
- ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
kfree(ci->fscrypt_auth);
ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
ci->fscrypt_auth = iinfo->fscrypt_auth;
@@ -1692,13 +1692,13 @@ retry_lookup:
pr_err_client(cl, "badness %p %llx.%llx\n", in,
ceph_vinop(in));
req->r_target_inode = NULL;
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
discard_new_inode(in);
else
iput(in);
goto done;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
}
@@ -1898,11 +1898,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
pr_err_client(cl, "inode badness on %p got %d\n", in,
rc);
err = rc;
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- } else if (in->i_state & I_NEW) {
+ } else if (inode_state_read_once(in) & I_NEW) {
unlock_new_inode(in);
}
@@ -2114,7 +2114,7 @@ retry_lookup:
pr_err_client(cl, "badness on %p %llx.%llx\n", in,
ceph_vinop(in));
if (d_really_is_negative(dn)) {
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
@@ -2124,7 +2124,7 @@ retry_lookup:
err = ret;
goto next_item;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
if (d_really_is_negative(dn)) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ad0cf177e75a..f6bf24b5c683 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1149,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
const char *path = fsc->mount_options->server_path ?
fsc->mount_options->server_path + 1 : "";
- err = __ceph_open_session(fsc->client, started);
+ err = __ceph_open_session(fsc->client);
if (err < 0)
goto out;
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 62a3d2565c26..70bb0579b40c 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -70,7 +70,7 @@ retry:
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cii = ITOC(inode);
/* we still need to set i_ino for things like stat(2) */
inode->i_ino = hash;
@@ -148,7 +148,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
/* we should never see newly created inodes because we intentionally
* fail in the initialization callback */
- BUG_ON(inode->i_state & I_NEW);
+ BUG_ON(inode_state_read_once(inode) & I_NEW);
return inode;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index b5fc06a092a4..fe4099e0530b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1036,7 +1036,7 @@ static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm,
static bool coredump_write(struct core_name *cn,
struct coredump_params *cprm,
- struct linux_binfmt *binfmt)
+ const struct linux_binfmt *binfmt)
{
if (dump_interrupted())
@@ -1086,119 +1086,119 @@ static inline bool coredump_skip(const struct coredump_params *cprm,
return false;
}
-void vfs_coredump(const kernel_siginfo_t *siginfo)
+static void do_coredump(struct core_name *cn, struct coredump_params *cprm,
+ size_t **argv, int *argc, const struct linux_binfmt *binfmt)
{
- struct cred *cred __free(put_cred) = NULL;
- size_t *argv __free(kfree) = NULL;
- struct core_state core_state;
- struct core_name cn;
- struct mm_struct *mm = current->mm;
- struct linux_binfmt *binfmt = mm->binfmt;
- const struct cred *old_cred;
- int argc = 0;
- struct coredump_params cprm = {
- .siginfo = siginfo,
- .limit = rlimit(RLIMIT_CORE),
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency of bit flags, since this flag is not protected
- * by any locks.
- *
- * Note that we only care about MMF_DUMP* flags.
- */
- .mm_flags = __mm_flags_get_dumpable(mm),
- .vma_meta = NULL,
- .cpu = raw_smp_processor_id(),
- };
-
- audit_core_dumps(siginfo->si_signo);
-
- if (coredump_skip(&cprm, binfmt))
- return;
-
- cred = prepare_creds();
- if (!cred)
- return;
- /*
- * We cannot trust fsuid as being the "true" uid of the process
- * nor do we know its entire history. We only know it was tainted
- * so we dump it as root in mode 2, and only into a controlled
- * environment (pipe handler or fully qualified path).
- */
- if (coredump_force_suid_safe(&cprm))
- cred->fsuid = GLOBAL_ROOT_UID;
-
- if (coredump_wait(siginfo->si_signo, &core_state) < 0)
- return;
-
- old_cred = override_creds(cred);
-
- if (!coredump_parse(&cn, &cprm, &argv, &argc)) {
+ if (!coredump_parse(cn, cprm, argv, argc)) {
coredump_report_failure("format_corename failed, aborting core");
- goto close_fail;
+ return;
}
- switch (cn.core_type) {
+ switch (cn->core_type) {
case COREDUMP_FILE:
- if (!coredump_file(&cn, &cprm, binfmt))
- goto close_fail;
+ if (!coredump_file(cn, cprm, binfmt))
+ return;
break;
case COREDUMP_PIPE:
- if (!coredump_pipe(&cn, &cprm, argv, argc))
- goto close_fail;
+ if (!coredump_pipe(cn, cprm, *argv, *argc))
+ return;
break;
case COREDUMP_SOCK_REQ:
fallthrough;
case COREDUMP_SOCK:
- if (!coredump_socket(&cn, &cprm))
- goto close_fail;
+ if (!coredump_socket(cn, cprm))
+ return;
break;
default:
WARN_ON_ONCE(true);
- goto close_fail;
+ return;
}
/* Don't even generate the coredump. */
- if (cn.mask & COREDUMP_REJECT)
- goto close_fail;
+ if (cn->mask & COREDUMP_REJECT)
+ return;
/* get us an unshared descriptor table; almost always a no-op */
/* The cell spufs coredump code reads the file descriptor tables */
if (unshare_files())
- goto close_fail;
+ return;
- if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt))
- goto close_fail;
+ if ((cn->mask & COREDUMP_KERNEL) && !coredump_write(cn, cprm, binfmt))
+ return;
- coredump_sock_shutdown(cprm.file);
+ coredump_sock_shutdown(cprm->file);
/* Let the parent know that a coredump was generated. */
- if (cn.mask & COREDUMP_USERSPACE)
- cn.core_dumped = true;
+ if (cn->mask & COREDUMP_USERSPACE)
+ cn->core_dumped = true;
/*
* When core_pipe_limit is set we wait for the coredump server
* or usermodehelper to finish before exiting so it can e.g.,
* inspect /proc/<pid>.
*/
- if (cn.mask & COREDUMP_WAIT) {
- switch (cn.core_type) {
+ if (cn->mask & COREDUMP_WAIT) {
+ switch (cn->core_type) {
case COREDUMP_PIPE:
- wait_for_dump_helpers(cprm.file);
+ wait_for_dump_helpers(cprm->file);
break;
case COREDUMP_SOCK_REQ:
fallthrough;
case COREDUMP_SOCK:
- coredump_sock_wait(cprm.file);
+ coredump_sock_wait(cprm->file);
break;
default:
break;
}
}
+}
+
+void vfs_coredump(const kernel_siginfo_t *siginfo)
+{
+ size_t *argv __free(kfree) = NULL;
+ struct core_state core_state;
+ struct core_name cn;
+ const struct mm_struct *mm = current->mm;
+ const struct linux_binfmt *binfmt = mm->binfmt;
+ int argc = 0;
+ struct coredump_params cprm = {
+ .siginfo = siginfo,
+ .limit = rlimit(RLIMIT_CORE),
+ /*
+ * We must use the same mm->flags while dumping core to avoid
+ * inconsistency of bit flags, since this flag is not protected
+ * by any locks.
+ *
+ * Note that we only care about MMF_DUMP* flags.
+ */
+ .mm_flags = __mm_flags_get_dumpable(mm),
+ .vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
+ };
+
+ audit_core_dumps(siginfo->si_signo);
+
+ if (coredump_skip(&cprm, binfmt))
+ return;
+
+ CLASS(prepare_creds, cred)();
+ if (!cred)
+ return;
+ /*
+ * We cannot trust fsuid as being the "true" uid of the process
+ * nor do we know its entire history. We only know it was tainted
+ * so we dump it as root in mode 2, and only into a controlled
+ * environment (pipe handler or fully qualified path).
+ */
+ if (coredump_force_suid_safe(&cprm))
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ if (coredump_wait(siginfo->si_signo, &core_state) < 0)
+ return;
-close_fail:
+ scoped_with_creds(cred)
+ do_coredump(&cn, &cprm, &argv, &argc, binfmt);
coredump_cleanup(&cn, &cprm);
- revert_creds(old_cred);
return;
}
@@ -1468,7 +1468,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
ssize_t retval;
char old_core_pattern[CORENAME_MAX_SIZE];
- if (write)
+ if (!write)
return proc_dostring(table, write, buffer, lenp, ppos);
retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index ca54bf24b719..e54ebe402df7 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -95,7 +95,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode = iget_locked(sb, cramino(cramfs_inode, offset));
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
switch (cramfs_inode->mode & S_IFMT) {
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 5dee7c498bc8..ed6e926226b5 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -333,8 +333,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
inode = mapping->host;
*inode_ret = inode;
- *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) +
- (bh_offset(bh) >> inode->i_blkbits);
+ *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits;
return true;
}
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 3adbd7167055..5e939ea3ac28 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -945,7 +945,7 @@ static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) {
inode = ci->ci_inode;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 4bd3918f50e3..40fa05688d3a 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -834,7 +834,7 @@ int fscrypt_drop_inode(struct inode *inode)
* userspace is still using the files, inodes can be dirtied between
* then and now. We mustn't lose any writes, so skip dirty inodes here.
*/
- if (inode->i_state & I_DIRTY_ALL)
+ if (inode_state_read(inode) & I_DIRTY_ALL)
return 0;
/*
diff --git a/fs/dax.c b/fs/dax.c
index 89f071ba7b10..38fae11ee419 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (ret < 0)
return ret;
- ret = iomap_iter_advance(iter, &length);
+ ret = iomap_iter_advance(iter, length);
if (ret)
return ret;
- } while (length > 0);
+ } while ((length = iomap_length(iter)) > 0);
if (did_zero)
*did_zero = true;
@@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
done = iov_iter_zero(min(length, end - pos), iter);
- return iomap_iter_advance(iomi, &done);
+ return iomap_iter_advance(iomi, done);
}
}
@@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- length = xfer;
- ret = iomap_iter_advance(iomi, &length);
+ ret = iomap_iter_advance(iomi, xfer);
if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
+ length = iomap_length(iomi);
}
dax_read_unlock(id);
@@ -1725,7 +1725,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE) {
lockdep_assert_held_write(&iomi.inode->i_rwsem);
iomi.flags |= IOMAP_WRITE;
- } else {
+ } else if (!sb_rdonly(iomi.inode->i_sb)) {
lockdep_assert_held(&iomi.inode->i_rwsem);
}
@@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR)) {
- u64 length = PAGE_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (!(ret & VM_FAULT_ERROR))
+ iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
}
if (iomap_errp)
@@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK) {
- u64 length = PMD_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (ret != VM_FAULT_FALLBACK)
+ iter.status = iomap_iter_advance(&iter, PMD_SIZE);
}
unlock_entry:
@@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
- u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
dax_read_unlock(id);
advance:
- dest_len = len;
- ret = iomap_iter_advance(it_src, &len);
+ ret = iomap_iter_advance(it_src, len);
if (!ret)
- ret = iomap_iter_advance(it_dest, &dest_len);
+ ret = iomap_iter_advance(it_dest, len);
return ret;
out_unlock:
diff --git a/fs/dcache.c b/fs/dcache.c
index a067fa0a965a..9143fd502def 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
-static struct kmem_cache *dentry_cache __ro_after_init;
+static struct kmem_cache *__dentry_cache __ro_after_init;
+#define dentry_cache runtime_const_ptr(__dentry_cache)
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
@@ -794,7 +795,7 @@ void d_mark_dontcache(struct inode *inode)
de->d_flags |= DCACHE_DONTCACHE;
spin_unlock(&de->d_lock);
}
- inode->i_state |= I_DONTCACHE;
+ inode_state_set(inode, I_DONTCACHE);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);
@@ -1073,7 +1074,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
spin_lock(&inode->i_lock);
// ->i_dentry and ->i_rcu are colocated, but the latter won't be
// used without having I_FREEING set, which means no aliases left
- if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
+ if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) {
if (S_ISDIR(inode->i_mode)) {
de = hlist_entry(l->first, struct dentry, d_u.d_alias);
} else {
@@ -1980,14 +1981,8 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW & ~I_CREATING;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -2306,11 +2301,20 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
- if (d_unhashed(dentry))
- continue;
if (dentry->d_name.hash_len != hashlen)
continue;
- if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0))
+ continue;
+ /*
+ * Check for the dentry being unhashed.
+ *
+ * As tempting as it is, we *can't* skip it because of a race window
+ * between us finding the dentry before it gets unhashed and loading
+ * the sequence counter after unhashing is finished.
+ *
+ * We can at least predict on it.
+ */
+ if (unlikely(d_unhashed(dentry)))
continue;
*seqp = seq;
return dentry;
@@ -2557,6 +2561,8 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
spin_lock(&parent->d_lock);
new->d_parent = dget_dlock(parent);
hlist_add_head(&new->d_sib, &parent->d_children);
+ if (parent->d_flags & DCACHE_DISCONNECTED)
+ new->d_flags |= DCACHE_DISCONNECTED;
spin_unlock(&parent->d_lock);
retry:
@@ -3220,9 +3226,10 @@ static void __init dcache_init(void)
* but it is probably not worth it because of the cache nature
* of the dcache.
*/
- dentry_cache = KMEM_CACHE_USERCOPY(dentry,
+ __dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
d_shortname.string);
+ runtime_const_init(ptr, __dentry_cache);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 019a8b4eaaf9..49f56a598ecb 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -28,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* inodes without pages but we deliberately won't in case
* we need to reschedule to avoid softlockups.
*/
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
(mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1bdeaa6d5790..c2f4fb41b4e6 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -4,7 +4,7 @@ config ECRYPT_FS
depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
select CRYPTO_ECB
select CRYPTO_CBC
- select CRYPTO_MD5
+ select CRYPTO_LIB_MD5
help
Encrypted filesystem that operates on the VFS layer. See
<file:Documentation/filesystems/ecryptfs.rst> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 69536cacdea8..260f8a4938b0 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -9,7 +9,6 @@
* Michael C. Thompson <mcthomps@us.ibm.com>
*/
-#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
}
}
-/**
- * ecryptfs_calculate_md5 - calculates the md5 of @src
- * @dst: Pointer to 16 bytes of allocated memory
- * @crypt_stat: Pointer to crypt_stat struct for the current inode
- * @src: Data to be md5'd
- * @len: Length of @src
- *
- * Uses the allocated crypto context that crypt_stat references to
- * generate the MD5 sum of the contents of src.
- */
-static int ecryptfs_calculate_md5(char *dst,
- struct ecryptfs_crypt_stat *crypt_stat,
- char *src, int len)
-{
- int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst);
-
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
-out:
- return rc;
-}
-
static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name,
char *cipher_name,
char *chaining_modifier)
@@ -104,13 +77,10 @@ out:
*
* Generate the initialization vector from the given root IV and page
* offset.
- *
- * Returns zero on success; non-zero on error.
*/
-int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset)
+void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset)
{
- int rc = 0;
char dst[MD5_DIGEST_SIZE];
char src[ECRYPTFS_MAX_IV_BYTES + 16];
@@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
ecryptfs_printk(KERN_DEBUG, "source:\n");
ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
}
- rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
- (crypt_stat->iv_bytes + 16));
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
- "MD5 while generating IV for a page\n");
- goto out;
- }
+ md5(src, crypt_stat->iv_bytes + 16, dst);
memcpy(iv, dst, crypt_stat->iv_bytes);
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
}
-out:
- return rc;
}
/**
@@ -151,29 +113,14 @@ out:
*
* Initialize the crypt_stat structure.
*/
-int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
{
- struct crypto_shash *tfm;
- int rc;
-
- tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
- if (IS_ERR(tfm)) {
- rc = PTR_ERR(tfm);
- ecryptfs_printk(KERN_ERR, "Error attempting to "
- "allocate crypto context; rc = [%d]\n",
- rc);
- return rc;
- }
-
memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
INIT_LIST_HEAD(&crypt_stat->keysig_list);
mutex_init(&crypt_stat->keysig_list_mutex);
mutex_init(&crypt_stat->cs_mutex);
mutex_init(&crypt_stat->cs_tfm_mutex);
- crypt_stat->hash_tfm = tfm;
crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED;
-
- return 0;
}
/**
@@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
crypto_free_skcipher(crypt_stat->tfm);
- crypto_free_shash(crypt_stat->hash_tfm);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
@@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
int rc;
extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size));
- rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
- (extent_base + extent_offset));
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
- "extent [0x%.16llx]; rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- goto out;
- }
+ ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset);
sg_init_table(&src_sg, 1);
sg_init_table(&dst_sg, 1);
@@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
*/
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
{
- int rc = 0;
char dst[MD5_DIGEST_SIZE];
BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
BUG_ON(crypt_stat->iv_bytes <= 0);
if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
- rc = -EINVAL;
ecryptfs_printk(KERN_WARNING, "Session key not valid; "
"cannot generate root IV\n");
- goto out;
- }
- rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
- crypt_stat->key_size);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
- "MD5 while generating root IV\n");
- goto out;
- }
- memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
-out:
- if (rc) {
memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING;
+ return -EINVAL;
}
- return rc;
+ md5(crypt_stat->key, crypt_stat->key_size, dst);
+ memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+ return 0;
}
static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 9e6ab0b41337..62a2ea7f59ed 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -14,6 +14,7 @@
#ifndef ECRYPTFS_KERNEL_H
#define ECRYPTFS_KERNEL_H
+#include <crypto/md5.h>
#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
@@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key)
+ MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
#define ECRYPTFS_DEFAULT_CIPHER "aes"
#define ECRYPTFS_DEFAULT_KEY_BYTES 16
-#define ECRYPTFS_DEFAULT_HASH "md5"
-#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key)
* ECRYPTFS_MAX_IV_BYTES */
#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
-#define MD5_DIGEST_SIZE 16
-#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+ ECRYPTFS_SIG_SIZE + 1 + 1)
#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
@@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat {
unsigned int extent_mask;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct crypto_skcipher *tfm;
- struct crypto_shash *hash_tfm; /* Crypto context for generating
- * the initialization vectors */
unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -558,7 +553,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_rotate_iv(unsigned char *iv);
-int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
@@ -693,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
char *data, size_t max_packet_size);
int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
-int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset);
+void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset);
extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ed1394da8d6b..fc6d37419753 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -95,7 +95,7 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
iput(lower_inode);
return ERR_PTR(-EACCES);
}
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
iput(lower_inode);
return inode;
@@ -106,7 +106,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode,
{
struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
- if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+ if (!IS_ERR(inode) && (inode_state_read_once(inode) & I_NEW))
unlock_new_inode(inode);
return inode;
@@ -364,7 +364,7 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
}
}
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
return d_splice_alias(inode, dentry);
}
@@ -903,11 +903,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap,
struct ecryptfs_crypt_stat *crypt_stat;
crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
- if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) {
- rc = ecryptfs_init_crypt_stat(crypt_stat);
- if (rc)
- return rc;
- }
+ if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+ ecryptfs_init_crypt_stat(crypt_stat);
inode = d_inode(dentry);
lower_inode = ecryptfs_inode_to_lower(inode);
lower_dentry = ecryptfs_dentry_to_lower(dentry);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7f9f68c00ef6..bbf8603242fa 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -11,7 +11,6 @@
* Trevor S. Highland <trevor.highland@gmail.com>
*/
-#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/string.h>
#include <linux/pagemap.h>
@@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
struct crypto_skcipher *skcipher_tfm;
struct skcipher_request *skcipher_req;
char iv[ECRYPTFS_MAX_IV_BYTES];
- char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- struct crypto_shash *hash_tfm;
- struct shash_desc *hash_desc;
+ char hash[MD5_DIGEST_SIZE];
};
/*
@@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"password tokens\n", __func__);
goto out_free_unlock;
}
- s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
- if (IS_ERR(s->hash_tfm)) {
- rc = PTR_ERR(s->hash_tfm);
- printk(KERN_ERR "%s: Error attempting to "
- "allocate hash crypto context; rc = [%d]\n",
- __func__, rc);
- goto out_free_unlock;
- }
-
- s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
- crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
- if (!s->hash_desc) {
- rc = -ENOMEM;
- goto out_release_free_unlock;
- }
- s->hash_desc->tfm = s->hash_tfm;
-
- rc = crypto_shash_digest(s->hash_desc,
- (u8 *)s->auth_tok->token.password.session_key_encryption_key,
- s->auth_tok->token.password.session_key_encryption_key_bytes,
- s->hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out_release_free_unlock;
- }
+ md5(s->auth_tok->token.password.session_key_encryption_key,
+ s->auth_tok->token.password.session_key_encryption_key_bytes,
+ s->hash);
for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
s->block_aligned_filename[s->j] =
- s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
- if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
- == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
- rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE,
- s->tmp_hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- memcpy(s->hash, s->tmp_hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
- }
+ s->hash[s->j % MD5_DIGEST_SIZE];
+ if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1))
+ md5(s->hash, MD5_DIGEST_SIZE, s->hash);
if (s->block_aligned_filename[s->j] == '\0')
s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
}
@@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"convert filename memory to scatterlist; rc = [%d]. "
"block_aligned_filename_size = [%zd]\n", __func__, rc,
s->block_aligned_filename_size);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
s->dst_sg, 2);
@@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"convert encrypted filename memory to scatterlist; "
"rc = [%d]. block_aligned_filename_size = [%zd]\n",
__func__, rc, s->block_aligned_filename_size);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
/* The characters in the first block effectively do the job
* of the IV here, so we just use 0's for the IV. Note the
@@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
s->block_aligned_filename_size, s->iv);
@@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt filename; "
"rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
s->i += s->block_aligned_filename_size;
(*packet_size) = s->i;
(*remaining_bytes) -= (*packet_size);
-out_release_free_unlock:
- crypto_free_shash(s->hash_tfm);
out_free_unlock:
kfree_sensitive(s->block_aligned_filename);
out_unlock:
@@ -850,7 +808,6 @@ out:
key_put(auth_tok_key);
}
skcipher_request_free(s->skcipher_req);
- kfree_sensitive(s->hash_desc);
kfree(s);
return rc;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 16ea14dd2c62..c12dc680f8fe 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -12,6 +12,7 @@
#include <linux/dcache.h>
#include <linux/file.h>
+#include <linux/fips.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/skbuff.h>
@@ -454,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out;
}
+ if (fips_enabled) {
+ rc = -EINVAL;
+ err = "eCryptfs support is disabled due to FIPS";
+ goto out;
+ }
+
s = sget_fc(fc, NULL, set_anon_super_fc);
if (IS_ERR(s)) {
rc = PTR_ERR(s);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e7b7f426fecf..3bc21d677564 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
- if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
- kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
- goto out;
- }
+ ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
mutex_init(&inode_info->lower_file_mutex);
atomic_set(&inode_info->lower_file_count, 0);
inode_info->lower_file = NULL;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 1f4d8ce56667..6de97565d5f7 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = {
.init_fs_context = efivarfs_init_fs_context,
.kill_sb = efivarfs_kill_sb,
.parameters = efivarfs_parameters,
+ .fs_flags = FS_POWER_FREEZE,
};
static __init int efivarfs_init(void)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 462619e59766..28407578f83a 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -62,7 +62,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
inode = iget_locked(super, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
in = INODE_INFO(inode);
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8ca29962a3dd..bb13c4cb8455 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
{
trace_erofs_read_folio(folio, true);
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ iomap_bio_read_folio(folio, &erofs_iomap_ops);
+ return 0;
}
static void erofs_readahead(struct readahead_control *rac)
@@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac)
trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
readahead_count(rac), true);
- return iomap_readahead(rac, &erofs_iomap_ops);
+ iomap_bio_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index b4bfe14229f9..e38d93bb2104 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -172,7 +172,6 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
dctx.bounce = strm->bounce;
do {
- dctx.avail_out = out_buf.size - out_buf.pos;
dctx.inbuf_sz = in_buf.size;
dctx.inbuf_pos = in_buf.pos;
err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
@@ -188,14 +187,18 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
in_buf.pos = dctx.inbuf_pos;
zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
- if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) {
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ if (zstd_is_error(zerr) ||
+ ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 &&
+ !(rq->inputsize + in_buf.size - in_buf.pos))))) {
erofs_err(sb, "failed to decompress in[%u] out[%u]: %s",
rq->inputsize, rq->outputsize,
- zerr ? zstd_get_error_name(zerr) : "unexpected end of stream");
+ zstd_is_error(zerr) ? zstd_get_error_name(zerr) :
+ "unexpected end of stream");
err = -EFSCORRUPTED;
break;
}
- } while (rq->outputsize || out_buf.pos < out_buf.size);
+ } while (rq->outputsize + dctx.avail_out);
if (dctx.kout)
kunmap_local(dctx.kout);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index b7b3432a9882..d27938435b2f 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -47,7 +47,6 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
{
- const struct cred *old_cred;
struct iov_iter iter;
int ret;
@@ -61,9 +60,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
rq->iocb.ki_flags = IOCB_DIRECT;
iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
rq->bio.bi_iter.bi_size);
- old_cred = override_creds(rq->iocb.ki_filp->f_cred);
- ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
- revert_creds(old_cred);
+ scoped_with_creds(rq->iocb.ki_filp->f_cred)
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
if (ret != -EIOCBQUEUED)
erofs_fileio_ki_complete(&rq->iocb, ret);
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index cb780c095d28..bce98c845a18 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -295,7 +295,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
int err = erofs_fill_inode(inode);
if (err) {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index e5581dbeb4c2..c8d8e129eb4b 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -55,10 +55,6 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
} else {
m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
- if (m->clusterofs >= 1 << vi->z_lclusterbits) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
m->pblk = le32_to_cpu(di->di_u.blkaddr);
}
return 0;
@@ -240,21 +236,29 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
+ struct erofs_inode *vi = EROFS_I(m->inode);
+ int err;
+
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) {
+ err = z_erofs_load_compact_lcluster(m, lcn, lookahead);
+ } else {
+ DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL);
+ err = z_erofs_load_full_lcluster(m, lcn);
+ }
+ if (err)
+ return err;
+
if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu",
- m->type, lcn, EROFS_I(m->inode)->nid);
+ m->type, lcn, EROFS_I(m->inode)->nid);
DBG_BUGON(1);
return -EOPNOTSUPP;
+ } else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD &&
+ m->clusterofs >= (1 << vi->z_lclusterbits)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
-
- switch (EROFS_I(m->inode)->datalayout) {
- case EROFS_INODE_COMPRESSED_FULL:
- return z_erofs_load_full_lcluster(m, lcn);
- case EROFS_INODE_COMPRESSED_COMPACT:
- return z_erofs_load_compact_lcluster(m, lcn, lookahead);
- default:
- return -EINVAL;
- }
+ return 0;
}
static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
@@ -268,20 +272,19 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned long lcn = m->lcn - lookback_distance;
int err;
+ if (!lookback_distance)
+ break;
+
err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
-
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
- if (!lookback_distance)
- break;
continue;
- } else {
- m->headtype = m->type;
- m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
- return 0;
}
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
}
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
@@ -431,13 +434,6 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
end = inode->i_size;
} else {
if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- /* m.lcn should be >= 1 if endoff < m.clusterofs */
- if (!m.lcn) {
- erofs_err(sb, "invalid logical cluster 0 at nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
end = (m.lcn << lclusterbits) | m.clusterofs;
map->m_flags |= EROFS_MAP_FULL_MAPPED;
m.delta[0] = 1;
@@ -596,7 +592,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
vi->z_fragmentoff = map->m_plen;
if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
vi->z_fragmentoff |= map->m_pa << 32;
- } else if (map->m_plen) {
+ } else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) {
map->m_flags |= EROFS_MAP_MAPPED |
EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
@@ -715,6 +711,7 @@ static int z_erofs_map_sanity_check(struct inode *inode,
struct erofs_map_blocks *map)
{
struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ u64 pend;
if (!(map->m_flags & EROFS_MAP_ENCODED))
return 0;
@@ -732,6 +729,10 @@ static int z_erofs_map_sanity_check(struct inode *inode,
if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
return -EOPNOTSUPP;
+ /* Filesystems beyond 48-bit physical block addresses are invalid */
+ if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) ||
+ (pend >> sbi->blkszbits) >= BIT_ULL(48)))
+ return -EFSCORRUPTED;
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 6b70c6726d31..4298e7e08d5d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
{
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error && !write)
+ if (!error && write)
validate_coredump_safety();
return error;
}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 329697c89d09..38210fb6901c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -29,7 +29,6 @@ enum exfat_error_mode {
enum {
NLS_NAME_NO_LOSSY = 0, /* no lossy */
NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */
- NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */
};
#define EXFAT_HASH_BITS 8
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index f246cf439588..adc37b4d7fc2 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -509,8 +509,8 @@ static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long ar
static int exfat_ioctl_set_volume_label(struct super_block *sb,
unsigned long arg)
{
- int ret = 0, lossy;
- char label[FSLABEL_MAX];
+ int ret = 0, lossy, label_len;
+ char label[FSLABEL_MAX] = {0};
struct exfat_uni_name uniname;
if (!capable(CAP_SYS_ADMIN))
@@ -520,8 +520,9 @@ static int exfat_ioctl_set_volume_label(struct super_block *sb,
return -EFAULT;
memset(&uniname, 0, sizeof(uniname));
+ label_len = strnlen(label, FSLABEL_MAX - 1);
if (label[0]) {
- ret = exfat_nls_to_utf16(sb, label, FSLABEL_MAX,
+ ret = exfat_nls_to_utf16(sb, label, label_len,
&uniname, &lossy);
if (ret < 0)
return ret;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 7eb9c67fd35f..745dce29ddb5 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -442,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
return namelen; /* return error value */
if ((lossy && !lookup) || !namelen)
- return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL;
+ return -EINVAL;
return 0;
}
@@ -642,10 +642,14 @@ static int exfat_find(struct inode *dir, const struct qstr *qname,
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
- info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size);
info->size = le64_to_cpu(ep2->dentry.stream.size);
+ if (info->valid_size < 0) {
+ exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size);
+ return -EIO;
+ }
+
if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) {
exfat_fs_error(sb, "data size is invalid(%lld)", info->size);
return -EIO;
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 8243d94ceaf4..57db08a5271c 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -616,9 +616,6 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
unilen++;
}
- if (p_cstring[i] != '\0')
- lossy |= NLS_NAME_OVERLEN;
-
*uniname = '\0';
p_uniname->name_len = unilen;
p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0,
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 7f9592856bf7..74d451f732c7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
/* set block size to read super block */
- sb_min_blocksize(sb, 512);
+ if (!sb_min_blocksize(sb, 512)) {
+ exfat_err(sb, "unable to set blocksize");
+ return -EINVAL;
+ }
/* read boot sector */
sbi->boot_bh = sb_bread(sb, 0);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e10c376843d7..dbfe9098a124 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1398,7 +1398,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ei = EXT2_I(inode);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b3e9b7bd7978..a0e66bc10093 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -280,9 +280,16 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
- /* In the no journal case, we can just do a bforget and return */
+ /*
+ * In the no journal case, we should wait for the ongoing buffer
+ * to complete and do a forget.
+ */
if (!ext4_handle_valid(handle)) {
- bforget(bh);
+ if (bh) {
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+ __bforget(bh);
+ }
return 0;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9e4ac87211e..78ea864fa8cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -202,8 +202,7 @@ void ext4_evict_inode(struct inode *inode)
* the inode. Flush worker is ignoring it because of I_FREEING flag but
* we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list))
- inode_io_list_del(inode);
+ inode_io_list_del(inode);
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -425,7 +424,7 @@ void ext4_check_map_extents_env(struct inode *inode)
if (!S_ISREG(inode->i_mode) ||
IS_NOQUOTA(inode) || IS_VERITY(inode) ||
is_special_ino(inode->i_sb, inode->i_ino) ||
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
ext4_verity_in_progress(inode))
return;
@@ -1319,8 +1318,8 @@ retry_grab:
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
from = offset_in_folio(folio, pos);
to = from + len;
@@ -2619,10 +2618,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
handle_t *handle = NULL;
int bpp = ext4_journal_blocks_per_folio(mpd->inode);
- if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(mpd->wbc);
mpd->map.m_len = 0;
mpd->next_pos = mpd->start_pos;
@@ -2704,7 +2700,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->start_pos = folio_pos(folio);
- mpd->next_pos = folio_pos(folio) + folio_size(folio);
+ mpd->next_pos = folio_next_pos(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -3146,8 +3142,8 @@ retry:
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
@@ -3473,7 +3469,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->i_private_list))
return true;
- return inode->i_state & I_DIRTY_DATASYNC;
+ return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
@@ -4552,7 +4548,7 @@ int ext4_truncate(struct inode *inode)
* or it's a completely new inode. In those cases we might not
* have i_rwsem locked because it's not necessary.
*/
- if (!(inode->i_state & (I_NEW|I_FREEING)))
+ if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
@@ -5210,7 +5206,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
ret = check_igot_inode(inode, flags, function, line);
if (ret) {
iput(inode);
@@ -5319,6 +5315,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
ext4_set_inode_flags(inode, true);
+ /* Detect invalid flag combination - can't have both inline data and extents */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_error_inode(inode, function, line, 0,
+ "inode has both inline data and extents flags");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
@@ -5541,7 +5545,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index ab1ff51302fb..6f57c181ff77 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -57,16 +57,12 @@ static int write_mmp_block_thawed(struct super_block *sb,
static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
- int err;
-
/*
* We protect against freezing so that we don't create dirty buffers
* on frozen filesystem.
*/
- sb_start_write(sb);
- err = write_mmp_block_thawed(sb, bh);
- sb_end_write(sb);
- return err;
+ scoped_guard(super_write, sb)
+ return write_mmp_block_thawed(sb, bh);
}
/*
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 33c3a89396b1..5fd54adf0c88 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
if (!sbi->s_journal || is_bad_inode(inode))
return 0;
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
if (ext4_inode_orphan_tracked(inode))
return 0;
@@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
return ext4_orphan_file_del(handle, inode);
@@ -513,7 +513,7 @@ void ext4_release_orphan_info(struct super_block *sb)
return;
for (i = 0; i < oi->of_blocks; i++)
brelse(oi->of_binfo[i].ob_bh);
- kfree(oi->of_binfo);
+ kvfree(oi->of_binfo);
}
static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
@@ -637,7 +637,7 @@ int ext4_init_orphan_info(struct super_block *sb)
out_free:
for (i--; i >= 0; i--)
brelse(oi->of_binfo[i].ob_bh);
- kfree(oi->of_binfo);
+ kvfree(oi->of_binfo);
out_put:
iput(inode);
return ret;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index d4d7f329d23f..fa8d81a30fb9 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -9,6 +9,7 @@
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
+#include <linux/fs_struct.h>
#include <linux/f2fs_fs.h>
#include "f2fs.h"
#include "xattr.h"
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 6ad8d3bc6df7..be53e06caf3d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1329,7 +1329,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
}
folio = page_folio(cc->rpages[last_index]);
- psize = folio_pos(folio) + folio_size(folio);
+ psize = folio_next_pos(folio);
err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
if (err)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ef38e62cda8f..8bf4feda42b0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1497,8 +1497,8 @@ static bool f2fs_map_blocks_cached(struct inode *inode,
struct f2fs_dev_info *dev = &sbi->devs[bidx];
map->m_bdev = dev->bdev;
- map->m_pblk -= dev->start_blk;
map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk);
+ map->m_pblk -= dev->start_blk;
} else {
map->m_bdev = inode->i_sb->s_bdev;
}
@@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
retry = 0;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -4222,7 +4219,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (map.m_flags & F2FS_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
- if ((inode->i_state & I_DIRTY_DATASYNC) ||
+ if ((inode_state_read_once(inode) & I_DIRTY_DATASYNC) ||
offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 8c4eafe9ffac..f1cda1900658 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -569,7 +569,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
if (is_meta_ino(sbi, ino)) {
f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino);
set_sbi_flag(sbi, SBI_NEED_FSCK);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b882771e4699..af40282a6948 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -844,7 +844,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
f2fs_i_links_write(inode, false);
spin_lock(&inode->i_lock);
- inode->i_state |= I_LINKABLE;
+ inode_state_set(inode, I_LINKABLE);
spin_unlock(&inode->i_lock);
} else {
if (file)
@@ -1057,7 +1057,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto put_out_dir;
spin_lock(&whiteout->i_lock);
- whiteout->i_state &= ~I_LINKABLE;
+ inode_state_clear(whiteout, I_LINKABLE);
spin_unlock(&whiteout->i_lock);
iput(whiteout);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fd8e7b0b2166..47489d48f2b9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1798,7 +1798,7 @@ static int f2fs_drop_inode(struct inode *inode)
* - f2fs_gc -> iput -> evict
* - inode_wait_for_writeback(inode)
*/
- if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
+ if ((!inode_unhashed(inode) && inode_state_read(inode) & I_SYNC)) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* to avoid evict_inode call simultaneously */
__iget(inode);
@@ -1820,7 +1820,7 @@ static int f2fs_drop_inode(struct inode *inode)
sb_end_intwrite(inode->i_sb);
spin_lock(&inode->i_lock);
- iput(inode);
+ atomic_dec(&inode->i_count);
}
trace_f2fs_drop_inode(inode, 0);
return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9648ed097816..0b6009cd1844 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -22,6 +22,7 @@
#include <linux/unaligned.h>
#include <linux/random.h>
#include <linux/iversion.h>
+#include <linux/fs_struct.h>
#include "fat.h"
#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -1595,8 +1596,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc,
setup(sb); /* flavour-specific stuff that needs options */
+ error = -EINVAL;
+ if (!sb_min_blocksize(sb, 512)) {
+ fat_msg(sb, KERN_ERR, "unable to set blocksize");
+ goto out_fail;
+ }
error = -EIO;
- sb_min_blocksize(sb, 512);
bh = sb_bread(sb, 0);
if (bh == NULL) {
fat_msg(sb, KERN_ERR, "unable to read boot sector");
diff --git a/fs/file.c b/fs/file.c
index 28743b742e3c..3f56890068aa 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -641,6 +641,34 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
+/*
+ * Install a file pointer in the fd array while it is being resized.
+ *
+ * We need to make sure our update to the array does not get lost as the resizing
+ * thread can be copying the content as we modify it.
+ *
+ * We have two ways to do it:
+ * - go off CPU waiting for resize_in_progress to clear
+ * - take the spin lock
+ *
+ * The latter is trivial to implement and saves us from having to might_sleep()
+ * for debugging purposes.
+ *
+ * This is moved out of line from fd_install() to convince gcc to optimize that
+ * routine better.
+ */
+static void noinline fd_install_slowpath(unsigned int fd, struct file *file)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
+ rcu_assign_pointer(fdt->fd[fd], file);
+ spin_unlock(&files->file_lock);
+}
+
/**
* fd_install - install a file pointer in the fd array
* @fd: file descriptor to install the file in
@@ -658,14 +686,9 @@ void fd_install(unsigned int fd, struct file *file)
return;
rcu_read_lock_sched();
-
if (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ fd_install_slowpath(fd, file);
return;
}
/* coupled with smp_wmb() in expand_fdtable() */
diff --git a/fs/file_attr.c b/fs/file_attr.c
index 12424d4945d0..4c4916632f11 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -84,7 +84,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
int error;
if (!inode->i_op->fileattr_get)
- return -EOPNOTSUPP;
+ return -ENOIOCTLCMD;
error = security_inode_file_getattr(dentry, fa);
if (error)
@@ -270,7 +270,7 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
int err;
if (!inode->i_op->fileattr_set)
- return -EOPNOTSUPP;
+ return -ENOIOCTLCMD;
if (!inode_owner_or_capable(idmap, inode))
return -EPERM;
@@ -312,13 +312,10 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp)
int err;
err = vfs_fileattr_get(file->f_path.dentry, &fa);
- if (err == -EOPNOTSUPP)
- err = -ENOIOCTLCMD;
if (!err)
err = put_user(fa.flags, argp);
return err;
}
-EXPORT_SYMBOL(ioctl_getflags);
int ioctl_setflags(struct file *file, unsigned int __user *argp)
{
@@ -335,13 +332,10 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
fileattr_fill_flags(&fa, flags);
err = vfs_fileattr_set(idmap, dentry, &fa);
mnt_drop_write_file(file);
- if (err == -EOPNOTSUPP)
- err = -ENOIOCTLCMD;
}
}
return err;
}
-EXPORT_SYMBOL(ioctl_setflags);
int ioctl_fsgetxattr(struct file *file, void __user *argp)
{
@@ -349,14 +343,11 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp)
int err;
err = vfs_fileattr_get(file->f_path.dentry, &fa);
- if (err == -EOPNOTSUPP)
- err = -ENOIOCTLCMD;
if (!err)
err = copy_fsxattr_to_user(&fa, argp);
return err;
}
-EXPORT_SYMBOL(ioctl_fsgetxattr);
int ioctl_fssetxattr(struct file *file, void __user *argp)
{
@@ -371,13 +362,10 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
if (!err) {
err = vfs_fileattr_set(idmap, dentry, &fa);
mnt_drop_write_file(file);
- if (err == -EOPNOTSUPP)
- err = -ENOIOCTLCMD;
}
}
return err;
}
-EXPORT_SYMBOL(ioctl_fssetxattr);
SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
struct file_attr __user *, ufattr, size_t, usize,
@@ -424,6 +412,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
}
error = vfs_fileattr_get(filepath.dentry, &fa);
+ if (error == -ENOIOCTLCMD || error == -ENOTTY)
+ error = -EOPNOTSUPP;
if (error)
return error;
@@ -491,6 +481,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
if (!error) {
error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
filepath.dentry, &fa);
+ if (error == -ENOIOCTLCMD || error == -ENOTTY)
+ error = -EOPNOTSUPP;
mnt_drop_write(filepath.mnt);
}
diff --git a/fs/file_table.c b/fs/file_table.c
index b223d873e48b..cd4a3db4659a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -192,7 +192,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_sb_err = 0;
/*
- * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While
* fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first.
*/
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 20600e9ea202..21fc94b98209 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -258,7 +258,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
ip = iget_locked(sbp, ino);
if (!ip)
return ERR_PTR(-ENOMEM);
- if (!(ip->i_state & I_NEW))
+ if (!(inode_state_read_once(ip) & I_NEW))
return ip;
vip = VXFS_INO(ip);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2b35e80037fe..6800886c4d10 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,6 +14,7 @@
* Additions for address_space-based writeback
*/
+#include <linux/sched/sysctl.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
@@ -32,11 +33,6 @@
#include "internal.h"
/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
-
-/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
@@ -121,7 +117,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
- WARN_ON_ONCE(inode->i_state & I_FREEING);
+ WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);
list_move(&inode->i_io_list, head);
@@ -200,6 +196,19 @@ static void wb_queue_work(struct bdi_writeback *wb,
spin_unlock_irq(&wb->work_lock);
}
+static bool wb_wait_for_completion_cb(struct wb_completion *done)
+{
+ unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
+
+ done->progress_stamp = jiffies;
+ if (waited_secs > sysctl_hung_task_timeout_secs)
+ pr_info("INFO: The task %s:%d has been waiting for writeback "
+ "completion for more than %lu seconds.",
+ current->comm, current->pid, waited_secs);
+
+ return !atomic_read(&done->cnt);
+}
+
/**
* wb_wait_for_completion - wait for completion of bdi_writeback_works
* @done: target wb_completion
@@ -212,8 +221,9 @@ static void wb_queue_work(struct bdi_writeback *wb,
*/
void wb_wait_for_completion(struct wb_completion *done)
{
+ done->wait_start = jiffies;
atomic_dec(&done->cnt); /* put down the initial count */
- wait_event(*done->waitq, !atomic_read(&done->cnt));
+ wait_event(*done->waitq, wb_wait_for_completion_cb(done));
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -304,9 +314,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
- WARN_ON_ONCE(inode->i_state & I_FREEING);
+ WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
if (wb != &wb->bdi->wb)
list_move(&inode->i_io_list, &wb->b_attached);
else
@@ -408,7 +418,7 @@ static bool inode_do_switch_wbs(struct inode *inode,
* Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
* path owns the inode and we shouldn't modify ->i_io_list.
*/
- if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
+ if (unlikely(inode_state_read(inode) & (I_FREEING | I_WILL_FREE)))
goto skip_switch;
trace_inode_switch_wbs(inode, old_wb, new_wb);
@@ -451,7 +461,7 @@ static bool inode_do_switch_wbs(struct inode *inode,
if (!list_empty(&inode->i_io_list)) {
inode->i_wb = new_wb;
- if (inode->i_state & I_DIRTY_ALL) {
+ if (inode_state_read(inode) & I_DIRTY_ALL) {
/*
* We need to keep b_dirty list sorted by
* dirtied_time_when. However properly sorting the
@@ -476,10 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
switched = true;
skip_switch:
/*
- * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+ * Paired with an acquire fence in unlocked_inode_to_wb_begin() and
* ensures that the new wb is visible if they see !I_WB_SWITCH.
*/
- smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+ smp_wmb();
+ inode_state_clear(inode, I_WB_SWITCH);
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
@@ -600,12 +611,12 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
- inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+ inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
inode_to_wb(inode) == new_wb) {
spin_unlock(&inode->i_lock);
return false;
}
- inode->i_state |= I_WB_SWITCH;
+ inode_state_set(inode, I_WB_SWITCH);
__iget(inode);
spin_unlock(&inode->i_lock);
@@ -635,7 +646,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
struct bdi_writeback *new_wb = NULL;
/* noop if seems to be already in progress */
- if (inode->i_state & I_WB_SWITCH)
+ if (inode_state_read_once(inode) & I_WB_SWITCH)
return;
/* avoid queueing a new switch if too many are already in flight */
@@ -807,9 +818,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
* @wbc: writeback_control of interest
* @inode: target inode
*
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
+ * This function is to be used by filemap_writeback(), which is an alternative
+ * entry point into writeback code, and first ensures @inode is associated with
+ * a bdi_writeback and attaches it to @wbc.
*/
void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
struct inode *inode)
@@ -1236,9 +1247,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
- WARN_ON_ONCE(inode->i_state & I_FREEING);
+ WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -1348,10 +1359,17 @@ void inode_io_list_del(struct inode *inode)
{
struct bdi_writeback *wb;
+ /*
+ * FIXME: ext4 can call here from ext4_evict_inode() after evict() already
+ * unlinked the inode.
+ */
+ if (list_empty_careful(&inode->i_io_list))
+ return;
+
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
@@ -1409,13 +1427,13 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
assert_spin_locked(&inode->i_lock);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
/*
* When the inode is being freed just don't bother with dirty list
* tracking. Flush worker will ignore this inode anyway and it will
* trigger assertions in inode_io_list_move_locked().
*/
- if (inode->i_state & I_FREEING) {
+ if (inode_state_read(inode) & I_FREEING) {
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
return;
@@ -1449,9 +1467,9 @@ static void inode_sync_complete(struct inode *inode)
{
assert_spin_locked(&inode->i_lock);
- inode->i_state &= ~I_SYNC;
+ inode_state_clear(inode, I_SYNC);
/* If inode is clean an unused, put it into LRU now... */
- inode_add_lru(inode);
+ inode_lru_list_add(inode);
/* Called with inode->i_lock which ensures memory ordering. */
inode_wake_up_bit(inode, __I_SYNC);
}
@@ -1493,7 +1511,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
spin_lock(&inode->i_lock);
list_move(&inode->i_io_list, &tmp);
moved++;
- inode->i_state |= I_SYNC_QUEUED;
+ inode_state_set(inode, I_SYNC_QUEUED);
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
@@ -1579,14 +1597,14 @@ void inode_wait_for_writeback(struct inode *inode)
assert_spin_locked(&inode->i_lock);
- if (!(inode->i_state & I_SYNC))
+ if (!(inode_state_read(inode) & I_SYNC))
return;
wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
for (;;) {
prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
/* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
- if (!(inode->i_state & I_SYNC))
+ if (!(inode_state_read(inode) & I_SYNC))
break;
spin_unlock(&inode->i_lock);
schedule();
@@ -1612,7 +1630,7 @@ static void inode_sleep_on_writeback(struct inode *inode)
wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
/* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
- sleep = !!(inode->i_state & I_SYNC);
+ sleep = !!(inode_state_read(inode) & I_SYNC);
spin_unlock(&inode->i_lock);
if (sleep)
schedule();
@@ -1631,7 +1649,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
struct writeback_control *wbc,
unsigned long dirtied_before)
{
- if (inode->i_state & I_FREEING)
+ if (inode_state_read(inode) & I_FREEING)
return;
/*
@@ -1639,7 +1657,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* shot. If still dirty, it will be redirty_tail()'ed below. Update
* the dirty time to prevent enqueue and sync it again.
*/
- if ((inode->i_state & I_DIRTY) &&
+ if ((inode_state_read(inode) & I_DIRTY) &&
(wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
inode->dirtied_when = jiffies;
@@ -1650,7 +1668,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* is odd for clean inodes, it can happen for some
* filesystems so handle that gracefully.
*/
- if (inode->i_state & I_DIRTY_ALL)
+ if (inode_state_read(inode) & I_DIRTY_ALL)
redirty_tail_locked(inode, wb);
else
inode_cgwb_move_to_attached(inode, wb);
@@ -1676,17 +1694,17 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
*/
redirty_tail_locked(inode, wb);
}
- } else if (inode->i_state & I_DIRTY) {
+ } else if (inode_state_read(inode) & I_DIRTY) {
/*
* Filesystems can dirty the inode during writeback operations,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
redirty_tail_locked(inode, wb);
- } else if (inode->i_state & I_DIRTY_TIME) {
+ } else if (inode_state_read(inode) & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
} else {
/* The inode is clean. Remove from writeback lists. */
inode_cgwb_move_to_attached(inode, wb);
@@ -1712,7 +1730,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
unsigned dirty;
int ret;
- WARN_ON(!(inode->i_state & I_SYNC));
+ WARN_ON(!(inode_state_read_once(inode) & I_SYNC));
trace_writeback_single_inode_start(inode, wbc, nr_to_write);
@@ -1736,7 +1754,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* mark_inode_dirty_sync() to notify the filesystem about it and to
* change I_DIRTY_TIME into I_DIRTY_SYNC.
*/
- if ((inode->i_state & I_DIRTY_TIME) &&
+ if ((inode_state_read_once(inode) & I_DIRTY_TIME) &&
(wbc->sync_mode == WB_SYNC_ALL ||
time_after(jiffies, inode->dirtied_time_when +
dirtytime_expire_interval * HZ))) {
@@ -1751,8 +1769,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* after handling timestamp expiration, as that may dirty the inode too.
*/
spin_lock(&inode->i_lock);
- dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~dirty;
+ dirty = inode_state_read(inode) & I_DIRTY;
+ inode_state_clear(inode, dirty);
/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -1768,10 +1786,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
smp_mb();
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- inode->i_state |= I_DIRTY_PAGES;
- else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
- if (!(inode->i_state & I_DIRTY_PAGES)) {
- inode->i_state &= ~I_PINNING_NETFS_WB;
+ inode_state_set(inode, I_DIRTY_PAGES);
+ else if (unlikely(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
+ if (!(inode_state_read(inode) & I_DIRTY_PAGES)) {
+ inode_state_clear(inode, I_PINNING_NETFS_WB);
wbc->unpinned_netfs_wb = true;
dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
}
@@ -1807,11 +1825,11 @@ static int writeback_single_inode(struct inode *inode,
spin_lock(&inode->i_lock);
if (!icount_read(inode))
- WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+ WARN_ON(!(inode_state_read(inode) & (I_WILL_FREE | I_FREEING)));
else
- WARN_ON(inode->i_state & I_WILL_FREE);
+ WARN_ON(inode_state_read(inode) & I_WILL_FREE);
- if (inode->i_state & I_SYNC) {
+ if (inode_state_read(inode) & I_SYNC) {
/*
* Writeback is already running on the inode. For WB_SYNC_NONE,
* that's enough and we can just return. For WB_SYNC_ALL, we
@@ -1822,7 +1840,7 @@ static int writeback_single_inode(struct inode *inode,
goto out;
inode_wait_for_writeback(inode);
}
- WARN_ON(inode->i_state & I_SYNC);
+ WARN_ON(inode_state_read(inode) & I_SYNC);
/*
* If the inode is already fully clean, then there's nothing to do.
*
@@ -1830,11 +1848,11 @@ static int writeback_single_inode(struct inode *inode,
* still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If
* there are any such pages, we'll need to wait for them.
*/
- if (!(inode->i_state & I_DIRTY_ALL) &&
+ if (!(inode_state_read(inode) & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
- inode->i_state |= I_SYNC;
+ inode_state_set(inode, I_SYNC);
wbc_attach_and_unlock_inode(wbc, inode);
ret = __writeback_single_inode(inode, wbc);
@@ -1847,18 +1865,18 @@ static int writeback_single_inode(struct inode *inode,
* If the inode is freeing, its i_io_list shoudn't be updated
* as it can be finally deleted at this moment.
*/
- if (!(inode->i_state & I_FREEING)) {
+ if (!(inode_state_read(inode) & I_FREEING)) {
/*
* If the inode is now fully clean, then it can be safely
* removed from its writeback list (if any). Otherwise the
* flusher threads are responsible for the writeback lists.
*/
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read(inode) & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
- else if (!(inode->i_state & I_SYNC_QUEUED)) {
- if ((inode->i_state & I_DIRTY))
+ else if (!(inode_state_read(inode) & I_SYNC_QUEUED)) {
+ if ((inode_state_read(inode) & I_DIRTY))
redirty_tail_locked(inode, wb);
- else if (inode->i_state & I_DIRTY_TIME) {
+ else if (inode_state_read(inode) & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode,
wb,
@@ -1874,8 +1892,8 @@ out:
return ret;
}
-static long writeback_chunk_size(struct bdi_writeback *wb,
- struct wb_writeback_work *work)
+static long writeback_chunk_size(struct super_block *sb,
+ struct bdi_writeback *wb, struct wb_writeback_work *work)
{
long pages;
@@ -1893,16 +1911,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
* (maybe slowly) sync all tagged pages
*/
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
- pages = LONG_MAX;
- else {
- pages = min(wb->avg_write_bandwidth / 2,
- global_wb_domain.dirty_limit / DIRTY_SCOPE);
- pages = min(pages, work->nr_pages);
- pages = round_down(pages + MIN_WRITEBACK_PAGES,
- MIN_WRITEBACK_PAGES);
- }
+ return LONG_MAX;
- return pages;
+ pages = min(wb->avg_write_bandwidth / 2,
+ global_wb_domain.dirty_limit / DIRTY_SCOPE);
+ pages = min(pages, work->nr_pages);
+ return round_down(pages + sb->s_min_writeback_pages,
+ sb->s_min_writeback_pages);
}
/*
@@ -1967,12 +1982,12 @@ static long writeback_sb_inodes(struct super_block *sb,
* kind writeout is handled by the freer.
*/
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) {
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
continue;
}
- if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+ if ((inode_state_read(inode) & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
/*
* If this inode is locked for writeback and we are not
* doing writeback-for-data-integrity, move it to
@@ -1994,17 +2009,17 @@ static long writeback_sb_inodes(struct super_block *sb,
* are doing WB_SYNC_NONE writeback. So this catches only the
* WB_SYNC_ALL case.
*/
- if (inode->i_state & I_SYNC) {
+ if (inode_state_read(inode) & I_SYNC) {
/* Wait for I_SYNC. This function drops i_lock... */
inode_sleep_on_writeback(inode);
/* Inode may be gone, start again */
spin_lock(&wb->list_lock);
continue;
}
- inode->i_state |= I_SYNC;
+ inode_state_set(inode, I_SYNC);
wbc_attach_and_unlock_inode(&wbc, inode);
- write_chunk = writeback_chunk_size(wb, work);
+ write_chunk = writeback_chunk_size(inode->i_sb, wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
@@ -2014,6 +2029,12 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
__writeback_single_inode(inode, &wbc);
+ /* Report progress to inform the hung task detector of the progress. */
+ if (work->done && work->done->progress_stamp &&
+ (jiffies - work->done->progress_stamp) > HZ *
+ sysctl_hung_task_timeout_secs / 2)
+ wake_up_all(work->done->waitq);
+
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
@@ -2039,7 +2060,7 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read(inode) & I_DIRTY_ALL))
total_wrote++;
requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
inode_sync_complete(inode);
@@ -2545,10 +2566,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* We tell ->dirty_inode callback that timestamps need to
* be updated by setting I_DIRTY_TIME in flags.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- if (inode->i_state & I_DIRTY_TIME) {
- inode->i_state &= ~I_DIRTY_TIME;
+ if (inode_state_read(inode) & I_DIRTY_TIME) {
+ inode_state_clear(inode, I_DIRTY_TIME);
flags |= I_DIRTY_TIME;
}
spin_unlock(&inode->i_lock);
@@ -2585,16 +2606,16 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if ((inode->i_state & flags) == flags)
+ if ((inode_state_read_once(inode) & flags) == flags)
return;
spin_lock(&inode->i_lock);
- if ((inode->i_state & flags) != flags) {
- const int was_dirty = inode->i_state & I_DIRTY;
+ if ((inode_state_read(inode) & flags) != flags) {
+ const int was_dirty = inode_state_read(inode) & I_DIRTY;
inode_attach_wb(inode, NULL);
- inode->i_state |= flags;
+ inode_state_set(inode, flags);
/*
* Grab inode's wb early because it requires dropping i_lock and we
@@ -2613,7 +2634,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* the inode it will place it on the appropriate superblock
* list, based upon its state.
*/
- if (inode->i_state & I_SYNC_QUEUED)
+ if (inode_state_read(inode) & I_SYNC_QUEUED)
goto out_unlock;
/*
@@ -2624,7 +2645,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (inode_unhashed(inode))
goto out_unlock;
}
- if (inode->i_state & I_FREEING)
+ if (inode_state_read(inode) & I_FREEING)
goto out_unlock;
/*
@@ -2639,7 +2660,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (dirtytime)
inode->dirtied_time_when = jiffies;
- if (inode->i_state & I_DIRTY)
+ if (inode_state_read(inode) & I_DIRTY)
dirty_list = &wb->b_dirty;
else
dirty_list = &wb->b_dirty_time;
@@ -2736,7 +2757,7 @@ static void wait_sb_inodes(struct super_block *sb)
spin_unlock_irq(&sb->s_inode_wblist_lock);
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
spin_lock_irq(&sb->s_inode_wblist_lock);
diff --git a/fs/fs_types.c b/fs/fs_dirent.c
index 78365e5dc08c..e5e08f213816 100644
--- a/fs/fs_types.c
+++ b/fs/fs_dirent.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/fs.h>
+#include <linux/fs_dirent.h>
#include <linux/export.h>
/*
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 28be762ac1c6..b8c46c5a38a0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -146,12 +146,6 @@ int unshare_fs_struct(void)
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);
-int current_umask(void)
-{
- return current->fs->umask;
-}
-EXPORT_SYMBOL(current_umask);
-
/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
.users = 1,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ecaec0fea3a1..316922d5dd13 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = fc->blkbits;
+ blkbits = inode->i_sb->s_blocksize_bits;
stat->blksize = 1 << blkbits;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f1ef77a0be05..7bcb650a9f26 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -834,23 +834,142 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio,
return 0;
}
+static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+};
+
+struct fuse_fill_read_data {
+ struct file *file;
+
+ /* Fields below are used if sending the read request asynchronously */
+ struct fuse_conn *fc;
+ struct fuse_io_args *ia;
+ unsigned int nr_bytes;
+};
+
+/* forward declarations */
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write);
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
+ unsigned int count, bool async);
+
+static int fuse_handle_readahead(struct folio *folio,
+ struct readahead_control *rac,
+ struct fuse_fill_read_data *data, loff_t pos,
+ size_t len)
+{
+ struct fuse_io_args *ia = data->ia;
+ size_t off = offset_in_folio(folio, pos);
+ struct fuse_conn *fc = data->fc;
+ struct fuse_args_pages *ap;
+ unsigned int nr_pages;
+
+ if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes,
+ false)) {
+ fuse_send_readpages(ia, data->file, data->nr_bytes,
+ fc->async_read);
+ data->nr_bytes = 0;
+ data->ia = NULL;
+ ia = NULL;
+ }
+ if (!ia) {
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ return -EAGAIN;
+
+ nr_pages = min(fc->max_pages, readahead_count(rac));
+ data->ia = fuse_io_alloc(NULL, nr_pages);
+ if (!data->ia)
+ return -ENOMEM;
+ ia = data->ia;
+ }
+ folio_get(folio);
+ ap = &ia->ap;
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = off;
+ ap->descs[ap->num_folios].length = len;
+ data->nr_bytes += len;
+ ap->num_folios++;
+
+ return 0;
+}
+
+static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx,
+ size_t len)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+ struct folio *folio = ctx->cur_folio;
+ loff_t pos = iter->pos;
+ size_t off = offset_in_folio(folio, pos);
+ struct file *file = data->file;
+ int ret;
+
+ if (ctx->rac) {
+ ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
+ } else {
+ /*
+ * for non-readahead read requests, do reads synchronously
+ * since it's not guaranteed that the server can handle
+ * out-of-order reads
+ */
+ ret = fuse_do_readfolio(file, folio, off, len);
+ if (!ret)
+ iomap_finish_folio_read(folio, off, len, ret);
+ }
+ return ret;
+}
+
+static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+
+ if (data->ia)
+ fuse_send_readpages(data->ia, data->file, data->nr_bytes,
+ data->fc->async_read);
+}
+
+static const struct iomap_read_ops fuse_iomap_read_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range_async,
+ .submit_read = fuse_iomap_read_submit,
+};
+
static int fuse_read_folio(struct file *file, struct folio *folio)
{
struct inode *inode = folio->mapping->host;
- int err;
+ struct fuse_fill_read_data data = {
+ .file = file,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .cur_folio = folio,
+ .ops = &fuse_iomap_read_ops,
+ .read_ctx = &data,
- err = -EIO;
- if (fuse_is_bad(inode))
- goto out;
+ };
- err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
- if (!err)
- folio_mark_uptodate(folio);
+ if (fuse_is_bad(inode)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+ iomap_read_folio(&fuse_iomap_ops, &ctx);
fuse_invalidate_atime(inode);
- out:
- folio_unlock(folio);
- return err;
+ return 0;
}
static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
@@ -887,7 +1006,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_invalidate_atime(inode);
for (i = 0; i < ap->num_folios; i++) {
- folio_end_read(ap->folios[i], !err);
+ iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset,
+ ap->descs[i].length, err);
folio_put(ap->folios[i]);
}
if (ia->ff)
@@ -897,7 +1017,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
- unsigned int count)
+ unsigned int count, bool async)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
@@ -919,7 +1039,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
ia->read.attr_ver = fuse_get_attr_version(fm->fc);
- if (fm->fc->async_read) {
+ if (async) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
@@ -936,81 +1056,20 @@ static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- unsigned int max_pages, nr_pages;
- struct folio *folio = NULL;
+ struct fuse_fill_read_data data = {
+ .file = rac->file,
+ .fc = fc,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &fuse_iomap_read_ops,
+ .rac = rac,
+ .read_ctx = &data
+ };
if (fuse_is_bad(inode))
return;
- max_pages = min_t(unsigned int, fc->max_pages,
- fc->max_read / PAGE_SIZE);
-
- /*
- * This is only accurate the first time through, since readahead_folio()
- * doesn't update readahead_count() from the previous folio until the
- * next call. Grab nr_pages here so we know how many pages we're going
- * to have to process. This means that we will exit here with
- * readahead_count() == folio_nr_pages(last_folio), but we will have
- * consumed all of the folios, and read_pages() will call
- * readahead_folio() again which will clean up the rac.
- */
- nr_pages = readahead_count(rac);
-
- while (nr_pages) {
- struct fuse_io_args *ia;
- struct fuse_args_pages *ap;
- unsigned cur_pages = min(max_pages, nr_pages);
- unsigned int pages = 0;
-
- if (fc->num_background >= fc->congestion_threshold &&
- rac->ra->async_size >= readahead_count(rac))
- /*
- * Congested and only async pages left, so skip the
- * rest.
- */
- break;
-
- ia = fuse_io_alloc(NULL, cur_pages);
- if (!ia)
- break;
- ap = &ia->ap;
-
- while (pages < cur_pages) {
- unsigned int folio_pages;
-
- /*
- * This returns a folio with a ref held on it.
- * The ref needs to be held until the request is
- * completed, since the splice case (see
- * fuse_try_move_page()) drops the ref after it's
- * replaced in the page cache.
- */
- if (!folio)
- folio = __readahead_folio(rac);
-
- folio_pages = folio_nr_pages(folio);
- if (folio_pages > cur_pages - pages) {
- /*
- * Large folios belonging to fuse will never
- * have more pages than max_pages.
- */
- WARN_ON(!pages);
- break;
- }
-
- ap->folios[ap->num_folios] = folio;
- ap->descs[ap->num_folios].length = folio_size(folio);
- ap->num_folios++;
- pages += folio_pages;
- folio = NULL;
- }
- fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
- nr_pages -= pages;
- }
- if (folio) {
- folio_end_read(folio, false);
- folio_put(folio);
- }
+ iomap_readahead(&fuse_iomap_ops, &ctx);
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -1397,20 +1456,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = {
.read_folio_range = fuse_iomap_read_folio_range,
};
-static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- iomap->type = IOMAP_MAPPED;
- iomap->length = length;
- iomap->offset = offset;
- return 0;
-}
-
-static const struct iomap_ops fuse_iomap_ops = {
- .iomap_begin = fuse_iomap_begin,
-};
-
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1834,7 +1879,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
- iomap_finish_folio_write(inode, ap->folios[i], 1);
+ iomap_finish_folio_write(inode, ap->folios[i],
+ ap->descs[i].length);
wake_up(&fi->page_waitq);
}
@@ -2047,7 +2093,7 @@ struct fuse_fill_wb_data {
struct fuse_file *ff;
unsigned int max_folios;
/*
- * nr_bytes won't overflow since fuse_writepage_need_send() caps
+ * nr_bytes won't overflow since fuse_folios_need_send() caps
* wb requests to never exceed fc->max_pages (which has an upper bound
* of U16_MAX).
*/
@@ -2092,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode,
spin_unlock(&fi->lock);
}
-static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
- unsigned len, struct fuse_args_pages *ap,
- struct fuse_fill_wb_data *data)
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write)
{
struct folio *prev_folio;
struct fuse_folio_desc prev_desc;
- unsigned bytes = data->nr_bytes + len;
+ unsigned bytes = cur_bytes + len;
loff_t prev_pos;
+ size_t max_bytes = write ? fc->max_write : fc->max_read;
WARN_ON(!ap->num_folios);
@@ -2107,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
return true;
- /* Reached max write bytes */
- if (bytes > fc->max_write)
+ if (bytes > max_bytes)
return true;
/* Discontinuity */
@@ -2118,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
if (prev_pos != pos)
return true;
- /* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_folios == data->max_folios &&
- !fuse_pages_realloc(data, fc->max_pages))
- return true;
-
return false;
}
@@ -2146,10 +2187,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) {
- fuse_writepages_send(inode, data);
- data->wpa = NULL;
- data->nr_bytes = 0;
+ if (wpa) {
+ bool send = fuse_folios_need_send(fc, pos, len, ap,
+ data->nr_bytes, true);
+
+ if (!send) {
+ /*
+ * Need to grow the pages array? If so, did the
+ * expansion fail?
+ */
+ send = (ap->num_folios == data->max_folios) &&
+ !fuse_pages_realloc(data, fc->max_pages);
+ }
+
+ if (send) {
+ fuse_writepages_send(inode, data);
+ data->wpa = NULL;
+ data->nr_bytes = 0;
+ }
}
if (data->wpa == NULL) {
@@ -2161,7 +2216,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
ap = &wpa->ia.ap;
}
- iomap_start_folio_write(inode, folio, 1);
fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
offset, len);
data->nr_bytes += len;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c2f2a48156d6..f616c1991fed 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -981,14 +981,6 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
-
- /*
- * This is a workaround until fuse uses iomap for reads.
- * For fuseblk servers, this represents the blocksize passed in at
- * mount time and for regular fuse servers, this is equivalent to
- * inode->i_blkbits.
- */
- u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d1babf56f254..1a397be53f49 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -160,7 +160,7 @@ static void fuse_evict_inode(struct inode *inode)
struct fuse_inode *fi = get_fuse_inode(inode);
/* Will write inode on close/munmap and in all other dirtiers */
- WARN_ON(inode->i_state & I_DIRTY_INODE);
+ WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
if (FUSE_IS_DAX(inode))
dax_break_layout_final(inode);
@@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
if (attr->blksize)
fi->cached_i_blkbits = ilog2(attr->blksize);
else
- fi->cached_i_blkbits = fc->blkbits;
+ fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits;
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
@@ -505,7 +505,7 @@ retry:
if (!inode)
return NULL;
- if ((inode->i_state & I_NEW)) {
+ if ((inode_state_read_once(inode) & I_NEW)) {
inode->i_flags |= S_NOATIME;
if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
@@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
- /*
- * This is a workaround until fuse hooks into iomap for reads.
- * Use PAGE_SIZE for the blocksize else if the writeback cache
- * is enabled, buffered writes go through iomap and a read may
- * overwrite partially written data if blocksize < PAGE_SIZE
- */
- fc->blkbits = sb->s_blocksize_bits;
- if (ctx->blksize != PAGE_SIZE &&
- !sb_set_blocksize(sb, PAGE_SIZE))
- goto err;
#endif
fc->sync_fs = 1;
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 57032eadca6c..fdc175e93f74 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -536,8 +536,6 @@ int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
cleanup:
fuse_priv_ioctl_cleanup(inode, ff);
- if (err == -ENOTTY)
- err = -EOPNOTSUPP;
return err;
}
@@ -574,7 +572,5 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,
cleanup:
fuse_priv_ioctl_cleanup(inode, ff);
- if (err == -ENOTTY)
- err = -EOPNOTSUPP;
return err;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 6bc7c97b017d..b2f6486fe1d5 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs)
sprintf(buff, "%d", i);
fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj);
- if (!fs->mqs_kobj) {
+ if (!fsvq->kobj) {
ret = -ENOMEM;
goto out_del;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 47d74afd63ac..ff1cf335449a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -81,8 +81,7 @@ static int gfs2_write_jdata_folio(struct folio *folio,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- if (folio_pos(folio) < i_size &&
- i_size < folio_pos(folio) + folio_size(folio))
+ if (folio_pos(folio) < i_size && i_size < folio_next_pos(folio))
folio_zero_segment(folio, offset_in_folio(folio, i_size),
folio_size(folio));
@@ -311,10 +310,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -424,11 +420,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
struct inode *inode = folio->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
+ int error = 0;
if (!gfs2_is_jdata(ip) ||
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
- error = iomap_read_folio(folio, &gfs2_iomap_ops);
+ iomap_bio_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
error = stuffed_read_folio(ip, folio);
} else {
@@ -503,7 +499,7 @@ static void gfs2_readahead(struct readahead_control *rac)
else if (gfs2_is_jdata(ip))
mpage_readahead(rac, gfs2_block_map);
else
- iomap_readahead(rac, &gfs2_iomap_ops);
+ iomap_bio_readahead(rac, &gfs2_iomap_ops);
}
/**
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index bc67fa058c84..ee92f5910ae1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -744,7 +744,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & I_DIRTY;
+ int sync_state = inode_state_read_once(inode) & I_DIRTY;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b677c0e6b9ab..c9712235e7a0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl)
ip = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
- wait_on_inode(&ip->i_inode);
+ wait_on_new_inode(&ip->i_inode);
if (is_bad_inode(&ip->i_inode)) {
iput(&ip->i_inode);
ip = NULL;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0c0a80b3baca..c94e42b0c94d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -394,7 +394,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
struct inode *inode = &ip->i_inode;
- bool is_new = inode->i_state & I_NEW;
+ bool is_new = inode_state_read_once(inode) & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) {
gfs2_consist_inode(ip);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8a7ed80d9f2d..890c87e3e365 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -127,7 +127,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
ip = GFS2_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_glock *io_gl;
int extra_flags = 0;
@@ -924,7 +924,7 @@ fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(&d_gh);
if (!IS_ERR_OR_NULL(inode)) {
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
iget_failed(inode);
else
iput(inode);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aa15183f9a16..889682f051ea 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1751,7 +1751,7 @@ static void gfs2_evict_inodes(struct super_block *sb)
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) &&
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) &&
!need_resched()) {
spin_unlock(&inode->i_lock);
continue;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 22e62fe7448b..54c20d01c342 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -42,7 +42,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
tree->inode = iget_locked(sb, id);
if (!tree->inode)
goto free_tree;
- BUG_ON(!(tree->inode->i_state & I_NEW));
+ BUG_ON(!(inode_state_read_once(tree->inode) & I_NEW));
{
struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
HFS_I(tree->inode)->flags = 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9cd449913dc8..81ad93e6312f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -412,7 +412,7 @@ struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_
return NULL;
}
inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
- if (inode && (inode->i_state & I_NEW))
+ if (inode && (inode_state_read_once(inode) & I_NEW))
unlock_new_inode(inode);
return inode;
}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index a66a09a56bf7..9b377481f397 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/nls.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 16bc4abc67e0..54e85e25a259 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -65,7 +65,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 1e1acf5775ab..51d26aa2b93e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -581,7 +581,7 @@ static struct inode *hostfs_iget(struct super_block *sb, char *name)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
unlock_new_inode(inode);
} else {
spin_lock(&inode->i_lock);
@@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct hostfs_fs_info *fsi = fc->s_fs_info;
struct fs_parse_result result;
- char *host_root;
+ char *host_root, *tmp_root;
int opt;
opt = fs_parse(fc, hostfs_param_specs, param, &result);
@@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_hostfs:
host_root = param->string;
if (!*host_root)
- host_root = "";
- fsi->host_root_path =
- kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
- if (fsi->host_root_path == NULL)
+ break;
+ tmp_root = kasprintf(GFP_KERNEL, "%s%s",
+ fsi->host_root_path, host_root);
+ if (!tmp_root)
return -ENOMEM;
+ kfree(fsi->host_root_path);
+ fsi->host_root_path = tmp_root;
break;
}
@@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int hostfs_parse_monolithic(struct fs_context *fc, void *data)
{
struct hostfs_fs_info *fsi = fc->s_fs_info;
- char *host_root = (char *)data;
+ char *tmp_root, *host_root = (char *)data;
/* NULL is printed as '(null)' by printf(): avoid that. */
if (host_root == NULL)
- host_root = "";
+ return 0;
- fsi->host_root_path =
- kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
- if (fsi->host_root_path == NULL)
+ tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root);
+ if (!tmp_root)
return -ENOMEM;
-
+ kfree(fsi->host_root_path);
+ fsi->host_root_path = tmp_root;
return 0;
}
@@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc)
if (!fsi)
return -ENOMEM;
+ fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino);
+ if (!fsi->host_root_path) {
+ kfree(fsi);
+ return -ENOMEM;
+ }
fc->s_fs_info = fsi;
fc->ops = &hostfs_context_ops;
return 0;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 49dd585c2b17..ceb50b2dc91a 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
result = ERR_PTR(-ENOMEM);
goto bail1;
}
- if (result->i_state & I_NEW) {
+ if (inode_state_read_once(result) & I_NEW) {
hpfs_init_inode(result);
if (de->directory)
hpfs_read_inode(result);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 34008442ee26..93d528f4f4f2 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -196,7 +196,7 @@ void hpfs_write_inode(struct inode *i)
parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir);
if (parent) {
hpfs_inode->i_dirty = 0;
- if (parent->i_state & I_NEW) {
+ if (inode_state_read_once(parent) & I_NEW) {
hpfs_init_inode(parent);
hpfs_read_inode(parent);
unlock_new_inode(parent);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 8ab85e7ac91e..371aa6de8075 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -9,6 +9,7 @@
#include "hpfs_fn.h"
#include <linux/module.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/init.h>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9c94ed8c3ab0..f42548ee9083 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -478,14 +478,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
if (!hugetlb_vma_trylock_write(vma))
continue;
- /*
- * Skip VMAs without shareable locks. Per the design in commit
- * 40549ba8f8e0, these will be handled by remove_inode_hugepages()
- * called after this function with proper locking.
- */
- if (!__vma_shareable_lock(vma))
- goto skip;
-
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
@@ -496,7 +488,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
* vmas. Therefore, lock is not held when calling
* unmap_hugepage_range for private vmas.
*/
-skip:
hugetlb_vma_unlock_write(vma);
}
}
diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..cc8265cfe80e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -233,7 +233,7 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
- inode->i_state = 0;
+ inode_state_assign_raw(inode, 0);
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
@@ -471,7 +471,7 @@ EXPORT_SYMBOL(set_nlink);
void inc_nlink(struct inode *inode)
{
if (unlikely(inode->i_nlink == 0)) {
- WARN_ON(!(inode->i_state & I_LINKABLE));
+ WARN_ON(!(inode_state_read_once(inode) & I_LINKABLE));
atomic_long_dec(&inode->i_sb->s_remove_count);
}
@@ -530,9 +530,48 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void __inode_add_lru(struct inode *inode, bool rotate)
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+ struct inode *inode, u32 bit)
+{
+ void *bit_address;
+
+ bit_address = inode_state_wait_address(inode, bit);
+ init_wait_var_entry(wqe, bit_address, 0);
+ return __var_waitqueue(bit_address);
+}
+EXPORT_SYMBOL(inode_bit_waitqueue);
+
+void wait_on_new_inode(struct inode *inode)
+{
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
+
+ spin_lock(&inode->i_lock);
+ if (!(inode_state_read(inode) & I_NEW)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+ for (;;) {
+ prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (!(inode_state_read(inode) & I_NEW))
+ break;
+ spin_unlock(&inode->i_lock);
+ schedule();
+ spin_lock(&inode->i_lock);
+ }
+ finish_wait(wq_head, &wqe.wq_entry);
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(wait_on_new_inode);
+
+static void __inode_lru_list_add(struct inode *inode, bool rotate)
{
- if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ lockdep_assert_held(&inode->i_lock);
+
+ if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
return;
if (icount_read(inode))
return;
@@ -544,32 +583,22 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
else if (rotate)
- inode->i_state |= I_REFERENCED;
-}
-
-struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
- struct inode *inode, u32 bit)
-{
- void *bit_address;
-
- bit_address = inode_state_wait_address(inode, bit);
- init_wait_var_entry(wqe, bit_address, 0);
- return __var_waitqueue(bit_address);
+ inode_state_set(inode, I_REFERENCED);
}
-EXPORT_SYMBOL(inode_bit_waitqueue);
/*
* Add inode to LRU if needed (inode is unused and clean).
- *
- * Needs inode->i_lock held.
*/
-void inode_add_lru(struct inode *inode)
+void inode_lru_list_add(struct inode *inode)
{
- __inode_add_lru(inode, false);
+ __inode_lru_list_add(inode, false);
}
static void inode_lru_list_del(struct inode *inode)
{
+ if (list_empty(&inode->i_lru))
+ return;
+
if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -577,15 +606,15 @@ static void inode_lru_list_del(struct inode *inode)
static void inode_pin_lru_isolating(struct inode *inode)
{
lockdep_assert_held(&inode->i_lock);
- WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
- inode->i_state |= I_LRU_ISOLATING;
+ WARN_ON(inode_state_read(inode) & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+ inode_state_set(inode, I_LRU_ISOLATING);
}
static void inode_unpin_lru_isolating(struct inode *inode)
{
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
- inode->i_state &= ~I_LRU_ISOLATING;
+ WARN_ON(!(inode_state_read(inode) & I_LRU_ISOLATING));
+ inode_state_clear(inode, I_LRU_ISOLATING);
/* Called with inode->i_lock which ensures memory ordering. */
inode_wake_up_bit(inode, __I_LRU_ISOLATING);
spin_unlock(&inode->i_lock);
@@ -597,7 +626,7 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
struct wait_queue_head *wq_head;
lockdep_assert_held(&inode->i_lock);
- if (!(inode->i_state & I_LRU_ISOLATING))
+ if (!(inode_state_read(inode) & I_LRU_ISOLATING))
return;
wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
@@ -607,14 +636,14 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
* Checking I_LRU_ISOLATING with inode->i_lock guarantees
* memory ordering.
*/
- if (!(inode->i_state & I_LRU_ISOLATING))
+ if (!(inode_state_read(inode) & I_LRU_ISOLATING))
break;
spin_unlock(&inode->i_lock);
schedule();
spin_lock(&inode->i_lock);
}
finish_wait(wq_head, &wqe.wq_entry);
- WARN_ON(inode->i_state & I_LRU_ISOLATING);
+ WARN_ON(inode_state_read(inode) & I_LRU_ISOLATING);
}
/**
@@ -761,11 +790,11 @@ void clear_inode(struct inode *inode)
*/
xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.i_private_list));
- BUG_ON(!(inode->i_state & I_FREEING));
- BUG_ON(inode->i_state & I_CLEAR);
+ BUG_ON(!(inode_state_read_once(inode) & I_FREEING));
+ BUG_ON(inode_state_read_once(inode) & I_CLEAR);
BUG_ON(!list_empty(&inode->i_wb_list));
/* don't need i_lock here, no concurrent mods to i_state */
- inode->i_state = I_FREEING | I_CLEAR;
+ inode_state_assign_raw(inode, I_FREEING | I_CLEAR);
}
EXPORT_SYMBOL(clear_inode);
@@ -786,12 +815,10 @@ static void evict(struct inode *inode)
{
const struct super_operations *op = inode->i_sb->s_op;
- BUG_ON(!(inode->i_state & I_FREEING));
+ BUG_ON(!(inode_state_read_once(inode) & I_FREEING));
BUG_ON(!list_empty(&inode->i_lru));
- if (!list_empty(&inode->i_io_list))
- inode_io_list_del(inode);
-
+ inode_io_list_del(inode);
inode_sb_list_del(inode);
spin_lock(&inode->i_lock);
@@ -829,7 +856,7 @@ static void evict(struct inode *inode)
* This also means we don't need any fences for the call below.
*/
inode_wake_up_bit(inode, __I_NEW);
- BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+ BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR));
destroy_inode(inode);
}
@@ -879,12 +906,12 @@ again:
spin_unlock(&inode->i_lock);
continue;
}
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
continue;
}
- inode->i_state |= I_FREEING;
+ inode_state_set(inode, I_FREEING);
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
@@ -938,7 +965,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* sync, or the last page cache deletion will requeue them.
*/
if (icount_read(inode) ||
- (inode->i_state & ~I_REFERENCED) ||
+ (inode_state_read(inode) & ~I_REFERENCED) ||
!mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
@@ -947,8 +974,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
}
/* Recently referenced inodes get one more pass */
- if (inode->i_state & I_REFERENCED) {
- inode->i_state &= ~I_REFERENCED;
+ if (inode_state_read(inode) & I_REFERENCED) {
+ inode_state_clear(inode, I_REFERENCED);
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
@@ -975,8 +1002,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_RETRY;
}
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state |= I_FREEING;
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ inode_state_set(inode, I_FREEING);
list_lru_isolate_move(lru, &inode->i_lru, freeable);
spin_unlock(&inode->i_lock);
@@ -1008,7 +1035,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
- void *data, bool is_inode_hash_locked)
+ void *data, bool is_inode_hash_locked,
+ bool *isnew)
{
struct inode *inode = NULL;
@@ -1025,16 +1053,17 @@ repeat:
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, is_inode_hash_locked);
goto repeat;
}
- if (unlikely(inode->i_state & I_CREATING)) {
+ if (unlikely(inode_state_read(inode) & I_CREATING)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
+ *isnew = !!(inode_state_read(inode) & I_NEW);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return inode;
@@ -1049,7 +1078,7 @@ repeat:
*/
static struct inode *find_inode_fast(struct super_block *sb,
struct hlist_head *head, unsigned long ino,
- bool is_inode_hash_locked)
+ bool is_inode_hash_locked, bool *isnew)
{
struct inode *inode = NULL;
@@ -1066,16 +1095,17 @@ repeat:
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, is_inode_hash_locked);
goto repeat;
}
- if (unlikely(inode->i_state & I_CREATING)) {
+ if (unlikely(inode_state_read(inode) & I_CREATING)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
+ *isnew = !!(inode_state_read(inode) & I_NEW);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return inode;
@@ -1180,14 +1210,8 @@ void unlock_new_inode(struct inode *inode)
{
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW & ~I_CREATING;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -1197,14 +1221,8 @@ void discard_new_inode(struct inode *inode)
{
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
iput(inode);
@@ -1260,6 +1278,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* @test: callback used for comparisons between inodes
* @set: callback used to initialize a new struct inode
* @data: opaque data pointer to pass to @test and @set
+ * @isnew: pointer to a bool which will indicate whether I_NEW is set
*
* Search for the inode specified by @hashval and @data in the inode cache,
* and if present return it with an increased reference count. This is a
@@ -1278,12 +1297,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
+ bool isnew;
might_sleep();
again:
spin_lock(&inode_hash_lock);
- old = find_inode(inode->i_sb, head, test, data, true);
+ old = find_inode(inode->i_sb, head, test, data, true, &isnew);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
@@ -1292,7 +1312,8 @@ again:
spin_unlock(&inode_hash_lock);
if (IS_ERR(old))
return NULL;
- wait_on_inode(old);
+ if (unlikely(isnew))
+ wait_on_new_inode(old);
if (unlikely(inode_unhashed(old))) {
iput(old);
goto again;
@@ -1310,7 +1331,7 @@ again:
* caller is responsible for filling in the contents
*/
spin_lock(&inode->i_lock);
- inode->i_state |= I_NEW;
+ inode_state_set(inode, I_NEW);
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
@@ -1383,15 +1404,17 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode, *new;
+ bool isnew;
might_sleep();
again:
- inode = find_inode(sb, head, test, data, false);
+ inode = find_inode(sb, head, test, data, false, &isnew);
if (inode) {
if (IS_ERR(inode))
return NULL;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1426,15 +1449,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+ bool isnew;
might_sleep();
again:
- inode = find_inode_fast(sb, head, ino, false);
+ inode = find_inode_fast(sb, head, ino, false, &isnew);
if (inode) {
if (IS_ERR(inode))
return NULL;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1448,11 +1473,11 @@ again:
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
- old = find_inode_fast(sb, head, ino, true);
+ old = find_inode_fast(sb, head, ino, true, &isnew);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
- inode->i_state = I_NEW;
+ inode_state_assign(inode, I_NEW);
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
@@ -1474,7 +1499,8 @@ again:
if (IS_ERR(old))
return NULL;
inode = old;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1545,7 +1571,7 @@ EXPORT_SYMBOL(iunique);
struct inode *igrab(struct inode *inode)
{
spin_lock(&inode->i_lock);
- if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
+ if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
__iget(inode);
spin_unlock(&inode->i_lock);
} else {
@@ -1578,13 +1604,13 @@ EXPORT_SYMBOL(igrab);
* Note2: @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
- int (*test)(struct inode *, void *), void *data)
+ int (*test)(struct inode *, void *), void *data, bool *isnew)
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data, true);
+ inode = find_inode(sb, head, test, data, true, isnew);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
@@ -1612,13 +1638,15 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct inode *inode;
+ bool isnew;
might_sleep();
again:
- inode = ilookup5_nowait(sb, hashval, test, data);
+ inode = ilookup5_nowait(sb, hashval, test, data, &isnew);
if (inode) {
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1640,16 +1668,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+ bool isnew;
might_sleep();
again:
- inode = find_inode_fast(sb, head, ino, false);
+ inode = find_inode_fast(sb, head, ino, false, &isnew);
if (inode) {
if (IS_ERR(inode))
return NULL;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1741,7 +1771,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb == sb &&
- !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
+ !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)) &&
test(inode, data))
return inode;
}
@@ -1780,7 +1810,7 @@ struct inode *find_inode_by_ino_rcu(struct super_block *sb,
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino == ino &&
inode->i_sb == sb &&
- !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
+ !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)))
return inode;
}
return NULL;
@@ -1792,6 +1822,7 @@ int insert_inode_locked(struct inode *inode)
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ bool isnew;
might_sleep();
@@ -1804,7 +1835,7 @@ int insert_inode_locked(struct inode *inode)
if (old->i_sb != sb)
continue;
spin_lock(&old->i_lock);
- if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+ if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) {
spin_unlock(&old->i_lock);
continue;
}
@@ -1812,21 +1843,23 @@ int insert_inode_locked(struct inode *inode)
}
if (likely(!old)) {
spin_lock(&inode->i_lock);
- inode->i_state |= I_NEW | I_CREATING;
+ inode_state_set(inode, I_NEW | I_CREATING);
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
return 0;
}
- if (unlikely(old->i_state & I_CREATING)) {
+ if (unlikely(inode_state_read(old) & I_CREATING)) {
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
return -EBUSY;
}
__iget(old);
+ isnew = !!(inode_state_read(old) & I_NEW);
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
- wait_on_inode(old);
+ if (isnew)
+ wait_on_new_inode(old);
if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
@@ -1843,7 +1876,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
might_sleep();
- inode->i_state |= I_CREATING;
+ inode_state_set_raw(inode, I_CREATING);
old = inode_insert5(inode, hashval, test, NULL, data);
if (old != inode) {
@@ -1875,10 +1908,10 @@ static void iput_final(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
const struct super_operations *op = inode->i_sb->s_op;
- unsigned long state;
int drop;
- WARN_ON(inode->i_state & I_NEW);
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
if (op->drop_inode)
drop = op->drop_inode(inode);
@@ -1886,29 +1919,33 @@ static void iput_final(struct inode *inode)
drop = inode_generic_drop(inode);
if (!drop &&
- !(inode->i_state & I_DONTCACHE) &&
+ !(inode_state_read(inode) & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- __inode_add_lru(inode, true);
+ __inode_lru_list_add(inode, true);
spin_unlock(&inode->i_lock);
return;
}
- state = inode->i_state;
- if (!drop) {
- WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
+ /*
+ * Re-check ->i_count in case the ->drop_inode() hooks played games.
+ * Note we only execute this if the verdict was to drop the inode.
+ */
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
+
+ if (drop) {
+ inode_state_set(inode, I_FREEING);
+ } else {
+ inode_state_set(inode, I_WILL_FREE);
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
spin_lock(&inode->i_lock);
- state = inode->i_state;
- WARN_ON(state & I_NEW);
- state &= ~I_WILL_FREE;
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ inode_state_replace(inode, I_WILL_FREE, I_FREEING);
}
- WRITE_ONCE(inode->i_state, state | I_FREEING);
- if (!list_empty(&inode->i_lru))
- inode_lru_list_del(inode);
+ inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
evict(inode);
@@ -1931,7 +1968,7 @@ void iput(struct inode *inode)
retry:
lockdep_assert_not_held(&inode->i_lock);
- VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode);
+ VFS_BUG_ON_INODE(inode_state_read_once(inode) & I_CLEAR, inode);
/*
* Note this assert is technically racy as if the count is bogusly
* equal to one, then two CPUs racing to further drop it can both
@@ -1942,14 +1979,14 @@ retry:
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) {
+ if ((inode_state_read_once(inode) & I_DIRTY_TIME) && inode->i_nlink) {
trace_writeback_lazytime_iput(inode);
mark_inode_dirty_sync(inode);
goto retry;
}
spin_lock(&inode->i_lock);
- if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) {
+ if (unlikely((inode_state_read(inode) & I_DIRTY_TIME) && inode->i_nlink)) {
spin_unlock(&inode->i_lock);
goto retry;
}
@@ -1967,6 +2004,18 @@ retry:
}
EXPORT_SYMBOL(iput);
+/**
+ * iput_not_last - put an inode assuming this is not the last reference
+ * @inode: inode to put
+ */
+void iput_not_last(struct inode *inode)
+{
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode);
+
+ WARN_ON(atomic_sub_return(1, &inode->i_count) == 0);
+}
+EXPORT_SYMBOL(iput_not_last);
+
#ifdef CONFIG_BLOCK
/**
* bmap - find a block number in a file
@@ -2310,42 +2359,40 @@ out:
}
EXPORT_SYMBOL(current_time);
-static int inode_needs_update_time(struct inode *inode)
+static int file_update_time_flags(struct file *file, unsigned int flags)
{
+ struct inode *inode = file_inode(file);
struct timespec64 now, ts;
- int sync_it = 0;
+ int sync_mode = 0;
+ int ret = 0;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
+ if (unlikely(file->f_mode & FMODE_NOCMTIME))
+ return 0;
now = current_time(inode);
ts = inode_get_mtime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it |= S_MTIME;
-
+ sync_mode |= S_MTIME;
ts = inode_get_ctime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it |= S_CTIME;
-
+ sync_mode |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
- sync_it |= S_VERSION;
+ sync_mode |= S_VERSION;
- return sync_it;
-}
-
-static int __file_update_time(struct file *file, int sync_mode)
-{
- int ret = 0;
- struct inode *inode = file_inode(file);
+ if (!sync_mode)
+ return 0;
- /* try to update time settings */
- if (!mnt_get_write_access_file(file)) {
- ret = inode_update_time(inode, sync_mode);
- mnt_put_write_access_file(file);
- }
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ if (mnt_get_write_access_file(file))
+ return 0;
+ ret = inode_update_time(inode, sync_mode);
+ mnt_put_write_access_file(file);
return ret;
}
@@ -2365,14 +2412,7 @@ static int __file_update_time(struct file *file, int sync_mode)
*/
int file_update_time(struct file *file)
{
- int ret;
- struct inode *inode = file_inode(file);
-
- ret = inode_needs_update_time(inode);
- if (ret <= 0)
- return ret;
-
- return __file_update_time(file, ret);
+ return file_update_time_flags(file, 0);
}
EXPORT_SYMBOL(file_update_time);
@@ -2394,7 +2434,6 @@ EXPORT_SYMBOL(file_update_time);
static int file_modified_flags(struct file *file, int flags)
{
int ret;
- struct inode *inode = file_inode(file);
/*
* Clear the security bits if the process is not being run by root.
@@ -2403,17 +2442,7 @@ static int file_modified_flags(struct file *file, int flags)
ret = file_remove_privs_flags(file, flags);
if (ret)
return ret;
-
- if (unlikely(file->f_mode & FMODE_NOCMTIME))
- return 0;
-
- ret = inode_needs_update_time(inode);
- if (ret <= 0)
- return ret;
- if (flags & IOCB_NOWAIT)
- return -EAGAIN;
-
- return __file_update_time(file, ret);
+ return file_update_time_flags(file, flags);
}
/**
@@ -2970,7 +2999,7 @@ void dump_inode(struct inode *inode, const char *reason)
pr_warn("%s encountered for inode %px\n"
"fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
- inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
+ inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count));
}
EXPORT_SYMBOL(dump_inode);
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index f7e1c8534c46..a572b8808524 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -14,5 +14,6 @@ iomap-y += trace.o \
iomap-$(CONFIG_BLOCK) += direct-io.o \
ioend.o \
fiemap.o \
- seek.o
+ seek.o \
+ bio.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c
new file mode 100644
index 000000000000..fc045f2e4c45
--- /dev/null
+++ b/fs/iomap/bio.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+#include "trace.h"
+
+static void iomap_read_end_io(struct bio *bio)
+{
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
+ bio_put(bio);
+}
+
+static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx)
+{
+ struct bio *bio = ctx->read_ctx;
+
+ if (bio)
+ submit_bio(bio);
+}
+
+static int iomap_bio_read_folio_range(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t plen)
+{
+ struct folio *folio = ctx->cur_folio;
+ const struct iomap *iomap = &iter->iomap;
+ loff_t pos = iter->pos;
+ size_t poff = offset_in_folio(folio, pos);
+ loff_t length = iomap_length(iter);
+ sector_t sector;
+ struct bio *bio = ctx->read_ctx;
+
+ sector = iomap_sector(iomap, pos);
+ if (!bio || bio_end_sector(bio) != sector ||
+ !bio_add_folio(bio, folio, plen, poff)) {
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
+ unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ if (bio)
+ submit_bio(bio);
+
+ if (ctx->rac) /* same as readahead_gfp_mask */
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ,
+ gfp);
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_read_folio does.
+ */
+ if (!bio)
+ bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp);
+ if (ctx->rac)
+ bio->bi_opf |= REQ_RAHEAD;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_end_io = iomap_read_end_io;
+ bio_add_folio_nofail(bio, folio, plen, poff);
+ ctx->read_ctx = bio;
+ }
+ return 0;
+}
+
+const struct iomap_read_ops iomap_bio_read_ops = {
+ .read_folio_range = iomap_bio_read_folio_range,
+ .submit_read = iomap_bio_submit_read,
+};
+EXPORT_SYMBOL_GPL(iomap_bio_read_ops);
+
+int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ struct bio_vec bvec;
+ struct bio bio;
+
+ bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = iomap_sector(srcmap, pos);
+ bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos));
+ return submit_bio_wait(&bio);
+}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8b847a1e27f1..e5c1ca440d93 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -8,6 +8,7 @@
#include <linux/writeback.h>
#include <linux/swap.h>
#include <linux/migrate.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -37,10 +38,28 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio,
return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
}
-static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
- unsigned int block)
+/*
+ * Find the next uptodate block in the folio. end_blk is inclusive.
+ * If no uptodate block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_uptodate_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ return find_next_bit(ifs->state, end_blk + 1, start_blk);
+}
+
+/*
+ * Find the next non-uptodate block in the folio. end_blk is inclusive.
+ * If no non-uptodate block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_nonuptodate_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
{
- return test_bit(block, ifs->state);
+ struct iomap_folio_state *ifs = folio->private;
+
+ return find_next_zero_bit(ifs->state, end_blk + 1, start_blk);
}
static bool ifs_set_range_uptodate(struct folio *folio,
@@ -75,13 +94,34 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
folio_mark_uptodate(folio);
}
-static inline bool ifs_block_is_dirty(struct folio *folio,
- struct iomap_folio_state *ifs, int block)
+/*
+ * Find the next dirty block in the folio. end_blk is inclusive.
+ * If no dirty block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_dirty_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
{
+ struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int blks = i_blocks_per_folio(inode, folio);
+
+ return find_next_bit(ifs->state, blks + end_blk + 1,
+ blks + start_blk) - blks;
+}
+
+/*
+ * Find the next clean block in the folio. end_blk is inclusive.
+ * If no clean block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_clean_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks = i_blocks_per_folio(inode, folio);
- return test_bit(block + blks_per_folio, ifs->state);
+ return find_next_zero_bit(ifs->state, blks + end_blk + 1,
+ blks + start_blk) - blks;
}
static unsigned ifs_find_dirty_range(struct folio *folio,
@@ -92,18 +132,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio,
offset_in_folio(folio, *range_start) >> inode->i_blkbits;
unsigned end_blk = min_not_zero(
offset_in_folio(folio, range_end) >> inode->i_blkbits,
- i_blocks_per_folio(inode, folio));
- unsigned nblks = 1;
+ i_blocks_per_folio(inode, folio)) - 1;
+ unsigned nblks;
- while (!ifs_block_is_dirty(folio, ifs, start_blk))
- if (++start_blk == end_blk)
- return 0;
-
- while (start_blk + nblks < end_blk) {
- if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
- break;
- nblks++;
- }
+ start_blk = ifs_next_dirty_block(folio, start_blk, end_blk);
+ if (start_blk > end_blk)
+ return 0;
+ if (start_blk == end_blk)
+ nblks = 1;
+ else
+ nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) -
+ start_blk;
*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
return nblks << inode->i_blkbits;
@@ -218,6 +257,22 @@ static void ifs_free(struct folio *folio)
}
/*
+ * Calculate how many bytes to truncate based off the number of blocks to
+ * truncate and the end position to start truncating from.
+ */
+static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits,
+ unsigned blocks_truncated)
+{
+ unsigned block_size = 1 << block_bits;
+ unsigned block_offset = end_pos & (block_size - 1);
+
+ if (!block_offset)
+ return blocks_truncated << block_bits;
+
+ return ((blocks_truncated - 1) << block_bits) + block_offset;
+}
+
+/*
* Calculate the range inside the folio that we actually need to read.
*/
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
@@ -240,24 +295,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* to avoid reading in already uptodate ranges.
*/
if (ifs) {
- unsigned int i;
-
- /* move forward for each leading block marked uptodate */
- for (i = first; i <= last; i++) {
- if (!ifs_block_is_uptodate(ifs, i))
- break;
- *pos += block_size;
- poff += block_size;
- plen -= block_size;
- first++;
+ unsigned int next, blocks_skipped;
+
+ next = ifs_next_nonuptodate_block(folio, first, last);
+ blocks_skipped = next - first;
+
+ if (blocks_skipped) {
+ unsigned long block_offset = *pos & (block_size - 1);
+ unsigned bytes_skipped =
+ (blocks_skipped << block_bits) - block_offset;
+
+ *pos += bytes_skipped;
+ poff += bytes_skipped;
+ plen -= bytes_skipped;
}
+ first = next;
/* truncate len if we find any trailing uptodate block(s) */
- while (++i <= last) {
- if (ifs_block_is_uptodate(ifs, i)) {
- plen -= (last - i + 1) * block_size;
- last = i - 1;
- break;
+ if (++next <= last) {
+ next = ifs_next_uptodate_block(folio, next, last);
+ if (next <= last) {
+ plen -= iomap_bytes_to_truncate(*pos + plen,
+ block_bits, last - next + 1);
+ last = next - 1;
}
}
}
@@ -271,7 +331,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
- plen -= (last - end) * block_size;
+ plen -= iomap_bytes_to_truncate(*pos + plen, block_bits,
+ last - end);
}
*offp = poff;
@@ -320,9 +381,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
return 0;
}
-#ifdef CONFIG_BLOCK
-static void iomap_finish_folio_read(struct folio *folio, size_t off,
- size_t len, int error)
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+ int error)
{
struct iomap_folio_state *ifs = folio->private;
bool uptodate = !error;
@@ -342,169 +402,201 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off,
if (finished)
folio_end_read(folio, uptodate);
}
+EXPORT_SYMBOL_GPL(iomap_finish_folio_read);
-static void iomap_read_end_io(struct bio *bio)
+static void iomap_read_init(struct folio *folio)
{
- int error = blk_status_to_errno(bio->bi_status);
- struct folio_iter fi;
+ struct iomap_folio_state *ifs = folio->private;
- bio_for_each_folio_all(fi, bio)
- iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
- bio_put(bio);
+ if (ifs) {
+ size_t len = folio_size(folio);
+
+ /*
+ * ifs->read_bytes_pending is used to track how many bytes are
+ * read in asynchronously by the IO helper. We need to track
+ * this so that we can know when the IO helper has finished
+ * reading in all the necessary ranges of the folio and can end
+ * the read.
+ *
+ * Increase ->read_bytes_pending by the folio size to start, and
+ * add a +1 bias. We'll subtract the bias and any uptodate /
+ * zeroed ranges that did not require IO in iomap_read_end()
+ * after we're done processing the folio.
+ *
+ * We do this because otherwise, we would have to increment
+ * ifs->read_bytes_pending every time a range in the folio needs
+ * to be read in, which can get expensive since the spinlock
+ * needs to be held whenever modifying ifs->read_bytes_pending.
+ *
+ * We add the bias to ensure the read has not been ended on the
+ * folio when iomap_read_end() is called, even if the IO helper
+ * has already finished reading in the entire folio.
+ */
+ spin_lock_irq(&ifs->state_lock);
+ WARN_ON_ONCE(ifs->read_bytes_pending != 0);
+ ifs->read_bytes_pending = len + 1;
+ spin_unlock_irq(&ifs->state_lock);
+ }
}
-struct iomap_readpage_ctx {
- struct folio *cur_folio;
- bool cur_folio_in_bio;
- struct bio *bio;
- struct readahead_control *rac;
-};
+/*
+ * This ends IO if no bytes were submitted to an IO helper.
+ *
+ * Otherwise, this calibrates ifs->read_bytes_pending to represent only the
+ * submitted bytes (see comment in iomap_read_init()). If all bytes submitted
+ * have already been completed by the IO helper, then this will end the read.
+ * Else the IO helper will end the read after all submitted ranges have been
+ * read.
+ */
+static void iomap_read_end(struct folio *folio, size_t bytes_submitted)
+{
+ struct iomap_folio_state *ifs = folio->private;
-static int iomap_readpage_iter(struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
+ if (ifs) {
+ bool end_read, uptodate;
+
+ spin_lock_irq(&ifs->state_lock);
+ if (!ifs->read_bytes_pending) {
+ WARN_ON_ONCE(bytes_submitted);
+ spin_unlock_irq(&ifs->state_lock);
+ folio_unlock(folio);
+ return;
+ }
+
+ /*
+ * Subtract any bytes that were initially accounted to
+ * read_bytes_pending but skipped for IO. The +1 accounts for
+ * the bias we added in iomap_read_init().
+ */
+ ifs->read_bytes_pending -=
+ (folio_size(folio) + 1 - bytes_submitted);
+
+ /*
+ * If !ifs->read_bytes_pending, this means all pending reads by
+ * the IO helper have already completed, which means we need to
+ * end the folio read here. If ifs->read_bytes_pending != 0,
+ * the IO helper will end the folio read.
+ */
+ end_read = !ifs->read_bytes_pending;
+ if (end_read)
+ uptodate = ifs_is_fully_uptodate(folio, ifs);
+ spin_unlock_irq(&ifs->state_lock);
+ if (end_read)
+ folio_end_read(folio, uptodate);
+ } else if (!bytes_submitted) {
+ /*
+ * If there were no bytes submitted, this means we are
+ * responsible for unlocking the folio here, since no IO helper
+ * has taken ownership of it. If there were bytes submitted,
+ * then the IO helper will end the read via
+ * iomap_finish_folio_read().
+ */
+ folio_unlock(folio);
+ }
+}
+
+static int iomap_read_folio_iter(struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted)
{
const struct iomap *iomap = &iter->iomap;
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
- struct iomap_folio_state *ifs;
size_t poff, plen;
- sector_t sector;
+ loff_t pos_diff;
int ret;
if (iomap->type == IOMAP_INLINE) {
ret = iomap_read_inline_data(iter, folio);
if (ret)
return ret;
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
- /* zero post-eof blocks as the page may be mapped */
- ifs = ifs_alloc(iter->inode, folio, iter->flags);
- iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
- if (plen == 0)
- goto done;
+ ifs_alloc(iter->inode, folio, iter->flags);
- if (iomap_block_needs_zeroing(iter, pos)) {
- folio_zero_range(folio, poff, plen);
- iomap_set_range_uptodate(folio, poff, plen);
- goto done;
- }
+ length = min_t(loff_t, length,
+ folio_size(folio) - offset_in_folio(folio, pos));
+ while (length) {
+ iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff,
+ &plen);
- ctx->cur_folio_in_bio = true;
- if (ifs) {
- spin_lock_irq(&ifs->state_lock);
- ifs->read_bytes_pending += plen;
- spin_unlock_irq(&ifs->state_lock);
- }
+ pos_diff = pos - iter->pos;
+ if (WARN_ON_ONCE(pos_diff + plen > length))
+ return -EIO;
- sector = iomap_sector(iomap, pos);
- if (!ctx->bio ||
- bio_end_sector(ctx->bio) != sector ||
- !bio_add_folio(ctx->bio, folio, plen, poff)) {
- gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- gfp_t orig_gfp = gfp;
- unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
-
- if (ctx->bio)
- submit_bio(ctx->bio);
-
- if (ctx->rac) /* same as readahead_gfp_mask */
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
- REQ_OP_READ, gfp);
- /*
- * If the bio_alloc fails, try it again for a single page to
- * avoid having to deal with partial page reads. This emulates
- * what do_mpage_read_folio does.
- */
- if (!ctx->bio) {
- ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
- orig_gfp);
- }
- if (ctx->rac)
- ctx->bio->bi_opf |= REQ_RAHEAD;
- ctx->bio->bi_iter.bi_sector = sector;
- ctx->bio->bi_end_io = iomap_read_end_io;
- bio_add_folio_nofail(ctx->bio, folio, plen, poff);
- }
+ ret = iomap_iter_advance(iter, pos_diff);
+ if (ret)
+ return ret;
-done:
- /*
- * Move the caller beyond our range so that it keeps making progress.
- * For that, we have to include any leading non-uptodate ranges, but
- * we can skip trailing ones as they will be handled in the next
- * iteration.
- */
- length = pos - iter->pos + plen;
- return iomap_iter_advance(iter, &length);
-}
+ if (plen == 0)
+ return 0;
-static int iomap_read_folio_iter(struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
-{
- int ret;
+ /* zero post-eof blocks as the page may be mapped */
+ if (iomap_block_needs_zeroing(iter, pos)) {
+ folio_zero_range(folio, poff, plen);
+ iomap_set_range_uptodate(folio, poff, plen);
+ } else {
+ if (!*bytes_submitted)
+ iomap_read_init(folio);
+ ret = ctx->ops->read_folio_range(iter, ctx, plen);
+ if (ret)
+ return ret;
+ *bytes_submitted += plen;
+ }
- while (iomap_length(iter)) {
- ret = iomap_readpage_iter(iter, ctx);
+ ret = iomap_iter_advance(iter, plen);
if (ret)
return ret;
+ length -= pos_diff + plen;
+ pos = iter->pos;
}
-
return 0;
}
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
+void iomap_read_folio(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx)
{
+ struct folio *folio = ctx->cur_folio;
struct iomap_iter iter = {
.inode = folio->mapping->host,
.pos = folio_pos(folio),
.len = folio_size(folio),
};
- struct iomap_readpage_ctx ctx = {
- .cur_folio = folio,
- };
+ size_t bytes_submitted = 0;
int ret;
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_read_folio_iter(&iter, &ctx);
+ iter.status = iomap_read_folio_iter(&iter, ctx,
+ &bytes_submitted);
- if (ctx.bio) {
- submit_bio(ctx.bio);
- WARN_ON_ONCE(!ctx.cur_folio_in_bio);
- } else {
- WARN_ON_ONCE(ctx.cur_folio_in_bio);
- folio_unlock(folio);
- }
+ if (ctx->ops->submit_read)
+ ctx->ops->submit_read(ctx);
- /*
- * Just like mpage_readahead and block_read_full_folio, we always
- * return 0 and just set the folio error flag on errors. This
- * should be cleaned up throughout the stack eventually.
- */
- return 0;
+ iomap_read_end(folio, bytes_submitted);
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
static int iomap_readahead_iter(struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
+ struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted)
{
int ret;
while (iomap_length(iter)) {
if (ctx->cur_folio &&
offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
- if (!ctx->cur_folio_in_bio)
- folio_unlock(ctx->cur_folio);
+ iomap_read_end(ctx->cur_folio, *cur_bytes_submitted);
ctx->cur_folio = NULL;
}
if (!ctx->cur_folio) {
ctx->cur_folio = readahead_folio(ctx->rac);
- ctx->cur_folio_in_bio = false;
+ if (WARN_ON_ONCE(!ctx->cur_folio))
+ return -EINVAL;
+ *cur_bytes_submitted = 0;
}
- ret = iomap_readpage_iter(iter, ctx);
+ ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted);
if (ret)
return ret;
}
@@ -514,8 +606,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
/**
* iomap_readahead - Attempt to read pages from a file.
- * @rac: Describes the pages to be read.
* @ops: The operations vector for the filesystem.
+ * @ctx: The ctx used for issuing readahead.
*
* This function is for filesystems to call to implement their readahead
* address_space operation.
@@ -527,51 +619,30 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
* function is called with memalloc_nofs set, so allocations will not cause
* the filesystem to be reentered.
*/
-void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
+void iomap_readahead(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx)
{
+ struct readahead_control *rac = ctx->rac;
struct iomap_iter iter = {
.inode = rac->mapping->host,
.pos = readahead_pos(rac),
.len = readahead_length(rac),
};
- struct iomap_readpage_ctx ctx = {
- .rac = rac,
- };
+ size_t cur_bytes_submitted;
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
- iter.status = iomap_readahead_iter(&iter, &ctx);
+ iter.status = iomap_readahead_iter(&iter, ctx,
+ &cur_bytes_submitted);
- if (ctx.bio)
- submit_bio(ctx.bio);
- if (ctx.cur_folio) {
- if (!ctx.cur_folio_in_bio)
- folio_unlock(ctx.cur_folio);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_readahead);
-
-static int iomap_read_folio_range(const struct iomap_iter *iter,
- struct folio *folio, loff_t pos, size_t len)
-{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct bio_vec bvec;
- struct bio bio;
+ if (ctx->ops->submit_read)
+ ctx->ops->submit_read(ctx);
- bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(srcmap, pos);
- bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos));
- return submit_bio_wait(&bio);
+ if (ctx->cur_folio)
+ iomap_read_end(ctx->cur_folio, cur_bytes_submitted);
}
-#else
-static int iomap_read_folio_range(const struct iomap_iter *iter,
- struct folio *folio, loff_t pos, size_t len)
-{
- WARN_ON_ONCE(1);
- return -EIO;
-}
-#endif /* CONFIG_BLOCK */
+EXPORT_SYMBOL_GPL(iomap_readahead);
/*
* iomap_is_partially_uptodate checks whether blocks within a folio are
@@ -584,7 +655,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- unsigned first, last, i;
+ unsigned first, last;
if (!ifs)
return false;
@@ -596,10 +667,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
first = from >> inode->i_blkbits;
last = (from + count - 1) >> inode->i_blkbits;
- for (i = first; i <= last; i++)
- if (!ifs_block_is_uptodate(ifs, i))
- return false;
- return true;
+ return ifs_next_nonuptodate_block(folio, first, last) > last;
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
@@ -707,7 +775,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
* are not changing pagecache contents.
*/
if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
- pos + len >= folio_pos(folio) + folio_size(folio))
+ pos + len >= folio_next_pos(folio))
return 0;
ifs = ifs_alloc(iter->inode, folio, iter->flags);
@@ -723,9 +791,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
if (plen == 0)
break;
- if (!(iter->flags & IOMAP_UNSHARE) &&
- (from <= poff || from >= poff + plen) &&
- (to <= poff || to >= poff + plen))
+ /*
+ * If the read range will be entirely overwritten by the write,
+ * we can skip having to zero/read it in.
+ */
+ if (!(iter->flags & IOMAP_UNSHARE) && from <= poff &&
+ to >= poff + plen)
continue;
if (iomap_block_needs_zeroing(iter, block_start)) {
@@ -742,7 +813,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
status = write_ops->read_folio_range(iter,
folio, block_start, plen);
else
- status = iomap_read_folio_range(iter,
+ status = iomap_bio_read_folio_range_sync(iter,
folio, block_start, plen);
if (status)
return status;
@@ -761,6 +832,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter,
if (!mapping_large_folio_support(iter->inode->i_mapping))
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
+ if (iter->fbatch) {
+ struct folio *folio = folio_batch_next(iter->fbatch);
+
+ if (!folio)
+ return NULL;
+
+ /*
+ * The folio mapping generally shouldn't have changed based on
+ * fs locks, but be consistent with filemap lookup and retry
+ * the iter if it does.
+ */
+ folio_lock(folio);
+ if (unlikely(folio->mapping != iter->inode->i_mapping)) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ folio_unlock(folio);
+ return NULL;
+ }
+
+ folio_get(folio);
+ return folio;
+ }
+
if (write_ops && write_ops->get_folio)
return write_ops->get_folio(iter, pos, len);
return iomap_get_folio(iter, pos, len);
@@ -815,15 +908,14 @@ static int iomap_write_begin(struct iomap_iter *iter,
size_t *poffset, u64 *plen)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
+ loff_t pos;
u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
struct folio *folio;
int status = 0;
len = min_not_zero(len, *plen);
- BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
- if (srcmap != &iter->iomap)
- BUG_ON(pos + len > srcmap->offset + srcmap->length);
+ *foliop = NULL;
+ *plen = 0;
if (fatal_signal_pending(current))
return -EINTR;
@@ -833,6 +925,15 @@ static int iomap_write_begin(struct iomap_iter *iter,
return PTR_ERR(folio);
/*
+ * No folio means we're done with a batch. We still have range to
+ * process so return and let the caller iterate and refill the batch.
+ */
+ if (!folio) {
+ WARN_ON_ONCE(!iter->fbatch);
+ return 0;
+ }
+
+ /*
* Now we have a locked folio, before we do anything with it we need to
* check that the iomap we have cached is not stale. The inode extent
* mapping can change due to concurrent IO in flight (e.g.
@@ -852,6 +953,22 @@ static int iomap_write_begin(struct iomap_iter *iter,
}
}
+ /*
+ * The folios in a batch may not be contiguous. If we've skipped
+ * forward, advance the iter to the pos of the current folio. If the
+ * folio starts beyond the end of the mapping, it may have been trimmed
+ * since the lookup for whatever reason. Return a NULL folio to
+ * terminate the op.
+ */
+ if (folio_pos(folio) > iter->pos) {
+ len = min_t(u64, folio_pos(folio) - iter->pos,
+ iomap_length(iter));
+ status = iomap_iter_advance(iter, len);
+ len = iomap_length(iter);
+ if (status || !len)
+ goto out_unlock;
+ }
+
pos = iomap_trim_folio_range(iter, folio, poffset, &len);
if (srcmap->type == IOMAP_INLINE)
@@ -1041,7 +1158,7 @@ retry:
}
} else {
total_written += written;
- iomap_iter_advance(iter, &written);
+ iomap_iter_advance(iter, written);
}
} while (iov_iter_count(i) && iomap_length(iter));
@@ -1082,7 +1199,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
struct iomap *iomap, iomap_punch_t punch)
{
- unsigned int first_blk, last_blk, i;
+ unsigned int first_blk, last_blk;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
@@ -1097,14 +1214,14 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode,
if (!ifs)
return;
- last_byte = min_t(loff_t, end_byte - 1,
- folio_pos(folio) + folio_size(folio) - 1);
+ last_byte = min_t(loff_t, end_byte - 1, folio_next_pos(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
- for (i = first_blk; i <= last_blk; i++) {
- if (!ifs_block_is_dirty(folio, ifs, i))
- punch(inode, folio_pos(folio) + (i << blkbits),
- 1 << blkbits, iomap);
+ while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk))
+ <= last_blk) {
+ punch(inode, folio_pos(folio) + (first_blk << blkbits),
+ 1 << blkbits, iomap);
+ first_blk++;
}
}
@@ -1129,8 +1246,7 @@ static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
* Make sure the next punch start is correctly bound to
* the end of this data range, not the end of the folio.
*/
- *punch_start_byte = min_t(loff_t, end_byte,
- folio_pos(folio) + folio_size(folio));
+ *punch_start_byte = min_t(loff_t, end_byte, folio_next_pos(folio));
}
/*
@@ -1170,7 +1286,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
start_byte, end_byte, iomap, punch);
/* move offset to start of next folio in range */
- start_byte = folio_pos(folio) + folio_size(folio);
+ start_byte = folio_next_pos(folio);
folio_unlock(folio);
folio_put(folio);
}
@@ -1310,7 +1426,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter,
int status;
if (!iomap_want_unshare_iter(iter))
- return iomap_iter_advance(iter, &bytes);
+ return iomap_iter_advance(iter, bytes);
do {
struct folio *folio;
@@ -1334,10 +1450,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter,
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- status = iomap_iter_advance(iter, &bytes);
+ status = iomap_iter_advance(iter, bytes);
if (status)
break;
- } while (bytes > 0);
+ } while ((bytes = iomap_length(iter)) > 0);
return status;
}
@@ -1398,6 +1514,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
if (iter->iomap.flags & IOMAP_F_STALE)
break;
+ /* a NULL folio means we're done with a folio batch */
+ if (!folio) {
+ status = iomap_iter_advance_full(iter);
+ break;
+ }
+
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
@@ -1412,16 +1534,36 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
if (WARN_ON_ONCE(!ret))
return -EIO;
- status = iomap_iter_advance(iter, &bytes);
+ status = iomap_iter_advance(iter, bytes);
if (status)
break;
- } while (bytes > 0);
+ } while ((bytes = iomap_length(iter)) > 0);
if (did_zero)
*did_zero = true;
return status;
}
+loff_t
+iomap_fill_dirty_folios(
+ struct iomap_iter *iter,
+ loff_t offset,
+ loff_t length)
+{
+ struct address_space *mapping = iter->inode->i_mapping;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + length - 1) >> PAGE_SHIFT;
+
+ iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL);
+ if (!iter->fbatch)
+ return offset + length;
+ folio_batch_init(iter->fbatch);
+
+ filemap_get_folios_dirty(mapping, &start, end, iter->fbatch);
+ return (start << PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios);
+
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops,
@@ -1435,46 +1577,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.private = private,
};
struct address_space *mapping = inode->i_mapping;
- unsigned int blocksize = i_blocksize(inode);
- unsigned int off = pos & (blocksize - 1);
- loff_t plen = min_t(loff_t, len, blocksize - off);
int ret;
bool range_dirty;
/*
- * Zero range can skip mappings that are zero on disk so long as
- * pagecache is clean. If pagecache was dirty prior to zero range, the
- * mapping converts on writeback completion and so must be zeroed.
- *
- * The simplest way to deal with this across a range is to flush
- * pagecache and process the updated mappings. To avoid excessive
- * flushing on partial eof zeroing, special case it to zero the
- * unaligned start portion if already dirty in pagecache.
- */
- if (off &&
- filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
- iter.len = plen;
- while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_zero_iter(&iter, did_zero,
- write_ops);
-
- iter.len = len - (iter.pos - pos);
- if (ret || !iter.len)
- return ret;
- }
-
- /*
* To avoid an unconditional flush, check pagecache state and only flush
* if dirty and the fs returns a mapping that might convert on
* writeback.
*/
- range_dirty = filemap_range_needs_writeback(inode->i_mapping,
- iter.pos, iter.pos + iter.len - 1);
+ range_dirty = filemap_range_needs_writeback(mapping, iter.pos,
+ iter.pos + iter.len - 1);
while ((ret = iomap_iter(&iter, ops)) > 0) {
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
- if (srcmap->type == IOMAP_HOLE ||
- srcmap->type == IOMAP_UNWRITTEN) {
+ if (WARN_ON_ONCE(iter.fbatch &&
+ srcmap->type != IOMAP_UNWRITTEN))
+ return -EIO;
+
+ if (!iter.fbatch &&
+ (srcmap->type == IOMAP_HOLE ||
+ srcmap->type == IOMAP_UNWRITTEN)) {
s64 status;
if (range_dirty) {
@@ -1526,7 +1648,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
folio_mark_dirty(folio);
}
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
@@ -1559,16 +1681,25 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len)
+static void iomap_writeback_init(struct inode *inode, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
+ if (ifs) {
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ /*
+ * Set this to the folio size. After processing the folio for
+ * writeback in iomap_writeback_folio(), we'll subtract any
+ * ranges not written back.
+ *
+ * We do this because otherwise, we would have to atomically
+ * increment ifs->write_bytes_pending every time a range in the
+ * folio needs to be written back.
+ */
+ atomic_set(&ifs->write_bytes_pending, folio_size(folio));
+ }
}
-EXPORT_SYMBOL_GPL(iomap_start_folio_write);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len)
@@ -1585,7 +1716,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
- bool *wb_pending)
+ size_t *bytes_submitted)
{
do {
ssize_t ret;
@@ -1599,11 +1730,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
pos += ret;
/*
- * Holes are not be written back by ->writeback_range, so track
+ * Holes are not written back by ->writeback_range, so track
* if we did handle anything that is not a hole here.
*/
if (wpc->iomap.type != IOMAP_HOLE)
- *wb_pending = true;
+ *bytes_submitted += ret;
} while (rlen);
return 0;
@@ -1674,7 +1805,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
u64 end_aligned = 0;
- bool wb_pending = false;
+ size_t bytes_submitted = 0;
int error = 0;
u32 rlen;
@@ -1694,14 +1825,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
iomap_set_range_dirty(folio, 0, end_pos - pos);
}
- /*
- * Keep the I/O completion handler from clearing the writeback
- * bit until we have submitted all blocks by adding a bias to
- * ifs->write_bytes_pending, which is dropped after submitting
- * all blocks.
- */
- WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
- iomap_start_folio_write(inode, folio, 1);
+ iomap_writeback_init(inode, folio);
}
/*
@@ -1716,13 +1840,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
- &wb_pending);
+ &bytes_submitted);
if (error)
break;
pos += rlen;
}
- if (wb_pending)
+ if (bytes_submitted)
wpc->nr_folios++;
/*
@@ -1740,12 +1864,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
* bit ourselves right after unlocking the page.
*/
if (ifs) {
- if (atomic_dec_and_test(&ifs->write_bytes_pending))
- folio_end_writeback(folio);
- } else {
- if (!wb_pending)
- folio_end_writeback(folio);
+ /*
+ * Subtract any bytes that were initially accounted to
+ * write_bytes_pending but skipped for writeback.
+ */
+ size_t bytes_not_submitted = folio_size(folio) -
+ bytes_submitted;
+
+ if (bytes_not_submitted)
+ iomap_finish_folio_write(inode, folio,
+ bytes_not_submitted);
+ } else if (!bytes_submitted) {
+ folio_end_writeback(folio);
}
+
mapping_set_error(inode->i_mapping, error);
return error;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5d5d63efbd57..8e273408453a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -16,21 +16,13 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
-#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
-#define IOMAP_DIO_CALLER_COMP (1U << 26)
-#define IOMAP_DIO_INLINE_COMP (1U << 27)
+#define IOMAP_DIO_NO_INVALIDATE (1U << 26)
+#define IOMAP_DIO_COMP_WORK (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
#define IOMAP_DIO_NEED_SYNC (1U << 29)
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
-/*
- * Used for sub block zeroing in iomap_dio_zero()
- */
-#define IOMAP_ZERO_PAGE_SIZE (SZ_64K)
-#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE))
-static struct page *zero_page;
-
struct iomap_dio {
struct kiocb *iocb;
const struct iomap_dio_ops *dops;
@@ -140,11 +132,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
-static ssize_t iomap_dio_deferred_complete(void *data)
-{
- return iomap_dio_complete(data);
-}
-
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -179,33 +166,33 @@ static void iomap_dio_done(struct iomap_dio *dio)
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
- WRITE_ONCE(iocb->private, NULL);
- iomap_dio_complete_work(&dio->aio.work);
- } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
- * schedule our completion that way to avoid an async punt to a
- * workqueue.
- */
- /* only polled IO cares about private cleared */
- iocb->private = dio;
- iocb->dio_complete = iomap_dio_deferred_complete;
+ return;
+ }
- /*
- * Invoke ->ki_complete() directly. We've assigned our
- * dio_complete callback handler, and since the issuer set
- * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
- * notice ->dio_complete being set and will defer calling that
- * handler until it can be done from a safe task context.
- *
- * Note that the 'res' being passed in here is not important
- * for this case. The actual completion value of the request
- * will be gotten from dio_complete when that is run by the
- * issuer.
- */
- iocb->ki_complete(iocb, 0);
- } else {
+ /*
+ * Always run error completions in user context. These are not
+ * performance critical and some code relies on taking sleeping locks
+ * for error handling.
+ */
+ if (dio->error)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+
+ /*
+ * Never invalidate pages from this context to avoid deadlocks with
+ * buffered I/O completions when called from the ioend workqueue,
+ * or avoid sleeping when called directly from ->bi_end_io.
+ * Tough luck if you hit the tiny race with someone dirtying the range
+ * right between this check and the actual completion.
+ */
+ if ((dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_COMP_WORK)) {
+ if (dio->iocb->ki_filp->f_mapping->nrpages)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+ else
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+
+ if (dio->flags & IOMAP_DIO_COMP_WORK) {
struct inode *inode = file_inode(iocb->ki_filp);
/*
@@ -216,7 +203,11 @@ static void iomap_dio_done(struct iomap_dio *dio)
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+ return;
}
+
+ WRITE_ONCE(iocb->private, NULL);
+ iomap_dio_complete_work(&dio->aio.work);
}
void iomap_dio_bio_end_io(struct bio *bio)
@@ -252,16 +243,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
/*
* Try to avoid another context switch for the completion given
* that we are already called from the ioend completion
- * workqueue, but never invalidate pages from this thread to
- * avoid deadlocks with buffered I/O completions. Tough luck if
- * you hit the tiny race with someone dirtying the range now
- * between this check and the actual completion.
+ * workqueue.
*/
- if (!dio->iocb->ki_filp->f_mapping->nrpages) {
- dio->flags |= IOMAP_DIO_INLINE_COMP;
- dio->flags |= IOMAP_DIO_NO_INVALIDATE;
- }
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ dio->flags &= ~IOMAP_DIO_COMP_WORK;
iomap_dio_done(dio);
}
@@ -285,42 +269,36 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
struct bio *bio;
+ struct folio *zero_folio = largest_zero_folio();
+ int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio));
if (!len)
return 0;
+
/*
- * Max block size supported is 64k
+ * This limit shall never be reached as most filesystems have a
+ * maximum blocksize of 64k.
*/
- if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE))
+ if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS))
return -EINVAL;
- bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
+ bio = iomap_dio_alloc_bio(iter, dio, nr_vecs,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- __bio_add_page(bio, zero_page, len, 0);
+ while (len > 0) {
+ unsigned int io_len = min(len, folio_size(zero_folio));
+
+ bio_add_folio_nofail(bio, zero_folio, io_len, 0);
+ len -= io_len;
+ }
iomap_dio_submit_bio(iter, dio, bio, pos);
- return 0;
-}
-/*
- * Use a FUA write if we need datasync semantics and this is a pure data I/O
- * that doesn't require any metadata updates (including after I/O completion
- * such as unwritten extent conversion) and the underlying device either
- * doesn't have a volatile write cache or supports FUA.
- * This allows us to avoid cache flushes on I/O completion.
- */
-static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
- struct iomap_dio *dio)
-{
- if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
- return false;
- if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
- return false;
- return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
+ return 0;
}
static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
@@ -336,12 +314,39 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
int nr_pages, ret = 0;
u64 copied = 0;
size_t orig_count;
+ unsigned int alignment;
+
+ /*
+ * File systems that write out of place and always allocate new blocks
+ * need each bio to be block aligned as that's the unit of allocation.
+ */
+ if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+ alignment = fs_block_size;
+ else
+ alignment = bdev_logical_block_size(iomap->bdev);
- if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1))
+ if ((pos | length) & (alignment - 1))
return -EINVAL;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio_opf |= REQ_OP_WRITE;
+ bool need_completion_work = true;
+
+ switch (iomap->type) {
+ case IOMAP_MAPPED:
+ /*
+ * Directly mapped I/O does not inherently need to do
+ * work at I/O completion time. But there are various
+ * cases below where this will get set again.
+ */
+ need_completion_work = false;
+ break;
+ case IOMAP_UNWRITTEN:
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ break;
+ default:
+ break;
+ }
if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
/*
@@ -354,35 +359,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
bio_opf |= REQ_ATOMIC;
}
- if (iomap->type == IOMAP_UNWRITTEN) {
- dio->flags |= IOMAP_DIO_UNWRITTEN;
- need_zeroout = true;
- }
-
- if (iomap->flags & IOMAP_F_SHARED)
+ if (iomap->flags & IOMAP_F_SHARED) {
+ /*
+ * Unsharing of needs to update metadata at I/O
+ * completion time.
+ */
+ need_completion_work = true;
dio->flags |= IOMAP_DIO_COW;
+ }
- if (iomap->flags & IOMAP_F_NEW)
+ if (iomap->flags & IOMAP_F_NEW) {
+ /*
+ * Newly allocated blocks might need recording in
+ * metadata at I/O completion time.
+ */
+ need_completion_work = true;
need_zeroout = true;
- else if (iomap->type == IOMAP_MAPPED &&
- iomap_dio_can_use_fua(iomap, dio))
- bio_opf |= REQ_FUA;
+ }
- if (!(bio_opf & REQ_FUA))
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ /*
+ * Use a FUA write if we need datasync semantics and this is a
+ * pure overwrite that doesn't require any metadata updates.
+ *
+ * This allows us to avoid cache flushes on I/O completion.
+ */
+ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) {
+ if (!need_completion_work &&
+ !(iomap->flags & IOMAP_F_DIRTY) &&
+ (!bdev_write_cache(iomap->bdev) ||
+ bdev_fua(iomap->bdev)))
+ bio_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ }
/*
- * We can only do deferred completion for pure overwrites that
+ * We can only do inline completion for pure overwrites that
* don't require additional I/O at completion time.
*
- * This rules out writes that need zeroing or extent conversion,
- * extend the file size, or issue metadata I/O or cache flushes
- * during completion processing.
+ * This rules out writes that need zeroing or metdata updates to
+ * convert unwritten or shared extents.
+ *
+ * Writes that extend i_size are also not supported, but this is
+ * handled in __iomap_dio_rw().
*/
- if (need_zeroout || (pos >= i_size_read(inode)) ||
- ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
- !(bio_opf & REQ_FUA)))
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ if (need_completion_work)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+
+ bio_opf |= REQ_OP_WRITE;
} else {
bio_opf |= REQ_OP_READ;
}
@@ -403,7 +427,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
*/
- if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
+ if (dio->flags & IOMAP_DIO_COMP_WORK)
dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) {
@@ -434,7 +458,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
bio->bi_end_io = iomap_dio_bio_end_io;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
- bdev_logical_block_size(iomap->bdev) - 1);
+ alignment - 1);
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
@@ -496,7 +520,7 @@ out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
- return iomap_iter_advance(iter, &copied);
+ return iomap_iter_advance(iter, copied);
return ret;
}
@@ -507,7 +531,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
dio->size += length;
if (!length)
return -EFAULT;
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
@@ -542,7 +566,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
dio->size += copied;
if (!copied)
return -EFAULT;
- return iomap_iter_advance(iomi, &copied);
+ return iomap_iter_advance(iomi, copied);
}
static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
@@ -639,10 +663,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
- if (iov_iter_rw(iter) == READ) {
- /* reads can always complete inline */
- dio->flags |= IOMAP_DIO_INLINE_COMP;
+ if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+ dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+ if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
goto out_free_dio;
@@ -656,15 +680,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
- /*
- * Flag as supporting deferred completions, if the issuer
- * groks it. This can avoid a workqueue punt for writes.
- * We may later clear this flag if we need to do other IO
- * as part of this IO completion.
- */
- if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
- dio->flags |= IOMAP_DIO_CALLER_COMP;
-
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -694,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
/*
+ * i_size updates must to happen from process context.
+ */
+ if (iomi.pos + iomi.len > dio->i_size)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+
+ /*
* Try to invalidate cache pages for the range we are writing.
* If this invalidation fails, let the caller fall back to
* buffered I/O.
@@ -717,12 +738,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
goto out_free_dio;
}
+ }
- if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
- ret = sb_init_dio_done_wq(inode->i_sb);
- if (ret < 0)
- goto out_free_dio;
- }
+ if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
+ ret = sb_init_dio_done_wq(inode->i_sb);
+ if (ret < 0)
+ goto out_free_dio;
}
inode_dio_begin(inode);
@@ -765,9 +786,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
* If all the writes we issued were already written through to the
* media, we don't need to flush the cache on IO completion. Clear the
* sync flag for this case.
+ *
+ * Otherwise clear the inline completion flag if any sync work is
+ * needed, as that needs to be performed from process context.
*/
if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+ else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
/*
* We are about to drop our additional submission reference, which
@@ -825,15 +851,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
-
-static int __init iomap_dio_init(void)
-{
- zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
- IOMAP_ZERO_PAGE_ORDER);
-
- if (!zero_page)
- return -ENOMEM;
-
- return 0;
-}
-fs_initcall(iomap_dio_init);
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
index d05cb3aed96e..3a4e4aad2bd1 100644
--- a/fs/iomap/internal.h
+++ b/fs/iomap/internal.h
@@ -6,4 +6,16 @@
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+#ifdef CONFIG_BLOCK
+int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len);
+#else
+static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+#endif /* CONFIG_BLOCK */
+
#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..86f44922ed3b 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -194,8 +194,6 @@ new_ioend:
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
- iomap_start_folio_write(wpc->inode, folio, map_len);
-
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index cef77ca0c20b..8692e5e41c6d 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -8,22 +8,24 @@
static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
+ if (iter->fbatch) {
+ folio_batch_release(iter->fbatch);
+ kfree(iter->fbatch);
+ iter->fbatch = NULL;
+ }
+
iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
}
-/*
- * Advance the current iterator position and output the length remaining for the
- * current mapping.
- */
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+/* Advance the current iterator position and decrement the remaining length */
+int iomap_iter_advance(struct iomap_iter *iter, u64 count)
{
- if (WARN_ON_ONCE(*count > iomap_length(iter)))
+ if (WARN_ON_ONCE(count > iomap_length(iter)))
return -EIO;
- iter->pos += *count;
- iter->len -= *count;
- *count = iomap_length(iter);
+ iter->pos += count;
+ iter->len -= count;
return 0;
}
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 56db2dd4b10d..6cbc587c93da 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
}
@@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter,
switch (iter->iomap.type) {
case IOMAP_HOLE:
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
return 0;
default:
*hole_pos = iter->pos;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index a61c1dae4742..532787277b16 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter);
#define IOMAP_DIO_STRINGS \
- {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
- {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
- {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }
+ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
+ {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
+ {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \
+ {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" }
DECLARE_EVENT_CLASS(iomap_class,
TP_PROTO(struct inode *inode, struct iomap *iomap),
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6f0e6b19383c..b7cbe126faf3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc)
goto out_freesbi;
}
opt->blocksize = sb_min_blocksize(s, opt->blocksize);
+ if (!opt->blocksize) {
+ printk(KERN_ERR
+ "ISOFS: unable to set blocksize\n");
+ goto out_freesbi;
+ }
sbi->s_high_sierra = 0; /* default is iso9660 */
sbi->s_session = opt->session;
@@ -1515,7 +1520,7 @@ struct inode *__isofs_iget(struct super_block *sb,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
ret = isofs_read_inode(inode, relocated);
if (ret < 0) {
iget_failed(inode);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c7867139af69..3e510564de6e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1659,6 +1659,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ int wait_for_writeback = 0;
if (is_handle_aborted(handle))
return -EROFS;
@@ -1782,18 +1783,22 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
}
/*
- * The buffer is still not written to disk, we should
- * attach this buffer to current transaction so that the
- * buffer can be checkpointed only after the current
- * transaction commits.
+ * The buffer has not yet been written to disk. We should
+ * either clear the buffer or ensure that the ongoing I/O
+ * is completed, and attach this buffer to current
+ * transaction so that the buffer can be checkpointed only
+ * after the current transaction commits.
*/
clear_buffer_dirty(bh);
+ wait_for_writeback = 1;
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
spin_unlock(&journal->j_list_lock);
}
drop:
__brelse(bh);
spin_unlock(&jh->b_state_lock);
+ if (wait_for_writeback)
+ wait_on_buffer(bh);
jbd2_journal_put_journal_head(jh);
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index d175cccb7c55..764bba8ba999 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -265,7 +265,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
f = JFFS2_INODE_INFO(inode);
@@ -373,7 +373,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
{
struct iattr iattr;
- if (!(inode->i_state & I_DIRTY_DATASYNC)) {
+ if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) {
jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n",
__func__, inode->i_ino);
return;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2a4a288b821c..87ad042221e7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -26,8 +26,8 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return rc;
inode_lock(inode);
- if (!(inode->i_state & I_DIRTY_ALL) ||
- (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL) ||
+ (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
inode_unlock(inode);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 21f3d029da7d..4709762713ef 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -29,7 +29,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ret = diRead(inode);
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 10934f9a11be..5aaafedb8fbc 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -76,14 +76,14 @@ struct jfs_inode_info {
struct {
unchar _unused[16]; /* 16: */
dxd_t _dxd; /* 16: */
- /* _inline may overflow into _inline_ea when needed */
+ /* _inline_sym may overflow into _inline_ea when needed */
/* _inline_ea may overlay the last part of
* file._xtroot if maxentry = XTROOTINITSLOT
*/
union {
struct {
/* 128: inline symlink */
- unchar _inline[128];
+ unchar _inline_sym[128];
/* 128: inline extended attr */
unchar _inline_ea[128];
};
@@ -101,7 +101,7 @@ struct jfs_inode_info {
#define i_imap u.file._imap
#define i_dirtable u.dir._table
#define i_dtroot u.dir._dtroot
-#define i_inline u.link._inline
+#define i_inline u.link._inline_sym
#define i_inline_ea u.link._inline_ea
#define i_inline_all u.link._inline_all
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 7840a03e5bcb..c16578af3a77 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1287,7 +1287,7 @@ int txCommit(tid_t tid, /* transaction identifier */
* to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
* Joern
*/
- if (tblk->u.ip->i_state & I_SYNC)
+ if (inode_state_read_once(tblk->u.ip) & I_SYNC)
tblk->xflag &= ~COMMIT_LAZY;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 457f91c412d4..a36aaee98dce 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -251,7 +251,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
struct inode *inode;
inode = iget_locked(sb, kernfs_ino(kn));
- if (inode && (inode->i_state & I_NEW))
+ if (inode && (inode_state_read_once(inode) & I_NEW))
kernfs_init_inode(kn, inode);
return inode;
diff --git a/fs/libfs.c b/fs/libfs.c
index ce8c496a6940..1661dcb7d983 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
+ s->s_d_flags |= ctx->s_d_flags;
root = new_inode(s);
if (!root)
return -ENOMEM;
@@ -1542,9 +1543,9 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
inode_lock(inode);
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
goto out;
err = sync_inode_metadata(inode, 1);
@@ -1664,7 +1665,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
* list because mark_inode_dirty() will think
* that it already _is_ on the dirty list.
*/
- inode->i_state = I_DIRTY;
+ inode_state_assign_raw(inode, I_DIRTY);
/*
* Historically anonymous inodes don't have a type at all and
* userspace has come to rely on this.
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7897f5123b3d..51ea9bdc813f 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -605,7 +605,7 @@ struct inode *minix_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
if (INODE_VERSION(inode) == MINIX_V1)
diff --git a/fs/mount.h b/fs/mount.h
index f13a28752d0b..2d28ef2a3aed 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -27,6 +27,7 @@ struct mnt_namespace {
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
refcount_t passive; /* number references not pinning @mounts */
+ bool is_anon;
} __randomize_layout;
struct mnt_pcp {
@@ -175,7 +176,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
- return ns->ns.ns_id == 0;
+ return ns->is_anon;
}
static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namei.c b/fs/namei.c
index 7377020a2cba..eaeb9278ffee 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,7 +282,7 @@ void putname(struct filename *name)
return;
refcnt = atomic_read(&name->refcnt);
- if (refcnt != 1) {
+ if (unlikely(refcnt != 1)) {
if (WARN_ON_ONCE(!refcnt))
return;
@@ -290,7 +290,7 @@ void putname(struct filename *name)
return;
}
- if (name->name != name->iname) {
+ if (unlikely(name->name != name->iname)) {
__putname(name->name);
kfree(name);
} else
@@ -540,10 +540,13 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Separate out file-system wide checks from inode-specific permission checks.
+ *
+ * Note: lookup_inode_permission_may_exec() does not call here. If you add
+ * MAY_EXEC checks, adjust it.
*/
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
- if (unlikely(mask & MAY_WRITE)) {
+ if (mask & MAY_WRITE) {
umode_t mode = inode->i_mode;
/* Nobody gets write access to a read-only fs. */
@@ -574,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap,
if (unlikely(retval))
return retval;
- if (unlikely(mask & MAY_WRITE)) {
+ if (mask & MAY_WRITE) {
/*
* Nobody gets write access to an immutable file.
*/
@@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
}
EXPORT_SYMBOL(inode_permission);
+/*
+ * lookup_inode_permission_may_exec - Check traversal right for given inode
+ *
+ * This is a special case routine for may_lookup() making assumptions specific
+ * to path traversal. Use inode_permission() if you are doing something else.
+ *
+ * Work is shaved off compared to inode_permission() as follows:
+ * - we know for a fact there is no MAY_WRITE to worry about
+ * - it is an invariant the inode is a directory
+ *
+ * Since majority of real-world traversal happens on inodes which grant it for
+ * everyone, we check it upfront and only resort to more expensive work if it
+ * fails.
+ *
+ * Filesystems which have their own ->permission hook and consequently miss out
+ * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
+ * on their directory inodes.
+ */
+static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ /* Lookup already checked this to return -ENOTDIR */
+ VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
+ VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
+
+ mask |= MAY_EXEC;
+
+ if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
+ return inode_permission(idmap, inode, mask);
+
+ if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
+ return inode_permission(idmap, inode, mask);
+
+ return security_inode_permission(inode, mask);
+}
+
/**
* path_get - get a reference to a path
* @path: path to get the reference to
@@ -746,7 +785,8 @@ static void leave_rcu(struct nameidata *nd)
static void terminate_walk(struct nameidata *nd)
{
- drop_links(nd);
+ if (unlikely(nd->depth))
+ drop_links(nd);
if (!(nd->flags & LOOKUP_RCU)) {
int i;
path_put(&nd->path);
@@ -843,7 +883,7 @@ static bool try_to_unlazy(struct nameidata *nd)
BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (unlikely(!legitimize_links(nd)))
+ if (unlikely(nd->depth && !legitimize_links(nd)))
goto out1;
if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
goto out;
@@ -878,7 +918,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
int res;
BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (unlikely(!legitimize_links(nd)))
+ if (unlikely(nd->depth && !legitimize_links(nd)))
goto out2;
res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
if (unlikely(res)) {
@@ -951,8 +991,8 @@ static int complete_walk(struct nameidata *nd)
* We don't want to zero nd->root for scoped-lookups or
* externally-managed nd->root.
*/
- if (!(nd->state & ND_ROOT_PRESET))
- if (!(nd->flags & LOOKUP_IS_SCOPED))
+ if (likely(!(nd->state & ND_ROOT_PRESET)))
+ if (likely(!(nd->flags & LOOKUP_IS_SCOPED)))
nd->root.mnt = NULL;
nd->flags &= ~LOOKUP_CACHED;
if (!try_to_unlazy(nd))
@@ -1034,7 +1074,7 @@ static int nd_jump_root(struct nameidata *nd)
}
if (!nd->root.mnt) {
int error = set_root(nd);
- if (error)
+ if (unlikely(error))
return error;
}
if (nd->flags & LOOKUP_RCU) {
@@ -1632,13 +1672,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
path->dentry = dentry;
if (nd->flags & LOOKUP_RCU) {
unsigned int seq = nd->next_seq;
+ if (likely(!d_managed(dentry)))
+ return 0;
if (likely(__follow_mount_rcu(nd, path)))
return 0;
// *path and nd->next_seq might've been clobbered
path->mnt = nd->path.mnt;
path->dentry = dentry;
nd->next_seq = seq;
- if (!try_to_unlazy_next(nd, dentry))
+ if (unlikely(!try_to_unlazy_next(nd, dentry)))
return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
@@ -1823,7 +1865,7 @@ again:
return dentry;
}
-static struct dentry *lookup_slow(const struct qstr *name,
+static noinline struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
@@ -1855,7 +1897,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
int err, mask;
mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
- err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+ err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
if (likely(!err))
return 0;
@@ -1870,7 +1912,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
if (err != -ECHILD) // hard error
return err;
- return inode_permission(idmap, nd->inode, MAY_EXEC);
+ return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
}
static int reserve_stack(struct nameidata *nd, struct path *link)
@@ -1901,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link)
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
-static const char *pick_link(struct nameidata *nd, struct path *link,
+static noinline const char *pick_link(struct nameidata *nd, struct path *link,
struct inode *inode, int flags)
{
struct saved *last;
const char *res;
- int error = reserve_stack(nd, link);
+ int error;
+
+ if (nd->flags & LOOKUP_RCU) {
+ /* make sure that d_is_symlink from step_into_slowpath() matches the inode */
+ if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
+ return ERR_PTR(-ECHILD);
+ } else {
+ if (link->mnt == nd->path.mnt)
+ mntget(link->mnt);
+ }
+ error = reserve_stack(nd, link);
if (unlikely(error)) {
if (!(nd->flags & LOOKUP_RCU))
path_put(link);
@@ -1981,14 +2033,15 @@ all_done: // pure jump
*
* NOTE: dentry must be what nd->next_seq had been sampled from.
*/
-static const char *step_into(struct nameidata *nd, int flags,
+static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
struct dentry *dentry)
{
struct path path;
struct inode *inode;
- int err = handle_mounts(nd, dentry, &path);
+ int err;
- if (err < 0)
+ err = handle_mounts(nd, dentry, &path);
+ if (unlikely(err < 0))
return ERR_PTR(err);
inode = path.dentry->d_inode;
if (likely(!d_is_symlink(path.dentry)) ||
@@ -2010,15 +2063,32 @@ static const char *step_into(struct nameidata *nd, int flags,
nd->seq = nd->next_seq;
return NULL;
}
- if (nd->flags & LOOKUP_RCU) {
- /* make sure that d_is_symlink above matches inode */
- if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
+ return pick_link(nd, &path, inode, flags);
+}
+
+static __always_inline const char *step_into(struct nameidata *nd, int flags,
+ struct dentry *dentry)
+{
+ /*
+ * In the common case we are in rcu-walk and traversing over a non-mounted on
+ * directory (as opposed to e.g., a symlink).
+ *
+ * We can handle that and negative entries with the checks below.
+ */
+ if (likely((nd->flags & LOOKUP_RCU) &&
+ !d_managed(dentry) && !d_is_symlink(dentry))) {
+ struct inode *inode = dentry->d_inode;
+ if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
return ERR_PTR(-ECHILD);
- } else {
- if (path.mnt == nd->path.mnt)
- mntget(path.mnt);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOENT);
+ nd->path.dentry = dentry;
+ /* nd->path.mnt is retained on purpose */
+ nd->inode = inode;
+ nd->seq = nd->next_seq;
+ return NULL;
}
- return pick_link(nd, &path, inode, flags);
+ return step_into_slowpath(nd, flags, dentry);
}
static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
@@ -2101,7 +2171,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
if (!nd->root.mnt) {
error = ERR_PTR(set_root(nd));
- if (error)
+ if (unlikely(error))
return error;
}
if (nd->flags & LOOKUP_RCU)
@@ -2131,7 +2201,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
return NULL;
}
-static const char *walk_component(struct nameidata *nd, int flags)
+static __always_inline const char *walk_component(struct nameidata *nd, int flags)
{
struct dentry *dentry;
/*
@@ -2140,7 +2210,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) {
- if (!(flags & WALK_MORE) && nd->depth)
+ if (unlikely(nd->depth) && !(flags & WALK_MORE))
put_link(nd);
return handle_dots(nd, nd->last_type);
}
@@ -2152,7 +2222,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
if (IS_ERR(dentry))
return ERR_CAST(dentry);
}
- if (!(flags & WALK_MORE) && nd->depth)
+ if (unlikely(nd->depth) && !(flags & WALK_MORE))
put_link(nd);
return step_into(nd, flags, dentry);
}
@@ -2505,7 +2575,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
if (unlikely(!*name)) {
OK:
/* pathname or trailing symlink, done */
- if (!depth) {
+ if (likely(!depth)) {
nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
@@ -2543,10 +2613,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
const char *s = nd->pathname;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
- if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
+ if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED))
return ERR_PTR(-EAGAIN);
- if (!*s)
+ if (unlikely(!*s))
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
rcu_read_lock();
@@ -2560,7 +2630,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
smp_rmb();
- if (nd->state & ND_ROOT_PRESET) {
+ if (unlikely(nd->state & ND_ROOT_PRESET)) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
if (*s && unlikely(!d_can_lookup(root)))
@@ -2579,7 +2649,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->root.mnt = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
- if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
+ if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) {
error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error);
@@ -2632,7 +2702,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
- if (flags & LOOKUP_IS_SCOPED) {
+ if (unlikely(flags & LOOKUP_IS_SCOPED)) {
nd->root = nd->path;
if (flags & LOOKUP_RCU) {
nd->root_seq = nd->seq;
@@ -4036,7 +4106,7 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
inode = file_inode(file);
if (!(open_flag & O_EXCL)) {
spin_lock(&inode->i_lock);
- inode->i_state |= I_LINKABLE;
+ inode_state_set(inode, I_LINKABLE);
spin_unlock(&inode->i_lock);
}
security_inode_post_create_tmpfile(idmap, inode);
@@ -4931,7 +5001,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
inode_lock(inode);
/* Make sure we don't allow creating hardlink to an unlinked file */
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
error = -ENOENT;
else if (max_links && inode->i_nlink >= max_links)
error = -EMLINK;
@@ -4941,9 +5011,9 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
error = dir->i_op->link(old_dentry, dir, new_dentry);
}
- if (!error && (inode->i_state & I_LINKABLE)) {
+ if (!error && (inode_state_read_once(inode) & I_LINKABLE)) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_LINKABLE;
+ inode_state_clear(inode, I_LINKABLE);
spin_unlock(&inode->i_lock);
}
inode_unlock(inode);
diff --git a/fs/namespace.c b/fs/namespace.c
index d82910f33dc4..d7afac807ac3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
-{
- struct ns_common *ns;
-
- if (!node)
- return NULL;
- ns = rb_entry(node, struct ns_common, ns_tree_node);
- return container_of(ns, struct mnt_namespace, ns);
-}
-
static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
@@ -151,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns)
kfree(ns);
}
}
-DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *,
+ if (!IS_ERR(_T)) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
@@ -1345,26 +1336,12 @@ static void delayed_mntput(struct work_struct *unused)
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
-static void mntput_no_expire(struct mount *mnt)
+static void noinline mntput_no_expire_slowpath(struct mount *mnt)
{
LIST_HEAD(list);
int count;
- rcu_read_lock();
- if (likely(READ_ONCE(mnt->mnt_ns))) {
- /*
- * Since we don't do lock_mount_hash() here,
- * ->mnt_ns can change under us. However, if it's
- * non-NULL, then there's a reference that won't
- * be dropped until after an RCU delay done after
- * turning ->mnt_ns NULL. So if we observe it
- * non-NULL under rcu_read_lock(), the reference
- * we are dropping is not the final one.
- */
- mnt_add_count(mnt, -1);
- rcu_read_unlock();
- return;
- }
+ VFS_BUG_ON(mnt->mnt_ns);
lock_mount_hash();
/*
* make sure that if __legitimize_mnt() has not seen us grab
@@ -1415,6 +1392,26 @@ static void mntput_no_expire(struct mount *mnt)
cleanup_mnt(mnt);
}
+static void mntput_no_expire(struct mount *mnt)
+{
+ rcu_read_lock();
+ if (likely(READ_ONCE(mnt->mnt_ns))) {
+ /*
+ * Since we don't do lock_mount_hash() here,
+ * ->mnt_ns can change under us. However, if it's
+ * non-NULL, then there's a reference that won't
+ * be dropped until after an RCU delay done after
+ * turning ->mnt_ns NULL. So if we observe it
+ * non-NULL under rcu_read_lock(), the reference
+ * we are dropping is not the final one.
+ */
+ mnt_add_count(mnt, -1);
+ rcu_read_unlock();
+ return;
+ }
+ mntput_no_expire_slowpath(mnt);
+}
+
void mntput(struct vfsmount *mnt)
{
if (mnt) {
@@ -4093,8 +4090,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
- if (!anon)
- ns_tree_gen_id(&new_ns->ns);
+ ns_tree_gen_id(new_ns);
+
+ new_ns->is_anon = anon;
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
init_waitqueue_head(&new_ns->poll);
@@ -5454,11 +5452,11 @@ static int statmount_string(struct kstatmount *s, u64 flag)
ret = statmount_sb_source(s, seq);
break;
case STATMOUNT_MNT_UIDMAP:
- sm->mnt_uidmap = start;
+ offp = &sm->mnt_uidmap;
ret = statmount_mnt_uidmap(s, seq);
break;
case STATMOUNT_MNT_GIDMAP:
- sm->mnt_gidmap = start;
+ offp = &sm->mnt_gidmap;
ret = statmount_mnt_gidmap(s, seq);
break;
default:
@@ -5736,7 +5734,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
if (ret)
return ret;
- if (kreq->spare != 0)
+ if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
return -EINVAL;
/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
@@ -5753,16 +5751,14 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
{
struct mnt_namespace *mnt_ns;
- if (kreq->mnt_ns_id && kreq->spare)
- return ERR_PTR(-EINVAL);
-
- if (kreq->mnt_ns_id)
- return lookup_mnt_ns(kreq->mnt_ns_id);
-
- if (kreq->spare) {
+ if (kreq->mnt_ns_id) {
+ mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id);
+ if (!mnt_ns)
+ return ERR_PTR(-ENOENT);
+ } else if (kreq->mnt_ns_fd) {
struct ns_common *ns;
- CLASS(fd, f)(kreq->spare);
+ CLASS(fd, f)(kreq->mnt_ns_fd);
if (fd_empty(f))
return ERR_PTR(-EBADF);
@@ -5774,11 +5770,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
return ERR_PTR(-EINVAL);
mnt_ns = to_mnt_ns(ns);
+ refcount_inc(&mnt_ns->passive);
} else {
mnt_ns = current->nsproxy->mnt_ns;
+ refcount_inc(&mnt_ns->passive);
}
- refcount_inc(&mnt_ns->passive);
return mnt_ns;
}
@@ -5801,8 +5798,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
return ret;
ns = grab_requested_mnt_ns(&kreq);
- if (!ns)
- return -ENOENT;
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
@@ -5912,8 +5909,8 @@ static void __free_klistmount_free(const struct klistmount *kls)
static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
size_t nr_mnt_ids)
{
-
u64 last_mnt_id = kreq->param;
+ struct mnt_namespace *ns;
/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
@@ -5927,9 +5924,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *
if (!kls->kmnt_ids)
return -ENOMEM;
- kls->ns = grab_requested_mnt_ns(kreq);
- if (!kls->ns)
- return -ENOENT;
+ ns = grab_requested_mnt_ns(kreq);
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
+ kls->ns = ns;
kls->mnt_parent_id = kreq->mnt_id;
return 0;
@@ -5985,11 +5983,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
}
struct mnt_namespace init_mnt_ns = {
- .ns.inum = ns_init_inum(&init_mnt_ns),
- .ns.ops = &mntns_operations,
+ .ns = NS_COMMON_INIT(init_mnt_ns),
.user_ns = &init_user_ns,
- .ns.__ns_ref = REFCOUNT_INIT(1),
- .ns.ns_type = ns_common_type(&init_mnt_ns),
.passive = REFCOUNT_INIT(1),
.mounts = RB_ROOT,
.poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 09394ac2c180..f9d62abef2ac 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -535,7 +535,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
folio_unlock(folio);
err = filemap_fdatawrite_range(mapping,
folio_pos(folio),
- folio_pos(folio) + folio_size(folio));
+ folio_next_pos(folio));
switch (err) {
case 0:
ret = VM_FAULT_RETRY;
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 486166460e17..6df89c92b10b 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -147,10 +147,10 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
if (!fscache_cookie_valid(cookie))
return true;
- if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ if (!(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) {
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_NETFS_WB)) {
- inode->i_state |= I_PINNING_NETFS_WB;
+ if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
+ inode_state_set(inode, I_PINNING_NETFS_WB);
need_use = true;
}
spin_unlock(&inode->i_lock);
@@ -192,7 +192,7 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
{
struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
- if (inode->i_state & I_PINNING_NETFS_WB) {
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) {
loff_t i_size = i_size_read(inode);
fscache_unuse_cookie(cookie, aux, &i_size);
}
@@ -298,7 +298,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
if (folio_test_dirty(folio))
return false;
- end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode));
+ end = umin(folio_next_pos(folio), i_size_read(&ctx->inode));
if (end > ctx->zero_point)
ctx->zero_point = end;
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index 5c0dc4efc792..8e6264f62a8f 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -36,12 +36,12 @@ void netfs_single_mark_inode_dirty(struct inode *inode)
mark_inode_dirty(inode);
- if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) {
+ if (caching && !(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) {
bool need_use = false;
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_NETFS_WB)) {
- inode->i_state |= I_PINNING_NETFS_WB;
+ if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
+ inode_state_set(inode, I_PINNING_NETFS_WB);
need_use = true;
}
spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4e3dcc157a83..54699299d5b1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -338,6 +338,14 @@ again:
/* Match the xprt security policy */
if (clp->cl_xprtsec.policy != data->xprtsec.policy)
continue;
+ if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) {
+ if (clp->cl_xprtsec.cert_serial !=
+ data->xprtsec.cert_serial)
+ continue;
+ if (clp->cl_xprtsec.privkey_serial !=
+ data->xprtsec.privkey_serial)
+ continue;
+ }
refcount_inc(&clp->cl_count);
return clp;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 46d9c65d50f8..ea9f6ca8f30f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2268,11 +2268,12 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
return -ENAMETOOLONG;
if (open_flags & O_CREAT) {
- file->f_mode |= FMODE_CREATED;
error = nfs_do_create(dir, dentry, mode, open_flags);
- if (error)
+ if (!error) {
+ file->f_mode |= FMODE_CREATED;
+ return finish_open(file, dentry, NULL);
+ } else if (error != -EEXIST || open_flags & O_EXCL)
return error;
- return finish_open(file, dentry, NULL);
}
if (d_in_lookup(dentry)) {
/* The only flags nfs_lookup considers are
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index df01d2876b68..9056f05a67dc 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -270,19 +270,31 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
mirror->layout = NULL;
}
-static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count,
+ gfp_t gfp_flags)
{
struct nfs4_ff_layout_mirror *mirror;
- u32 dss_id;
mirror = kzalloc(sizeof(*mirror), gfp_flags);
- if (mirror != NULL) {
- spin_lock_init(&mirror->lock);
- refcount_set(&mirror->ref, 1);
- INIT_LIST_HEAD(&mirror->mirrors);
- for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
- nfs_localio_file_init(&mirror->dss[dss_id].nfl);
+ if (mirror == NULL)
+ return NULL;
+
+ spin_lock_init(&mirror->lock);
+ refcount_set(&mirror->ref, 1);
+ INIT_LIST_HEAD(&mirror->mirrors);
+
+ mirror->dss_count = dss_count;
+ mirror->dss =
+ kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
+ gfp_flags);
+ if (mirror->dss == NULL) {
+ kfree(mirror);
+ return NULL;
}
+
+ for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ nfs_localio_file_init(&mirror->dss[dss_id].nfl);
+
return mirror;
}
@@ -507,17 +519,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (dss_count > 1 && stripe_unit == 0)
goto out_err_free;
- fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
+ fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags);
if (fls->mirror_array[i] == NULL) {
rc = -ENOMEM;
goto out_err_free;
}
- fls->mirror_array[i]->dss_count = dss_count;
- fls->mirror_array[i]->dss =
- kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
- gfp_flags);
-
for (dss_id = 0; dss_id < dss_count; dss_id++) {
dss_info = &fls->mirror_array[i]->dss[dss_id];
dss_info->mirror = fls->mirror_array[i];
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 18b57c7c2f97..f76fe406937a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -475,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
goto out_no_inode;
}
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long now = jiffies;
@@ -718,6 +718,8 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct nfs_fattr *fattr;
loff_t oldsize = i_size_read(inode);
int error = 0;
+ kuid_t task_uid = current_fsuid();
+ kuid_t owner_uid = inode->i_uid;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -739,9 +741,11 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) {
spin_lock(&inode->i_lock);
if (attr->ia_valid & ATTR_MTIME_SET) {
- nfs_set_timestamps_to_ts(inode, attr);
- attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET|
+ if (uid_eq(task_uid, owner_uid)) {
+ nfs_set_timestamps_to_ts(inode, attr);
+ attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET|
ATTR_ATIME|ATTR_ATIME_SET);
+ }
} else {
nfs_update_timestamps(inode, attr->ia_valid);
attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME);
@@ -751,10 +755,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
attr->ia_valid & ATTR_ATIME &&
!(attr->ia_valid & ATTR_MTIME)) {
if (attr->ia_valid & ATTR_ATIME_SET) {
- spin_lock(&inode->i_lock);
- nfs_set_timestamps_to_ts(inode, attr);
- spin_unlock(&inode->i_lock);
- attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET);
+ if (uid_eq(task_uid, owner_uid)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_timestamps_to_ts(inode, attr);
+ spin_unlock(&inode->i_lock);
+ attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET);
+ }
} else {
nfs_update_delegated_atime(inode);
attr->ia_valid &= ~ATTR_ATIME;
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 2c0455e91571..49ed90c6b9f2 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -42,10 +42,9 @@ struct nfs_local_kiocb {
/* Begin mostly DIO-specific members */
size_t end_len;
short int end_iter_index;
- short int n_iters;
+ atomic_t n_iters;
bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
- loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
- struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
/* End mostly DIO-specific members */
};
@@ -314,7 +313,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
init_sync_kiocb(&iocb->kiocb, file);
iocb->hdr = hdr;
+ iocb->kiocb.ki_pos = hdr->args.offset;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
+ iocb->kiocb.ki_complete = NULL;
iocb->aio_complete_work = NULL;
iocb->end_iter_index = -1;
@@ -388,13 +389,24 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
return true;
}
+static void
+nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec,
+ unsigned int nvecs, unsigned long total,
+ size_t start, size_t len)
+{
+ iov_iter_bvec(iter, rw, bvec, nvecs, total);
+ if (start)
+ iov_iter_advance(iter, start);
+ iov_iter_truncate(iter, len);
+}
+
/*
* Setup as many as 3 iov_iter based on extents described by @local_dio.
* Returns the number of iov_iter that were setup.
*/
static int
nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
- unsigned int nvecs, size_t len,
+ unsigned int nvecs, unsigned long total,
struct nfs_local_dio *local_dio)
{
int n_iters = 0;
@@ -402,39 +414,17 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
/* Setup misaligned start? */
if (local_dio->start_len) {
- iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
- iters[n_iters].count = local_dio->start_len;
- iocb->offset[n_iters] = iocb->hdr->args.offset;
- iocb->iter_is_dio_aligned[n_iters] = false;
+ nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
+ nvecs, total, 0, local_dio->start_len);
++n_iters;
}
- /* Setup misaligned end?
- * If so, the end is purposely setup to be issued using buffered IO
- * before the middle (which will use DIO, if DIO-aligned, with AIO).
- * This creates problems if/when the end results in a partial write.
- * So must save index and length of end to handle this corner case.
- */
- if (local_dio->end_len) {
- iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
- iocb->offset[n_iters] = local_dio->end_offset;
- iov_iter_advance(&iters[n_iters],
- local_dio->start_len + local_dio->middle_len);
- iocb->iter_is_dio_aligned[n_iters] = false;
- /* Save index and length of end */
- iocb->end_iter_index = n_iters;
- iocb->end_len = local_dio->end_len;
- ++n_iters;
- }
-
- /* Setup DIO-aligned middle to be issued last, to allow for
- * DIO with AIO completion (see nfs_local_call_{read,write}).
+ /*
+ * Setup DIO-aligned middle, if there is no misaligned end (below)
+ * then AIO completion is used, see nfs_local_call_{read,write}
*/
- iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
- if (local_dio->start_len)
- iov_iter_advance(&iters[n_iters], local_dio->start_len);
- iters[n_iters].count -= local_dio->end_len;
- iocb->offset[n_iters] = local_dio->middle_offset;
+ nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs,
+ total, local_dio->start_len, local_dio->middle_len);
iocb->iter_is_dio_aligned[n_iters] =
nfs_iov_iter_aligned_bvec(&iters[n_iters],
@@ -442,12 +432,22 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
trace_nfs_local_dio_misaligned(iocb->hdr->inode,
- iocb->hdr->args.offset, len, local_dio);
+ local_dio->start_len, local_dio->middle_len, local_dio);
return 0; /* no DIO-aligned IO possible */
}
+ iocb->end_iter_index = n_iters;
++n_iters;
- iocb->n_iters = n_iters;
+ /* Setup misaligned end? */
+ if (local_dio->end_len) {
+ nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
+ nvecs, total, local_dio->start_len +
+ local_dio->middle_len, local_dio->end_len);
+ iocb->end_iter_index = n_iters;
+ ++n_iters;
+ }
+
+ atomic_set(&iocb->n_iters, n_iters);
return n_iters;
}
@@ -473,18 +473,26 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
}
len = hdr->args.count - total;
+ /*
+ * For each iocb, iocb->n_iters is always at least 1 and we always
+ * end io after first nfs_local_pgio_done call unless misaligned DIO.
+ */
+ atomic_set(&iocb->n_iters, 1);
+
if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
struct nfs_local_dio local_dio;
if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
- nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) {
+ /* Ensure DIO WRITE's IO on stable storage upon completion */
+ if (rw == ITER_SOURCE)
+ iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
return; /* is DIO-aligned */
+ }
}
/* Use buffered IO */
- iocb->offset[0] = hdr->args.offset;
iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
- iocb->n_iters = 1;
}
static void
@@ -504,9 +512,11 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
hdr->task.tk_start = ktime_get();
}
-static void
-nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
+static bool
+nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force)
{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
/* Must handle partial completions */
if (status >= 0) {
hdr->res.count += status;
@@ -517,6 +527,12 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
}
+
+ if (force)
+ return true;
+
+ BUG_ON(atomic_read(&iocb->n_iters) <= 0);
+ return atomic_dec_and_test(&iocb->n_iters);
}
static void
@@ -547,11 +563,11 @@ static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb)
queue_work(nfsiod_workqueue, &iocb->work);
}
-static void
-nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
+static void nfs_local_read_done(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
struct file *filp = iocb->kiocb.ki_filp;
+ long status = hdr->task.tk_status;
if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
@@ -564,20 +580,27 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
*/
hdr->res.replen = 0;
- if (hdr->res.count != hdr->args.count ||
- hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp)))
+ /* nfs_readpage_result() handles short read */
+
+ if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp)))
hdr->res.eof = true;
dprintk("%s: read %ld bytes eof %d.\n", __func__,
status > 0 ? status : 0, hdr->res.eof);
}
+static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_read_done(iocb);
+ nfs_local_pgio_release(iocb);
+}
+
static void nfs_local_read_aio_complete_work(struct work_struct *work)
{
struct nfs_local_kiocb *iocb =
container_of(work, struct nfs_local_kiocb, work);
- nfs_local_pgio_release(iocb);
+ nfs_local_read_iocb_done(iocb);
}
static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
@@ -585,43 +608,51 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
- nfs_local_pgio_done(iocb->hdr, ret);
- nfs_local_read_done(iocb, ret);
+ /* AIO completion of DIO read should always be last to complete */
+ if (unlikely(!nfs_local_pgio_done(iocb, ret, false)))
+ return;
+
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
}
-static void nfs_local_call_read(struct work_struct *work)
+static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *filp)
{
- struct nfs_local_kiocb *iocb =
- container_of(work, struct nfs_local_kiocb, work);
- struct file *filp = iocb->kiocb.ki_filp;
- const struct cred *save_cred;
+ bool force_done = false;
ssize_t status;
+ int n_iters;
- save_cred = override_creds(filp->f_cred);
-
- for (int i = 0; i < iocb->n_iters ; i++) {
+ n_iters = atomic_read(&iocb->n_iters);
+ for (int i = 0; i < n_iters ; i++) {
if (iocb->iter_is_dio_aligned[i]) {
iocb->kiocb.ki_flags |= IOCB_DIRECT;
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
+ /* Only use AIO completion if DIO-aligned segment is last */
+ if (i == iocb->end_iter_index) {
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
+ } else
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
- iocb->kiocb.ki_pos = iocb->offset[i];
status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
if (status != -EIOCBQUEUED) {
- nfs_local_pgio_done(iocb->hdr, status);
- if (iocb->hdr->task.tk_status)
+ if (unlikely(status >= 0 && status < iocb->iters[i].count))
+ force_done = true; /* Partial read */
+ if (nfs_local_pgio_done(iocb, status, force_done)) {
+ nfs_local_read_iocb_done(iocb);
break;
+ }
}
}
+}
- revert_creds(save_cred);
+static void nfs_local_call_read(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+ struct file *filp = iocb->kiocb.ki_filp;
- if (status != -EIOCBQUEUED) {
- nfs_local_read_done(iocb, status);
- nfs_local_pgio_release(iocb);
- }
+ scoped_with_creds(filp->f_cred)
+ do_nfs_local_call_read(iocb, filp);
}
static int
@@ -736,11 +767,10 @@ static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb)
fattr->du.nfs3.used = stat.blocks << 9;
}
-static void
-nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
+static void nfs_local_write_done(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
- struct inode *inode = hdr->inode;
+ long status = hdr->task.tk_status;
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
@@ -759,10 +789,17 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
/* record -ENOSPC in terms of nfs_local_pgio_done */
- nfs_local_pgio_done(hdr, status);
+ (void) nfs_local_pgio_done(iocb, status, true);
}
if (hdr->task.tk_status < 0)
- nfs_reset_boot_verifier(inode);
+ nfs_reset_boot_verifier(hdr->inode);
+}
+
+static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_write_done(iocb);
+ nfs_local_vfs_getattr(iocb);
+ nfs_local_pgio_release(iocb);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -770,8 +807,7 @@ static void nfs_local_write_aio_complete_work(struct work_struct *work)
struct nfs_local_kiocb *iocb =
container_of(work, struct nfs_local_kiocb, work);
- nfs_local_vfs_getattr(iocb);
- nfs_local_pgio_release(iocb);
+ nfs_local_write_iocb_done(iocb);
}
static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
@@ -779,75 +815,62 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
- nfs_local_pgio_done(iocb->hdr, ret);
- nfs_local_write_done(iocb, ret);
+ /* AIO completion of DIO write should always be last to complete */
+ if (unlikely(!nfs_local_pgio_done(iocb, ret, false)))
+ return;
+
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
}
-static void nfs_local_call_write(struct work_struct *work)
+static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb,
+ struct file *filp)
{
- struct nfs_local_kiocb *iocb =
- container_of(work, struct nfs_local_kiocb, work);
- struct file *filp = iocb->kiocb.ki_filp;
- unsigned long old_flags = current->flags;
- const struct cred *save_cred;
+ bool force_done = false;
ssize_t status;
-
- current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
- save_cred = override_creds(filp->f_cred);
+ int n_iters;
file_start_write(filp);
- for (int i = 0; i < iocb->n_iters ; i++) {
+ n_iters = atomic_read(&iocb->n_iters);
+ for (int i = 0; i < n_iters ; i++) {
if (iocb->iter_is_dio_aligned[i]) {
iocb->kiocb.ki_flags |= IOCB_DIRECT;
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-retry:
- iocb->kiocb.ki_pos = iocb->offset[i];
+ /* Only use AIO completion if DIO-aligned segment is last */
+ if (i == iocb->end_iter_index) {
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+ } else
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+
status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
if (status != -EIOCBQUEUED) {
- if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
- /* partial write */
- if (i == iocb->end_iter_index) {
- /* Must not account partial end, otherwise, due
- * to end being issued before middle: the partial
- * write accounting in nfs_local_write_done()
- * would incorrectly advance hdr->args.offset
- */
- status = 0;
- } else {
- /* Partial write at start or buffered middle,
- * exit early.
- */
- nfs_local_pgio_done(iocb->hdr, status);
- break;
- }
- } else if (unlikely(status == -ENOTBLK &&
- (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
- /* VFS will return -ENOTBLK if DIO WRITE fails to
- * invalidate the page cache. Retry using buffered IO.
- */
- iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
- iocb->kiocb.ki_complete = NULL;
- iocb->aio_complete_work = NULL;
- goto retry;
- }
- nfs_local_pgio_done(iocb->hdr, status);
- if (iocb->hdr->task.tk_status)
+ if (unlikely(status >= 0 && status < iocb->iters[i].count))
+ force_done = true; /* Partial write */
+ if (nfs_local_pgio_done(iocb, status, force_done)) {
+ nfs_local_write_iocb_done(iocb);
break;
+ }
}
}
file_end_write(filp);
- revert_creds(save_cred);
- current->flags = old_flags;
+ return status;
+}
- if (status != -EIOCBQUEUED) {
- nfs_local_write_done(iocb, status);
- nfs_local_vfs_getattr(iocb);
- nfs_local_pgio_release(iocb);
- }
+static void nfs_local_call_write(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+ struct file *filp = iocb->kiocb.ki_filp;
+ unsigned long old_flags = current->flags;
+ ssize_t status;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+
+ scoped_with_creds(filp->f_cred)
+ status = do_nfs_local_call_write(iocb, filp);
+
+ current->flags = old_flags;
}
static int
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 0d7310c1ee0c..5d97c1d38bb6 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -2,6 +2,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include <linux/sunrpc/addr.h>
+#include <net/handshake.h>
#include "internal.h"
#include "nfs3_fs.h"
#include "netns.h"
@@ -98,7 +99,11 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
.net = mds_clp->cl_net,
.timeparms = &ds_timeout,
.cred = mds_srv->cred,
- .xprtsec = mds_clp->cl_xprtsec,
+ .xprtsec = {
+ .policy = RPC_XPRTSEC_NONE,
+ .cert_serial = TLS_NO_CERT,
+ .privkey_serial = TLS_NO_PRIVKEY,
+ },
.connect_timeout = connect_timeout,
.reconnect_timeout = connect_timeout,
};
@@ -111,9 +116,14 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_TCP_TLS:
+ if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE)
+ cl_init.xprtsec = mds_clp->cl_xprtsec;
+ else
+ ds_proto = XPRT_TRANSPORT_TCP;
+ fallthrough;
case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
- case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1)
cl_init.nconnect = mds_clp->cl_nconnect;
}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 6fddf43d729c..3a4baed993c9 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -11,6 +11,7 @@
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/bc_xprt.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/handshake.h>
#include "internal.h"
#include "callback.h"
#include "delegation.h"
@@ -222,6 +223,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
clp->cl_mig_gen = 1;
+ clp->cl_last_renewal = jiffies;
#if IS_ENABLED(CONFIG_NFS_V4_1)
init_waitqueue_head(&clp->cl_lock_waitq);
#endif
@@ -982,7 +984,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
.net = mds_clp->cl_net,
.timeparms = &ds_timeout,
.cred = mds_srv->cred,
- .xprtsec = mds_srv->nfs_client->cl_xprtsec,
+ .xprtsec = {
+ .policy = RPC_XPRTSEC_NONE,
+ .cert_serial = TLS_NO_CERT,
+ .privkey_serial = TLS_NO_PRIVKEY,
+ },
};
char buf[INET6_ADDRSTRLEN + 1];
@@ -991,9 +997,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_TCP_TLS:
+ if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE)
+ cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec;
+ else
+ ds_proto = XPRT_TRANSPORT_TCP;
+ fallthrough;
case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
- case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1) {
cl_init.nconnect = mds_clp->cl_nconnect;
cl_init.max_connect = NFS_MAX_TRANSPORTS;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 00932500fce4..9e1c48c5c0b8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -306,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
const char *type, void *data,
size_t data_size, struct idmap *idmap)
{
- const struct cred *saved_cred;
struct key *rkey;
const struct user_key_payload *payload;
ssize_t ret;
- saved_cred = override_creds(id_resolver_cache);
- rkey = nfs_idmap_request_key(name, namelen, type, idmap);
- revert_creds(saved_cred);
-
+ scoped_with_creds(id_resolver_cache)
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f58098417142..93c6ce04332b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3636,6 +3636,7 @@ struct nfs4_closedata {
} lr;
struct nfs_fattr fattr;
unsigned long timestamp;
+ unsigned short retrans;
};
static void nfs4_free_closedata(void *data)
@@ -3664,6 +3665,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
.state = state,
.inode = calldata->inode,
.stateid = &calldata->arg.stateid,
+ .retrans = calldata->retrans,
};
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -3711,6 +3713,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
default:
task->tk_status = nfs4_async_handle_exception(task,
server, task->tk_status, &exception);
+ calldata->retrans = exception.retrans;
if (exception.retry)
goto out_restart;
}
@@ -4712,16 +4715,19 @@ static int _nfs4_proc_lookupp(struct inode *inode,
};
unsigned short task_flags = 0;
- if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+ if (server->flags & NFS_MOUNT_SOFTREVAL)
task_flags |= RPC_TASK_TIMEOUT;
+ if (server->caps & NFS_CAP_MOVEABLE)
+ task_flags |= RPC_TASK_MOVEABLE;
args.bitmask = nfs4_bitmask(server, fattr->label);
nfs_fattr_init(fattr);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino);
- status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
- &res.seq_res, task_flags);
+ status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args,
+ &res.seq_res, task_flags);
dprintk("NFS reply lookupp: %d\n", status);
return status;
}
@@ -5593,9 +5599,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
.inode = hdr->inode,
.state = hdr->args.context->state,
.stateid = &hdr->args.stateid,
+ .retrans = hdr->retrans,
};
task->tk_status = nfs4_async_handle_exception(task,
server, task->tk_status, &exception);
+ hdr->retrans = exception.retrans;
if (exception.retry) {
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -5709,10 +5717,12 @@ static int nfs4_write_done_cb(struct rpc_task *task,
.inode = hdr->inode,
.state = hdr->args.context->state,
.stateid = &hdr->args.stateid,
+ .retrans = hdr->retrans,
};
task->tk_status = nfs4_async_handle_exception(task,
NFS_SERVER(inode), task->tk_status,
&exception);
+ hdr->retrans = exception.retrans;
if (exception.retry) {
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -6726,6 +6736,7 @@ struct nfs4_delegreturndata {
struct nfs_fh fh;
nfs4_stateid stateid;
unsigned long timestamp;
+ unsigned short retrans;
struct {
struct nfs4_layoutreturn_args arg;
struct nfs4_layoutreturn_res res;
@@ -6746,6 +6757,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
.inode = data->inode,
.stateid = &data->stateid,
.task_is_privileged = data->args.seq_args.sa_privileged,
+ .retrans = data->retrans,
};
if (!nfs4_sequence_done(task, &data->res.seq_res))
@@ -6817,6 +6829,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
task->tk_status = nfs4_async_handle_exception(task,
data->res.server, task->tk_status,
&exception);
+ data->retrans = exception.retrans;
if (exception.retry)
goto out_restart;
}
@@ -7093,6 +7106,7 @@ struct nfs4_unlockdata {
struct file_lock fl;
struct nfs_server *server;
unsigned long timestamp;
+ unsigned short retrans;
};
static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
@@ -7147,6 +7161,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
struct nfs4_exception exception = {
.inode = calldata->lsp->ls_state->inode,
.stateid = &calldata->arg.stateid,
+ .retrans = calldata->retrans,
};
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -7180,6 +7195,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
task->tk_status = nfs4_async_handle_exception(task,
calldata->server, task->tk_status,
&exception);
+ calldata->retrans = exception.retrans;
if (exception.retry)
rpc_restart_call_prepare(task);
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3135b5af7ee..f157d43d1312 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -317,7 +317,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
pnfs_detach_layout_hdr(lo);
/* Notify pnfs_destroy_layout_final() that we're done */
- if (inode->i_state & (I_FREEING | I_CLEAR))
+ if (inode_state_read(inode) & (I_FREEING | I_CLEAR))
wake_up_var_locked(lo, &inode->i_lock);
spin_unlock(&inode->i_lock);
pnfs_free_layout_hdr(lo);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7b32afb29782..9976cc16b689 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -809,8 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
unsigned int retrans)
{
struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy;
struct nfs4_pnfs_ds_addr *da;
unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10;
+ int ds_proto;
int status = 0;
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
@@ -834,27 +837,28 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
.xprtsec = clp->cl_xprtsec,
};
- if (da->da_transport != clp->cl_proto &&
- clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
- continue;
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
+ if (xprt_args.ident == XPRT_TRANSPORT_TCP &&
+ clp->cl_proto == XPRT_TRANSPORT_TCP_TLS)
xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
- if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+ if (xprt_args.ident != clp->cl_proto)
+ continue;
+ if (xprt_args.dstaddr->sa_family !=
+ clp->cl_addr.ss_family)
continue;
/* Add this address as an alias */
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
- rpc_clnt_test_and_add_xprt, NULL);
+ rpc_clnt_test_and_add_xprt, NULL);
continue;
}
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
- da->da_transport = XPRT_TRANSPORT_TCP_TLS;
- clp = get_v3_ds_connect(mds_srv,
- &da->da_addr,
- da->da_addrlen, da->da_transport,
- timeo, retrans);
+
+ ds_proto = da->da_transport;
+ if (ds_proto == XPRT_TRANSPORT_TCP &&
+ xprtsec_policy != RPC_XPRTSEC_NONE)
+ ds_proto = XPRT_TRANSPORT_TCP_TLS;
+
+ clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen,
+ ds_proto, timeo, retrans);
if (IS_ERR(clp))
continue;
clp->cl_rpcclient->cl_softerr = 0;
@@ -880,7 +884,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
u32 minor_version)
{
struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy;
struct nfs4_pnfs_ds_addr *da;
+ int ds_proto;
int status = 0;
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
@@ -908,12 +915,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
.data = &xprtdata,
};
- if (da->da_transport != clp->cl_proto &&
- clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
- continue;
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto ==
- XPRT_TRANSPORT_TCP_TLS) {
+ if (xprt_args.ident == XPRT_TRANSPORT_TCP &&
+ clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) {
struct sockaddr *addr =
(struct sockaddr *)&da->da_addr;
struct sockaddr_in *sin =
@@ -944,7 +947,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
xprt_args.servername = servername;
}
- if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+ if (xprt_args.ident != clp->cl_proto)
+ continue;
+ if (xprt_args.dstaddr->sa_family !=
+ clp->cl_addr.ss_family)
continue;
/**
@@ -958,15 +964,14 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
if (xprtdata.cred)
put_cred(xprtdata.cred);
} else {
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto ==
- XPRT_TRANSPORT_TCP_TLS)
- da->da_transport = XPRT_TRANSPORT_TCP_TLS;
- clp = nfs4_set_ds_client(mds_srv,
- &da->da_addr,
- da->da_addrlen,
- da->da_transport, timeo,
- retrans, minor_version);
+ ds_proto = da->da_transport;
+ if (ds_proto == XPRT_TRANSPORT_TCP &&
+ xprtsec_policy != RPC_XPRTSEC_NONE)
+ ds_proto = XPRT_TRANSPORT_TCP_TLS;
+
+ clp = nfs4_set_ds_client(mds_srv, &da->da_addr,
+ da->da_addrlen, ds_proto,
+ timeo, retrans, minor_version);
if (IS_ERR(clp))
continue;
@@ -977,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
clp = ERR_PTR(-EIO);
continue;
}
-
}
}
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 545148d42dcc..ea6e6168092b 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -189,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
return p;
kobject_put(&p->kobject);
+ kobject_put(&p->nfs_net_kobj);
}
return NULL;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0fb6905736d5..336c510f3750 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1535,7 +1535,8 @@ static int nfs_writeback_done(struct rpc_task *task,
/* Deal with the suid/sgid bit corner case */
if (nfs_should_remove_suid(inode)) {
spin_lock(&inode->i_lock);
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE
+ | NFS_INO_REVAL_FORCED);
spin_unlock(&inode->i_lock);
}
return 0;
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index c318cf74e388..0f1a35400cd5 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -125,6 +125,13 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
return 0;
}
+static __be32
+nfsd4_ff_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
+ struct nfsd4_layoutcommit *lcp)
+{
+ return nfs_ok;
+}
+
const struct nfsd4_layout_ops ff_layout_ops = {
.notify_types =
NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
@@ -133,4 +140,5 @@ const struct nfsd4_layout_ops ff_layout_ops = {
.encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo,
.proc_layoutget = nfsd4_ff_proc_layoutget,
.encode_layoutget = nfsd4_ff_encode_layoutget,
+ .proc_layoutcommit = nfsd4_ff_proc_layoutcommit,
};
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e466cf52d7d7..7f7e6bb23a90 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -988,10 +988,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static void
nfsd4_read_release(union nfsd4_op_u *u)
{
- if (u->read.rd_nf)
+ if (u->read.rd_nf) {
+ trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
+ u->read.rd_offset, u->read.rd_length);
nfsd_file_put(u->read.rd_nf);
- trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
- u->read.rd_offset, u->read.rd_length);
+ }
}
static __be32
@@ -2892,10 +2893,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
rqstp->rq_lease_breaker = (void **)&cstate->clp;
- trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt);
+ trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
+ if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
+ /* If there are still more operations to process,
+ * stop here and report NFS4ERR_RESOURCE. */
+ if (cstate->minorversion == 0 &&
+ args->client_opcnt > resp->opcnt) {
+ op->status = nfserr_resource;
+ goto encode_op;
+ }
+ }
+
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -2972,7 +2983,7 @@ encode_op:
status = op->status;
}
- trace_nfsd_compound_status(args->opcnt, resp->opcnt,
+ trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 81fa7cc6c77b..8a6960500217 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1542,7 +1542,8 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
release_all_access(stp);
if (stp->st_stateowner)
nfs4_put_stateowner(stp->st_stateowner);
- WARN_ON(!list_empty(&stid->sc_cp_list));
+ if (!list_empty(&stid->sc_cp_list))
+ nfs4_free_cpntf_statelist(stid->sc_client->net, stid);
kmem_cache_free(stateid_slab, stid);
}
@@ -3486,7 +3487,20 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
struct nfsd4_slot *slot = resp->cstate.slot;
unsigned int base;
- dprintk("--> %s slot %p\n", __func__, slot);
+ /*
+ * RFC 5661 Section 2.10.6.1.2:
+ *
+ * Any time SEQUENCE ... returns an error ... [t]he replier MUST NOT
+ * modify the reply cache entry for the slot whenever an error is
+ * returned from SEQUENCE ...
+ *
+ * Because nfsd4_store_cache_entry is called only by
+ * nfsd4_sequence_done(), nfsd4_store_cache_entry() is called only
+ * when a SEQUENCE operation was part of the COMPOUND.
+ * nfs41_check_op_ordering() ensures SEQUENCE is the first op.
+ */
+ if (resp->opcnt == 1 && resp->cstate.status != nfs_ok)
+ return;
slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
slot->sl_opcnt = resp->opcnt;
@@ -3902,6 +3916,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
ca->headerpadsz = 0;
ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+ ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
@@ -4348,6 +4363,36 @@ static bool replay_matches_cache(struct svc_rqst *rqstp,
return true;
}
+/*
+ * Note that the response is constructed here both for the case
+ * of a new SEQUENCE request and for a replayed SEQUENCE request.
+ * We do not cache SEQUENCE responses as SEQUENCE is idempotent.
+ */
+static void nfsd4_construct_sequence_response(struct nfsd4_session *session,
+ struct nfsd4_sequence *seq)
+{
+ struct nfs4_client *clp = session->se_client;
+
+ seq->maxslots_response = max(session->se_target_maxslots,
+ seq->maxslots);
+ seq->target_maxslots = session->se_target_maxslots;
+
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
+ seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ default:
+ seq->status_flags = 0;
+ }
+ if (!list_empty(&clp->cl_revoked))
+ seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+ if (atomic_read(&clp->cl_admin_revoked))
+ seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
+}
+
__be32
nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -4397,6 +4442,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("%s: slotid %d\n", __func__, seq->slotid);
trace_nfsd_slot_seqid_sequence(clp, seq, slot);
+
+ nfsd4_construct_sequence_response(session, seq);
+
status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags);
if (status == nfserr_replay_cache) {
status = nfserr_seq_misordered;
@@ -4494,23 +4542,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
out:
- seq->maxslots = max(session->se_target_maxslots, seq->maxslots);
- seq->target_maxslots = session->se_target_maxslots;
-
- switch (clp->cl_cb_state) {
- case NFSD4_CB_DOWN:
- seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
- break;
- case NFSD4_CB_FAULT:
- seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
- break;
- default:
- seq->status_flags = 0;
- }
- if (!list_empty(&clp->cl_revoked))
- seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
- if (atomic_read(&clp->cl_admin_revoked))
- seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
trace_nfsd_seq4_status(rqstp, seq);
out_no_session:
if (conn)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c0a3c6a7c8bb..67bb9c0b9fcb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2488,8 +2488,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
return false;
- if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
return false;
+ argp->opcnt = min_t(u32, argp->client_opcnt,
+ NFSD_MAX_OPS_PER_COMPOUND);
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
@@ -2628,10 +2630,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
__be32 *p;
__be32 pathlen;
int pathlen_offset;
- int strlen, count=0;
char *str, *end, *next;
-
- dprintk("nfsd4_encode_components(%s)\n", components);
+ int count = 0;
pathlen_offset = xdr->buf->len;
p = xdr_reserve_space(xdr, 4);
@@ -2658,9 +2658,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
for (; *end && (*end != sep); end++)
/* find sep or end of string */;
- strlen = end - str;
- if (strlen) {
- if (xdr_stream_encode_opaque(xdr, str, strlen) < 0)
+ if (end > str) {
+ if (xdr_stream_encode_opaque(xdr, str, end - str) < 0)
return nfserr_resource;
count++;
} else
@@ -2939,6 +2938,12 @@ struct nfsd4_fattr_args {
typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args);
+static __be32 nfsd4_encode_fattr4__inval(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfserr_inval;
+}
+
static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args)
{
@@ -3560,6 +3565,8 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop,
[FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support,
+ [FATTR4_TIME_DELEG_ACCESS] = nfsd4_encode_fattr4__inval,
+ [FATTR4_TIME_DELEG_MODIFY] = nfsd4_encode_fattr4__inval,
[FATTR4_OPEN_ARGUMENTS] = nfsd4_encode_fattr4_open_arguments,
};
@@ -5066,7 +5073,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr;
/* Note slotid's are numbered from zero: */
/* sr_highest_slotid */
- nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+ nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots_response - 1);
if (nfserr != nfs_ok)
return nfserr;
/* sr_target_highest_slotid */
@@ -5918,8 +5925,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
*/
warn_on_nonidempotent_op(op);
xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT);
- }
- if (so) {
+ } else if (so) {
int len = xdr->buf->len - (op_status_offset + XDR_UNIT);
so->so_replay.rp_status = op->status;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index ea87b42894dd..b752433c3c2c 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -57,6 +57,9 @@ struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
};
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND 200
+
struct nfsd_genl_rqstp {
struct sockaddr rq_daddr;
struct sockaddr rq_saddr;
@@ -455,6 +458,7 @@ enum {
#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
FATTR4_WORD2_MODE_UMASK | \
+ FATTR4_WORD2_CLONE_BLKSIZE | \
NFSD4_2_SECURITY_ATTRS | \
FATTR4_WORD2_XATTR_SUPPORT | \
FATTR4_WORD2_TIME_DELEG_ACCESS | \
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3eb724ec9566..ed85dd43da18 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -269,9 +269,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
dentry);
}
- fhp->fh_dentry = dentry;
- fhp->fh_export = exp;
-
switch (fhp->fh_maxsize) {
case NFS4_FHSIZE:
if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR)
@@ -293,6 +290,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
goto out;
}
+ fhp->fh_dentry = dentry;
+ fhp->fh_export = exp;
+
return 0;
out:
exp_put(exp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9cb20d4aeab1..cf4062ac092a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1159,7 +1159,7 @@ static int wait_for_concurrent_writes(struct file *file)
dprintk("nfsd: write resume %d\n", task_pid_nr(current));
}
- if (inode->i_state & I_DIRTY) {
+ if (inode_state_read_once(inode) & I_DIRTY) {
dprintk("nfsd: write sync %d\n", task_pid_nr(current));
err = vfs_fsync(file, 0);
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d4b48602b2b0..1ce8e12ae335 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -574,8 +574,9 @@ struct nfsd4_sequence {
struct nfs4_sessionid sessionid; /* request/response */
u32 seqid; /* request/response */
u32 slotid; /* request/response */
- u32 maxslots; /* request/response */
+ u32 maxslots; /* request */
u32 cachethis; /* request */
+ u32 maxslots_response; /* response */
u32 target_maxslots; /* response */
u32 status_flags; /* response */
};
@@ -903,6 +904,7 @@ struct nfsd4_compoundargs {
char * tag;
u32 taglen;
u32 minorversion;
+ u32 client_opcnt;
u32 opcnt;
bool splice_ok;
struct nfsd4_op *ops;
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index bcc7d76269ac..4bbdc832d7f2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -1148,7 +1148,7 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
if (unlikely(!cpfile))
return -ENOMEM;
- if (!(cpfile->i_state & I_NEW))
+ if (!(inode_state_read_once(cpfile) & I_NEW))
goto out;
err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index c664daba56ae..674380837ab9 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -506,7 +506,7 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
if (unlikely(!dat))
return -ENOMEM;
- if (!(dat->i_state & I_NEW))
+ if (!(inode_state_read_once(dat) & I_NEW))
goto out;
err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index c4cd4a4dedd0..99eb8a59009e 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -188,7 +188,7 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
if (unlikely(!ifile))
return -ENOMEM;
- if (!(ifile->i_state & I_NEW))
+ if (!(inode_state_read_once(ifile) & I_NEW))
goto out;
err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 87ddde159f0c..51bde45d5865 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -365,7 +365,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
failed_after_creation:
clear_nlink(inode);
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
iput(inode); /*
* raw_inode will be deleted through
@@ -562,7 +562,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
if (!inode->i_nlink) {
iput(inode);
return ERR_PTR(-ESTALE);
@@ -591,7 +591,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
err = nilfs_init_gcinode(inode);
@@ -631,7 +631,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode)
nilfs_iget_set, &args);
if (unlikely(!btnc_inode))
return -ENOMEM;
- if (btnc_inode->i_state & I_NEW) {
+ if (inode_state_read_once(btnc_inode) & I_NEW) {
nilfs_init_btnc_inode(btnc_inode);
unlock_new_inode(btnc_inode);
}
@@ -686,7 +686,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
nilfs_iget_set, &args);
if (unlikely(!s_inode))
return ERR_PTR(-ENOMEM);
- if (!(s_inode->i_state & I_NEW))
+ if (!(inode_state_read_once(s_inode) & I_NEW))
return inode;
NILFS_I(s_inode)->i_flags = 0;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f466daa39440..b7e3d91b6243 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -14,6 +14,7 @@
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
+#include <linux/fs_struct.h>
#include <linux/nilfs2_api.h>
#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f15ca6fc400d..deee16bc9d4e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2768,7 +2768,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
if (sci->sc_task) {
wake_up(&sci->sc_wait_daemon);
- kthread_stop(sci->sc_task);
+ if (kthread_stop(sci->sc_task)) {
+ spin_lock(&sci->sc_state_lock);
+ sci->sc_task = NULL;
+ timer_shutdown_sync(&sci->sc_timer);
+ spin_unlock(&sci->sc_state_lock);
+ }
}
spin_lock(&sci->sc_state_lock);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 330f269abedf..83f93337c01b 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1226,7 +1226,7 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
if (unlikely(!sufile))
return -ENOMEM;
- if (!(sufile->i_state & I_NEW))
+ if (!(inode_state_read_once(sufile) & I_NEW))
goto out;
err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 1161eabf11ee..9cc7eb863643 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -17,6 +17,7 @@
#include "fanotify/fanotify.h"
#include "fdinfo.h"
#include "fsnotify.h"
+#include "../internal.h"
#if defined(CONFIG_PROC_FS)
@@ -46,7 +47,12 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
size = f->handle_bytes >> 2;
+ if (!super_trylock_shared(inode->i_sb))
+ return;
+
ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
+ up_read(&inode->i_sb->s_umount);
+
if ((ret == FILEID_INVALID) || (ret < 0))
return;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 46bfc543f946..d27ff5e5f165 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -52,7 +52,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
* the inode cannot have any associated watches.
*/
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 648dc59bef7f..a80f8d2a4122 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = {
static void nsfs_evict(struct inode *inode)
{
struct ns_common *ns = inode->i_private;
+
+ __ns_ref_active_put(ns);
clear_inode(inode);
ns->ops->put(ns);
}
@@ -408,6 +410,7 @@ static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
.show_path = nsfs_show_path,
+ .drop_inode = inode_just_drop,
};
static int nsfs_init_inode(struct inode *inode, void *data)
@@ -418,6 +421,16 @@ static int nsfs_init_inode(struct inode *inode, void *data)
inode->i_mode |= S_IRUGO;
inode->i_fop = &ns_file_operations;
inode->i_ino = ns->inum;
+
+ /*
+ * Bring the namespace subtree back to life if we have to. This
+ * can happen when e.g., all processes using a network namespace
+ * and all namespace files or namespace file bind-mounts have
+ * died but there are still sockets pinning it. The SIOCGSKNS
+ * ioctl on such a socket will resurrect the relevant namespace
+ * subtree.
+ */
+ __ns_ref_active_get(ns);
return 0;
}
@@ -458,6 +471,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return FILEID_NSFS;
}
+bool is_current_namespace(struct ns_common *ns)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ return current_in_namespace(to_cg_ns(ns));
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ return current_in_namespace(to_ipc_ns(ns));
+#endif
+ case CLONE_NEWNS:
+ return current_in_namespace(to_mnt_ns(ns));
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ return current_in_namespace(to_net_ns(ns));
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ return current_in_namespace(to_pid_ns(ns));
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ return current_in_namespace(to_time_ns(ns));
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ return current_in_namespace(to_user_ns(ns));
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ return current_in_namespace(to_uts_ns(ns));
+#endif
+ default:
+ VFS_WARN_ON_ONCE(true);
+ return false;
+ }
+}
+
static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
int fh_len, int fh_type)
{
@@ -483,16 +535,35 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return NULL;
}
+ if (!fid->ns_id)
+ return NULL;
+ /* Either both are set or both are unset. */
+ if (!fid->ns_inum != !fid->ns_type)
+ return NULL;
+
scoped_guard(rcu) {
ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
if (!ns)
return NULL;
VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
- VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
- VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
- if (!__ns_ref_get(ns))
+ if (fid->ns_inum && (fid->ns_inum != ns->inum))
+ return NULL;
+ if (fid->ns_type && (fid->ns_type != ns->ns_type))
+ return NULL;
+
+ /*
+ * This is racy because we're not actually taking an
+ * active reference. IOW, it could happen that the
+ * namespace becomes inactive after this check.
+ * We don't care because nsfs_init_inode() will just
+ * resurrect the relevant namespace tree for us. If it
+ * has been active here we just allow it's resurrection.
+ * We could try to take an active reference here and
+ * then drop it again. But really, why bother.
+ */
+ if (!ns_get_unless_inactive(ns))
return NULL;
}
@@ -588,6 +659,8 @@ static int nsfs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
+ ctx->s_d_flags |= DCACHE_DONTCACHE;
ctx->ops = &nsfs_ops;
ctx->eops = &nsfs_export_operations;
ctx->dops = &ns_dentry_operations;
@@ -610,3 +683,27 @@ void __init nsfs_init(void)
nsfs_root_path.mnt = nsfs_mnt;
nsfs_root_path.dentry = nsfs_mnt->mnt_root;
}
+
+void nsproxy_ns_active_get(struct nsproxy *ns)
+{
+ ns_ref_active_get(ns->mnt_ns);
+ ns_ref_active_get(ns->uts_ns);
+ ns_ref_active_get(ns->ipc_ns);
+ ns_ref_active_get(ns->pid_ns_for_children);
+ ns_ref_active_get(ns->cgroup_ns);
+ ns_ref_active_get(ns->net_ns);
+ ns_ref_active_get(ns->time_ns);
+ ns_ref_active_get(ns->time_ns_for_children);
+}
+
+void nsproxy_ns_active_put(struct nsproxy *ns)
+{
+ ns_ref_active_put(ns->mnt_ns);
+ ns_ref_active_put(ns->uts_ns);
+ ns_ref_active_put(ns->ipc_ns);
+ ns_ref_active_put(ns->pid_ns_for_children);
+ ns_ref_active_put(ns->cgroup_ns);
+ ns_ref_active_put(ns->net_ns);
+ ns_ref_active_put(ns->time_ns);
+ ns_ref_active_put(ns->time_ns_for_children);
+}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 3959f23c487a..08266adc42ba 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -537,7 +537,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
return ERR_PTR(-ENOMEM);
/* If this is a freshly allocated inode, need to read it now. */
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
inode = ntfs_read_mft(inode, name, ref);
else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) {
/*
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index ddff94c091b8..8d09dfec970a 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -51,6 +51,7 @@
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/log2.h>
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 62464d194da3..af1e2cedb217 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/fs_struct.h>
#include <cluster/masklog.h>
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 162711cc5b20..b267ec580da9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6892,7 +6892,7 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
&phys);
- start = folio_next_index(folio) << PAGE_SHIFT;
+ start = folio_next_pos(folio);
}
out:
if (folios)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 92a6149da9c1..619ff03b15d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2487,7 +2487,7 @@ update:
* which hasn't been populated yet, so clear the refresh flag
* and let the caller handle it.
*/
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
status = 0;
if (lockres)
ocfs2_complete_lock_res_refresh(lockres, 0);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index fcc89856ab95..78f81950c9ee 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -152,8 +152,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
mlog_errno(PTR_ERR(inode));
goto bail;
}
- trace_ocfs2_iget5_locked(inode->i_state);
- if (inode->i_state & I_NEW) {
+ trace_ocfs2_iget5_locked(inode_state_read_once(inode));
+ if (inode_state_read_once(inode) & I_NEW) {
rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
@@ -1290,6 +1290,8 @@ static void ocfs2_clear_inode(struct inode *inode)
void ocfs2_evict_inode(struct inode *inode)
{
+ write_inode_now(inode, 1);
+
if (!inode->i_nlink ||
(OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
ocfs2_delete_inode(inode);
@@ -1299,27 +1301,6 @@ void ocfs2_evict_inode(struct inode *inode)
ocfs2_clear_inode(inode);
}
-/* Called under inode_lock, with no more references on the
- * struct inode, so it's safe here to check the flags field
- * and to manipulate i_nlink without any other locks. */
-int ocfs2_drop_inode(struct inode *inode)
-{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
- trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
- inode->i_nlink, oi->ip_flags);
-
- assert_spin_locked(&inode->i_lock);
- inode->i_state |= I_WILL_FREE;
- spin_unlock(&inode->i_lock);
- write_inode_now(inode, 1);
- spin_lock(&inode->i_lock);
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state &= ~I_WILL_FREE;
-
- return 1;
-}
-
/*
* This is called from our getattr.
*/
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index accf03d4765e..07bd838e7843 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -116,7 +116,6 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
}
void ocfs2_evict_inode(struct inode *inode);
-int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e5f58ff2175f..85239807dec7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -902,15 +902,8 @@ bail:
static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
- struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = jinode->i_dirty_start,
- .range_end = jinode->i_dirty_end,
- };
-
- return filemap_fdatawrite_wbc(mapping, &wbc);
+ return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
+ jinode->i_dirty_start, jinode->i_dirty_end);
}
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 86f2631e6360..10923bf7c8b8 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -867,6 +867,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
mlog_errno(ret);
goto out;
}
+ /*
+ * Invalidate extent cache after moving/defragging to prevent
+ * stale cached data with outdated extent flags.
+ */
+ ocfs2_extent_map_trunc(inode, cpos);
context->clusters_moved += alloc_size;
next:
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 54ed1495de9a..4b32fb5658ad 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1569,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
-DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
-
TRACE_EVENT(ocfs2_inode_revalidate,
TP_PROTO(void *inode, unsigned long long ino,
unsigned int flags),
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 53daa4482406..2c7ba1480f7a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -129,7 +129,7 @@ static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
.alloc_inode = ocfs2_alloc_inode,
.free_inode = ocfs2_free_inode,
- .drop_inode = ocfs2_drop_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ocfs2_evict_inode,
.sync_fs = ocfs2_sync_fs,
.put_super = ocfs2_put_super,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 135c49c5d848..701ed85d9831 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -14,6 +14,7 @@
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/crc-itu-t.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include "omfs.h"
@@ -212,7 +213,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
bh = omfs_bread(inode->i_sb, ino);
diff --git a/fs/open.c b/fs/open.c
index 3d64372ecc67..2a2cf9007900 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -191,12 +191,9 @@ int do_ftruncate(struct file *file, loff_t length, int small)
if (error)
return error;
- sb_start_write(inode->i_sb);
- error = do_truncate(file_mnt_idmap(file), dentry, length,
- ATTR_MTIME | ATTR_CTIME, file);
- sb_end_write(inode->i_sb);
-
- return error;
+ scoped_guard(super_write, inode->i_sb)
+ return do_truncate(file_mnt_idmap(file), dentry, length,
+ ATTR_MTIME | ATTR_CTIME, file);
}
int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
@@ -940,7 +937,7 @@ static int do_dentry_open(struct file *f,
}
error = security_file_open(f);
- if (error)
+ if (unlikely(error))
goto cleanup_all;
/*
@@ -950,11 +947,11 @@ static int do_dentry_open(struct file *f,
* pseudo file, this call will not change the mode.
*/
error = fsnotify_open_perm_and_set_mode(f);
- if (error)
+ if (unlikely(error))
goto cleanup_all;
error = break_lease(file_inode(f), f->f_flags);
- if (error)
+ if (unlikely(error))
goto cleanup_all;
/* normally all 3 are set; ->open() can clear them if needed */
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 26ecda0e4d19..fb8d84bdedfb 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -236,7 +236,7 @@ found:
mutex_unlock(&op_mutex);
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
simple_inode_init_ts(inode);
ent_oi = OP_I(inode);
ent_oi->type = ent_type;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a01400cd41fd..d7275990ffa4 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -878,7 +878,9 @@ int orangefs_update_time(struct inode *inode, int flags)
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
get_khandle_from_ino(inode));
- flags = generic_update_time(inode, flags);
+
+ flags = inode_update_timestamps(inode, flags);
+
memset(&iattr, 0, sizeof iattr);
if (flags & S_ATIME)
iattr.ia_valid |= ATTR_ATIME;
@@ -1041,7 +1043,7 @@ struct inode *orangefs_iget(struct super_block *sb,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 0fdceb00ca07..9ab1119ebd28 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -247,7 +247,7 @@ again:
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
+ orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) {
if (orangefs_inode->attr_valid) {
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
@@ -281,13 +281,13 @@ again2:
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
+ orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) {
if (orangefs_inode->attr_valid) {
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
goto again2;
}
- if (inode->i_state & I_DIRTY_PAGES) {
+ if (inode_state_read(inode) & I_DIRTY_PAGES) {
ret = 0;
goto out_unlock;
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index aac7e34f56c1..604a82acd164 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -178,7 +178,7 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
err = ovl_real_fileattr_get(old, &oldfa);
if (err) {
/* Ntfs-3g returns -EINVAL for "no fileattr support" */
- if (err == -EOPNOTSUPP || err == -EINVAL)
+ if (err == -ENOTTY || err == -EINVAL)
return 0;
pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
old->dentry, err);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index a5e9ddf3023b..83b955a1d55c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -686,7 +686,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
goto out_drop_write;
spin_lock(&inode->i_lock);
- inode->i_state |= I_CREATING;
+ inode_state_set(inode, I_CREATING);
spin_unlock(&inode->i_lock);
inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index fc52c796061d..7ab2c9daffd0 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -369,11 +369,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
- /*
- * Overlayfs doesn't support deferred completions, don't copy
- * this property in case it is set by the issuer.
- */
- ifl &= ~IOCB_DIO_CALLER_COMP;
ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx);
out_unlock:
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index aaa4cf579561..1af700668089 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -720,7 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa)
if (err)
return err;
- return vfs_fileattr_get(realpath->dentry, fa);
+ err = vfs_fileattr_get(realpath->dentry, fa);
+ if (err == -ENOIOCTLCMD)
+ err = -ENOTTY;
+ return err;
}
int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
@@ -1149,7 +1152,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
if (!trap)
return ERR_PTR(-ENOMEM);
- if (!(trap->i_state & I_NEW)) {
+ if (!(inode_state_read_once(trap) & I_NEW)) {
/* Conflicting layer roots? */
iput(trap);
return ERR_PTR(-ELOOP);
@@ -1240,7 +1243,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
inode = ovl_iget5(sb, oip->newinode, key);
if (!inode)
goto out_err;
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
/*
* Verify that the underlying files stored in the inode
* match those in the dentry.
@@ -1300,7 +1303,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
if (upperdentry)
ovl_check_protattr(inode, upperdentry);
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
out:
return inode;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f76672f2e686..312eb4529caf 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1019,8 +1019,8 @@ bool ovl_inuse_trylock(struct dentry *dentry)
bool locked = false;
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_OVL_INUSE)) {
- inode->i_state |= I_OVL_INUSE;
+ if (!(inode_state_read(inode) & I_OVL_INUSE)) {
+ inode_state_set(inode, I_OVL_INUSE);
locked = true;
}
spin_unlock(&inode->i_lock);
@@ -1034,8 +1034,8 @@ void ovl_inuse_unlock(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_OVL_INUSE));
- inode->i_state &= ~I_OVL_INUSE;
+ WARN_ON(!(inode_state_read(inode) & I_OVL_INUSE));
+ inode_state_clear(inode, I_OVL_INUSE);
spin_unlock(&inode->i_lock);
}
}
@@ -1046,7 +1046,7 @@ bool ovl_is_inuse(struct dentry *dentry)
bool inuse;
spin_lock(&inode->i_lock);
- inuse = (inode->i_state & I_OVL_INUSE);
+ inuse = (inode_state_read(inode) & I_OVL_INUSE);
spin_unlock(&inode->i_lock);
return inuse;
@@ -1234,9 +1234,9 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work,
goto err;
if (trap)
goto err_unlock;
- if (work && work->d_parent != workdir)
+ if (work && (work->d_parent != workdir || d_unhashed(work)))
goto err_unlock;
- if (upper && upper->d_parent != upperdir)
+ if (upper && (upper->d_parent != upperdir || d_unhashed(upper)))
goto err_unlock;
return 0;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 0ef5b47d796a..dba703d4ce4a 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -39,20 +39,20 @@ void pidfs_get_root(struct path *path)
path_get(path);
}
-/*
- * Stashes information that userspace needs to access even after the
- * process has been reaped.
- */
-struct pidfs_exit_info {
- __u64 cgroupid;
- __s32 exit_code;
- __u32 coredump_mask;
+enum pidfs_attr_mask_bits {
+ PIDFS_ATTR_BIT_EXIT = 0,
+ PIDFS_ATTR_BIT_COREDUMP = 1,
};
struct pidfs_attr {
+ unsigned long attr_mask;
struct simple_xattrs *xattrs;
- struct pidfs_exit_info __pei;
- struct pidfs_exit_info *exit_info;
+ struct /* exit info */ {
+ __u64 cgroupid;
+ __s32 exit_code;
+ };
+ __u32 coredump_mask;
+ __u32 coredump_signal;
};
static struct rb_root pidfs_ino_tree = RB_ROOT;
@@ -293,6 +293,15 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \
+ PIDFD_INFO_CREDS | \
+ PIDFD_INFO_CGROUPID | \
+ PIDFD_INFO_EXIT | \
+ PIDFD_INFO_COREDUMP | \
+ PIDFD_INFO_SUPPORTED_MASK | \
+ PIDFD_INFO_COREDUMP_SIGNAL)
+
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
@@ -300,12 +309,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
- struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
+ BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
+
if (!uinfo)
return -EINVAL;
if (usize < PIDFD_INFO_SIZE_VER0)
@@ -323,20 +333,24 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
attr = READ_ONCE(pid->attr);
if (mask & PIDFD_INFO_EXIT) {
- exit_info = READ_ONCE(attr->exit_info);
- if (exit_info) {
+ if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) {
+ smp_rmb();
kinfo.mask |= PIDFD_INFO_EXIT;
#ifdef CONFIG_CGROUPS
- kinfo.cgroupid = exit_info->cgroupid;
+ kinfo.cgroupid = attr->cgroupid;
kinfo.mask |= PIDFD_INFO_CGROUPID;
#endif
- kinfo.exit_code = exit_info->exit_code;
+ kinfo.exit_code = attr->exit_code;
}
}
if (mask & PIDFD_INFO_COREDUMP) {
- kinfo.mask |= PIDFD_INFO_COREDUMP;
- kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
+ if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
+ smp_rmb();
+ kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
+ kinfo.coredump_mask = attr->coredump_mask;
+ kinfo.coredump_signal = attr->coredump_signal;
+ }
}
task = get_pid_task(pid, PIDTYPE_PID);
@@ -355,14 +369,15 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!c)
return -ESRCH;
- if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
- task_lock(task);
+ if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
+ guard(task_lock)(task);
if (task->mm) {
unsigned long flags = __mm_flags_get_dumpable(task->mm);
kinfo.coredump_mask = pidfs_coredump_mask(flags);
+ kinfo.mask |= PIDFD_INFO_COREDUMP;
+ /* No coredump actually took place, so no coredump signal. */
}
- task_unlock(task);
}
/* Unconditionally return identifiers and credentials, the rest only on request */
@@ -409,6 +424,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
return -ESRCH;
copy_out:
+ if (mask & PIDFD_INFO_SUPPORTED_MASK) {
+ kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK;
+ kinfo.supported_mask = PIDFD_INFO_SUPPORTED;
+ }
+
+ /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */
+ WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask);
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
@@ -454,7 +476,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
struct ns_common *ns_common = NULL;
- struct pid_namespace *pid_ns;
if (!pidfs_ioctl_valid(cmd))
return -ENOIOCTLCMD;
@@ -496,66 +517,64 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (cmd) {
/* Namespaces that hang of nsproxy. */
case PIDFD_GET_CGROUP_NAMESPACE:
- if (IS_ENABLED(CONFIG_CGROUPS)) {
- get_cgroup_ns(nsp->cgroup_ns);
- ns_common = to_ns_common(nsp->cgroup_ns);
- }
+ if (!ns_ref_get(nsp->cgroup_ns))
+ break;
+ ns_common = to_ns_common(nsp->cgroup_ns);
break;
case PIDFD_GET_IPC_NAMESPACE:
- if (IS_ENABLED(CONFIG_IPC_NS)) {
- get_ipc_ns(nsp->ipc_ns);
- ns_common = to_ns_common(nsp->ipc_ns);
- }
+ if (!ns_ref_get(nsp->ipc_ns))
+ break;
+ ns_common = to_ns_common(nsp->ipc_ns);
break;
case PIDFD_GET_MNT_NAMESPACE:
- get_mnt_ns(nsp->mnt_ns);
+ if (!ns_ref_get(nsp->mnt_ns))
+ break;
ns_common = to_ns_common(nsp->mnt_ns);
break;
case PIDFD_GET_NET_NAMESPACE:
- if (IS_ENABLED(CONFIG_NET_NS)) {
- ns_common = to_ns_common(nsp->net_ns);
- get_net_ns(ns_common);
- }
+ if (!ns_ref_get(nsp->net_ns))
+ break;
+ ns_common = to_ns_common(nsp->net_ns);
break;
case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
- if (IS_ENABLED(CONFIG_PID_NS)) {
- get_pid_ns(nsp->pid_ns_for_children);
- ns_common = to_ns_common(nsp->pid_ns_for_children);
- }
+ if (!ns_ref_get(nsp->pid_ns_for_children))
+ break;
+ ns_common = to_ns_common(nsp->pid_ns_for_children);
break;
case PIDFD_GET_TIME_NAMESPACE:
- if (IS_ENABLED(CONFIG_TIME_NS)) {
- get_time_ns(nsp->time_ns);
- ns_common = to_ns_common(nsp->time_ns);
- }
+ if (!ns_ref_get(nsp->time_ns))
+ break;
+ ns_common = to_ns_common(nsp->time_ns);
break;
case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
- if (IS_ENABLED(CONFIG_TIME_NS)) {
- get_time_ns(nsp->time_ns_for_children);
- ns_common = to_ns_common(nsp->time_ns_for_children);
- }
+ if (!ns_ref_get(nsp->time_ns_for_children))
+ break;
+ ns_common = to_ns_common(nsp->time_ns_for_children);
break;
case PIDFD_GET_UTS_NAMESPACE:
- if (IS_ENABLED(CONFIG_UTS_NS)) {
- get_uts_ns(nsp->uts_ns);
- ns_common = to_ns_common(nsp->uts_ns);
- }
+ if (!ns_ref_get(nsp->uts_ns))
+ break;
+ ns_common = to_ns_common(nsp->uts_ns);
break;
/* Namespaces that don't hang of nsproxy. */
case PIDFD_GET_USER_NAMESPACE:
- if (IS_ENABLED(CONFIG_USER_NS)) {
- rcu_read_lock();
- ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct user_namespace *user_ns;
+
+ user_ns = task_cred_xxx(task, user_ns);
+ if (!ns_ref_get(user_ns))
+ break;
+ ns_common = to_ns_common(user_ns);
}
break;
case PIDFD_GET_PID_NAMESPACE:
- if (IS_ENABLED(CONFIG_PID_NS)) {
- rcu_read_lock();
+ scoped_guard(rcu) {
+ struct pid_namespace *pid_ns;
+
pid_ns = task_active_pid_ns(task);
- if (pid_ns)
- ns_common = to_ns_common(get_pid_ns(pid_ns));
- rcu_read_unlock();
+ if (!ns_ref_get(pid_ns))
+ break;
+ ns_common = to_ns_common(pid_ns);
}
break;
default:
@@ -606,24 +625,25 @@ void pidfs_exit(struct task_struct *tsk)
{
struct pid *pid = task_pid(tsk);
struct pidfs_attr *attr;
- struct pidfs_exit_info *exit_info;
#ifdef CONFIG_CGROUPS
struct cgroup *cgrp;
#endif
might_sleep();
- guard(spinlock_irq)(&pid->wait_pidfd.lock);
- attr = pid->attr;
- if (!attr) {
- /*
- * No one ever held a pidfd for this struct pid.
- * Mark it as dead so no one can add a pidfs
- * entry anymore. We're about to be reaped and
- * so no exit information would be available.
- */
- pid->attr = PIDFS_PID_DEAD;
- return;
+ /* Synchronize with pidfs_register_pid(). */
+ scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
}
/*
@@ -634,41 +654,39 @@ void pidfs_exit(struct task_struct *tsk)
* is put
*/
- exit_info = &attr->__pei;
-
#ifdef CONFIG_CGROUPS
rcu_read_lock();
cgrp = task_dfl_cgroup(tsk);
- exit_info->cgroupid = cgroup_id(cgrp);
+ attr->cgroupid = cgroup_id(cgrp);
rcu_read_unlock();
#endif
- exit_info->exit_code = tsk->exit_code;
+ attr->exit_code = tsk->exit_code;
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
- smp_store_release(&attr->exit_info, &attr->__pei);
+ smp_wmb();
+ set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask);
}
#ifdef CONFIG_COREDUMP
void pidfs_coredump(const struct coredump_params *cprm)
{
struct pid *pid = cprm->pid;
- struct pidfs_exit_info *exit_info;
struct pidfs_attr *attr;
- __u32 coredump_mask = 0;
attr = READ_ONCE(pid->attr);
VFS_WARN_ON_ONCE(!attr);
VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
- exit_info = &attr->__pei;
- /* Note how we were coredumped. */
- coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
- /* Note that we actually did coredump. */
- coredump_mask |= PIDFD_COREDUMPED;
+ /* Note how we were coredumped and that we coredumped. */
+ attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
+ PIDFD_COREDUMPED;
/* If coredumping is set to skip we should never end up here. */
- VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
- smp_store_release(&exit_info->coredump_mask, coredump_mask);
+ VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
+ /* Expose the signal number that caused the coredump. */
+ attr->coredump_signal = cprm->siginfo->si_signo;
+ smp_wmb();
+ set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask);
}
#endif
@@ -1022,6 +1040,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
fc->s_iflags |= SB_I_NOEXEC;
fc->s_iflags |= SB_I_NODEV;
+ ctx->s_d_flags |= DCACHE_DONTCACHE;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
diff --git a/fs/pipe.c b/fs/pipe.c
index 42fead1efe52..2d0fed2ecbfd 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -908,7 +908,7 @@ static struct inode * get_pipe_inode(void)
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
- inode->i_state = I_DIRTY;
+ inode_state_assign_raw(inode, I_DIRTY);
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 176281112273..501889856461 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -698,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde)
}
}
+static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
+{
+ rb_erase(&pde->subdir_node, &parent->subdir);
+ RB_CLEAR_NODE(&pde->subdir_node);
+}
+
/*
* Remove a /proc entry and free it if it's not currently in use.
*/
@@ -720,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
WARN(1, "removing permanent /proc entry '%s'", de->name);
de = NULL;
} else {
- rb_erase(&de->subdir_node, &parent->subdir);
+ pde_erase(de, parent);
if (S_ISDIR(de->mode))
parent->nlink--;
}
@@ -764,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
root->parent->name, root->name);
return -EINVAL;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ pde_erase(root, parent);
de = root;
while (1) {
@@ -776,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
next->parent->name, next->name);
return -EINVAL;
}
- rb_erase(&next->subdir_node, &de->subdir);
+ pde_erase(next, de);
de = next;
continue;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e399e2dd3a12..31d78da203ea 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -290,7 +290,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
qnx4_inode = qnx4_raw_inode(inode);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 3310d1ad4d0e..88d285005083 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -521,7 +521,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ei = QNX6_I(inode);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6c4a6ee1fa2b..376739f6420e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1033,7 +1033,7 @@ static int add_dquot_ref(struct super_block *sb, int type)
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
!atomic_read(&inode->i_writecount) ||
!dqinit_needed(inode, type)) {
spin_unlock(&inode->i_lock);
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index 4076336fbba6..572a9925bd6c 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -1782,15 +1782,13 @@ int resctrl_mon_resource_init(void)
mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
if (r->mon.mbm_cntr_assignable) {
- if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
- resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
- if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
- resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
- mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
- mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
- (READS_TO_LOCAL_MEM |
- READS_TO_LOCAL_S_MEM |
- NON_TEMP_WRITE_TO_LOCAL_MEM);
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
+ (READS_TO_LOCAL_MEM |
+ READS_TO_LOCAL_S_MEM |
+ NON_TEMP_WRITE_TO_LOCAL_MEM);
r->mon.mbm_assign_on_mkdir = true;
resctrl_file_fflags_init("num_mbm_cntrs",
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 0addcc849ff2..360b00854115 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -302,7 +302,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
if (!i)
return ERR_PTR(-ENOMEM);
- if (!(i->i_state & I_NEW))
+ if (!(inode_state_read_once(i) & I_NEW))
return i;
/* precalculate the data offset */
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index a4c02199fef4..17bd368574e9 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -5,17 +5,16 @@ config CIFS
select NLS
select NLS_UCS2_UTILS
select CRYPTO
- select CRYPTO_MD5
- select CRYPTO_SHA256
- select CRYPTO_SHA512
select CRYPTO_CMAC
- select CRYPTO_HMAC
select CRYPTO_AEAD2
select CRYPTO_CCM
select CRYPTO_GCM
select CRYPTO_ECB
select CRYPTO_AES
select CRYPTO_LIB_ARC4
+ select CRYPTO_LIB_MD5
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_SHA512
select KEYS
select DNS_RESOLVER
select ASN1
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index b8ac7b7faf61..e3ea6fe7edb4 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -16,6 +16,7 @@ static struct cached_fid *init_cached_dir(const char *path);
static void free_cached_dir(struct cached_fid *cfid);
static void smb2_close_cached_fid(struct kref *ref);
static void cfids_laundromat_worker(struct work_struct *work);
+static void close_cached_dir_locked(struct cached_fid *cfid);
struct cached_dir_dentry {
struct list_head entry;
@@ -388,11 +389,11 @@ out:
* lease. Release one here, and the second below.
*/
cfid->has_lease = false;
- kref_put(&cfid->refcount, smb2_close_cached_fid);
+ close_cached_dir_locked(cfid);
}
spin_unlock(&cfids->cfid_list_lock);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
+ close_cached_dir(cfid);
} else {
*ret_cfid = cfid;
atomic_inc(&tcon->num_remote_opens);
@@ -438,12 +439,14 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
static void
smb2_close_cached_fid(struct kref *ref)
+__releases(&cfid->cfids->cfid_list_lock)
{
struct cached_fid *cfid = container_of(ref, struct cached_fid,
refcount);
int rc;
- spin_lock(&cfid->cfids->cfid_list_lock);
+ lockdep_assert_held(&cfid->cfids->cfid_list_lock);
+
if (cfid->on_list) {
list_del(&cfid->entry);
cfid->on_list = false;
@@ -478,15 +481,49 @@ void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon,
spin_lock(&cfid->cfids->cfid_list_lock);
if (cfid->has_lease) {
cfid->has_lease = false;
- kref_put(&cfid->refcount, smb2_close_cached_fid);
+ close_cached_dir_locked(cfid);
}
spin_unlock(&cfid->cfids->cfid_list_lock);
close_cached_dir(cfid);
}
-
+/**
+ * close_cached_dir - drop a reference of a cached dir
+ *
+ * The release function will be called with cfid_list_lock held to remove the
+ * cached dirs from the list before any other thread can take another @cfid
+ * ref. Must not be called with cfid_list_lock held; use
+ * close_cached_dir_locked() called instead.
+ *
+ * @cfid: cached dir
+ */
void close_cached_dir(struct cached_fid *cfid)
{
+ lockdep_assert_not_held(&cfid->cfids->cfid_list_lock);
+ kref_put_lock(&cfid->refcount, smb2_close_cached_fid, &cfid->cfids->cfid_list_lock);
+}
+
+/**
+ * close_cached_dir_locked - put a reference of a cached dir with
+ * cfid_list_lock held
+ *
+ * Calling close_cached_dir() with cfid_list_lock held has the potential effect
+ * of causing a deadlock if the invariant of refcount >= 2 is false.
+ *
+ * This function is used in paths that hold cfid_list_lock and expect at least
+ * two references. If that invariant is violated, WARNs and returns without
+ * dropping a reference; the final put must still go through
+ * close_cached_dir().
+ *
+ * @cfid: cached dir
+ */
+static void close_cached_dir_locked(struct cached_fid *cfid)
+{
+ lockdep_assert_held(&cfid->cfids->cfid_list_lock);
+
+ if (WARN_ON(kref_read(&cfid->refcount) < 2))
+ return;
+
kref_put(&cfid->refcount, smb2_close_cached_fid);
}
@@ -596,7 +633,7 @@ cached_dir_offload_close(struct work_struct *work)
WARN_ON(cfid->on_list);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
+ close_cached_dir(cfid);
cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cached_close);
}
@@ -762,7 +799,7 @@ static void cfids_laundromat_worker(struct work_struct *work)
* Drop the ref-count from above, either the lease-ref (if there
* was one) or the extra one acquired.
*/
- kref_put(&cfid->refcount, smb2_close_cached_fid);
+ close_cached_dir(cfid);
}
queue_delayed_work(cfid_put_wq, &cfids->laundromat_work,
dir_cache_timeout * HZ);
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 9891f55bac1e..da935bd1ce87 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -90,7 +90,6 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
size_t desc_len;
struct key *spnego_key;
const char *hostname = server->hostname;
- const struct cred *saved_cred;
/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
host=hostname sec=mechanism uid=0xFF user=username */
@@ -158,9 +157,8 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
dp += sprintf(dp, ";upcall_target=app");
cifs_dbg(FYI, "key description = %s\n", description);
- saved_cred = override_creds(spnego_cred);
- spnego_key = request_key(&cifs_spnego_key_type, description, "");
- revert_creds(saved_cred);
+ scoped_with_creds(spnego_cred)
+ spnego_key = request_key(&cifs_spnego_key_type, description, "");
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index 63b3b1290bed..ce2ebc213a1d 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -339,7 +339,6 @@ int
sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
- int rc = 0;
struct key *sidkey;
char *sidstr;
const struct cred *saved_cred;
@@ -446,12 +445,12 @@ out_revert_creds:
* fails then we just fall back to using the ctx->linux_uid/linux_gid.
*/
got_valid_id:
- rc = 0;
if (sidtype == SIDOWNER)
fattr->cf_uid = fuid;
else
fattr->cf_gid = fgid;
- return rc;
+
+ return 0;
}
int
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 7b7c8c38fdd0..801824825ecf 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -24,14 +24,43 @@
#include <linux/iov_iter.h>
#include <crypto/aead.h>
#include <crypto/arc4.h>
+#include <crypto/md5.h>
+#include <crypto/sha2.h>
-static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,
- void *priv, void *priv2)
+static int cifs_sig_update(struct cifs_calc_sig_ctx *ctx,
+ const u8 *data, size_t len)
{
- struct shash_desc *shash = priv;
+ if (ctx->md5) {
+ md5_update(ctx->md5, data, len);
+ return 0;
+ }
+ if (ctx->hmac) {
+ hmac_sha256_update(ctx->hmac, data, len);
+ return 0;
+ }
+ return crypto_shash_update(ctx->shash, data, len);
+}
+
+static int cifs_sig_final(struct cifs_calc_sig_ctx *ctx, u8 *out)
+{
+ if (ctx->md5) {
+ md5_final(ctx->md5, out);
+ return 0;
+ }
+ if (ctx->hmac) {
+ hmac_sha256_final(ctx->hmac, out);
+ return 0;
+ }
+ return crypto_shash_final(ctx->shash, out);
+}
+
+static size_t cifs_sig_step(void *iter_base, size_t progress, size_t len,
+ void *priv, void *priv2)
+{
+ struct cifs_calc_sig_ctx *ctx = priv;
int ret, *pret = priv2;
- ret = crypto_shash_update(shash, iter_base, len);
+ ret = cifs_sig_update(ctx, iter_base, len);
if (ret < 0) {
*pret = ret;
return len;
@@ -42,21 +71,20 @@ static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,
/*
* Pass the data from an iterator into a hash.
*/
-static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize,
- struct shash_desc *shash)
+static int cifs_sig_iter(const struct iov_iter *iter, size_t maxsize,
+ struct cifs_calc_sig_ctx *ctx)
{
struct iov_iter tmp_iter = *iter;
int err = -EIO;
- if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err,
- cifs_shash_step) != maxsize)
+ if (iterate_and_advance_kernel(&tmp_iter, maxsize, ctx, &err,
+ cifs_sig_step) != maxsize)
return err;
return 0;
}
-int __cifs_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server, char *signature,
- struct shash_desc *shash)
+int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ char *signature, struct cifs_calc_sig_ctx *ctx)
{
int i;
ssize_t rc;
@@ -82,8 +110,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
return -EIO;
}
- rc = crypto_shash_update(shash,
- iov[i].iov_base, iov[i].iov_len);
+ rc = cifs_sig_update(ctx, iov[i].iov_base, iov[i].iov_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with payload\n",
__func__);
@@ -91,11 +118,11 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
}
}
- rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash);
+ rc = cifs_sig_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), ctx);
if (rc < 0)
return rc;
- rc = crypto_shash_final(shash, signature);
+ rc = cifs_sig_final(ctx, signature);
if (rc)
cifs_dbg(VFS, "%s: Could not generate hash\n", __func__);
@@ -112,29 +139,22 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
static int cifs_calc_signature(struct smb_rqst *rqst,
struct TCP_Server_Info *server, char *signature)
{
- int rc;
+ struct md5_ctx ctx;
if (!rqst->rq_iov || !signature || !server)
return -EINVAL;
-
- rc = cifs_alloc_hash("md5", &server->secmech.md5);
- if (rc)
- return -1;
-
- rc = crypto_shash_init(server->secmech.md5);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
- return rc;
+ if (fips_enabled) {
+ cifs_dbg(VFS,
+ "MD5 signature support is disabled due to FIPS\n");
+ return -EOPNOTSUPP;
}
- rc = crypto_shash_update(server->secmech.md5,
- server->session_key.response, server->session_key.len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
- return rc;
- }
+ md5_init(&ctx);
+ md5_update(&ctx, server->session_key.response, server->session_key.len);
- return __cifs_calc_signature(rqst, server, signature, server->secmech.md5);
+ return __cifs_calc_signature(
+ rqst, server, signature,
+ &(struct cifs_calc_sig_ctx){ .md5 = &ctx });
}
/* must be called with server->srv_mutex held */
@@ -405,11 +425,11 @@ static __le64 find_timestamp(struct cifs_ses *ses)
}
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
- const struct nls_table *nls_cp, struct shash_desc *hmacmd5)
+ const struct nls_table *nls_cp)
{
- int rc = 0;
int len;
char nt_hash[CIFS_NTHASH_SIZE];
+ struct hmac_md5_ctx hmac_ctx;
__le16 *user;
wchar_t *domain;
wchar_t *server;
@@ -417,17 +437,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
/* calculate md4 hash of password */
E_md4hash(ses->password, nt_hash, nls_cp);
- rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc);
- return rc;
- }
-
- rc = crypto_shash_init(hmacmd5);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
- return rc;
- }
+ hmac_md5_init_usingrawkey(&hmac_ctx, nt_hash, CIFS_NTHASH_SIZE);
/* convert ses->user_name to unicode */
len = ses->user_name ? strlen(ses->user_name) : 0;
@@ -442,12 +452,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
*(u16 *)user = 0;
}
- rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len);
+ hmac_md5_update(&hmac_ctx, (const u8 *)user, 2 * len);
kfree(user);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc);
- return rc;
- }
/* convert ses->domainName to unicode and uppercase */
if (ses->domainName) {
@@ -459,12 +465,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
nls_cp);
- rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len);
+ hmac_md5_update(&hmac_ctx, (const u8 *)domain, 2 * len);
kfree(domain);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc);
- return rc;
- }
} else {
/* We use ses->ip_addr if no domain name available */
len = strlen(ses->ip_addr);
@@ -474,25 +476,16 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
return -ENOMEM;
len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp);
- rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len);
+ hmac_md5_update(&hmac_ctx, (const u8 *)server, 2 * len);
kfree(server);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc);
- return rc;
- }
}
- rc = crypto_shash_final(hmacmd5, ntlmv2_hash);
- if (rc)
- cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
-
- return rc;
+ hmac_md5_final(&hmac_ctx, ntlmv2_hash);
+ return 0;
}
-static int
-CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5)
+static void CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
{
- int rc;
struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
unsigned int hash_len;
@@ -501,35 +494,15 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_
hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
offsetof(struct ntlmv2_resp, challenge.key[0]));
- rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc);
- return rc;
- }
-
- rc = crypto_shash_init(hmacmd5);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
- return rc;
- }
-
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
else
memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
- rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc);
- return rc;
- }
-
- /* Note that the MD5 digest over writes anon.challenge_key.key */
- rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash);
- if (rc)
- cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
-
- return rc;
+ /* Note that the HMAC-MD5 value overwrites ntlmv2->challenge.key */
+ hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE,
+ ntlmv2->challenge.key, hash_len,
+ ntlmv2->ntlmv2_hash);
}
/*
@@ -586,7 +559,6 @@ out:
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
- struct shash_desc *hmacmd5 = NULL;
unsigned char *tiblob = NULL; /* target info blob */
struct ntlmv2_resp *ntlmv2;
char ntlmv2_hash[16];
@@ -657,51 +629,29 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
ntlmv2->client_chal = cc;
ntlmv2->reserved2 = 0;
- rc = cifs_alloc_hash("hmac(md5)", &hmacmd5);
- if (rc) {
- cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc);
+ if (fips_enabled) {
+ cifs_dbg(VFS, "NTLMv2 support is disabled due to FIPS\n");
+ rc = -EOPNOTSUPP;
goto unlock;
}
/* calculate ntlmv2_hash */
- rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5);
+ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
if (rc) {
cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc);
goto unlock;
}
/* calculate first part of the client response (CR1) */
- rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5);
- if (rc) {
- cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc);
- goto unlock;
- }
+ CalcNTLMv2_response(ses, ntlmv2_hash);
/* now calculate the session key for NTLMv2 */
- rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc);
- goto unlock;
- }
-
- rc = crypto_shash_init(hmacmd5);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
- goto unlock;
- }
-
- rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc);
- goto unlock;
- }
-
- rc = crypto_shash_final(hmacmd5, ses->auth_key.response);
- if (rc)
- cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
+ hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE,
+ ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE,
+ ses->auth_key.response);
+ rc = 0;
unlock:
cifs_server_unlock(ses->server);
- cifs_free_hash(&hmacmd5);
setup_ntlmv2_rsp_ret:
kfree_sensitive(tiblob);
@@ -743,9 +693,6 @@ void
cifs_crypto_secmech_release(struct TCP_Server_Info *server)
{
cifs_free_hash(&server->secmech.aes_cmac);
- cifs_free_hash(&server->secmech.hmacsha256);
- cifs_free_hash(&server->secmech.md5);
- cifs_free_hash(&server->secmech.sha512);
if (server->secmech.enc) {
crypto_free_aead(server->secmech.enc);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 05b1fa76e8cc..1d4958ff4d4a 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -173,7 +173,7 @@ module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
module_param(enable_gcm_256, bool, 0644);
-MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0");
+MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/1");
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
@@ -500,7 +500,7 @@ cifs_evict_inode(struct inode *inode)
{
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_NETFS_WB)
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
@@ -2139,13 +2139,9 @@ MODULE_DESCRIPTION
"also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
MODULE_SOFTDEP("ecb");
-MODULE_SOFTDEP("hmac");
-MODULE_SOFTDEP("md5");
MODULE_SOFTDEP("nls");
MODULE_SOFTDEP("aes");
MODULE_SOFTDEP("cmac");
-MODULE_SOFTDEP("sha256");
-MODULE_SOFTDEP("sha512");
MODULE_SOFTDEP("aead2");
MODULE_SOFTDEP("ccm");
MODULE_SOFTDEP("gcm");
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 8f6f567d7474..203e2aaa3c25 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -24,6 +24,7 @@
#include "cifsacl.h"
#include <crypto/internal/hash.h>
#include <uapi/linux/cifs/cifs_mount.h>
+#include "../common/cifsglob.h"
#include "../common/smb2pdu.h"
#include "smb2pdu.h"
#include <linux/filelock.h>
@@ -221,9 +222,6 @@ struct session_key {
/* crypto hashing related structure/fields, not specific to a sec mech */
struct cifs_secmech {
- struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */
- struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */
- struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */
struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */
struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */
@@ -536,8 +534,6 @@ struct smb_version_operations {
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *ses,
struct TCP_Server_Info *server);
- int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
- bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
@@ -702,12 +698,6 @@ get_rfc1002_length(void *buf)
return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
}
-static inline void
-inc_rfc1001_len(void *buf, int count)
-{
- be32_add_cpu((__be32 *)buf, count);
-}
-
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
@@ -740,7 +730,7 @@ struct TCP_Server_Info {
bool nosharesock;
bool tcp_nodelay;
bool terminate;
- unsigned int credits; /* send no more requests at once */
+ int credits; /* send no more requests at once */
unsigned int max_credits; /* can override large 32000 default at mnt */
unsigned int in_flight; /* number of requests on the wire to server */
unsigned int max_in_flight; /* max number of requests that were on wire */
@@ -1021,8 +1011,6 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
-#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
-
/*
* Windows only supports a max of 60kb reads and 65535 byte writes. Default to
* those values when posix extensions aren't in force. In actuality here, we
@@ -2148,30 +2136,20 @@ extern mempool_t cifs_io_request_pool;
extern mempool_t cifs_io_subrequest_pool;
/* Operations for different SMB versions */
-#define SMB1_VERSION_STRING "1.0"
-#define SMB20_VERSION_STRING "2.0"
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
extern struct smb_version_operations smb1_operations;
extern struct smb_version_values smb1_values;
extern struct smb_version_operations smb20_operations;
extern struct smb_version_values smb20_values;
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
-#define SMB21_VERSION_STRING "2.1"
extern struct smb_version_operations smb21_operations;
extern struct smb_version_values smb21_values;
-#define SMBDEFAULT_VERSION_STRING "default"
extern struct smb_version_values smbdefault_values;
-#define SMB3ANY_VERSION_STRING "3"
extern struct smb_version_values smb3any_values;
-#define SMB30_VERSION_STRING "3.0"
extern struct smb_version_operations smb30_operations;
extern struct smb_version_values smb30_values;
-#define SMB302_VERSION_STRING "3.02"
-#define ALT_SMB302_VERSION_STRING "3.0.2"
/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
extern struct smb_version_values smb302_values;
-#define SMB311_VERSION_STRING "3.1.1"
-#define ALT_SMB311_VERSION_STRING "3.11"
extern struct smb_version_operations smb311_operations;
extern struct smb_version_values smb311_values;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index e8fba98690ce..3528c365a452 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -9,6 +9,7 @@
#define _CIFSPROTO_H
#include <linux/nls.h>
#include <linux/ctype.h>
+#include "cifsglob.h"
#include "trace.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
@@ -615,6 +616,8 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
extern struct TCP_Server_Info *
cifs_find_tcp_session(struct smb3_fs_context *ctx);
+struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal);
+
void __cifs_put_smb_ses(struct cifs_ses *ses);
extern struct cifs_ses *
@@ -632,9 +635,13 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
unsigned int *pbytes_written);
-int __cifs_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server, char *signature,
- struct shash_desc *shash);
+struct cifs_calc_sig_ctx {
+ struct md5_ctx *md5;
+ struct hmac_sha256_ctx *hmac;
+ struct shash_desc *shash;
+};
+int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ char *signature, struct cifs_calc_sig_ctx *ctx);
enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 2881efcbe09a..dcc50a2bfa4b 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1311,6 +1311,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rreq_debug_id = rdata->rreq->debug_id,
.rreq_debug_index = rdata->subreq.debug_index,
};
+ unsigned int rreq_debug_id = rdata->rreq->debug_id;
+ unsigned int subreq_debug_index = rdata->subreq.debug_index;
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1361,6 +1363,14 @@ do_retry:
if (rdata->result == -ENODATA) {
rdata->result = 0;
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+ trace_smb3_read_err(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start + rdata->subreq.transferred,
+ rdata->subreq.len - rdata->subreq.transferred,
+ rdata->result);
} else {
size_t trans = rdata->subreq.transferred + rdata->got_bytes;
if (trans < rdata->subreq.len &&
@@ -1372,8 +1382,18 @@ do_retry:
}
if (rdata->got_bytes)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
+ trace_smb3_read_done(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start + rdata->subreq.transferred,
+ rdata->got_bytes);
}
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
+ server->credits, server->in_flight,
+ 0, cifs_trace_rw_credits_read_response_clear);
rdata->credits.value = 0;
rdata->subreq.error = rdata->result;
rdata->subreq.transferred += rdata->got_bytes;
@@ -1381,6 +1401,9 @@ do_retry:
netfs_read_subreq_terminated(&rdata->subreq);
release_mid(mid);
add_credits(server, &credits, 0);
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
+ server->credits, server->in_flight,
+ credits.value, cifs_trace_rw_credits_read_response_add);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1437,6 +1460,13 @@ cifs_async_readv(struct cifs_io_subrequest *rdata)
rdata->iov[1].iov_base = (char *)smb + 4;
rdata->iov[1].iov_len = get_rfc1002_length(smb);
+ trace_smb3_read_enter(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.netfid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start, rdata->subreq.len);
+
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
cifs_readv_callback, NULL, rdata, 0, NULL);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dd12f3eb61dc..2f94d93b95e9 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -310,6 +310,8 @@ cifs_abort_connection(struct TCP_Server_Info *server)
server->ssocket->flags);
sock_release(server->ssocket);
server->ssocket = NULL;
+ } else if (cifs_rdma_enabled(server)) {
+ smbd_destroy(server);
}
server->sequence_number = 0;
server->session_estab = false;
@@ -338,12 +340,6 @@ cifs_abort_connection(struct TCP_Server_Info *server)
mid_execute_callback(mid);
release_mid(mid);
}
-
- if (cifs_rdma_enabled(server)) {
- cifs_server_lock(server);
- smbd_destroy(server);
- cifs_server_unlock(server);
- }
}
static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets)
@@ -2015,39 +2011,31 @@ static int match_session(struct cifs_ses *ses,
/**
* cifs_setup_ipc - helper to setup the IPC tcon for the session
* @ses: smb session to issue the request on
- * @ctx: the superblock configuration context to use for building the
- * new tree connection for the IPC (interprocess communication RPC)
+ * @seal: if encryption is requested
*
* A new IPC connection is made and stored in the session
* tcon_ipc. The IPC tcon has the same lifetime as the session.
*/
-static int
-cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
+struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal)
{
int rc = 0, xid;
struct cifs_tcon *tcon;
char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
- bool seal = false;
struct TCP_Server_Info *server = ses->server;
/*
* If the mount request that resulted in the creation of the
* session requires encryption, force IPC to be encrypted too.
*/
- if (ctx->seal) {
- if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)
- seal = true;
- else {
- cifs_server_dbg(VFS,
- "IPC: server doesn't support encryption\n");
- return -EOPNOTSUPP;
- }
+ if (seal && !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) {
+ cifs_server_dbg(VFS, "IPC: server doesn't support encryption\n");
+ return ERR_PTR(-EOPNOTSUPP);
}
/* no need to setup directory caching on IPC share, so pass in false */
tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_ipc);
if (tcon == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
spin_lock(&server->srv_lock);
scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname);
@@ -2057,13 +2045,13 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->ses = ses;
tcon->ipc = true;
tcon->seal = seal;
- rc = server->ops->tree_connect(xid, ses, unc, tcon, ctx->local_nls);
+ rc = server->ops->tree_connect(xid, ses, unc, tcon, ses->local_nls);
free_xid(xid);
if (rc) {
- cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc);
+ cifs_server_dbg(VFS | ONCE, "failed to connect to IPC (rc=%d)\n", rc);
tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc_fail);
- goto out;
+ return ERR_PTR(rc);
}
cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid);
@@ -2071,9 +2059,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
spin_lock(&tcon->tc_lock);
tcon->status = TID_GOOD;
spin_unlock(&tcon->tc_lock);
- ses->tcon_ipc = tcon;
-out:
- return rc;
+ return tcon;
}
static struct cifs_ses *
@@ -2347,6 +2333,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+ struct cifs_tcon *ipc;
struct cifs_ses *ses;
unsigned int xid;
int retries = 0;
@@ -2525,7 +2512,12 @@ retry_new_session:
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- cifs_setup_ipc(ses, ctx);
+ ipc = cifs_setup_ipc(ses, ctx->seal);
+ spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ ses->tcon_ipc = !IS_ERR(ipc) ? ipc : NULL;
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
free_xid(xid);
@@ -4459,6 +4451,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
out:
kfree(ctx->username);
+ kfree(ctx->domainname);
kfree_sensitive(ctx->password);
kfree(origin_fullpath);
kfree(ctx);
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 4dada26d56b5..f2ad0ccd08a7 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1120,24 +1120,63 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)
return match;
}
-static bool is_ses_good(struct cifs_ses *ses)
+static bool is_ses_good(struct cifs_tcon *tcon, struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
- struct cifs_tcon *tcon = ses->tcon_ipc;
+ struct cifs_tcon *ipc = NULL;
bool ret;
+ spin_lock(&cifs_tcp_ses_lock);
spin_lock(&ses->ses_lock);
spin_lock(&ses->chan_lock);
+
ret = !cifs_chan_needs_reconnect(ses, server) &&
- ses->ses_status == SES_GOOD &&
- !tcon->need_reconnect;
+ ses->ses_status == SES_GOOD;
+
spin_unlock(&ses->chan_lock);
+
+ if (!ret)
+ goto out;
+
+ if (likely(ses->tcon_ipc)) {
+ if (ses->tcon_ipc->need_reconnect) {
+ ret = false;
+ goto out;
+ }
+ } else {
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ ipc = cifs_setup_ipc(ses, tcon->seal);
+
+ spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ if (!IS_ERR(ipc)) {
+ if (!ses->tcon_ipc) {
+ ses->tcon_ipc = ipc;
+ ipc = NULL;
+ }
+ } else {
+ ret = false;
+ ipc = NULL;
+ }
+ }
+
+out:
spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (ipc && server->ops->tree_disconnect) {
+ unsigned int xid = get_xid();
+
+ (void)server->ops->tree_disconnect(xid, ipc);
+ _free_xid(xid);
+ }
+ tconInfoFree(ipc, netfs_trace_tcon_ref_free_ipc);
return ret;
}
/* Refresh dfs referral of @ses */
-static void refresh_ses_referral(struct cifs_ses *ses)
+static void refresh_ses_referral(struct cifs_tcon *tcon, struct cifs_ses *ses)
{
struct cache_entry *ce;
unsigned int xid;
@@ -1153,7 +1192,7 @@ static void refresh_ses_referral(struct cifs_ses *ses)
}
ses = CIFS_DFS_ROOT_SES(ses);
- if (!is_ses_good(ses)) {
+ if (!is_ses_good(tcon, ses)) {
cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",
__func__);
goto out;
@@ -1241,7 +1280,7 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
up_read(&htable_rw_lock);
ses = CIFS_DFS_ROOT_SES(ses);
- if (!is_ses_good(ses)) {
+ if (!is_ses_good(tcon, ses)) {
cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",
__func__);
goto out;
@@ -1309,7 +1348,7 @@ void dfs_cache_refresh(struct work_struct *work)
tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work);
list_for_each_entry(ses, &tcon->dfs_ses_list, dlist)
- refresh_ses_referral(ses);
+ refresh_ses_referral(tcon, ses);
refresh_tcon_referral(tcon, false);
queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 474dadeb1593..9dc0a968ec89 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -9,6 +9,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/filelock.h>
#include <linux/backing-dev.h>
#include <linux/stat.h>
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index e60927b2a7c8..2a0d8b87bd8e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1435,12 +1435,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_errorf(fc, "Unknown error parsing devname\n");
goto cifs_parse_mount_err;
}
+ kfree(ctx->source);
ctx->source = smb3_fs_context_fullpath(ctx, '/');
if (IS_ERR(ctx->source)) {
ctx->source = NULL;
cifs_errorf(fc, "OOM when copying UNC string\n");
goto cifs_parse_mount_err;
}
+ kfree(fc->source);
fc->source = kstrdup(ctx->source, GFP_KERNEL);
if (fc->source == NULL) {
cifs_errorf(fc, "OOM when copying UNC string\n");
@@ -1468,7 +1470,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
break;
}
- if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) >
+ if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) ==
CIFS_MAX_USERNAME_LEN) {
pr_warn("username too long\n");
goto cifs_parse_mount_err;
@@ -1832,6 +1834,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->password = NULL;
kfree_sensitive(ctx->password2);
ctx->password2 = NULL;
+ kfree(ctx->source);
+ ctx->source = NULL;
+ kfree(fc->source);
+ fc->source = NULL;
return -EINVAL;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 239dd84a336f..b75482730912 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -6,6 +6,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
@@ -101,7 +102,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
cifs_dbg(FYI, "%s: revalidating inode %llu\n",
__func__, cifs_i->uniqueid);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cifs_dbg(FYI, "%s: inode %llu is new\n",
__func__, cifs_i->uniqueid);
return;
@@ -146,7 +147,7 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
*/
if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
/* only provide fake values on a new inode */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
set_nlink(inode, 2);
else
@@ -167,12 +168,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- if (!(inode->i_state & I_NEW) &&
+ if (!(inode_state_read_once(inode) & I_NEW) &&
unlikely(inode_wrong_type(inode, fattr->cf_mode))) {
CIFS_I(inode)->time = 0; /* force reval */
return -ESTALE;
}
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
CIFS_I(inode)->netfs.zero_point = fattr->cf_eof;
cifs_revalidate_cache(inode, fattr);
@@ -194,7 +195,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
inode->i_gid = fattr->cf_gid;
/* if dynperm is set, don't clobber existing mode */
- if (inode->i_state & I_NEW ||
+ if (inode_state_read(inode) & I_NEW ||
!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
inode->i_mode = fattr->cf_mode;
@@ -236,7 +237,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
if (fattr->cf_flags & CIFS_FATTR_JUNCTION)
inode->i_flags |= S_AUTOMOUNT;
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cifs_set_netfs_context(inode);
cifs_set_ops(inode);
}
@@ -1638,7 +1639,7 @@ retry_iget5_locked:
cifs_fattr_to_inode(inode, fattr, false);
if (sb->s_flags & SB_NOATIME)
inode->i_flags |= S_NOATIME | S_NOCMTIME;
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_ino = hash;
cifs_fscache_get_inode_cookie(inode);
unlock_new_inode(inode);
@@ -2431,8 +2432,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- if (!server->ops->rename)
- return -ENOSYS;
+ if (!server->ops->rename) {
+ rc = -ENOSYS;
+ goto do_rename_exit;
+ }
/* try path-based rename first */
rc = server->ops->rename(xid, tcon, from_dentry,
@@ -2482,11 +2485,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
- if (rc == 0) {
+ if (rc == 0)
d_move(from_dentry, to_dentry);
- /* Force a new lookup */
- d_drop(from_dentry);
- }
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index fe80e711cd75..70f3c0c67eeb 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -5,6 +5,7 @@
* Author(s): Steve French (sfrench@us.ibm.com)
*
*/
+#include <crypto/md5.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/slab.h>
@@ -37,23 +38,6 @@
#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash
static int
-symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
-{
- int rc;
- struct shash_desc *md5 = NULL;
-
- rc = cifs_alloc_hash("md5", &md5);
- if (rc)
- return rc;
-
- rc = crypto_shash_digest(md5, link_str, link_len, md5_hash);
- if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
- cifs_free_hash(&md5);
- return rc;
-}
-
-static int
parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
char **_link_str)
{
@@ -77,11 +61,7 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
return -EINVAL;
- rc = symlink_hash(link_len, link_str, md5_hash);
- if (rc) {
- cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
- return rc;
- }
+ md5(link_str, link_len, md5_hash);
scnprintf(md5_str2, sizeof(md5_str2),
CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -103,7 +83,6 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
static int
format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
{
- int rc;
unsigned int link_len;
unsigned int ofs;
u8 md5_hash[16];
@@ -116,11 +95,7 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
return -ENAMETOOLONG;
- rc = symlink_hash(link_len, link_str, md5_hash);
- if (rc) {
- cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
- return rc;
- }
+ md5(link_str, link_len, md5_hash);
scnprintf(buf, buf_len,
CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index dda6dece802a..e10123d8cd7d 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -916,6 +916,14 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
char *data_end;
struct dfs_referral_level_3 *ref;
+ if (rsp_size < sizeof(*rsp)) {
+ cifs_dbg(VFS | ONCE,
+ "%s: header is malformed (size is %u, must be %zu)\n",
+ __func__, rsp_size, sizeof(*rsp));
+ rc = -EINVAL;
+ goto parse_DFS_referrals_exit;
+ }
+
*num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals);
if (*num_of_nodes < 1) {
@@ -925,6 +933,15 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
goto parse_DFS_referrals_exit;
}
+ if (sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3) > rsp_size) {
+ cifs_dbg(VFS | ONCE,
+ "%s: malformed buffer (size is %u, must be at least %zu)\n",
+ __func__, rsp_size,
+ sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3));
+ rc = -EINVAL;
+ goto parse_DFS_referrals_exit;
+ }
+
ref = (struct dfs_referral_level_3 *) &(rsp->referrals);
if (ref->VersionNumber != cpu_to_le16(3)) {
cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n",
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 0a8c2fcc9ded..ef3b498b0a02 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -584,7 +584,7 @@ cifs_ses_add_channel(struct cifs_ses *ses,
* to sign packets before we generate the channel signing key
* (we sign with the session key)
*/
- rc = smb311_crypto_shash_allocate(chan->server);
+ rc = smb3_crypto_shash_allocate(chan->server);
if (rc) {
cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
mutex_unlock(&ses->session_mutex);
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index ca8f3dd7ff63..78650527d4bb 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -7,6 +7,7 @@
#include <linux/pagemap.h>
#include <linux/vfs.h>
+#include <linux/fs_struct.h>
#include <uapi/linux/magic.h>
#include "cifsglob.h"
#include "cifsproto.h"
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 09e3fc81d7cb..69cb81fa0d3a 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -1294,6 +1294,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb);
if (smb2_to_name == NULL) {
rc = -ENOMEM;
+ if (cfile)
+ cifsFileInfo_put(cfile);
goto smb2_rename_path;
}
in_iov.iov_base = smb2_to_name;
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 89d933b4a8bc..96bfe4c63ccf 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -7,6 +7,7 @@
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
*/
+#include <crypto/sha2.h>
#include <linux/ctype.h>
#include "cifsglob.h"
#include "cifsproto.h"
@@ -888,13 +889,13 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve
* @iov: array containing the SMB request we will send to the server
* @nvec: number of array entries for the iov
*/
-int
+void
smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct kvec *iov, int nvec)
{
- int i, rc;
+ int i;
struct smb2_hdr *hdr;
- struct shash_desc *sha512 = NULL;
+ struct sha512_ctx sha_ctx;
hdr = (struct smb2_hdr *)iov[0].iov_base;
/* neg prot are always taken */
@@ -907,52 +908,22 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
* and we can test it. Preauth requires 3.1.1 for now.
*/
if (server->dialect != SMB311_PROT_ID)
- return 0;
+ return;
if (hdr->Command != SMB2_SESSION_SETUP)
- return 0;
+ return;
/* skip last sess setup response */
if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
&& (hdr->Status == NT_STATUS_OK
|| (hdr->Status !=
cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))))
- return 0;
+ return;
ok:
- rc = smb311_crypto_shash_allocate(server);
- if (rc)
- return rc;
-
- sha512 = server->secmech.sha512;
- rc = crypto_shash_init(sha512);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_update(sha512, ses->preauth_sha_hash,
- SMB2_PREAUTH_HASH_SIZE);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__);
- return rc;
- }
-
- for (i = 0; i < nvec; i++) {
- rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update sha512 shash\n",
- __func__);
- return rc;
- }
- }
-
- rc = crypto_shash_final(sha512, ses->preauth_sha_hash);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n",
- __func__);
- return rc;
- }
-
- return 0;
+ sha512_init(&sha_ctx);
+ sha512_update(&sha_ctx, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE);
+ for (i = 0; i < nvec; i++)
+ sha512_update(&sha_ctx, iov[i].iov_base, iov[i].iov_len);
+ sha512_final(&sha_ctx, ses->preauth_sha_hash);
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 7c392cf5940b..1e39f2165e42 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2799,11 +2799,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
int rc;
__le16 *utf16_path;
- struct cached_fid *cfid = NULL;
+ struct cached_fid *cfid;
int retries = 0, cur_sleep = 1;
replay_again:
/* reinitialize for possible replay */
+ cfid = NULL;
flags = CIFS_CP_CREATE_CLOSE_OP;
oplock = SMB2_OPLOCK_LEVEL_NONE;
server = cifs_pick_channel(ses);
@@ -3212,8 +3213,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path) {
rc = -ENOMEM;
- free_xid(xid);
- return ERR_PTR(rc);
+ goto put_tlink;
}
oparms = (struct cifs_open_parms) {
@@ -3245,6 +3245,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
}
+put_tlink:
cifs_put_tlink(tlink);
free_xid(xid);
@@ -3285,8 +3286,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen,
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path) {
rc = -ENOMEM;
- free_xid(xid);
- return rc;
+ goto put_tlink;
}
oparms = (struct cifs_open_parms) {
@@ -3307,6 +3307,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen,
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
}
+put_tlink:
cifs_put_tlink(tlink);
free_xid(xid);
return rc;
@@ -5446,7 +5447,6 @@ struct smb_version_operations smb20_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
- .calc_signature = smb2_calc_signature,
.is_read_op = smb2_is_read_op,
.set_oplock_level = smb2_set_oplock_level,
.create_lease_buf = smb2_create_lease_buf,
@@ -5550,7 +5550,6 @@ struct smb_version_operations smb21_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
- .calc_signature = smb2_calc_signature,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb21_set_oplock_level,
.create_lease_buf = smb2_create_lease_buf,
@@ -5660,7 +5659,6 @@ struct smb_version_operations smb30_operations = {
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
.generate_signingkey = generate_smb30signingkey,
- .calc_signature = smb3_calc_signature,
.set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb3_set_oplock_level,
@@ -5777,7 +5775,6 @@ struct smb_version_operations smb311_operations = {
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
.generate_signingkey = generate_smb311signingkey,
- .calc_signature = smb3_calc_signature,
.set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb3_set_oplock_level,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index b0739a2661bf..8b4a4573e9c3 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4054,9 +4054,12 @@ replay_again:
smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base;
- smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset),
- le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov,
+ rc = smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset),
+ le32_to_cpu(smb_rsp->OutputBufferLength),
+ &rsp_iov,
sizeof(struct file_notify_information));
+ if (rc)
+ goto cnotify_exit;
*out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset),
le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL);
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index b3f1398c9f79..5241daaae543 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -39,12 +39,6 @@ extern struct mid_q_entry *smb2_setup_async_request(
struct TCP_Server_Info *server, struct smb_rqst *rqst);
extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
__u64 ses_id, __u32 tid);
-extern int smb2_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server,
- bool allocate_crypto);
-extern int smb3_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server,
- bool allocate_crypto);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,
@@ -295,10 +289,10 @@ extern int smb2_validate_and_copy_iov(unsigned int offset,
extern void smb2_copy_fs_info_to_kstatfs(
struct smb2_fs_full_size_info *pfs_inf,
struct kstatfs *kst);
-extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
-extern int smb311_update_preauth_hash(struct cifs_ses *ses,
- struct TCP_Server_Info *server,
- struct kvec *iov, int nvec);
+extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server);
+extern void smb311_update_preauth_hash(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ struct kvec *iov, int nvec);
extern int smb2_query_info_compound(const unsigned int xid,
struct cifs_tcon *tcon,
const char *path, u32 desired_access,
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 33f33013b392..6a9b80385b86 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -19,6 +19,7 @@
#include <linux/mempool.h>
#include <linux/highmem.h>
#include <crypto/aead.h>
+#include <crypto/sha2.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -26,53 +27,14 @@
#include "../common/smb2status.h"
#include "smb2glob.h"
-static int
+int
smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
struct cifs_secmech *p = &server->secmech;
- int rc;
- rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
- if (rc)
- goto err;
-
- rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
- if (rc)
- goto err;
-
- return 0;
-err:
- cifs_free_hash(&p->hmacsha256);
- return rc;
+ return cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
}
-int
-smb311_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- struct cifs_secmech *p = &server->secmech;
- int rc = 0;
-
- rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
- if (rc)
- return rc;
-
- rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
- if (rc)
- goto err;
-
- rc = cifs_alloc_hash("sha512", &p->sha512);
- if (rc)
- goto err;
-
- return 0;
-
-err:
- cifs_free_hash(&p->aes_cmac);
- cifs_free_hash(&p->hmacsha256);
- return rc;
-}
-
-
static
int smb3_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
{
@@ -247,16 +209,15 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
return tcon;
}
-int
+static int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
- bool allocate_crypto)
+ bool allocate_crypto)
{
int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
- unsigned char *sigptr = smb2_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
- struct shash_desc *shash = NULL;
+ struct hmac_sha256_ctx hmac_ctx;
struct smb_rqst drqst;
__u64 sid = le64_to_cpu(shdr->SessionId);
u8 key[SMB2_NTLMV2_SESSKEY_SIZE];
@@ -271,30 +232,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- if (allocate_crypto) {
- rc = cifs_alloc_hash("hmac(sha256)", &shash);
- if (rc) {
- cifs_server_dbg(VFS,
- "%s: sha256 alloc failed\n", __func__);
- goto out;
- }
- } else {
- shash = server->secmech.hmacsha256;
- }
-
- rc = crypto_shash_setkey(shash->tfm, key, sizeof(key));
- if (rc) {
- cifs_server_dbg(VFS,
- "%s: Could not update with response\n",
- __func__);
- goto out;
- }
-
- rc = crypto_shash_init(shash);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not init sha256", __func__);
- goto out;
- }
+ hmac_sha256_init_usingrawkey(&hmac_ctx, key, sizeof(key));
/*
* For SMB2+, __cifs_calc_signature() expects to sign only the actual
@@ -305,25 +243,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
*/
drqst = *rqst;
if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) {
- rc = crypto_shash_update(shash, iov[0].iov_base,
- iov[0].iov_len);
- if (rc) {
- cifs_server_dbg(VFS,
- "%s: Could not update with payload\n",
- __func__);
- goto out;
- }
+ hmac_sha256_update(&hmac_ctx, iov[0].iov_base, iov[0].iov_len);
drqst.rq_iov++;
drqst.rq_nvec--;
}
- rc = __cifs_calc_signature(&drqst, server, sigptr, shash);
+ rc = __cifs_calc_signature(
+ &drqst, server, smb2_signature,
+ &(struct cifs_calc_sig_ctx){ .hmac = &hmac_ctx });
if (!rc)
- memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+ memcpy(shdr->Signature, smb2_signature, SMB2_SIGNATURE_SIZE);
-out:
- if (allocate_crypto)
- cifs_free_hash(&shash);
return rc;
}
@@ -336,8 +266,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
__u8 L256[4] = {0, 0, 1, 0};
int rc = 0;
unsigned char prfhash[SMB2_HMACSHA256_SIZE];
- unsigned char *hashptr = prfhash;
struct TCP_Server_Info *server = ses->server;
+ struct hmac_sha256_ctx hmac_ctx;
memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
memset(key, 0x0, key_size);
@@ -345,67 +275,26 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
rc = smb3_crypto_shash_allocate(server);
if (rc) {
cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__);
- goto smb3signkey_ret;
- }
-
- rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm,
- ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__);
- goto smb3signkey_ret;
- }
-
- rc = crypto_shash_init(server->secmech.hmacsha256);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
- goto smb3signkey_ret;
- }
-
- rc = crypto_shash_update(server->secmech.hmacsha256, i, 4);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__);
- goto smb3signkey_ret;
- }
-
- rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__);
- goto smb3signkey_ret;
+ return rc;
}
- rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__);
- goto smb3signkey_ret;
- }
-
- rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__);
- goto smb3signkey_ret;
- }
+ hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ hmac_sha256_update(&hmac_ctx, i, 4);
+ hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len);
+ hmac_sha256_update(&hmac_ctx, &zero, 1);
+ hmac_sha256_update(&hmac_ctx, context.iov_base, context.iov_len);
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
- rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4);
+ hmac_sha256_update(&hmac_ctx, L256, 4);
} else {
- rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4);
- }
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__);
- goto smb3signkey_ret;
+ hmac_sha256_update(&hmac_ctx, L128, 4);
}
+ hmac_sha256_final(&hmac_ctx, prfhash);
- rc = crypto_shash_final(server->secmech.hmacsha256, hashptr);
- if (rc) {
- cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
- goto smb3signkey_ret;
- }
-
- memcpy(key, hashptr, key_size);
-
-smb3signkey_ret:
- return rc;
+ memcpy(key, prfhash, key_size);
+ return 0;
}
struct derivation {
@@ -576,19 +465,21 @@ generate_smb311signingkey(struct cifs_ses *ses,
return generate_smb3signingkey(ses, server, &triplet);
}
-int
+static int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
- bool allocate_crypto)
+ bool allocate_crypto)
{
int rc;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
- unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct shash_desc *shash = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
+ if (server->vals->protocol_id <= SMB21_PROT_ID)
+ return smb2_calc_signature(rqst, server, allocate_crypto);
+
rc = smb3_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
if (unlikely(rc)) {
cifs_server_dbg(FYI, "%s: Could not get signing key\n", __func__);
@@ -643,9 +534,11 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
drqst.rq_nvec--;
}
- rc = __cifs_calc_signature(&drqst, server, sigptr, shash);
+ rc = __cifs_calc_signature(
+ &drqst, server, smb3_signature,
+ &(struct cifs_calc_sig_ctx){ .shash = shash });
if (!rc)
- memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+ memcpy(shdr->Signature, smb3_signature, SMB2_SIGNATURE_SIZE);
out:
if (allocate_crypto)
@@ -657,7 +550,6 @@ out:
static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
- int rc = 0;
struct smb2_hdr *shdr;
struct smb2_sess_setup_req *ssr;
bool is_binding;
@@ -684,9 +576,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return 0;
}
- rc = server->ops->calc_signature(rqst, server, false);
-
- return rc;
+ return smb3_calc_signature(rqst, server, false);
}
int
@@ -722,7 +612,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- rc = server->ops->calc_signature(rqst, server, true);
+ rc = smb3_calc_signature(rqst, server, true);
if (rc)
return rc;
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 316f398c70f4..c6c428c2e08d 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -172,6 +172,7 @@ static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
* in order to notice the broken connection.
*/
wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.lcredits.wait_queue);
wake_up_all(&sc->send_io.credits.wait_queue);
wake_up_all(&sc->send_io.pending.dec_wait_queue);
wake_up_all(&sc->send_io.pending.zero_wait_queue);
@@ -289,6 +290,9 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
break;
case SMBDIRECT_SOCKET_CREATED:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
case SMBDIRECT_SOCKET_CONNECTED:
sc->status = SMBDIRECT_SOCKET_ERROR;
break;
@@ -495,6 +499,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_send_io *request =
container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
struct smbdirect_socket *sc = request->socket;
+ int lcredits = 0;
log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
request, ib_wc_status_msg(wc->status));
@@ -504,22 +509,24 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
+ mempool_free(request, sc->send_io.mem.pool);
+ lcredits += 1;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->opcode);
- mempool_free(request, sc->send_io.mem.pool);
smbd_disconnect_rdma_connection(sc);
return;
}
+ atomic_add(lcredits, &sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
if (atomic_dec_and_test(&sc->send_io.pending.count))
wake_up(&sc->send_io.pending.zero_wait_queue);
wake_up(&sc->send_io.pending.dec_wait_queue);
-
- mempool_free(request, sc->send_io.mem.pool);
}
static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
@@ -567,6 +574,7 @@ static bool process_negotiation_response(
log_rdma_event(ERR, "error: credits_granted==0\n");
return false;
}
+ atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
@@ -1114,6 +1122,24 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc,
struct smbdirect_data_transfer *packet;
int new_credits = 0;
+wait_lcredit:
+ /* Wait for local send credits */
+ rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue,
+ atomic_read(&sc->send_io.lcredits.count) > 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (rc)
+ goto err_wait_lcredit;
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ log_outgoing(ERR, "disconnected not sending on wait_credit\n");
+ rc = -EAGAIN;
+ goto err_wait_lcredit;
+ }
+ if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) {
+ atomic_inc(&sc->send_io.lcredits.count);
+ goto wait_lcredit;
+ }
+
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
@@ -1132,23 +1158,6 @@ wait_credit:
goto wait_credit;
}
-wait_send_queue:
- wait_event(sc->send_io.pending.dec_wait_queue,
- atomic_read(&sc->send_io.pending.count) < sp->send_credit_target ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
- rc = -EAGAIN;
- goto err_wait_send_queue;
- }
-
- if (unlikely(atomic_inc_return(&sc->send_io.pending.count) >
- sp->send_credit_target)) {
- atomic_dec(&sc->send_io.pending.count);
- goto wait_send_queue;
- }
-
request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
@@ -1229,10 +1238,21 @@ wait_send_queue:
le32_to_cpu(packet->data_length),
le32_to_cpu(packet->remaining_data_length));
+ /*
+ * Now that we got a local and a remote credit
+ * we add us as pending
+ */
+ atomic_inc(&sc->send_io.pending.count);
+
rc = smbd_post_send(sc, request);
if (!rc)
return 0;
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
+
+ wake_up(&sc->send_io.pending.dec_wait_queue);
+
err_dma:
for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
@@ -1246,14 +1266,14 @@ err_dma:
atomic_sub(new_credits, &sc->recv_io.credits.count);
err_alloc:
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
-
-err_wait_send_queue:
- /* roll back send credits and pending */
atomic_inc(&sc->send_io.credits.count);
+ wake_up(&sc->send_io.credits.wait_queue);
err_wait_credit:
+ atomic_inc(&sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
+err_wait_lcredit:
return rc;
}
@@ -1575,12 +1595,12 @@ void smbd_destroy(struct TCP_Server_Info *server)
disable_work_sync(&sc->disconnect_work);
log_rdma_event(INFO, "destroying rdma session\n");
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
smbd_disconnect_rdma_work(&sc->disconnect_work);
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
log_rdma_event(INFO, "wait for transport being disconnected\n");
- wait_event_interruptible(
- sc->status_wait,
- sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
+ wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
+ log_rdma_event(INFO, "waited for transport being disconnected\n");
}
/*
@@ -1624,19 +1644,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "free receive buffers\n");
destroy_receive_buffers(sc);
- /*
- * For performance reasons, memory registration and deregistration
- * are not locked by srv_mutex. It is possible some processes are
- * blocked on transport srv_mutex while holding memory registration.
- * Release the transport srv_mutex to allow them to hit the failure
- * path when sending data, and then release memory registrations.
- */
log_rdma_event(INFO, "freeing mr list\n");
- while (atomic_read(&sc->mr_io.used.count)) {
- cifs_server_unlock(server);
- msleep(1000);
- cifs_server_lock(server);
- }
destroy_mr_list(sc);
ib_free_cq(sc->ib.send_cq);
@@ -1779,6 +1787,7 @@ static struct smbd_connection *_smbd_get_connection(
struct smbdirect_socket *sc;
struct smbdirect_socket_parameters *sp;
struct rdma_conn_param conn_param;
+ struct ib_qp_cap qp_cap;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
struct ib_port_immutable port_immutable;
@@ -1850,6 +1859,25 @@ static struct smbd_connection *_smbd_get_connection(
goto config_failed;
}
+ sp->responder_resources =
+ min_t(u8, sp->responder_resources,
+ sc->ib.dev->attrs.max_qp_rd_atom);
+ log_rdma_mr(INFO, "responder_resources=%d\n",
+ sp->responder_resources);
+
+ /*
+ * We use allocate sp->responder_resources * 2 MRs
+ * and each MR needs WRs for REG and INV, so
+ * we use '* 4'.
+ *
+ * +1 for ib_drain_qp()
+ */
+ memset(&qp_cap, 0, sizeof(qp_cap));
+ qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
+ qp_cap.max_recv_wr = sp->recv_credit_max + 1;
+ qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
+
sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
if (IS_ERR(sc->ib.pd)) {
rc = PTR_ERR(sc->ib.pd);
@@ -1860,7 +1888,7 @@ static struct smbd_connection *_smbd_get_connection(
sc->ib.send_cq =
ib_alloc_cq_any(sc->ib.dev, sc,
- sp->send_credit_target, IB_POLL_SOFTIRQ);
+ qp_cap.max_send_wr, IB_POLL_SOFTIRQ);
if (IS_ERR(sc->ib.send_cq)) {
sc->ib.send_cq = NULL;
goto alloc_cq_failed;
@@ -1868,7 +1896,7 @@ static struct smbd_connection *_smbd_get_connection(
sc->ib.recv_cq =
ib_alloc_cq_any(sc->ib.dev, sc,
- sp->recv_credit_max, IB_POLL_SOFTIRQ);
+ qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);
if (IS_ERR(sc->ib.recv_cq)) {
sc->ib.recv_cq = NULL;
goto alloc_cq_failed;
@@ -1877,11 +1905,7 @@ static struct smbd_connection *_smbd_get_connection(
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smbd_qp_async_error_upcall;
qp_attr.qp_context = sc;
- qp_attr.cap.max_send_wr = sp->send_credit_target;
- qp_attr.cap.max_recv_wr = sp->recv_credit_max;
- qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
- qp_attr.cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
- qp_attr.cap.max_inline_data = 0;
+ qp_attr.cap = qp_cap;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = sc->ib.send_cq;
@@ -1895,12 +1919,6 @@ static struct smbd_connection *_smbd_get_connection(
}
sc->ib.qp = sc->rdma.cm_id->qp;
- sp->responder_resources =
- min_t(u8, sp->responder_resources,
- sc->ib.dev->attrs.max_qp_rd_atom);
- log_rdma_mr(INFO, "responder_resources=%d\n",
- sp->responder_resources);
-
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = sp->initiator_depth;
conn_param.responder_resources = sp->responder_resources;
@@ -2352,18 +2370,84 @@ static void smbd_mr_recovery_work(struct work_struct *work)
}
}
+static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr)
+{
+ struct smbdirect_socket *sc = mr->socket;
+
+ lockdep_assert_held(&mr->mutex);
+
+ if (mr->state == SMBDIRECT_MR_DISABLED)
+ return;
+
+ if (mr->mr)
+ ib_dereg_mr(mr->mr);
+ if (mr->sgt.nents)
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+ kfree(mr->sgt.sgl);
+
+ mr->mr = NULL;
+ mr->sgt.sgl = NULL;
+ mr->sgt.nents = 0;
+
+ mr->state = SMBDIRECT_MR_DISABLED;
+}
+
+static void smbd_mr_free_locked(struct kref *kref)
+{
+ struct smbdirect_mr_io *mr =
+ container_of(kref, struct smbdirect_mr_io, kref);
+
+ lockdep_assert_held(&mr->mutex);
+
+ /*
+ * smbd_mr_disable_locked() should already be called!
+ */
+ if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
+ smbd_mr_disable_locked(mr);
+
+ mutex_unlock(&mr->mutex);
+ mutex_destroy(&mr->mutex);
+ kfree(mr);
+}
+
static void destroy_mr_list(struct smbdirect_socket *sc)
{
struct smbdirect_mr_io *mr, *tmp;
+ LIST_HEAD(all_list);
+ unsigned long flags;
disable_work_sync(&sc->mr_io.recovery_work);
- list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) {
- if (mr->state == SMBDIRECT_MR_INVALIDATED)
- ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
- mr->sgt.nents, mr->dir);
- ib_dereg_mr(mr->mr);
- kfree(mr->sgt.sgl);
- kfree(mr);
+
+ spin_lock_irqsave(&sc->mr_io.all.lock, flags);
+ list_splice_tail_init(&sc->mr_io.all.list, &all_list);
+ spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
+
+ list_for_each_entry_safe(mr, tmp, &all_list, list) {
+ mutex_lock(&mr->mutex);
+
+ smbd_mr_disable_locked(mr);
+ list_del(&mr->list);
+ mr->socket = NULL;
+
+ /*
+ * No kref_put_mutex() as it's already locked.
+ *
+ * If smbd_mr_free_locked() is called
+ * and the mutex is unlocked and mr is gone,
+ * in that case kref_put() returned 1.
+ *
+ * If kref_put() returned 0 we know that
+ * smbd_mr_free_locked() didn't
+ * run. Not by us nor by anyone else, as we
+ * still hold the mutex, so we need to unlock.
+ *
+ * If the mr is still registered it will
+ * be dangling (detached from the connection
+ * waiting for smbd_deregister_mr() to be
+ * called in order to free the memory.
+ */
+ if (!kref_put(&mr->kref, smbd_mr_free_locked))
+ mutex_unlock(&mr->mutex);
}
}
@@ -2377,10 +2461,9 @@ static void destroy_mr_list(struct smbdirect_socket *sc)
static int allocate_mr_list(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
- int i;
- struct smbdirect_mr_io *smbdirect_mr, *tmp;
-
- INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
+ struct smbdirect_mr_io *mr;
+ int ret;
+ u32 i;
if (sp->responder_resources == 0) {
log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
@@ -2389,42 +2472,52 @@ static int allocate_mr_list(struct smbdirect_socket *sc)
/* Allocate more MRs (2x) than hardware responder_resources */
for (i = 0; i < sp->responder_resources * 2; i++) {
- smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
- if (!smbdirect_mr)
- goto cleanup_entries;
- smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type,
- sp->max_frmr_depth);
- if (IS_ERR(smbdirect_mr->mr)) {
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ ret = -ENOMEM;
+ goto kzalloc_mr_failed;
+ }
+
+ kref_init(&mr->kref);
+ mutex_init(&mr->mutex);
+
+ mr->mr = ib_alloc_mr(sc->ib.pd,
+ sc->mr_io.type,
+ sp->max_frmr_depth);
+ if (IS_ERR(mr->mr)) {
+ ret = PTR_ERR(mr->mr);
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
sc->mr_io.type, sp->max_frmr_depth);
- goto out;
+ goto ib_alloc_mr_failed;
}
- smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
- sizeof(struct scatterlist),
- GFP_KERNEL);
- if (!smbdirect_mr->sgt.sgl) {
+
+ mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
+ sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!mr->sgt.sgl) {
+ ret = -ENOMEM;
log_rdma_mr(ERR, "failed to allocate sgl\n");
- ib_dereg_mr(smbdirect_mr->mr);
- goto out;
+ goto kcalloc_sgl_failed;
}
- smbdirect_mr->state = SMBDIRECT_MR_READY;
- smbdirect_mr->socket = sc;
+ mr->state = SMBDIRECT_MR_READY;
+ mr->socket = sc;
- list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list);
+ list_add_tail(&mr->list, &sc->mr_io.all.list);
atomic_inc(&sc->mr_io.ready.count);
}
+
+ INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
+
return 0;
-out:
- kfree(smbdirect_mr);
-cleanup_entries:
- list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) {
- list_del(&smbdirect_mr->list);
- ib_dereg_mr(smbdirect_mr->mr);
- kfree(smbdirect_mr->sgt.sgl);
- kfree(smbdirect_mr);
- }
- return -ENOMEM;
+kcalloc_sgl_failed:
+ ib_dereg_mr(mr->mr);
+ib_alloc_mr_failed:
+ mutex_destroy(&mr->mutex);
+ kfree(mr);
+kzalloc_mr_failed:
+ destroy_mr_list(sc);
+ return ret;
}
/*
@@ -2458,6 +2551,7 @@ again:
list_for_each_entry(ret, &sc->mr_io.all.list, list) {
if (ret->state == SMBDIRECT_MR_READY) {
ret->state = SMBDIRECT_MR_REGISTERED;
+ kref_get(&ret->kref);
spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
atomic_dec(&sc->mr_io.ready.count);
atomic_inc(&sc->mr_io.used.count);
@@ -2504,9 +2598,8 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
{
struct smbdirect_socket *sc = &info->socket;
struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_mr_io *smbdirect_mr;
+ struct smbdirect_mr_io *mr;
int rc, num_pages;
- enum dma_data_direction dir;
struct ib_reg_wr *reg_wr;
num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
@@ -2517,49 +2610,47 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
return NULL;
}
- smbdirect_mr = get_mr(sc);
- if (!smbdirect_mr) {
+ mr = get_mr(sc);
+ if (!mr) {
log_rdma_mr(ERR, "get_mr returning NULL\n");
return NULL;
}
- dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
- smbdirect_mr->dir = dir;
- smbdirect_mr->need_invalidate = need_invalidate;
- smbdirect_mr->sgt.nents = 0;
- smbdirect_mr->sgt.orig_nents = 0;
+ mutex_lock(&mr->mutex);
+
+ mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ mr->need_invalidate = need_invalidate;
+ mr->sgt.nents = 0;
+ mr->sgt.orig_nents = 0;
log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
num_pages, iov_iter_count(iter), sp->max_frmr_depth);
- smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth);
+ smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth);
- rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
- smbdirect_mr->sgt.nents, dir);
+ rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
if (!rc) {
log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
- num_pages, dir, rc);
+ num_pages, mr->dir, rc);
goto dma_map_error;
}
- rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl,
- smbdirect_mr->sgt.nents, NULL, PAGE_SIZE);
- if (rc != smbdirect_mr->sgt.nents) {
+ rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
+ if (rc != mr->sgt.nents) {
log_rdma_mr(ERR,
- "ib_map_mr_sg failed rc = %d nents = %x\n",
- rc, smbdirect_mr->sgt.nents);
+ "ib_map_mr_sg failed rc = %d nents = %x\n",
+ rc, mr->sgt.nents);
goto map_mr_error;
}
- ib_update_fast_reg_key(smbdirect_mr->mr,
- ib_inc_rkey(smbdirect_mr->mr->rkey));
- reg_wr = &smbdirect_mr->wr;
+ ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
+ reg_wr = &mr->wr;
reg_wr->wr.opcode = IB_WR_REG_MR;
- smbdirect_mr->cqe.done = register_mr_done;
- reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
+ mr->cqe.done = register_mr_done;
+ reg_wr->wr.wr_cqe = &mr->cqe;
reg_wr->wr.num_sge = 0;
reg_wr->wr.send_flags = IB_SEND_SIGNALED;
- reg_wr->mr = smbdirect_mr->mr;
- reg_wr->key = smbdirect_mr->mr->rkey;
+ reg_wr->mr = mr->mr;
+ reg_wr->key = mr->mr->rkey;
reg_wr->access = writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
@@ -2570,24 +2661,51 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
* on the next ib_post_send when we actually send I/O to remote peer
*/
rc = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
- if (!rc)
- return smbdirect_mr;
+ if (!rc) {
+ /*
+ * get_mr() gave us a reference
+ * via kref_get(&mr->kref), we keep that and let
+ * the caller use smbd_deregister_mr()
+ * to remove it again.
+ */
+ mutex_unlock(&mr->mutex);
+ return mr;
+ }
log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
rc, reg_wr->key);
/* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
map_mr_error:
- ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
- smbdirect_mr->sgt.nents, smbdirect_mr->dir);
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
dma_map_error:
- smbdirect_mr->state = SMBDIRECT_MR_ERROR;
+ mr->sgt.nents = 0;
+ mr->state = SMBDIRECT_MR_ERROR;
if (atomic_dec_and_test(&sc->mr_io.used.count))
wake_up(&sc->mr_io.cleanup.wait_queue);
smbd_disconnect_rdma_connection(sc);
+ /*
+ * get_mr() gave us a reference
+ * via kref_get(&mr->kref), we need to remove it again
+ * on error.
+ *
+ * No kref_put_mutex() as it's already locked.
+ *
+ * If smbd_mr_free_locked() is called
+ * and the mutex is unlocked and mr is gone,
+ * in that case kref_put() returned 1.
+ *
+ * If kref_put() returned 0 we know that
+ * smbd_mr_free_locked() didn't
+ * run. Not by us nor by anyone else, as we
+ * still hold the mutex, so we need to unlock.
+ */
+ if (!kref_put(&mr->kref, smbd_mr_free_locked))
+ mutex_unlock(&mr->mutex);
+
return NULL;
}
@@ -2612,44 +2730,55 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
* and we have to locally invalidate the buffer to prevent data is being
* modified by remote peer after upper layer consumes it
*/
-int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr)
+void smbd_deregister_mr(struct smbdirect_mr_io *mr)
{
- struct ib_send_wr *wr;
- struct smbdirect_socket *sc = smbdirect_mr->socket;
- int rc = 0;
+ struct smbdirect_socket *sc = mr->socket;
+
+ mutex_lock(&mr->mutex);
+ if (mr->state == SMBDIRECT_MR_DISABLED)
+ goto put_kref;
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbd_mr_disable_locked(mr);
+ goto put_kref;
+ }
+
+ if (mr->need_invalidate) {
+ struct ib_send_wr *wr = &mr->inv_wr;
+ int rc;
- if (smbdirect_mr->need_invalidate) {
/* Need to finish local invalidation before returning */
- wr = &smbdirect_mr->inv_wr;
wr->opcode = IB_WR_LOCAL_INV;
- smbdirect_mr->cqe.done = local_inv_done;
- wr->wr_cqe = &smbdirect_mr->cqe;
+ mr->cqe.done = local_inv_done;
+ wr->wr_cqe = &mr->cqe;
wr->num_sge = 0;
- wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
+ wr->ex.invalidate_rkey = mr->mr->rkey;
wr->send_flags = IB_SEND_SIGNALED;
- init_completion(&smbdirect_mr->invalidate_done);
+ init_completion(&mr->invalidate_done);
rc = ib_post_send(sc->ib.qp, wr, NULL);
if (rc) {
log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
+ smbd_mr_disable_locked(mr);
smbd_disconnect_rdma_connection(sc);
goto done;
}
- wait_for_completion(&smbdirect_mr->invalidate_done);
- smbdirect_mr->need_invalidate = false;
+ wait_for_completion(&mr->invalidate_done);
+ mr->need_invalidate = false;
} else
/*
* For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
* and defer to mr_recovery_work to recover the MR for next use
*/
- smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
+ mr->state = SMBDIRECT_MR_INVALIDATED;
- if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) {
- ib_dma_unmap_sg(
- sc->ib.dev, smbdirect_mr->sgt.sgl,
- smbdirect_mr->sgt.nents,
- smbdirect_mr->dir);
- smbdirect_mr->state = SMBDIRECT_MR_READY;
+ if (mr->sgt.nents) {
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+ mr->sgt.nents = 0;
+ }
+
+ if (mr->state == SMBDIRECT_MR_INVALIDATED) {
+ mr->state = SMBDIRECT_MR_READY;
if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
wake_up(&sc->mr_io.ready.wait_queue);
} else
@@ -2663,7 +2792,23 @@ done:
if (atomic_dec_and_test(&sc->mr_io.used.count))
wake_up(&sc->mr_io.cleanup.wait_queue);
- return rc;
+put_kref:
+ /*
+ * No kref_put_mutex() as it's already locked.
+ *
+ * If smbd_mr_free_locked() is called
+ * and the mutex is unlocked and mr is gone,
+ * in that case kref_put() returned 1.
+ *
+ * If kref_put() returned 0 we know that
+ * smbd_mr_free_locked() didn't
+ * run. Not by us nor by anyone else, as we
+ * still hold the mutex, so we need to unlock
+ * and keep the mr in SMBDIRECT_MR_READY or
+ * SMBDIRECT_MR_ERROR state.
+ */
+ if (!kref_put(&mr->kref, smbd_mr_free_locked))
+ mutex_unlock(&mr->mutex);
}
static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index d67ac5ddaff4..577d37dbeb8a 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -60,7 +60,7 @@ int smbd_send(struct TCP_Server_Info *server,
struct smbdirect_mr_io *smbd_register_mr(
struct smbd_connection *info, struct iov_iter *iter,
bool writing, bool need_invalidate);
-int smbd_deregister_mr(struct smbdirect_mr_io *mr);
+void smbd_deregister_mr(struct smbdirect_mr_io *mr);
#else
#define cifs_rdma_enabled(server) 0
diff --git a/fs/smb/client/trace.c b/fs/smb/client/trace.c
index 465483787193..16b0e719731f 100644
--- a/fs/smb/client/trace.c
+++ b/fs/smb/client/trace.c
@@ -4,5 +4,6 @@
*
* Author(s): Steve French <stfrench@microsoft.com>
*/
+#include "cifsglob.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 051cd9dbba13..915cedde5d66 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -830,7 +830,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
if (!server || server->terminate)
continue;
- if (CIFS_CHAN_NEEDS_RECONNECT(ses, i))
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, cur))
continue;
/*
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index b88fa04f5792..029910d56c22 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -178,7 +178,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
memcpy(pacl, value, size);
if (pTcon->ses->server->ops->set_acl) {
int aclflags = 0;
- rc = 0;
switch (handler->flags) {
case XATTR_CIFS_NTSD_FULL:
diff --git a/fs/smb/common/cifsglob.h b/fs/smb/common/cifsglob.h
new file mode 100644
index 000000000000..00fd215e3eb5
--- /dev/null
+++ b/fs/smb/common/cifsglob.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+/*
+ *
+ * Copyright (C) International Business Machines Corp., 2002,2008
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ * Jeremy Allison (jra@samba.org)
+ *
+ */
+#ifndef _COMMON_CIFS_GLOB_H
+#define _COMMON_CIFS_GLOB_H
+
+static inline void inc_rfc1001_len(void *buf, int count)
+{
+ be32_add_cpu((__be32 *)buf, count);
+}
+
+#define SMB1_VERSION_STRING "1.0"
+#define SMB20_VERSION_STRING "2.0"
+#define SMB21_VERSION_STRING "2.1"
+#define SMBDEFAULT_VERSION_STRING "default"
+#define SMB3ANY_VERSION_STRING "3"
+#define SMB30_VERSION_STRING "3.0"
+#define SMB302_VERSION_STRING "3.02"
+#define ALT_SMB302_VERSION_STRING "3.0.2"
+#define SMB311_VERSION_STRING "3.1.1"
+#define ALT_SMB311_VERSION_STRING "3.11"
+
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+#endif /* _COMMON_CIFS_GLOB_H */
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
index db22a1d0546b..ee5a90d691c8 100644
--- a/fs/smb/common/smbdirect/smbdirect_socket.h
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -142,7 +142,15 @@ struct smbdirect_socket {
} mem;
/*
- * The credit state for the send side
+ * The local credit state for ib_post_send()
+ */
+ struct {
+ atomic_t count;
+ wait_queue_head_t wait_queue;
+ } lcredits;
+
+ /*
+ * The remote credit state for the send side
*/
struct {
atomic_t count;
@@ -337,6 +345,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
disable_delayed_work_sync(&sc->idle.timer_work);
+ atomic_set(&sc->send_io.lcredits.count, 0);
+ init_waitqueue_head(&sc->send_io.lcredits.wait_queue);
+
atomic_set(&sc->send_io.credits.count, 0);
init_waitqueue_head(&sc->send_io.credits.wait_queue);
@@ -437,13 +448,22 @@ enum smbdirect_mr_state {
SMBDIRECT_MR_READY,
SMBDIRECT_MR_REGISTERED,
SMBDIRECT_MR_INVALIDATED,
- SMBDIRECT_MR_ERROR
+ SMBDIRECT_MR_ERROR,
+ SMBDIRECT_MR_DISABLED
};
struct smbdirect_mr_io {
struct smbdirect_socket *socket;
struct ib_cqe cqe;
+ /*
+ * We can have up to two references:
+ * 1. by the connection
+ * 2. by the registration
+ */
+ struct kref kref;
+ struct mutex mutex;
+
struct list_head list;
enum smbdirect_mr_state state;
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 6fa025374f2f..1c181ef99929 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -147,14 +147,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)
{
struct ksmbd_session_rpc *entry;
- int method;
- down_read(&sess->rpc_lock);
+ lockdep_assert_held(&sess->rpc_lock);
entry = xa_load(&sess->rpc_handle_list, id);
- method = entry ? entry->method : 0;
- up_read(&sess->rpc_lock);
- return method;
+ return entry ? entry->method : 0;
}
void ksmbd_session_destroy(struct ksmbd_session *sess)
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index ab1d45fcebde..f901ae18e68a 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1806,6 +1806,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
if (ksmbd_conn_need_reconnect(conn)) {
rc = -EFAULT;
+ ksmbd_user_session_put(sess);
sess = NULL;
goto out_err;
}
@@ -4625,8 +4626,15 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
* pipe without opening it, checking error condition here
*/
id = req->VolatileFileId;
- if (!ksmbd_session_rpc_method(sess, id))
+
+ lockdep_assert_not_held(&sess->rpc_lock);
+
+ down_read(&sess->rpc_lock);
+ if (!ksmbd_session_rpc_method(sess, id)) {
+ up_read(&sess->rpc_lock);
return -ENOENT;
+ }
+ up_read(&sess->rpc_lock);
ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n",
req->FileInfoClass, req->VolatileFileId);
@@ -6824,6 +6832,7 @@ int smb2_read(struct ksmbd_work *work)
nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf);
if (nbytes < 0) {
+ kvfree(aux_payload_buf);
err = nbytes;
goto out;
}
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index d742ba754348..863716207a0d 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -10,6 +10,7 @@
#include "glob.h"
#include "nterr.h"
+#include "../common/cifsglob.h"
#include "../common/smb2pdu.h"
#include "smb2pdu.h"
@@ -26,16 +27,8 @@
#define SMB311_PROT 6
#define BAD_PROT 0xFFFF
-#define SMB1_VERSION_STRING "1.0"
-#define SMB20_VERSION_STRING "2.0"
-#define SMB21_VERSION_STRING "2.1"
-#define SMB30_VERSION_STRING "3.0"
-#define SMB302_VERSION_STRING "3.02"
-#define SMB311_VERSION_STRING "3.1.1"
-
#define SMB_ECHO_INTERVAL (60 * HZ)
-#define CIFS_DEFAULT_IOSIZE (64 * 1024)
#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
#define MAX_STREAM_PROT_LEN 0x00FFFFFF
@@ -464,9 +457,4 @@ static inline unsigned int get_rfc1002_len(void *buf)
{
return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
}
-
-static inline void inc_rfc1001_len(void *buf, int count)
-{
- be32_add_cpu((__be32 *)buf, count);
-}
#endif /* __SMB_COMMON_H__ */
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 2aa1b29bea08..2c08cccfa680 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -263,10 +263,16 @@ static void ipc_msg_handle_free(int handle)
static int handle_response(int type, void *payload, size_t sz)
{
- unsigned int handle = *(unsigned int *)payload;
+ unsigned int handle;
struct ipc_msg_table_entry *entry;
int ret = 0;
+ /* Prevent 4-byte read beyond declared payload size */
+ if (sz < sizeof(unsigned int))
+ return -EINVAL;
+
+ handle = *(unsigned int *)payload;
+
ipc_update_last_active();
down_read(&ipc_msg_table_lock);
hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) {
@@ -825,6 +831,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle
if (!msg)
return NULL;
+ lockdep_assert_not_held(&sess->rpc_lock);
+
+ down_read(&sess->rpc_lock);
msg->type = KSMBD_EVENT_RPC_REQUEST;
req = (struct ksmbd_rpc_command *)msg->payload;
req->handle = handle;
@@ -833,6 +842,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle
req->flags |= KSMBD_RPC_WRITE_METHOD;
req->payload_sz = payload_sz;
memcpy(req->payload, payload, payload_sz);
+ up_read(&sess->rpc_lock);
resp = ipc_msg_send_request(msg, req->handle);
ipc_msg_free(msg);
@@ -849,6 +859,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle)
if (!msg)
return NULL;
+ lockdep_assert_not_held(&sess->rpc_lock);
+
+ down_read(&sess->rpc_lock);
msg->type = KSMBD_EVENT_RPC_REQUEST;
req = (struct ksmbd_rpc_command *)msg->payload;
req->handle = handle;
@@ -856,6 +869,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle)
req->flags |= rpc_context_flags(sess);
req->flags |= KSMBD_RPC_READ_METHOD;
req->payload_sz = 0;
+ up_read(&sess->rpc_lock);
resp = ipc_msg_send_request(msg, req->handle);
ipc_msg_free(msg);
@@ -876,6 +890,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle
if (!msg)
return NULL;
+ lockdep_assert_not_held(&sess->rpc_lock);
+
+ down_read(&sess->rpc_lock);
msg->type = KSMBD_EVENT_RPC_REQUEST;
req = (struct ksmbd_rpc_command *)msg->payload;
req->handle = handle;
@@ -884,6 +901,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle
req->flags |= KSMBD_RPC_IOCTL_METHOD;
req->payload_sz = payload_sz;
memcpy(req->payload, payload, payload_sz);
+ up_read(&sess->rpc_lock);
resp = ipc_msg_send_request(msg, req->handle);
ipc_msg_free(msg);
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index b3077766d6ec..e2be9a496154 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -219,6 +219,7 @@ static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)
* in order to notice the broken connection.
*/
wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.lcredits.wait_queue);
wake_up_all(&sc->send_io.credits.wait_queue);
wake_up_all(&sc->send_io.pending.zero_wait_queue);
wake_up_all(&sc->recv_io.reassembly.wait_queue);
@@ -333,6 +334,9 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc)
break;
case SMBDIRECT_SOCKET_CREATED:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
case SMBDIRECT_SOCKET_CONNECTED:
sc->status = SMBDIRECT_SOCKET_ERROR;
break;
@@ -417,9 +421,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
sc->ib.dev = sc->rdma.cm_id->device;
- INIT_WORK(&sc->recv_io.posted.refill_work,
- smb_direct_post_recv_credits);
- INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer);
conn = ksmbd_conn_alloc();
@@ -450,11 +451,10 @@ static void free_transport(struct smb_direct_transport *t)
struct smbdirect_recv_io *recvmsg;
disable_work_sync(&sc->disconnect_work);
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
smb_direct_disconnect_rdma_work(&sc->disconnect_work);
- wait_event_interruptible(sc->status_wait,
- sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
- }
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED)
+ wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
/*
* Wake up all waiters in all wait queues
@@ -469,9 +469,11 @@ static void free_transport(struct smb_direct_transport *t)
disable_delayed_work_sync(&sc->idle.timer_work);
disable_work_sync(&sc->idle.immediate_work);
+ if (sc->rdma.cm_id)
+ rdma_lock_handler(sc->rdma.cm_id);
+
if (sc->ib.qp) {
ib_drain_qp(sc->ib.qp);
- ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs);
sc->ib.qp = NULL;
rdma_destroy_qp(sc->rdma.cm_id);
}
@@ -498,8 +500,10 @@ static void free_transport(struct smb_direct_transport *t)
ib_free_cq(sc->ib.recv_cq);
if (sc->ib.pd)
ib_dealloc_pd(sc->ib.pd);
- if (sc->rdma.cm_id)
+ if (sc->rdma.cm_id) {
+ rdma_unlock_handler(sc->rdma.cm_id);
rdma_destroy_id(sc->rdma.cm_id);
+ }
smb_direct_destroy_pools(sc);
ksmbd_conn_free(KSMBD_TRANS(t)->conn);
@@ -524,6 +528,12 @@ static void smb_direct_free_sendmsg(struct smbdirect_socket *sc,
{
int i;
+ /*
+ * The list needs to be empty!
+ * The caller should take care of it.
+ */
+ WARN_ON_ONCE(!list_empty(&msg->sibling_list));
+
if (msg->num_sge > 0) {
ib_dma_unmap_single(sc->ib.dev,
msg->sge[0].addr, msg->sge[0].length,
@@ -909,9 +919,9 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
static void send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smbdirect_send_io *sendmsg, *sibling;
+ struct smbdirect_send_io *sendmsg, *sibling, *next;
struct smbdirect_socket *sc;
- struct list_head *pos, *prev, *end;
+ int lcredits = 0;
sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
sc = sendmsg->socket;
@@ -920,27 +930,31 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
+ /*
+ * Free possible siblings and then the main send_io
+ */
+ list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smb_direct_free_sendmsg(sc, sibling);
+ lcredits += 1;
+ }
+ /* Note this frees wc->wr_cqe, but not wc */
+ smb_direct_free_sendmsg(sc, sendmsg);
+ lcredits += 1;
+
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
pr_err("Send error. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
smb_direct_disconnect_rdma_connection(sc);
+ return;
}
+ atomic_add(lcredits, &sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
if (atomic_dec_and_test(&sc->send_io.pending.count))
wake_up(&sc->send_io.pending.zero_wait_queue);
-
- /* iterate and free the list of messages in reverse. the list's head
- * is invalid.
- */
- for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next;
- prev != end; pos = prev, prev = prev->prev) {
- sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
- smb_direct_free_sendmsg(sc, sibling);
- }
-
- sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
- smb_direct_free_sendmsg(sc, sibling);
}
static int manage_credits_prior_sending(struct smbdirect_socket *sc)
@@ -988,8 +1002,6 @@ static int smb_direct_post_send(struct smbdirect_socket *sc,
ret = ib_post_send(sc->ib.qp, wr, NULL);
if (ret) {
pr_err("failed to post send: %d\n", ret);
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
smb_direct_disconnect_rdma_connection(sc);
}
return ret;
@@ -1032,19 +1044,29 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
last->wr.send_flags = IB_SEND_SIGNALED;
last->wr.wr_cqe = &last->cqe;
+ /*
+ * Remove last from send_ctx->msg_list
+ * and splice the rest of send_ctx->msg_list
+ * to last->sibling_list.
+ *
+ * send_ctx->msg_list is a valid empty list
+ * at the end.
+ */
+ list_del_init(&last->sibling_list);
+ list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list);
+ send_ctx->wr_cnt = 0;
+
ret = smb_direct_post_send(sc, &first->wr);
- if (!ret) {
- smb_direct_send_ctx_init(send_ctx,
- send_ctx->need_invalidate_rkey,
- send_ctx->remote_key);
- } else {
- atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count);
- wake_up(&sc->send_io.credits.wait_queue);
- list_for_each_entry_safe(first, last, &send_ctx->msg_list,
- sibling_list) {
- smb_direct_free_sendmsg(sc, first);
+ if (ret) {
+ struct smbdirect_send_io *sibling, *next;
+
+ list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smb_direct_free_sendmsg(sc, sibling);
}
+ smb_direct_free_sendmsg(sc, last);
}
+
return ret;
}
@@ -1070,6 +1092,23 @@ static int wait_for_credits(struct smbdirect_socket *sc,
} while (true);
}
+static int wait_for_send_lcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx)
+{
+ if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) {
+ int ret;
+
+ ret = smb_direct_flush_send_list(sc, send_ctx, false);
+ if (ret)
+ return ret;
+ }
+
+ return wait_for_credits(sc,
+ &sc->send_io.lcredits.wait_queue,
+ &sc->send_io.lcredits.count,
+ 1);
+}
+
static int wait_for_send_credits(struct smbdirect_socket *sc,
struct smbdirect_send_batch *send_ctx)
{
@@ -1257,9 +1296,13 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
int data_length;
struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1];
+ ret = wait_for_send_lcredit(sc, send_ctx);
+ if (ret)
+ goto lcredit_failed;
+
ret = wait_for_send_credits(sc, send_ctx);
if (ret)
- return ret;
+ goto credit_failed;
data_length = 0;
for (i = 0; i < niov; i++)
@@ -1267,10 +1310,8 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
ret = smb_direct_create_header(sc, data_length, remaining_data_length,
&msg);
- if (ret) {
- atomic_inc(&sc->send_io.credits.count);
- return ret;
- }
+ if (ret)
+ goto header_failed;
for (i = 0; i < niov; i++) {
struct ib_sge *sge;
@@ -1308,7 +1349,11 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,
return 0;
err:
smb_direct_free_sendmsg(sc, msg);
+header_failed:
atomic_inc(&sc->send_io.credits.count);
+credit_failed:
+ atomic_inc(&sc->send_io.lcredits.count);
+lcredit_failed:
return ret;
}
@@ -1574,18 +1619,14 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
get_buf_page_count(desc_buf, desc_buf_len),
msg->sg_list, SG_CHUNK_SIZE);
if (ret) {
- kfree(msg);
ret = -ENOMEM;
- goto out;
+ goto free_msg;
}
ret = get_sg_list(desc_buf, desc_buf_len,
msg->sgt.sgl, msg->sgt.orig_nents);
- if (ret < 0) {
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
- kfree(msg);
- goto out;
- }
+ if (ret < 0)
+ goto free_table;
ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
msg->sgt.sgl,
@@ -1596,9 +1637,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
if (ret < 0) {
pr_err("failed to init rdma_rw_ctx: %d\n", ret);
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
- kfree(msg);
- goto out;
+ goto free_table;
}
list_add_tail(&msg->list, &msg_list);
@@ -1630,6 +1669,12 @@ out:
atomic_add(credits_needed, &sc->rw_io.credits.count);
wake_up(&sc->rw_io.credits.wait_queue);
return ret;
+
+free_table:
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+free_msg:
+ kfree(msg);
+ goto out;
}
static int smb_direct_rdma_write(struct ksmbd_transport *t,
@@ -1687,10 +1732,10 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
}
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED: {
- ib_drain_qp(sc->ib.qp);
-
sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
smb_direct_disconnect_rdma_work(&sc->disconnect_work);
+ if (sc->ib.qp)
+ ib_drain_qp(sc->ib.qp);
break;
}
case RDMA_CM_EVENT_CONNECT_ERROR: {
@@ -1841,6 +1886,7 @@ static int smb_direct_accept_client(struct smbdirect_socket *sc)
static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
{
struct smbdirect_recv_io *recvmsg;
+ bool recv_posted = false;
int ret;
WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
@@ -1857,6 +1903,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
pr_err("Can't post recv: %d\n", ret);
goto out_err;
}
+ recv_posted = true;
ret = smb_direct_accept_client(sc);
if (ret) {
@@ -1864,27 +1911,24 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
goto out_err;
}
- smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
return 0;
out_err:
- put_recvmsg(sc, recvmsg);
+ /*
+ * If the recv was never posted, return it to the free list.
+ * If it was posted, leave it alone so disconnect teardown can
+ * drain the QP and complete it (flush) and the completion path
+ * will unmap it exactly once.
+ */
+ if (!recv_posted)
+ put_recvmsg(sc, recvmsg);
return ret;
}
-static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc)
-{
- return min_t(unsigned int,
- sc->ib.dev->attrs.max_fast_reg_page_list_len,
- 256);
-}
-
-static int smb_direct_init_params(struct smbdirect_socket *sc,
- struct ib_qp_cap *cap)
+static int smb_direct_init_params(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct ib_device *device = sc->ib.dev;
- int max_send_sges, max_rw_wrs, max_send_wrs;
- unsigned int max_sge_per_wr, wrs_per_credit;
+ int max_send_sges;
+ unsigned int maxpages;
/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
* SMB2 response could be mapped.
@@ -1895,67 +1939,20 @@ static int smb_direct_init_params(struct smbdirect_socket *sc,
return -EINVAL;
}
- /* Calculate the number of work requests for RDMA R/W.
- * The maximum number of pages which can be registered
- * with one Memory region can be transferred with one
- * R/W credit. And at least 4 work requests for each credit
- * are needed for MR registration, RDMA R/W, local & remote
- * MR invalidation.
- */
- sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc);
- sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size,
- (sc->rw_io.credits.num_pages - 1) *
- PAGE_SIZE);
-
- max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
- device->attrs.max_sge_rd);
- max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
- max_send_sges);
- wrs_per_credit = max_t(unsigned int, 4,
- DIV_ROUND_UP(sc->rw_io.credits.num_pages,
- max_sge_per_wr) + 1);
- max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit;
-
- max_send_wrs = sp->send_credit_target + max_rw_wrs;
- if (max_send_wrs > device->attrs.max_cqe ||
- max_send_wrs > device->attrs.max_qp_wr) {
- pr_err("consider lowering send_credit_target = %d\n",
- sp->send_credit_target);
- pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
- device->attrs.max_cqe, device->attrs.max_qp_wr);
- return -EINVAL;
- }
+ atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
- if (sp->recv_credit_max > device->attrs.max_cqe ||
- sp->recv_credit_max > device->attrs.max_qp_wr) {
- pr_err("consider lowering receive_credit_max = %d\n",
- sp->recv_credit_max);
- pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
- device->attrs.max_cqe, device->attrs.max_qp_wr);
- return -EINVAL;
- }
-
- if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
- pr_err("warning: device max_send_sge = %d too small\n",
- device->attrs.max_send_sge);
- return -EINVAL;
- }
- if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
- pr_err("warning: device max_recv_sge = %d too small\n",
- device->attrs.max_recv_sge);
- return -EINVAL;
- }
+ maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
+ sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
+ sc->rdma.cm_id->port_num,
+ maxpages);
+ sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
+ /* add one extra in order to handle unaligned pages */
+ sc->rw_io.credits.max += 1;
sc->recv_io.credits.target = 1;
atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
- cap->max_send_wr = max_send_wrs;
- cap->max_recv_wr = sp->recv_credit_max;
- cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
- cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
- cap->max_inline_data = 0;
- cap->max_rdma_ctxs = sc->rw_io.credits.max;
return 0;
}
@@ -2029,13 +2026,129 @@ err:
return -ENOMEM;
}
-static int smb_direct_create_qpair(struct smbdirect_socket *sc,
- struct ib_qp_cap *cap)
+static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr)
+{
+ /*
+ * This could be split out of rdma_rw_init_qp()
+ * and be a helper function next to rdma_rw_mr_factor()
+ *
+ * We can't check unlikely(rdma_rw_force_mr) here,
+ * but that is most likely 0 anyway.
+ */
+ u32 factor;
+
+ WARN_ON_ONCE(attr->port_num == 0);
+
+ /*
+ * Each context needs at least one RDMA READ or WRITE WR.
+ *
+ * For some hardware we might need more, eventually we should ask the
+ * HCA driver for a multiplier here.
+ */
+ factor = 1;
+
+ /*
+ * If the device needs MRs to perform RDMA READ or WRITE operations,
+ * we'll need two additional MRs for the registrations and the
+ * invalidation.
+ */
+ if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd)
+ factor += 2; /* inv + reg */
+
+ return factor * attr->cap.max_rdma_ctxs;
+}
+
+static int smb_direct_create_qpair(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
int ret;
+ struct ib_qp_cap qp_cap;
struct ib_qp_init_attr qp_attr;
- int pages_per_rw;
+ u32 max_send_wr;
+ u32 rdma_send_wr;
+
+ /*
+ * Note that {rdma,ib}_create_qp() will call
+ * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0.
+ * It will adjust cap->max_send_wr to the required
+ * number of additional WRs for the RDMA RW operations.
+ * It will cap cap->max_send_wr to the device limit.
+ *
+ * +1 for ib_drain_qp
+ */
+ qp_cap.max_send_wr = sp->send_credit_target + 1;
+ qp_cap.max_recv_wr = sp->recv_credit_max + 1;
+ qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
+ qp_cap.max_inline_data = 0;
+ qp_cap.max_rdma_ctxs = sc->rw_io.credits.max;
+
+ /*
+ * Find out the number of max_send_wr
+ * after rdma_rw_init_qp() adjusted it.
+ *
+ * We only do it on a temporary variable,
+ * as rdma_create_qp() will trigger
+ * rdma_rw_init_qp() again.
+ */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.cap = qp_cap;
+ qp_attr.port_num = sc->rdma.cm_id->port_num;
+ rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr);
+ max_send_wr = qp_cap.max_send_wr + rdma_send_wr;
+
+ if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe ||
+ qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
+ pr_err("Possible CQE overrun: max_send_wr %d\n",
+ qp_cap.max_send_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering send_credit_target = %d\n",
+ sp->send_credit_target);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_rdma_ctxs &&
+ (max_send_wr >= sc->ib.dev->attrs.max_cqe ||
+ max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
+ pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n",
+ rdma_send_wr, qp_cap.max_send_wr, max_send_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n",
+ sp->send_credit_target, qp_cap.max_rdma_ctxs);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe ||
+ qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
+ pr_err("Possible CQE overrun: max_recv_wr %d\n",
+ qp_cap.max_recv_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering receive_credit_max = %d\n",
+ sp->recv_credit_max);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge ||
+ qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) {
+ pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_send_sge,
+ sc->ib.dev->attrs.max_recv_sge);
+ return -EINVAL;
+ }
sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
if (IS_ERR(sc->ib.pd)) {
@@ -2046,8 +2159,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
}
sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
- sp->send_credit_target +
- cap->max_rdma_ctxs,
+ max_send_wr,
IB_POLL_WORKQUEUE);
if (IS_ERR(sc->ib.send_cq)) {
pr_err("Can't create RDMA send CQ\n");
@@ -2057,7 +2169,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
}
sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
- sp->recv_credit_max,
+ qp_cap.max_recv_wr,
IB_POLL_WORKQUEUE);
if (IS_ERR(sc->ib.recv_cq)) {
pr_err("Can't create RDMA recv CQ\n");
@@ -2066,10 +2178,18 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
goto err;
}
+ /*
+ * We reset completely here!
+ * As the above use was just temporary
+ * to calc max_send_wr and rdma_send_wr.
+ *
+ * rdma_create_qp() will trigger rdma_rw_init_qp()
+ * again if max_rdma_ctxs is not 0.
+ */
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smb_direct_qpair_handler;
qp_attr.qp_context = sc;
- qp_attr.cap = *cap;
+ qp_attr.cap = qp_cap;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
qp_attr.send_cq = sc->ib.send_cq;
@@ -2085,18 +2205,6 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,
sc->ib.qp = sc->rdma.cm_id->qp;
sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
- pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1;
- if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) {
- ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs,
- sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG,
- sc->rw_io.credits.num_pages, 0);
- if (ret) {
- pr_err("failed to init mr pool count %zu pages %zu\n",
- sc->rw_io.credits.max, sc->rw_io.credits.num_pages);
- goto err;
- }
- }
-
return 0;
err:
if (sc->ib.qp) {
@@ -2154,8 +2262,8 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
return -ECONNABORTED;
ret = smb_direct_check_recvmsg(recvmsg);
- if (ret == -ECONNABORTED)
- goto out;
+ if (ret)
+ goto put;
req = (struct smbdirect_negotiate_req *)recvmsg->packet;
sp->max_recv_size = min_t(int, sp->max_recv_size,
@@ -2170,23 +2278,46 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1);
- ret = smb_direct_send_negotiate_response(sc, ret);
-out:
+put:
spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
sc->recv_io.reassembly.queue_length--;
list_del(&recvmsg->list);
spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
put_recvmsg(sc, recvmsg);
+ if (ret == -ECONNABORTED)
+ return ret;
+
+ if (ret)
+ goto respond;
+
+ /*
+ * We negotiated with success, so we need to refill the recv queue.
+ * We do that with sc->idle.immediate_work still being disabled
+ * via smbdirect_socket_init(), so that queue_work(sc->workqueue,
+ * &sc->idle.immediate_work) in smb_direct_post_recv_credits()
+ * is a no-op.
+ *
+ * The message that grants the credits to the client is
+ * the negotiate response.
+ */
+ INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits);
+ smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
+ if (unlikely(sc->first_error))
+ return sc->first_error;
+ INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
+
+respond:
+ ret = smb_direct_send_negotiate_response(sc, ret);
+
return ret;
}
static int smb_direct_connect(struct smbdirect_socket *sc)
{
- struct ib_qp_cap qp_cap;
int ret;
- ret = smb_direct_init_params(sc, &qp_cap);
+ ret = smb_direct_init_params(sc);
if (ret) {
pr_err("Can't configure RDMA parameters\n");
return ret;
@@ -2198,7 +2329,7 @@ static int smb_direct_connect(struct smbdirect_socket *sc)
return ret;
}
- ret = smb_direct_create_qpair(sc, &qp_cap);
+ ret = smb_direct_create_qpair(sc);
if (ret) {
pr_err("Can't accept RDMA client: %d\n", ret);
return ret;
@@ -2487,7 +2618,7 @@ void ksmbd_rdma_destroy(void)
}
}
-bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
+static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev)
{
struct smb_direct_device *smb_dev;
int i;
@@ -2529,6 +2660,28 @@ out:
return rdma_capable;
}
+bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ if (ksmbd_find_rdma_capable_netdev(netdev))
+ return true;
+
+ /* check if netdev is bridge or VLAN */
+ if (netif_is_bridge_master(netdev) ||
+ netdev->priv_flags & IFF_802_1Q_VLAN)
+ netdev_for_each_lower_dev(netdev, lower_dev, iter)
+ if (ksmbd_find_rdma_capable_netdev(lower_dev))
+ return true;
+
+ /* check if netdev is IPoIB safely without layer violation */
+ if (netdev->type == ARPHRD_INFINIBAND)
+ return true;
+
+ return false;
+}
+
static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
.prepare = smb_direct_prepare,
.disconnect = smb_direct_disconnect,
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 7a1e3dcc2cde..d2e391c29464 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -290,8 +290,11 @@ static int ksmbd_kthread_fn(void *p)
}
}
up_read(&conn_list_lock);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ /* Per-IP limit hit: release the just-accepted socket. */
+ sock_release(client_sk);
continue;
+ }
skip_max_ip_conns_limit:
if (server_conf.max_connections &&
diff --git a/fs/splice.c b/fs/splice.c
index f5094b6d00a0..d338fe56b50b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1498,7 +1498,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
/*
* For lack of a better implementation, implement vmsplice() to userspace
- * as a simple copy of the pipes pages to the user iov.
+ * as a simple copy of the pipe's pages to the user iov.
*/
static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
unsigned int flags)
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index cceae3b78698..82b687414e65 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -86,7 +86,7 @@ struct inode *squashfs_iget(struct super_block *sb, long long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
err = squashfs_read_inode(inode, ino);
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..7c66b96b59be 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
goto fail;
if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
goto fail;
+ s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
return s;
fail:
@@ -1183,11 +1184,14 @@ static inline bool get_active_super(struct super_block *sb)
static const char *filesystems_freeze_ptr = "filesystems_freeze";
-static void filesystems_freeze_callback(struct super_block *sb, void *unused)
+static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr)
{
if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
return;
+ if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE))
+ return;
+
if (!get_active_super(sb))
return;
@@ -1201,9 +1205,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused)
deactivate_super(sb);
}
-void filesystems_freeze(void)
+void filesystems_freeze(bool freeze_all)
{
- __iterate_supers(filesystems_freeze_callback, NULL,
+ void *freeze_all_ptr = NULL;
+
+ if (freeze_all)
+ freeze_all_ptr = &freeze_all;
+ __iterate_supers(filesystems_freeze_callback, freeze_all_ptr,
SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE);
}
diff --git a/fs/sync.c b/fs/sync.c
index 2955cd4c77a3..431fc5f5be06 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -117,16 +117,17 @@ SYSCALL_DEFINE0(sync)
static void do_sync_work(struct work_struct *work)
{
int nowait = 0;
+ int wait = 1;
/*
* Sync twice to reduce the possibility we skipped some inodes / pages
* because they were temporarily locked
*/
- iterate_supers(sync_inodes_one_sb, &nowait);
+ iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
sync_bdevs(false);
- iterate_supers(sync_inodes_one_sb, &nowait);
- iterate_supers(sync_fs_one_sb, &nowait);
+ iterate_supers(sync_inodes_one_sb, NULL);
+ iterate_supers(sync_fs_one_sb, &wait);
sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
@@ -182,7 +183,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
if (!file->f_op->fsync)
return -EINVAL;
- if (!datasync && (inode->i_state & I_DIRTY_TIME))
+ if (!datasync && (inode_state_read_once(inode) & I_DIRTY_TIME))
mark_inode_dirty_sync(inode);
return file->f_op->fsync(file, start, end, datasync);
}
@@ -280,14 +281,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
- int sync_mode = WB_SYNC_NONE;
-
if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
SYNC_FILE_RANGE_WRITE_AND_WAIT)
- sync_mode = WB_SYNC_ALL;
-
- ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
- sync_mode);
+ ret = filemap_fdatawrite_range(mapping, offset,
+ endbyte);
+ else
+ ret = filemap_flush_range(mapping, offset, endbyte);
if (ret < 0)
goto out;
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2d78e94072a0..e142bac4f9f8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -498,17 +498,26 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
}
EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);
-static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
+static int sysfs_group_attrs_change_owner(struct kobject *kobj,
+ struct kernfs_node *grp_kn,
const struct attribute_group *grp,
struct iattr *newattrs)
{
struct kernfs_node *kn;
- int error;
+ int error, i;
+ umode_t mode;
if (grp->attrs) {
struct attribute *const *attr;
- for (attr = grp->attrs; *attr; attr++) {
+ for (i = 0, attr = grp->attrs; *attr; i++, attr++) {
+ if (grp->is_visible) {
+ mode = grp->is_visible(kobj, *attr, i);
+ if (mode & SYSFS_GROUP_INVISIBLE)
+ break;
+ if (!mode)
+ continue;
+ }
kn = kernfs_find_and_get(grp_kn, (*attr)->name);
if (!kn)
return -ENOENT;
@@ -523,7 +532,14 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
if (grp->bin_attrs) {
const struct bin_attribute *const *bin_attr;
- for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
+ if (grp->is_bin_visible) {
+ mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ if (mode & SYSFS_GROUP_INVISIBLE)
+ break;
+ if (!mode)
+ continue;
+ }
kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
if (!kn)
return -ENOENT;
@@ -573,7 +589,7 @@ int sysfs_group_change_owner(struct kobject *kobj,
error = kernfs_setattr(grp_kn, &newattrs);
if (!error)
- error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);
+ error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs);
kernfs_put(grp_kn);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index ca41ce8208c4..c3265b8804f5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1323,7 +1323,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
inode_lock(inode);
/* Synchronize the inode unless this is a 'datasync()' call. */
- if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+ if (!datasync || (inode_state_read_once(inode) & I_DIRTY_DATASYNC)) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 46952a33c4e6..f453c37cee37 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -114,7 +114,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
inode = iget_locked(sb, inum);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ui = ubifs_inode(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a79d73f28aa7..7fae8002344a 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1962,7 +1962,7 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
if (UDF_I(inode)->i_hidden != hidden_inode) {
iput(inode);
return ERR_PTR(-EFSCORRUPTED);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 8361c00e8fa6..e2b0a35de2a7 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -655,7 +655,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ufsi = UFS_I(inode);
diff --git a/fs/utimes.c b/fs/utimes.c
index c7c7958e57b2..3e7156396230 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -76,6 +76,7 @@ retry_deleg:
out:
return error;
}
+EXPORT_SYMBOL_GPL(vfs_utimes);
static int do_utimes_path(int dfd, const char __user *filename,
struct timespec64 *times, int flags)
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 8930d5254e1d..b99da294e9a3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -119,6 +119,15 @@ config XFS_RT
See the xfs man page in section 5 for additional information.
+ This option is mandatory to support zoned block devices. For these
+ devices, the realtime subvolume must be backed by a zoned block
+ device and a regular block device used as the main device (for
+ metadata). If the zoned block device is a host-managed SMR hard-disk
+ containing conventional zones at the beginning of its address space,
+ XFS will use the disk conventional zones as the main device and the
+ remaining sequential write required zones as the backing storage for
+ the realtime subvolume.
+
If unsure, say N.
config XFS_DRAIN_INTENTS
@@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select DEBUG_FS
+ depends on DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index de840abc0bcd..57e47077c75a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -73,7 +73,8 @@
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
-#define XFS_ERRTAG_MAX 46
+#define XFS_ERRTAG_FORCE_ZERO_RANGE 46
+#define XFS_ERRTAG_MAX 47
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
-XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4)
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \
+XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4)
#endif /* XFS_ERRTAG */
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index d36a6ae0abe5..d4fcf591e63d 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -50,6 +50,12 @@ struct xfs_rtgroup {
uint8_t *rtg_rsum_cache;
struct xfs_open_zone *rtg_open_zone;
};
+
+ /*
+ * Count of outstanding GC operations for zoned XFS. Any RTG with a
+ * non-zero rtg_gccount will not be picked as new GC victim.
+ */
+ atomic_t rtg_gccount;
};
/*
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 2ef7742be7d3..7bfa37c99480 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1249,7 +1249,7 @@ xchk_irele(
* hits do not clear DONTCACHE, so we must do it here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index a90a011c7e5f..4f7040c9ddf0 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1933,7 +1933,7 @@ xrep_inode_pptr(
* Unlinked inodes that cannot be added to the directory tree will not
* have a parent pointer.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return 0;
/* Children of the superblock do not have parent pointers. */
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 26721fab5cab..091c79e432e5 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -376,6 +376,36 @@ out_incomplete:
return error;
}
+static uint
+xchk_nlinks_ilock_dir(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /*
+ * We're going to scan the directory entries, so we must be ready to
+ * pull the data fork mappings into memory if they aren't already.
+ */
+ if (xfs_need_iread_extents(&ip->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * We're going to scan the parent pointers, so we must be ready to
+ * pull the attr fork mappings into memory if they aren't already.
+ */
+ if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) &&
+ xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * Take the IOLOCK so that other threads cannot start a directory
+ * update while we're scanning.
+ */
+ lock_mode |= XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@@ -394,8 +424,7 @@ xchk_nlinks_collect_dir(
return 0;
/* Prevent anyone from changing this directory while we walk it. */
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
- lock_mode = xfs_ilock_data_map_shared(dp);
+ lock_mode = xchk_nlinks_ilock_dir(dp);
/*
* The dotdot entry of an unlinked directory still points to the last
@@ -452,7 +481,6 @@ out_abort:
xchk_iscan_abort(&xnc->collect_iscan);
out_unlock:
xfs_iunlock(dp, lock_mode);
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return error;
}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 3b692c4acc1e..11d5de10fd56 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -915,7 +915,7 @@ xchk_pptr_looks_zapped(
* Temporary files that cannot be linked into the directory tree do not
* have attr forks because they cannot ever have parents.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return false;
/*
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index 5902398185a8..df629892462f 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -184,7 +184,7 @@ xrep_symlink_salvage_inline(
sc->ip->i_disk_size == 1 && old_target[0] == '?')
return 0;
- nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
+ nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes);
memcpy(target_buf, ifp->if_data, nr);
return nr;
}
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index cdd13ed9c569..ed2e8c64b1a8 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -834,7 +834,7 @@ xfarray_sort_scan(
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
- next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ next_pos = folio_next_pos(si->folio);
si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
si->last_folio_idx--;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a26f79815533..56a544638491 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -271,7 +271,7 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio), NULL);
+ folio_next_pos(folio), NULL);
}
/*
@@ -742,14 +742,15 @@ xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops);
+ iomap_bio_read_folio(folio, &xfs_read_iomap_ops);
+ return 0;
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops);
+ iomap_bio_readahead(rac, &xfs_read_iomap_ops);
}
static int
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e43..2208a720ec3f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -514,7 +514,7 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- if (!(VFS_I(ip)->i_state & I_FREEING))
+ if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 773d959965dc..47edf3041631 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1751,7 +1751,7 @@ xfs_init_buftarg(
const char *descr)
{
/* The maximum size of the buftarg is only known once the sb is read. */
- btp->bt_nr_sectors = (xfs_daddr_t)-1;
+ btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8fa7bdf59c91..e25cd2a160f3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache;
*/
struct xfs_buf;
+#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX)
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
#define XBF_READ (1u << 0) /* buffer intended for reading from device */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index ee49f20875af..6917de832191 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -726,8 +726,10 @@ xfs_trim_rtgroup_extents(
break;
}
- if (!tr.queued)
+ if (!tr.queued) {
+ kfree(tr.extents);
break;
+ }
/*
* We hand the extent list to the discard function here so the
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2702fef2c90c..6108612182e2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -27,6 +27,8 @@
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -674,8 +676,17 @@ xfs_file_dio_write_aligned(
struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int dio_flags = 0;
ssize_t ret;
+ /*
+ * For always COW inodes, each bio must be aligned to the file system
+ * block size and not just the device sector size because we need to
+ * allocate a block-aligned amount of space for each write.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -693,7 +704,7 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
out_unlock:
xfs_iunlock(ip, iolock);
return ret;
@@ -890,15 +901,7 @@ xfs_file_dio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /*
- * For always COW inodes we also must check the alignment of each
- * individual iovec segment, as they could end up with different
- * I/Os due to the way bio_iov_iter_get_pages works, and we'd
- * then overwrite an already written block.
- */
- if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
- (xfs_is_always_cow_inode(ip) &&
- (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
@@ -1254,23 +1257,36 @@ xfs_falloc_zero_range(
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
- trace_xfs_zero_file_space(XFS_I(inode));
+ trace_xfs_zero_file_space(ip);
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
- if (error)
- return error;
+ /*
+ * Zero range implements a full zeroing mechanism but is only used in
+ * limited situations. It is more efficient to allocate unwritten
+ * extents than to perform zeroing here, so use an errortag to randomly
+ * force zeroing on DEBUG kernels for added test coverage.
+ */
+ if (XFS_TEST_ERROR(ip->i_mount,
+ XFS_ERRTAG_FORCE_ZERO_RANGE)) {
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
+ } else {
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
- len = round_up(offset + len, blksize) - round_down(offset, blksize);
- offset = round_down(offset, blksize);
- error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len);
+ }
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 7c541fb373d5..3c1557fb1cf0 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -285,7 +285,7 @@ xfs_inode_mark_sick(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
@@ -309,7 +309,7 @@ xfs_inode_mark_corrupt(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e44040206851..f3fc4d21bfe1 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -334,7 +334,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
- unsigned long state = inode->i_state;
+ unsigned long state = inode_state_read_once(inode);
error = inode_init_always(mp->m_super, inode);
@@ -345,7 +345,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_state = state;
+ inode_state_assign_raw(inode, state);
mapping_set_folio_min_order(inode->i_mapping,
M_IGEO(mp)->min_folio_order);
return error;
@@ -411,7 +411,7 @@ xfs_iget_recycle(
ip->i_flags |= XFS_INEW;
xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
- inode->i_state = I_NEW;
+ inode_state_assign_raw(inode, I_NEW);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 36b39539e561..f1f88e48fe22 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next(
next_ip->i_prev_unlinked = prev_agino;
trace_xfs_iunlink_reload_next(next_ip);
rele:
- ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE));
if (xfs_is_quotacheck_running(mp) && next_ip)
xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
xfs_irele(next_ip);
@@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout(
*/
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
- VFS_I(tmpfile)->i_state |= I_LINKABLE;
+ inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE);
*wip = tmpfile;
return 0;
@@ -2330,7 +2330,7 @@ retry:
* flag from the inode so it doesn't accidentally get misused in
* future.
*/
- VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
+ inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE);
}
out_commit:
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 1bd411a1114c..2eb0c6011a2e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -113,9 +113,9 @@ xfs_inode_item_precommit(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a6bb7ee7a27a..59eaad774371 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1408,10 +1408,8 @@ xfs_file_ioctl(
trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
- sb_start_write(mp->m_super);
- error = xfs_blockgc_free_space(mp, &icw);
- sb_end_write(mp->m_super);
- return error;
+ guard(super_write)(mp->m_super);
+ return xfs_blockgc_free_space(mp, &icw);
}
case XFS_IOC_EXCHANGE_RANGE:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d3f6e3e42a11..04f39ea15898 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1091,6 +1091,29 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
+#ifdef DEBUG
+static void
+xfs_check_atomic_cow_conversion(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb,
+ const struct xfs_bmbt_irec *cmap)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec cmap2 = { };
+
+ if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2))
+ xfs_trim_extent(&cmap2, offset_fsb, count_fsb);
+
+ ASSERT(cmap2.br_startoff == cmap->br_startoff);
+ ASSERT(cmap2.br_blockcount == cmap->br_blockcount);
+ ASSERT(cmap2.br_startblock == cmap->br_startblock);
+ ASSERT(cmap2.br_state == cmap->br_state);
+}
+#else
+# define xfs_check_atomic_cow_conversion(...) ((void)0)
+#endif
+
static int
xfs_atomic_write_cow_iomap_begin(
struct inode *inode,
@@ -1102,9 +1125,10 @@ xfs_atomic_write_cow_iomap_begin(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
- xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ const xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ xfs_filblks_t hole_count_fsb;
int nmaps = 1;
xfs_filblks_t resaligned;
struct xfs_bmbt_irec cmap;
@@ -1130,7 +1154,7 @@ xfs_atomic_write_cow_iomap_begin(
return -EAGAIN;
trace_xfs_iomap_atomic_write_cow(ip, offset, length);
-
+retry:
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
@@ -1141,14 +1165,22 @@ xfs_atomic_write_cow_iomap_begin(
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
+ if (isnullstartblock(cmap.br_startblock))
+ goto convert_delay;
+
+ /*
+ * cmap could extend outside the write range due to previous
+ * speculative preallocations. We must trim cmap to the write
+ * range because the cow fork treats written mappings to mean
+ * "write in progress".
+ */
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
goto found;
}
- end_fsb = cmap.br_startoff;
- count_fsb = end_fsb - offset_fsb;
+ hole_count_fsb = cmap.br_startoff - offset_fsb;
- resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb,
xfs_get_cowextsz_hint(ip));
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1169,8 +1201,10 @@ xfs_atomic_write_cow_iomap_begin(
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
- xfs_trim_extent(&cmap, offset_fsb, count_fsb);
xfs_trans_cancel(tp);
+ if (isnullstartblock(cmap.br_startblock))
+ goto convert_delay;
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
goto found;
}
@@ -1182,7 +1216,7 @@ xfs_atomic_write_cow_iomap_begin(
* atomic writes to that same range will be aligned (and don't require
* this COW-based method).
*/
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
if (error) {
@@ -1195,21 +1229,43 @@ xfs_atomic_write_cow_iomap_begin(
if (error)
goto out_unlock;
+ /*
+ * cmap could map more blocks than the range we passed into bmapi_write
+ * because of EXTSZALIGN or adjacent pre-existing unwritten mappings
+ * that were merged. Trim cmap to the original write range so that we
+ * don't convert more than we were asked to do for this write.
+ */
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+
found:
if (cmap.br_state != XFS_EXT_NORM) {
- error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
- count_fsb);
+ error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff,
+ cmap.br_blockcount);
if (error)
goto out_unlock;
cmap.br_state = XFS_EXT_NORM;
+ xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb,
+ &cmap);
}
- length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
- trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+convert_delay:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap,
+ NULL);
+ if (error)
+ return error;
+
+ /*
+ * Try the lookup again, because the delalloc conversion might have
+ * turned the COW mapping into unwritten, but we need it to be in
+ * written state.
+ */
+ goto retry;
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -1702,6 +1758,8 @@ xfs_buffered_write_iomap_begin(
struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter,
+ iomap);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1767,21 +1825,41 @@ xfs_buffered_write_iomap_begin(
}
/*
- * For zeroing, trim a delalloc extent that extends beyond the EOF
- * block. If it starts beyond the EOF block, convert it to an
+ * For zeroing, trim extents that extend beyond the EOF block. If a
+ * delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
- if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
- isnullstartblock(imap.br_startblock)) {
+ if (flags & IOMAP_ZERO) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+ u64 end;
- if (offset_fsb >= eof_fsb)
+ if (isnullstartblock(imap.br_startblock) &&
+ offset_fsb >= eof_fsb)
goto convert_delay;
- if (end_fsb > eof_fsb) {
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
end_fsb = eof_fsb;
- xfs_trim_extent(&imap, offset_fsb,
- end_fsb - offset_fsb);
+
+ /*
+ * Look up dirty folios for unwritten mappings within EOF.
+ * Providing this bypasses the flush iomap uses to trigger
+ * extent conversion when unwritten mappings have dirty
+ * pagecache in need of zeroing.
+ *
+ * Trim the mapping to the end pos of the lookup, which in turn
+ * was trimmed to the end of the batch if it became full before
+ * the end of the mapping.
+ */
+ if (imap.br_state == XFS_EXT_UNWRITTEN &&
+ offset_fsb < eof_fsb) {
+ loff_t len = min(count,
+ XFS_FSB_TO_B(mp, imap.br_blockcount));
+
+ end = iomap_fill_dirty_folios(iter, offset, len);
+ end_fsb = min_t(xfs_fileoff_t, end_fsb,
+ XFS_B_TO_FSB(mp, end));
}
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
}
/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index caff0125faea..ad94fbf55014 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1420,7 +1420,7 @@ xfs_setup_inode(
bool is_meta = xfs_is_internal_inode(ip);
inode->i_ino = ip->i_ino;
- inode->i_state |= I_NEW;
+ inode_state_set_raw(inode, I_NEW);
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
- struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 36cda724da89..9d1ed9bb0bee 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
{
struct inode *inode = VFS_I(ip);
- if ((inode->i_state & I_DIRTY_PAGES) ||
+ if ((inode_state_read_once(inode) & I_DIRTY_PAGES) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
atomic_read(&inode->i_dio_count))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..bc71aa9dcee8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = {
* Table driven mount option parser.
*/
enum {
- Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
+ Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
@@ -114,7 +114,21 @@ enum {
Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
+#define fsparam_dead(NAME) \
+ __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL)
+
static const struct fs_parameter_spec xfs_fs_parameters[] = {
+ /*
+ * These mount options were supposed to be deprecated in September 2025
+ * but the deprecation warning was buggy, so not all users were
+ * notified. The deprecation is now obnoxiously loud and postponed to
+ * September 2030.
+ */
+ fsparam_dead("attr2"),
+ fsparam_dead("noattr2"),
+ fsparam_dead("ikeep"),
+ fsparam_dead("noikeep"),
+
fsparam_u32("logbufs", Opt_logbufs),
fsparam_string("logbsize", Opt_logbsize),
fsparam_string("logdev", Opt_logdev),
@@ -786,6 +800,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
@@ -1373,16 +1393,25 @@ suffix_kstrtoull(
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
- struct fs_parameter *param,
- uint64_t flag,
- bool value)
+ struct fs_parameter *param)
{
- /* Don't print the warning if reconfiguring and current mount point
- * already had the flag set
+ /*
+ * Always warn about someone passing in a deprecated mount option.
+ * Previously we wouldn't print the warning if we were reconfiguring
+ * and current mount point already had the flag set, but that was not
+ * the right thing to do.
+ *
+ * Many distributions mount the root filesystem with no options in the
+ * initramfs and rely on mount -a to remount the root fs with the
+ * options in fstab. However, the old behavior meant that there would
+ * never be a warning about deprecated mount options for the root fs in
+ * /etc/fstab. On a single-fs system, that means no warning at all.
+ *
+ * Compounding this problem are distribution scripts that copy
+ * /proc/mounts to fstab, which means that we can't remove mount
+ * options unless we're 100% sure they have only ever been advertised
+ * in /proc/mounts in response to explicitly provided mount options.
*/
- if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
- !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
- return;
xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}
@@ -1408,6 +1437,9 @@ xfs_fs_parse_param(
return opt;
switch (opt) {
+ case Op_deprecated:
+ xfs_fs_warn_deprecated(fc, param);
+ return 0;
case Opt_logbufs:
parsing_mp->m_logbufs = result.uint_32;
return 0;
@@ -1528,7 +1560,6 @@ xfs_fs_parse_param(
xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
- /* Following mount options will be removed in September 2025 */
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
@@ -1662,7 +1693,10 @@ xfs_fs_fill_super(
if (error)
return error;
- sb_min_blocksize(sb, BBSIZE);
+ if (!sb_min_blocksize(sb, BBSIZE)) {
+ xfs_err(mp, "unable to set blocksize");
+ return -EINVAL;
+ }
sb->s_xattr = xfs_xattr_handlers;
sb->s_export_op = &xfs_export_operations;
#ifdef CONFIG_XFS_QUOTA
@@ -2221,7 +2255,7 @@ xfs_init_fs_context(
struct xfs_mount *mp;
int i;
- mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
return -ENOMEM;
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1147bacb2da8..8dde444596f1 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -238,6 +246,14 @@ xfs_zoned_map_extent(
* If a data write raced with this GC write, keep the existing data in
* the data fork, mark our newly written GC extent as reclaimable, then
* move on to the next extent.
+ *
+ * Note that this can also happen when racing with operations that do
+ * not actually invalidate the data, but just move it to a different
+ * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
+ * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
+ * data was just moved around, GC fails to free the zone, but the zone
+ * becomes a GC candidate again as soon as all previous GC I/O has
+ * finished and these blocks will be moved out eventually.
*/
if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock)
@@ -599,7 +615,7 @@ xfs_select_open_zone_mru(
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, false))
+ if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -614,14 +630,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
}
/*
- * Try to pack inodes that are written back after they were closed tight instead
- * of trying to open new zones for them or spread them to the least recently
- * used zone. This optimizes the data layout for workloads that untar or copy
- * a lot of small files. Right now this does not separate multiple such
+ * Try to tightly pack small files that are written back after they were closed
+ * instead of trying to open new zones for them or spread them to the least
+ * recently used zone. This optimizes the data layout for workloads that untar
+ * or copy a lot of small files. Right now this does not separate multiple such
* streams.
*/
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
{
+ struct xfs_mount *mp = ip->i_mount;
+ size_t zone_capacity =
+ XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /*
+ * Do not pack write files that are already using a full zone to avoid
+ * fragmentation.
+ */
+ if (i_size_read(VFS_I(ip)) >= zone_capacity)
+ return false;
+
return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
@@ -746,97 +773,54 @@ xfs_mark_rtg_boundary(
}
/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
-/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
}
- xfs_mru_cache_done(mp->m_zone_cache);
+ rcu_read_unlock();
+
return oz;
}
/*
- * Update the last used zone cache for a given inode.
+ * Stash our zone in the inode so that is is reused for future allocations.
*
- * The caller must have a reference on the open zone.
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it. This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup. Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
-
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
-
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
- }
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
}
static void
@@ -880,15 +864,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +949,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1215,6 +1204,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1245,10 +1235,33 @@ xfs_mount_zones(
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
@@ -1260,8 +1273,10 @@ xfs_mount_zones(
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
error = xfs_init_zone(&iz, rtg, NULL);
- if (error)
+ if (error) {
+ xfs_rtgroup_rele(rtg);
goto out_free_zone_info;
+ }
}
}
@@ -1279,14 +1294,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1300,5 +1307,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 064cd1a857a0..4ade54445532 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -114,6 +114,8 @@ struct xfs_gc_bio {
/* Open Zone being written to */
struct xfs_open_zone *oz;
+ struct xfs_rtgroup *victim_rtg;
+
/* Bio used for reads and writes, including the bvec used by it */
struct bio_vec bv;
struct bio bio; /* must be last */
@@ -264,6 +266,7 @@ xfs_zone_gc_iter_init(
iter->rec_count = 0;
iter->rec_idx = 0;
iter->victim_rtg = victim_rtg;
+ atomic_inc(&victim_rtg->rtg_gccount);
}
/*
@@ -362,6 +365,7 @@ xfs_zone_gc_query(
return 0;
done:
+ atomic_dec(&iter->victim_rtg->rtg_gccount);
xfs_rtgroup_rele(iter->victim_rtg);
iter->victim_rtg = NULL;
return 0;
@@ -451,6 +455,20 @@ xfs_zone_gc_pick_victim_from(
if (!rtg)
continue;
+ /*
+ * If the zone is already undergoing GC, don't pick it again.
+ *
+ * This prevents us from picking one of the zones for which we
+ * already submitted GC I/O, but for which the remapping hasn't
+ * concluded yet. This won't cause data corruption, but
+ * increases write amplification and slows down GC, so this is
+ * a bad thing.
+ */
+ if (atomic_read(&rtg->rtg_gccount)) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
/* skip zones that are just waiting for a reset */
if (rtg_rmap(rtg)->i_used_blocks == 0 ||
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
@@ -491,21 +509,6 @@ xfs_zone_gc_select_victim(
struct xfs_rtgroup *victim_rtg = NULL;
unsigned int bucket;
- if (xfs_is_shutdown(mp))
- return false;
-
- if (iter->victim_rtg)
- return true;
-
- /*
- * Don't start new work if we are asked to stop or park.
- */
- if (kthread_should_stop() || kthread_should_park())
- return false;
-
- if (!xfs_zoned_need_gc(mp))
- return false;
-
spin_lock(&zi->zi_used_buckets_lock);
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
@@ -703,6 +706,9 @@ xfs_zone_gc_start_chunk(
chunk->scratch = &data->scratch[data->scratch_idx];
chunk->data = data;
chunk->oz = oz;
+ chunk->victim_rtg = iter->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
bio->bi_end_io = xfs_zone_gc_end_io;
@@ -725,6 +731,8 @@ static void
xfs_zone_gc_free_chunk(
struct xfs_gc_bio *chunk)
{
+ atomic_dec(&chunk->victim_rtg->rtg_gccount);
+ xfs_rtgroup_rele(chunk->victim_rtg);
list_del(&chunk->entry);
xfs_open_zone_put(chunk->oz);
xfs_irele(chunk->ip);
@@ -785,6 +793,10 @@ xfs_zone_gc_split_write(
split_chunk->oz = chunk->oz;
atomic_inc(&chunk->oz->oz_ref);
+ split_chunk->victim_rtg = chunk->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
+
chunk->offset += split_len;
chunk->len -= split_len;
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
@@ -975,6 +987,27 @@ xfs_zone_gc_reset_zones(
} while (next);
}
+static bool
+xfs_zone_gc_should_start_new_work(
+ struct xfs_zone_gc_data *data)
+{
+ if (xfs_is_shutdown(data->mp))
+ return false;
+ if (!xfs_zone_gc_space_available(data))
+ return false;
+
+ if (!data->iter.victim_rtg) {
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+ if (!xfs_zoned_need_gc(data->mp))
+ return false;
+ if (!xfs_zone_gc_select_victim(data))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@@ -982,7 +1015,7 @@ xfs_zone_gc_reset_zones(
* Note that the order of the chunks is preserved so that we don't undo the
* optimal order established by xfs_zone_gc_query().
*/
-static bool
+static void
xfs_zone_gc_handle_work(
struct xfs_zone_gc_data *data)
{
@@ -996,30 +1029,22 @@ xfs_zone_gc_handle_work(
zi->zi_reset_list = NULL;
spin_unlock(&zi->zi_reset_list_lock);
- if (!xfs_zone_gc_select_victim(data) ||
- !xfs_zone_gc_space_available(data)) {
- if (list_empty(&data->reading) &&
- list_empty(&data->writing) &&
- list_empty(&data->resetting) &&
- !reset_list)
- return false;
- }
-
- __set_current_state(TASK_RUNNING);
- try_to_freeze();
-
- if (reset_list)
+ if (reset_list) {
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_reset_zones(data, reset_list);
+ }
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_reset(chunk);
}
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_chunk(chunk);
}
@@ -1027,15 +1052,18 @@ xfs_zone_gc_handle_work(
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_write_chunk(chunk);
}
blk_finish_plug(&plug);
- blk_start_plug(&plug);
- while (xfs_zone_gc_start_chunk(data))
- ;
- blk_finish_plug(&plug);
- return true;
+ if (xfs_zone_gc_should_start_new_work(data)) {
+ set_current_state(TASK_RUNNING);
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+ }
}
/*
@@ -1059,8 +1087,18 @@ xfs_zoned_gcd(
for (;;) {
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
xfs_set_zonegc_running(mp);
- if (xfs_zone_gc_handle_work(data))
+
+ xfs_zone_gc_handle_work(data);
+
+ /*
+ * Only sleep if nothing set the state to running. Else check for
+ * work again as someone might have queued up more work and woken
+ * us in the meantime.
+ */
+ if (get_current_state() == TASK_RUNNING) {
+ try_to_freeze();
continue;
+ }
if (list_empty(&data->reading) &&
list_empty(&data->writing) &&
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 90e2ad8ee5f4..c1e5e30e90a0 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = {
static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_read_folio(folio, &zonefs_read_iomap_ops);
+ iomap_bio_read_folio(folio, &zonefs_read_iomap_ops);
+ return 0;
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_read_iomap_ops);
+ iomap_bio_readahead(rac, &zonefs_read_iomap_ops);
}
/*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 70be0b3dda49..086a31269198 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -644,7 +644,7 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
WARN_ON_ONCE(inode->i_private != z);
return inode;
}
@@ -683,7 +683,7 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
inode->i_ino = ino;