From 10f0740234f0b157b41bdc7e9c3555a9b86c1599 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 9 Oct 2024 16:28:06 +1100 Subject: [PATCH 1/7] sunrpc: handle -ENOTCONN in xs_tcp_setup_socket() xs_tcp_finish_connecting() can return -ENOTCONN but the switch statement in xs_tcp_setup_socket() treats that as an unhandled error. If we treat it as a known error it would propagate back to call_connect_status() which does handle that error code. This appears to be the intention of the commit (given below) which added -ENOTCONN as a return status for xs_tcp_finish_connecting(). So add -ENOTCONN to the switch statement as an error to pass through to the caller. Link: https://bugzilla.suse.com/show_bug.cgi?id=1231050 Link: https://access.redhat.com/discussions/3434091 Fixes: 01d37c428ae0 ("SUNRPC: xprt_connect() don't abort the task if the transport isn't bound") Signed-off-by: NeilBrown Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0e1691316f42..1326fbf45a34 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2459,6 +2459,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: + case -ENOTCONN: break; default: printk("%s: connect returned unhandled error %d\n", From 6e2a10343ecb71c4457bc16be05758f9c7aae7d9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 4 Oct 2024 11:07:23 +1000 Subject: [PATCH 2/7] NFSv3: only use NFS timeout for MOUNT when protocols are compatible If a timeout is specified in the mount options, it currently applies to both the NFS protocol and (with v3) the MOUNT protocol. This is sensible when they both use the same underlying protocol, or those protocols are compatible w.r.t timeouts as RDMA and TCP are. However if, for example, NFS is using TCP and MOUNT is using UDP then using the same timeout doesn't make much sense. If you mount -o vers=3,proto=tcp,mountproto=udp,timeo=600,retrans=5 \ server:/path /mountpoint then the timeo=600 which was intended for the NFS/TCP request will apply to the MOUNT/UDP requests with the result that there will only be one request sent (because UDP has a maximum timeout of 60 seconds). This is not what a reasonable person might expect. This patch disables the sharing of timeout information in cases where the underlying protocols are not compatible. Fixes: c9301cb35b59 ("nfs: hornor timeo and retrans option when mounting NFSv3") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9723b6c53397..ae5c5e39afa0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -885,7 +885,15 @@ static int nfs_request_mount(struct fs_context *fc, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request, ctx->timeo, ctx->retrans); + if ((request.protocol == XPRT_TRANSPORT_UDP) == + !(ctx->flags & NFS_MOUNT_TCP)) + /* + * NFS protocol and mount protocol are both UDP or neither UDP + * so timeouts are compatible. Use NFS timeouts for MOUNT + */ + status = nfs_mount(&request, ctx->timeo, ctx->retrans); + else + status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); From dc270d7159699ad6d11decadfce9633f0f71c1db Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 25 Oct 2024 16:03:27 +0200 Subject: [PATCH 3/7] nfs: Fix KMSAN warning in decode_getfattr_attrs() Fix the following KMSAN warning: CPU: 1 UID: 0 PID: 7651 Comm: cp Tainted: G B Tainted: [B]=BAD_PAGE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009) ===================================================== ===================================================== BUG: KMSAN: uninit-value in decode_getfattr_attrs+0x2d6d/0x2f90 decode_getfattr_attrs+0x2d6d/0x2f90 decode_getfattr_generic+0x806/0xb00 nfs4_xdr_dec_getattr+0x1de/0x240 rpcauth_unwrap_resp_decode+0xab/0x100 rpcauth_unwrap_resp+0x95/0xc0 call_decode+0x4ff/0xb50 __rpc_execute+0x57b/0x19d0 rpc_execute+0x368/0x5e0 rpc_run_task+0xcfe/0xee0 nfs4_proc_getattr+0x5b5/0x990 __nfs_revalidate_inode+0x477/0xd00 nfs_access_get_cached+0x1021/0x1cc0 nfs_do_access+0x9f/0xae0 nfs_permission+0x1e4/0x8c0 inode_permission+0x356/0x6c0 link_path_walk+0x958/0x1330 path_lookupat+0xce/0x6b0 filename_lookup+0x23e/0x770 vfs_statx+0xe7/0x970 vfs_fstatat+0x1f2/0x2c0 __se_sys_newfstatat+0x67/0x880 __x64_sys_newfstatat+0xbd/0x120 x64_sys_call+0x1826/0x3cf0 do_syscall_64+0xd0/0x1b0 entry_SYSCALL_64_after_hwframe+0x77/0x7f The KMSAN warning is triggered in decode_getfattr_attrs(), when calling decode_attr_mdsthreshold(). It appears that fattr->mdsthreshold is not initialized. Fix the issue by initializing fattr->mdsthreshold to NULL in nfs_fattr_init(). Cc: stable@vger.kernel.org # v3.5.x Fixes: 88034c3d88c2 ("NFSv4.1 mdsthreshold attribute xdr") Signed-off-by: Roberto Sassu Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 542c7d97b235..1e71b029da58 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1633,6 +1633,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; + fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); From d054c5eb2890633935c23c371f45fb2d6b3b4b64 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 23 Oct 2024 09:35:43 -0400 Subject: [PATCH 4/7] NFS: Fix attribute delegation behaviour on exclusive create When the client does an exclusive create and the server decides to store the verifier in the timestamps, a SETATTR is subsequently sent to fix up those timestamps. When that is the case, suppress the exceptions for attribute delegations in nfs4_bitmap_copy_adjust(). Fixes: 32215c1f893a ("NFSv4: Don't request atime/mtime/size if they are delegated to us") Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cd2fbde2e6d7..9d40319e063d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3452,6 +3452,10 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, adjust_flags |= NFS_INO_INVALID_MODE; if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) adjust_flags |= NFS_INO_INVALID_OTHER; + if (sattr->ia_valid & ATTR_ATIME) + adjust_flags |= NFS_INO_INVALID_ATIME; + if (sattr->ia_valid & ATTR_MTIME) + adjust_flags |= NFS_INO_INVALID_MTIME; do { nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), From 40f45ab3814f2aff1ddada629c910aad982fc8e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 23 Oct 2024 17:05:48 -0400 Subject: [PATCH 5/7] NFS: Further fixes to attribute delegation a/mtime changes When asked to set both an atime and an mtime to the current system time, ensure that the setting is atomic by calling inode_update_timestamps() only once with the appropriate flags. Fixes: e12912d94137 ("NFSv4: Add support for delegated atime and mtime attributes") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1e71b029da58..9e884e1ce5d8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -628,23 +628,35 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) +{ + enum file_time_flags time_flags = 0; + unsigned int cache_flags = 0; + + if (ia_valid & ATTR_MTIME) { + time_flags |= S_MTIME | S_CTIME; + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (ia_valid & ATTR_ATIME) { + time_flags |= S_ATIME; + cache_flags |= NFS_INO_INVALID_ATIME; + } + inode_update_timestamps(inode, time_flags); + NFS_I(inode)->cache_validity &= ~cache_flags; +} + void nfs_update_delegated_atime(struct inode *inode) { spin_lock(&inode->i_lock); - if (nfs_have_delegated_atime(inode)) { - inode_update_timestamps(inode, S_ATIME); - NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ATIME; - } + if (nfs_have_delegated_atime(inode)) + nfs_update_timestamps(inode, ATTR_ATIME); spin_unlock(&inode->i_lock); } void nfs_update_delegated_mtime_locked(struct inode *inode) { - if (nfs_have_delegated_mtime(inode)) { - inode_update_timestamps(inode, S_CTIME | S_MTIME); - NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_MTIME); - } + if (nfs_have_delegated_mtime(inode)) + nfs_update_timestamps(inode, ATTR_MTIME); } void nfs_update_delegated_mtime(struct inode *inode) @@ -682,15 +694,16 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid &= ~ATTR_SIZE; } - if (nfs_have_delegated_mtime(inode)) { - if (attr->ia_valid & ATTR_MTIME) { - nfs_update_delegated_mtime(inode); - attr->ia_valid &= ~ATTR_MTIME; - } - if (attr->ia_valid & ATTR_ATIME) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; - } + if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { + spin_lock(&inode->i_lock); + nfs_update_timestamps(inode, attr->ia_valid); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); + } else if (nfs_have_delegated_atime(inode) && + attr->ia_valid & ATTR_ATIME && + !(attr->ia_valid & ATTR_MTIME)) { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; } /* Optimization: if the end result is no change, don't RPC */ From bc2940869508b7b956a757a26d3b1ebf9546790e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 23 Oct 2024 16:34:42 -0400 Subject: [PATCH 6/7] nfs_common: fix localio to cope with racing nfs_local_probe() Fix the possibility of racing nfs_local_probe() resulting in: list_add double add: new=ffff8b99707f9f58, prev=ffff8b99707f9f58, next=ffffffffc0f30000. ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:35! Add nfs_uuid_init() to properly initialize all nfs_uuid_t members (particularly its list_head). Switch to returning bool from nfs_uuid_begin(), returns false if nfs_uuid_t is already in-use (its list_head is on a list). Update nfs_local_probe() to return early if the nfs_client's cl_uuid (nfs_uuid_t) is in-use. Also, switch nfs_uuid_begin() from using list_add_tail_rcu() to list_add_tail() -- rculist was used in an earlier version of the localio code that had a lockless nfs_uuid_lookup interface. Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 3 +-- fs/nfs/localio.c | 3 ++- fs/nfs_common/nfslocalio.c | 23 ++++++++++++++++++----- include/linux/nfslocalio.h | 3 ++- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 114282398716..03ecc7765615 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -181,8 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); - clp->cl_uuid.net = NULL; - clp->cl_uuid.dom = NULL; + nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index d0aa680ec816..8f0ce82a677e 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -205,7 +205,8 @@ void nfs_local_probe(struct nfs_client *clp) nfs_local_disable(clp); } - nfs_uuid_begin(&clp->cl_uuid); + if (!nfs_uuid_begin(&clp->cl_uuid)) + return; if (nfs_server_uuid_is_local(clp)) nfs_local_enable(clp); nfs_uuid_end(&clp->cl_uuid); diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 5c8ce5066c16..09404d142d1a 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -5,7 +5,7 @@ */ #include -#include +#include #include #include @@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock); */ static LIST_HEAD(nfs_uuids); -void nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +void nfs_uuid_init(nfs_uuid_t *nfs_uuid) { nfs_uuid->net = NULL; nfs_uuid->dom = NULL; - uuid_gen(&nfs_uuid->uuid); + INIT_LIST_HEAD(&nfs_uuid->list); +} +EXPORT_SYMBOL_GPL(nfs_uuid_init); +bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +{ spin_lock(&nfs_uuid_lock); - list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids); + /* Is this nfs_uuid already in use? */ + if (!list_empty(&nfs_uuid->list)) { + spin_unlock(&nfs_uuid_lock); + return false; + } + uuid_gen(&nfs_uuid->uuid); + list_add_tail(&nfs_uuid->list, &nfs_uuids); spin_unlock(&nfs_uuid_lock); + + return true; } EXPORT_SYMBOL_GPL(nfs_uuid_begin); @@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid) { if (nfs_uuid->net == NULL) { spin_lock(&nfs_uuid_lock); - list_del_init(&nfs_uuid->list); + if (nfs_uuid->net == NULL) + list_del_init(&nfs_uuid->list); spin_unlock(&nfs_uuid_lock); } } diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index b0dd9b1eef4f..3982fea79919 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -32,7 +32,8 @@ typedef struct { struct auth_domain *dom; /* auth_domain for localio */ } nfs_uuid_t; -void nfs_uuid_begin(nfs_uuid_t *); +void nfs_uuid_init(nfs_uuid_t *); +bool nfs_uuid_begin(nfs_uuid_t *); void nfs_uuid_end(nfs_uuid_t *); void nfs_uuid_is_local(const uuid_t *, struct list_head *, struct net *, struct auth_domain *, struct module *); From 867da60d463bb2a3e28c9235c487e56e96cffa00 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 18 Oct 2024 17:15:41 -0400 Subject: [PATCH 7/7] nfs: avoid i_lock contention in nfs_clear_invalid_mapping Multi-threaded buffered reads to the same file exposed significant inode spinlock contention in nfs_clear_invalid_mapping(). Eliminate this spinlock contention by checking flags without locking, instead using smp_rmb and smp_load_acquire accordingly, but then take spinlock and double-check these inode flags. Also refactor nfs_set_cache_invalid() slightly to use smp_store_release() to pair with nfs_clear_invalid_mapping()'s smp_load_acquire(). While this fix is beneficial for all multi-threaded buffered reads issued by an NFS client, this issue was identified in the context of surprisingly low LOCALIO performance with 4K multi-threaded buffered read IO. This fix dramatically speeds up LOCALIO performance: before: read: IOPS=1583k, BW=6182MiB/s (6482MB/s)(121GiB/20002msec) after: read: IOPS=3046k, BW=11.6GiB/s (12.5GB/s)(232GiB/20001msec) Fixes: 17dfeb911339 ("NFS: Fix races in nfs_revalidate_mapping") Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9e884e1ce5d8..596f35170137 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -205,12 +205,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; - nfsi->cache_validity |= flags; + flags |= nfsi->cache_validity; + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; - if (inode->i_mapping->nrpages == 0) { - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - nfs_ooo_clear(nfsi); - } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ + smp_store_release(&nfsi->cache_validity, flags); + + if (inode->i_mapping->nrpages == 0 || + nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); @@ -1421,6 +1424,13 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; + smp_rmb(); /* pairs with smp_wmb() below */ + if (test_bit(NFS_INO_INVALIDATING, bitlock)) + continue; + /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ + if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) + goto out; + /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock);