From 65d284a38c0ded391ae04a29640c34221c934cd2 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 6 Jun 2024 18:41:57 +0200 Subject: [PATCH 1/7] ceph: use cap_wait_list only if debugfs is enabled Only debugfs uses this list. By omitting it, we save some memory and reduce lock contention on `caps_list_lock`. Signed-off-by: Max Kellermann Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 ++++++ fs/ceph/mds_client.c | 2 ++ fs/ceph/mds_client.h | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c4941ba245ac..e98aa8219303 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3067,10 +3067,13 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { +#ifdef CONFIG_DEBUG_FS struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; +#endif DEFINE_WAIT_FUNC(wait, woken_wake_function); +#ifdef CONFIG_DEBUG_FS cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; @@ -3079,6 +3082,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); +#endif /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); @@ -3097,9 +3101,11 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); +#ifdef CONFIG_DEBUG_FS spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); +#endif if (ret == -EAGAIN) continue; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c2157f6e0c69..62238f3e6e19 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5505,7 +5505,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); +#ifdef CONFIG_DEBUG_FS INIT_LIST_HEAD(&mdsc->cap_wait_list); +#endif spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); INIT_LIST_HEAD(&mdsc->snap_flush_list); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cfa18cf915a0..9bcc7f181bfe 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -416,6 +416,8 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +#ifdef CONFIG_DEBUG_FS + struct cap_wait { struct list_head list; u64 ino; @@ -424,6 +426,8 @@ struct cap_wait { int want; }; +#endif + enum { CEPH_MDSC_STOPPING_BEGIN = 1, CEPH_MDSC_STOPPING_FLUSHING = 2, @@ -512,7 +516,9 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ +#ifdef CONFIG_DEBUG_FS struct list_head cap_wait_list; +#endif int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ From 77bb4a501a7756e6f5428b72fb0e420deb7ae562 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 9 Jul 2024 14:44:00 +0800 Subject: [PATCH 2/7] ceph: convert comma to semicolon in __ceph_dentry_dir_lease_touch() Replace a comma between expression statements by a semicolon. Signed-off-by: Chen Ni Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82a2e2a06a65..e4e7a856e9c2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1589,7 +1589,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) } spin_lock(&mdsc->dentry_list_lock); - __dentry_dir_lease_touch(mdsc, di), + __dentry_dir_lease_touch(mdsc, di); spin_unlock(&mdsc->dentry_list_lock); } From 578eb54c4a16713d99eed736660e30ae6eb1877f Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 10 Jul 2024 20:16:54 +0800 Subject: [PATCH 3/7] ceph: periodically flush the cap releases The MDS could be waiting the caps releases infinitely in some corner case and then reporting the caps revoke stuck warning. To fix this we should periodically flush the cap releases. Link: https://tracker.ceph.com/issues/57244 Signed-off-by: Xiubo Li Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 62238f3e6e19..276e34ab3e2c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5446,6 +5446,8 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + ceph_flush_cap_releases(mdsc, s); + mutex_lock(&s->s_mutex); if (renew_caps) send_renew_caps(mdsc, s); From 03230edb0bd831662a7c08b6fef66b2a9a817774 Mon Sep 17 00:00:00 2001 From: ethanwu Date: Thu, 11 Jul 2024 14:47:56 +0800 Subject: [PATCH 4/7] ceph: fix incorrect kmalloc size of pagevec mempool The kmalloc size of pagevec mempool is incorrectly calculated. It misses the size of page pointer and only accounts the number for the array. Fixes: a0102bda5bc0 ("ceph: move sb->wb_pagevec_pool to be a global mempool") Signed-off-by: ethanwu Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 885cb5d4e771..0cdf84cd1791 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -961,7 +961,8 @@ static int __init init_caches(void) if (!ceph_mds_request_cachep) goto bad_mds_req; - ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, + (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *)); if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; From f5c466a0fdb2d9f3650d2e3911b0735f17ba00cf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 23 Jul 2024 17:54:39 +0200 Subject: [PATCH 5/7] rbd: rename RBD_LOCK_STATE_RELEASING and releasing_wait ... to RBD_LOCK_STATE_QUIESCING and quiescing_wait to recognize that this state and the associated completion are backing rbd_quiesce_lock(), which isn't specific to releasing the lock. While exclusive lock does get quiesced before it's released, it also gets quiesced before an attempt to update the cookie is made and there the lock is not released as long as ceph_cls_set_cookie() succeeds. Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 26ff5cd2bf0a..c30d227753d7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -362,7 +362,7 @@ enum rbd_watch_state { enum rbd_lock_state { RBD_LOCK_STATE_UNLOCKED, RBD_LOCK_STATE_LOCKED, - RBD_LOCK_STATE_RELEASING, + RBD_LOCK_STATE_QUIESCING, }; /* WatchNotify::ClientId */ @@ -422,7 +422,7 @@ struct rbd_device { struct list_head running_list; struct completion acquire_wait; int acquire_err; - struct completion releasing_wait; + struct completion quiescing_wait; spinlock_t object_map_lock; u8 *object_map; @@ -525,7 +525,7 @@ static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) lockdep_assert_held(&rbd_dev->lock_rwsem); return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || - rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; + rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING; } static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) @@ -3458,12 +3458,12 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req) spin_lock(&rbd_dev->lock_lists_lock); if (!list_empty(&img_req->lock_item)) { list_del_init(&img_req->lock_item); - need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && list_empty(&rbd_dev->running_list)); } spin_unlock(&rbd_dev->lock_lists_lock); if (need_wakeup) - complete(&rbd_dev->releasing_wait); + complete(&rbd_dev->quiescing_wait); } static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) @@ -4181,16 +4181,16 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) /* * Ensure that all in-flight IO is flushed. */ - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; - rbd_assert(!completion_done(&rbd_dev->releasing_wait)); + rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING; + rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); if (list_empty(&rbd_dev->running_list)) return true; up_write(&rbd_dev->lock_rwsem); - wait_for_completion(&rbd_dev->releasing_wait); + wait_for_completion(&rbd_dev->quiescing_wait); down_write(&rbd_dev->lock_rwsem); - if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) + if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING) return false; rbd_assert(list_empty(&rbd_dev->running_list)); @@ -5383,7 +5383,7 @@ static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec) INIT_LIST_HEAD(&rbd_dev->acquiring_list); INIT_LIST_HEAD(&rbd_dev->running_list); init_completion(&rbd_dev->acquire_wait); - init_completion(&rbd_dev->releasing_wait); + init_completion(&rbd_dev->quiescing_wait); spin_lock_init(&rbd_dev->object_map_lock); From 2237ceb71f89837ac47c5dce2aaa2c2b3a337a3c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 23 Jul 2024 18:07:59 +0200 Subject: [PATCH 6/7] rbd: don't assume RBD_LOCK_STATE_LOCKED for exclusive mappings Every time a watch is reestablished after getting lost, we need to update the cookie which involves quiescing exclusive lock. For this, we transition from RBD_LOCK_STATE_LOCKED to RBD_LOCK_STATE_QUIESCING roughly for the duration of rbd_reacquire_lock() call. If the mapping is exclusive and I/O happens to arrive in this time window, it's failed with EROFS (later translated to EIO) based on the wrong assumption in rbd_img_exclusive_lock() -- "lock got released?" check there stopped making sense with commit a2b1da09793d ("rbd: lock should be quiesced on reacquire"). To make it worse, any such I/O is added to the acquiring list before EROFS is returned and this sets up for violating rbd_lock_del_request() precondition that the request is either on the running list or not on any list at all -- see commit ded080c86b3f ("rbd: don't move requests to the running list on errors"). rbd_lock_del_request() ends up processing these requests as if they were on the running list which screws up quiescing_wait completion counter and ultimately leads to rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); being triggered on the next watch error. Cc: stable@vger.kernel.org # 06ef84c4e9c4: rbd: rename RBD_LOCK_STATE_RELEASING and releasing_wait Cc: stable@vger.kernel.org Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code") Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c30d227753d7..ea6c592e015c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3457,6 +3457,7 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req) lockdep_assert_held(&rbd_dev->lock_rwsem); spin_lock(&rbd_dev->lock_lists_lock); if (!list_empty(&img_req->lock_item)) { + rbd_assert(!list_empty(&rbd_dev->running_list)); list_del_init(&img_req->lock_item); need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && list_empty(&rbd_dev->running_list)); @@ -3476,11 +3477,6 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) if (rbd_lock_add_request(img_req)) return 1; - if (rbd_dev->opts->exclusive) { - WARN_ON(1); /* lock got released? */ - return -EROFS; - } - /* * Note the use of mod_delayed_work() in rbd_acquire_lock() * and cancel_delayed_work() in wake_lock_waiters(). @@ -4601,6 +4597,10 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) rbd_warn(rbd_dev, "failed to update lock cookie: %d", ret); + if (rbd_dev->opts->exclusive) + rbd_warn(rbd_dev, + "temporarily releasing lock on exclusive mapping"); + /* * Lock cookie cannot be updated on older OSDs, so do * a manual release and queue an acquire. From 3ceccb14f5576e02b81cc8b105ab81f224bd87f6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 23 Jul 2024 18:08:08 +0200 Subject: [PATCH 7/7] rbd: don't assume rbd_is_lock_owner() for exclusive mappings Expanding on the previous commit, assuming that rbd_is_lock_owner() always returns true (i.e. that we are either in RBD_LOCK_STATE_LOCKED or RBD_LOCK_STATE_QUIESCING) if the mapping is exclusive is wrong too. In case ceph_cls_set_cookie() fails, the lock would be temporarily released even if the mapping is exclusive, meaning that we can end up even in RBD_LOCK_STATE_UNLOCKED. IOW, exclusive mappings are really "just" about disabling automatic lock transitions (as documented in the man page), not about grabbing the lock and holding on to it whatever it takes. Cc: stable@vger.kernel.org Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code") Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ea6c592e015c..da22ce38c039 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6589,11 +6589,6 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) if (ret) return ret; - /* - * The lock may have been released by now, unless automatic lock - * transitions are disabled. - */ - rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); return 0; }