diff --git a/MAINTAINERS b/MAINTAINERS index 70e629dad6e4..8c84584b5a1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3813,10 +3813,9 @@ F: Documentation/filesystems/befs.rst F: fs/befs/ BFQ I/O SCHEDULER -M: Paolo Valente -M: Jens Axboe +M: Yu Kuai L: linux-block@vger.kernel.org -S: Maintained +S: Odd Fixes F: Documentation/block/bfq-iosched.rst F: block/bfq-* diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b758693697c0..e831aedb4643 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -679,12 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); - if (entity->parent && - entity->parent->last_bfqq_created == bfqq) - entity->parent->last_bfqq_created = NULL; - else if (bfqd->last_bfqq_created == bfqq) - bfqd->last_bfqq_created = NULL; - + bfq_reassign_last_bfqq(bfqq, NULL); entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ @@ -741,7 +736,6 @@ static void bfq_sync_bfqq_move(struct bfq_data *bfqd, */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); - bfq_release_process_ref(bfqd, sync_bfqq); } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 36a4998c4b37..0747d9d0e48c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2911,8 +2911,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* if a merge has already been setup, then proceed with that first */ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; + new_bfqq = bfqq->new_bfqq; + if (new_bfqq) { + while (new_bfqq->new_bfqq) + new_bfqq = new_bfqq->new_bfqq; + return new_bfqq; + } /* * Check delayed stable merge for rotational or non-queueing @@ -3093,8 +3097,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } -static void -bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, + struct bfq_queue *new_bfqq) { if (cur_bfqq->entity.parent && cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) @@ -3125,10 +3129,12 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void -bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_queue *bfqq) { + struct bfq_queue *new_bfqq = bfqq->new_bfqq; + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", (unsigned long)new_bfqq->pid); /* Save weight raising and idle window of the merged queues */ @@ -3222,6 +3228,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_reassign_last_bfqq(bfqq, new_bfqq); bfq_release_process_ref(bfqd, bfqq); + + return new_bfqq; } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, @@ -3257,14 +3265,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * fulfilled, i.e., bic can be redirected to new_bfqq * and bfqq can be put. */ - bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, - new_bfqq); - /* - * If we get here, bio will be queued into new_queue, - * so use new_bfqq to decide whether bio and rq can be - * merged. - */ - bfqq = new_bfqq; + while (bfqq != new_bfqq) + bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq); /* * Change also bqfd->bio_bfqq, as @@ -5432,6 +5434,8 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) bfq_put_queue(__bfqq); __bfqq = next; } + + bfq_release_process_ref(bfqq->bfqd, bfqq); } static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) @@ -5444,8 +5448,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); bfq_put_cooperator(bfqq); - - bfq_release_process_ref(bfqd, bfqq); } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, @@ -5701,9 +5703,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, * state before killing it. */ bfqq->bic = bic; - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); - - return new_bfqq; + return bfq_merge_bfqqs(bfqd, bic, bfqq); } /* @@ -6158,6 +6158,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bool waiting, idle_timer_disabled = false; if (new_bfqq) { + struct bfq_queue *old_bfqq = bfqq; /* * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. @@ -6174,18 +6175,18 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * new_bfqq. */ if (bic_to_bfqq(RQ_BIC(rq), true, - bfq_actuator_index(bfqd, rq->bio)) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); + bfq_actuator_index(bfqd, rq->bio)) == bfqq) { + while (bfqq != new_bfqq) + bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq); + } - bfq_clear_bfqq_just_created(bfqq); + bfq_clear_bfqq_just_created(old_bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq */ - bfq_put_queue(bfqq); + bfq_put_queue(old_bfqq); rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; } bfq_update_io_thinktime(bfqd, bfqq); @@ -6723,7 +6724,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - if (bfqq_process_refs(bfqq) == 1) { + if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) { bfqq->pid = current->pid; bfq_clear_bfqq_coop(bfqq); bfq_clear_bfqq_split_coop(bfqq); @@ -6733,16 +6734,13 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); - - bfq_release_process_ref(bfqq->bfqd, bfqq); return NULL; } -static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct bio *bio, - bool split, bool is_sync, - bool *new_queue) +static struct bfq_queue * +__bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bio *bio, bool split, bool is_sync, + bool *new_queue) { unsigned int act_idx = bfq_actuator_index(bfqd, bio); struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); @@ -6821,6 +6819,84 @@ static void bfq_prepare_request(struct request *rq) rq->elv.priv[0] = rq->elv.priv[1] = NULL; } +static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) +{ + struct bfq_queue *new_bfqq = bfqq->new_bfqq; + struct bfq_queue *waker_bfqq = bfqq->waker_bfqq; + + if (!waker_bfqq) + return NULL; + + while (new_bfqq) { + if (new_bfqq == waker_bfqq) { + /* + * If waker_bfqq is in the merge chain, and current + * is the only procress. + */ + if (bfqq_process_refs(waker_bfqq) == 1) + return NULL; + break; + } + + new_bfqq = new_bfqq->new_bfqq; + } + + return waker_bfqq; +} + +static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bio *bio, + unsigned int idx, + bool is_sync) +{ + struct bfq_queue *waker_bfqq; + struct bfq_queue *bfqq; + bool new_queue = false; + + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); + if (unlikely(new_queue)) + return bfqq; + + /* If the queue was seeky for too long, break it apart. */ + if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) || + bic->bfqq_data[idx].stably_merged) + return bfqq; + + waker_bfqq = bfq_waker_bfqq(bfqq); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->bfqq_data[idx].saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); + if (bfqq) { + bfq_bfqq_resume_state(bfqq, bfqd, bic, true); + return bfqq; + } + + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + if (unlikely(bfqq == &bfqd->oom_bfqq)) + return bfqq; + + bfq_bfqq_resume_state(bfqq, bfqd, bic, false); + bfqq->waker_bfqq = waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + + return bfqq; +} + /* * If needed, init rq, allocate bfq data structures associated with * rq, and increment reference counters in the destination bfq_queue @@ -6852,8 +6928,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; - bool new_queue = false; - bool bfqq_already_existing = false, split = false; unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) @@ -6870,54 +6944,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); - bfq_check_ioprio_change(bic, bio); - bfq_bic_update_cgroup(bic, bio); - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); - - if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && - !bic->bfqq_data[a_idx].stably_merged) { - struct bfq_queue *old_bfqq = bfqq; - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) - bic->bfqq_data[a_idx].saved_in_large_burst = - true; - - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - - if (!bfqq) { - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - true, is_sync, - NULL); - if (unlikely(bfqq == &bfqd->oom_bfqq)) - bfqq_already_existing = true; - } else - bfqq_already_existing = true; - - if (!bfqq_already_existing) { - bfqq->waker_bfqq = old_bfqq->waker_bfqq; - bfqq->tentative_waker_bfqq = NULL; - - /* - * If the waker queue disappears, then - * new_bfqq->waker_bfqq must be - * reset. So insert new_bfqq into the - * woken_list of the waker. See - * bfq_check_waker for details. - */ - if (bfqq->waker_bfqq) - hlist_add_head(&bfqq->woken_list_node, - &bfqq->waker_bfqq->woken_list); - } - } - } + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync); bfqq_request_allocated(bfqq); bfqq->ref++; @@ -6934,18 +6963,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) * addition, if the queue has also just been split, we have to * resume its state. */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq && + bfqq_process_refs(bfqq) == 1) bfqq->bic = bic; - if (split) { - /* - * The queue has just been split from a shared - * queue: restore the idle window and the - * possible weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); - } - } /* * Consider bfqq as possibly belonging to a burst of newly diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 08ddf2cfae5b..687a3a7ba784 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1156,6 +1156,8 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq); void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, + struct bfq_queue *new_bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ @@ -1183,11 +1185,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); "%s " fmt, pid_str, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - blk_add_cgroup_trace_msg((bfqd)->queue, \ - &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ -} while (0) - #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ @@ -1197,7 +1194,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) #endif /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bio.c b/block/bio.c index 8b827a194c22..5ff0b66e47a4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,7 +934,8 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return false; - *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); + *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & + PAGE_MASK)); if (!*same_page) { if (IS_ENABLED(CONFIG_KMSAN)) return false; @@ -1019,6 +1020,29 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return len; } +/** + * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints + * @q: the target queue + * @bio: destination bio + * @folio: folio to add + * @len: vec entry length + * @offset: vec entry offset in the folio + * @max_sectors: maximum number of sectors that can be added + * @same_page: return if the segment has been merged inside the same folio + * + * Add a folio to a bio while respecting the hardware max_sectors, max_segment + * and gap limitations. + */ +int bio_add_hw_folio(struct request_queue *q, struct bio *bio, + struct folio *folio, size_t len, size_t offset, + unsigned int max_sectors, bool *same_page) +{ + if (len > UINT_MAX || offset > UINT_MAX) + return 0; + return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset, + max_sectors, same_page); +} + /** * bio_add_pc_page - attempt to add page to passthrough bio * @q: the target queue @@ -1169,7 +1193,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) struct folio_iter fi; bio_for_each_folio_all(fi, bio) { - struct page *page; size_t nr_pages; if (mark_dirty) { @@ -1177,12 +1200,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } - page = folio_page(fi.folio, fi.offset / PAGE_SIZE); nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; - do { - bio_release_page(bio, page++); - } while (--nr_pages != 0); + unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1207,8 +1227,8 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, + size_t offset) { bool same_page = false; @@ -1217,30 +1237,61 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { + folio_page(folio, 0), len, offset, + &same_page)) { bio->bi_iter.bi_size += len; - if (same_page) - bio_release_page(bio, page); + if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_folio(folio, 1); return 0; } - __bio_add_page(bio, page, len, offset); + bio_add_folio_nofail(bio, folio, len, offset); return 0; } -static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, + size_t len, size_t offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); bool same_page = false; - if (bio_add_hw_page(q, bio, page, len, offset, + if (bio_add_hw_folio(q, bio, folio, len, offset, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; - if (same_page) - bio_release_page(bio, page); + if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_folio(folio, 1); return 0; } +static unsigned int get_contig_folio_len(unsigned int *num_pages, + struct page **pages, unsigned int i, + struct folio *folio, size_t left, + size_t offset) +{ + size_t bytes = left; + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); + unsigned int j; + + /* + * We might COW a single page in the middle of + * a large folio, so we have to check that all + * pages belong to the same folio. + */ + bytes -= contig_sz; + for (j = i + 1; j < i + *num_pages; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) { + break; + } + contig_sz += next; + bytes -= next; + } + *num_pages = j - i; + + return contig_sz; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1260,9 +1311,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i = 0; - size_t offset; + ssize_t size; + unsigned int num_pages, i = 0; + size_t offset, folio_offset, left, len; int ret = 0; /* @@ -1302,17 +1353,28 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) goto out; } - for (left = size, i = 0; left > 0; left -= len, i++) { + for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); + + folio_offset = ((size_t)folio_page_idx(folio, page) << + PAGE_SHIFT) + offset; + + len = min(folio_size(folio) - folio_offset, left); + + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + if (num_pages > 1) + len = get_contig_folio_len(&num_pages, pages, i, + folio, left, offset); - len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = bio_iov_add_zone_append_page(bio, page, len, - offset); + ret = bio_iov_add_zone_append_folio(bio, folio, len, + folio_offset); if (ret) break; } else - bio_iov_add_page(bio, page, len, offset); + bio_iov_add_folio(bio, folio, len, folio_offset); offset = 0; } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 69e70964398c..e68c725cf8d9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1458,7 +1458,6 @@ int blkcg_init_disk(struct gendisk *disk) struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; - int ret; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1478,15 +1477,8 @@ int blkcg_init_disk(struct gendisk *disk) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(disk); - if (ret) - goto err_destroy_all; - return 0; -err_destroy_all: - blkg_destroy_all(disk); - return ret; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) @@ -1554,6 +1546,14 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (blkcg_policy_enabled(q, pol)) return 0; + /* + * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, + * for example, ioprio. Such policy will work on blkcg level, not disk + * level, and don't need to be activated. + */ + if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) + return -EINVAL; + if (queue_is_mq(q)) blk_mq_freeze_queue(q); retry: @@ -1733,9 +1733,12 @@ int blkcg_policy_register(struct blkcg_policy *pol) goto err_unlock; } - /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) goto err_unlock; /* register @pol */ diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 864fad4a850b..b9e3265c1eb3 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -485,7 +485,6 @@ static inline void blkcg_deactivate_policy(struct gendisk *disk, static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 690ca99dfaca..9dc9323f84ac 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -648,7 +648,7 @@ static const struct ioc_params autop[] = { * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ -static u32 vrate_adj_pct[] = +static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -2076,7 +2076,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; - u64 dur, usage_pct, nr_cycles; + u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { @@ -2138,10 +2138,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, old_debt = iocg->abs_vdebt; old_delay = iocg->delay; + nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) - iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; + if (iocg->delay) - iocg->delay = iocg->delay >> nr_cycles ?: 1; + iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 4051fada01f1..8fff7ccc0ac7 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -49,14 +49,6 @@ static const char *policy_name[] = { static struct blkcg_policy ioprio_policy; -/** - * struct ioprio_blkg - Per (cgroup, request queue) data. - * @pd: blkg_policy_data structure. - */ -struct ioprio_blkg { - struct blkg_policy_data pd; -}; - /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. @@ -67,11 +59,6 @@ struct ioprio_blkcg { enum prio_policy prio_policy; }; -static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; -} - static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), @@ -84,16 +71,6 @@ ioprio_blkcg_from_css(struct cgroup_subsys_state *css) return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } -static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) -{ - struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); - - if (!pd) - return NULL; - - return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); -} - static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); @@ -118,25 +95,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, return nbytes; } -static struct blkg_policy_data * -ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) -{ - struct ioprio_blkg *ioprio_blkg; - - ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); - if (!ioprio_blkg) - return NULL; - - return &ioprio_blkg->pd; -} - -static void ioprio_free_pd(struct blkg_policy_data *pd) -{ - struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); - - kfree(ioprio_blkg); -} - static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; @@ -179,14 +137,11 @@ static struct blkcg_policy ioprio_policy = { .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, - - .pd_alloc_fn = ioprio_alloc_pd, - .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) { - struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) @@ -219,16 +174,6 @@ void blkcg_set_ioprio(struct bio *bio) bio->bi_ioprio = prio; } -void blk_ioprio_exit(struct gendisk *disk) -{ - blkcg_deactivate_policy(disk, &ioprio_policy); -} - -int blk_ioprio_init(struct gendisk *disk) -{ - return blkcg_activate_policy(disk, &ioprio_policy); -} - static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index b6afb8e80de0..9265143f9bc9 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -9,17 +9,8 @@ struct request_queue; struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO -int blk_ioprio_init(struct gendisk *disk); -void blk_ioprio_exit(struct gendisk *disk); void blkcg_set_ioprio(struct bio *bio); #else -static inline int blk_ioprio_init(struct gendisk *disk) -{ - return 0; -} -static inline void blk_ioprio_exit(struct gendisk *disk) -{ -} static inline void blkcg_set_ioprio(struct bio *bio) { } diff --git a/block/blk-merge.c b/block/blk-merge.c index de5281bcadc5..56769c4bcd79 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -105,9 +105,33 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } -static struct bio *bio_split_discard(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static struct bio *bio_submit_split(struct bio *bio, int split_sectors) +{ + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } + + if (split_sectors) { + struct bio *split; + + split = bio_split(bio, split_sectors, GFP_NOIO, + &bio->bi_bdev->bd_disk->bio_split); + split->bi_opf |= REQ_NOMERGE; + blkcg_bio_issue_init(split); + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + return split; + } + + return bio; +} + +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -121,10 +145,10 @@ static struct bio *bio_split_discard(struct bio *bio, min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) - return NULL; + return bio; if (bio_sectors(bio) <= max_discard_sectors) - return NULL; + return bio; split_sectors = max_discard_sectors; @@ -139,19 +163,18 @@ static struct bio *bio_split_discard(struct bio *bio, if (split_sectors > tmp) split_sectors -= tmp; - return bio_split(bio, split_sectors, GFP_NOIO, bs); + return bio_submit_split(bio, split_sectors); } -static struct bio *bio_split_write_zeroes(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs) { *nsegs = 0; if (!lim->max_write_zeroes_sectors) - return NULL; + return bio; if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) - return NULL; - return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); + return bio; + return bio_submit_split(bio, lim->max_write_zeroes_sectors); } static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, @@ -274,27 +297,19 @@ static bool bvec_split_segs(const struct queue_limits *lim, } /** - * bio_split_rw - split a bio in two bios + * bio_split_rw_at - check if and where to split a read/write bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors - * @bs: [in] bio set to allocate the clone from * @max_bytes: [in] maximum number of bytes per bio * - * Clone @bio, update the bi_iter of the clone to represent the first sectors - * of @bio and update @bio->bi_iter to represent the remaining sectors. The - * following is guaranteed for the cloned bio: - * - That it has at most @max_bytes worth of data - * - That it has at most queue_max_segments(@q) segments. - * - * Except for discard requests the cloned bio will point at the bi_io_vec of - * the original bio. It is the responsibility of the caller to ensure that the - * original bio is not freed before the cloned bio. The caller is also - * responsible for ensuring that @bs is only destroyed after processing of the - * split bio has finished. + * Find out if @bio needs to be split to fit the queue limits in @lim and a + * maximum size of @max_bytes. Returns a negative error number if @bio can't be + * split, 0 if the bio doesn't have to be split, or a positive sector offset if + * @bio needs to be split. */ -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes) +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; @@ -324,22 +339,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; - return NULL; + return 0; split: - if (bio->bi_opf & REQ_ATOMIC) { - bio->bi_status = BLK_STS_INVAL; - bio_endio(bio); - return ERR_PTR(-EINVAL); - } + if (bio->bi_opf & REQ_ATOMIC) + return -EINVAL; + /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. */ - if (bio->bi_opf & REQ_NOWAIT) { - bio->bi_status = BLK_STS_AGAIN; - bio_endio(bio); - return ERR_PTR(-EAGAIN); - } + if (bio->bi_opf & REQ_NOWAIT) + return -EAGAIN; *segs = nsegs; @@ -356,58 +366,36 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); - return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); + return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw); +EXPORT_SYMBOL_GPL(bio_split_rw_at); -/** - * __bio_split_to_limits - split a bio to fit the queue limits - * @bio: bio to be split - * @lim: queue limits to split based on - * @nr_segs: returns the number of segments in the returned bio - * - * Check if @bio needs splitting based on the queue limits, and if so split off - * a bio fitting the limits from the beginning of @bio and return it. @bio is - * shortened to the remainder and re-submitted. - * - * The split bio is allocated from @q->bio_split, which is provided by the - * block layer. - */ -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs) +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs) { - struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; - struct bio *split; + return bio_submit_split(bio, + bio_split_rw_at(bio, lim, nr_segs, + get_max_io_size(bio, lim) << SECTOR_SHIFT)); +} - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - split = bio_split_discard(bio, lim, nr_segs, bs); - break; - case REQ_OP_WRITE_ZEROES: - split = bio_split_write_zeroes(bio, lim, nr_segs, bs); - break; - default: - split = bio_split_rw(bio, lim, nr_segs, bs, - get_max_io_size(bio, lim) << SECTOR_SHIFT); - if (IS_ERR(split)) - return NULL; - break; - } +/* + * REQ_OP_ZONE_APPEND bios must never be split by the block layer. + * + * But we want the nr_segs calculation provided by bio_split_rw_at, and having + * a good sanity check that the submitter built the bio correctly is nice to + * have as well. + */ +struct bio *bio_split_zone_append(struct bio *bio, + const struct queue_limits *lim, unsigned *nr_segs) +{ + unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); + int split_sectors; - if (split) { - /* there isn't chance to merge the split bio */ - split->bi_opf |= REQ_NOMERGE; - - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; - } - return bio; + split_sectors = bio_split_rw_at(bio, lim, nr_segs, + max_sectors << SECTOR_SHIFT); + if (WARN_ON_ONCE(split_sectors > 0)) + split_sectors = -EINVAL; + return bio_submit_split(bio, split_sectors); } /** @@ -426,9 +414,7 @@ struct bio *bio_split_to_limits(struct bio *bio) const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - if (bio_may_exceed_limits(bio, lim)) - return __bio_split_to_limits(bio, lim, &nr_segs); - return bio; + return __bio_split_to_limits(bio, lim, &nr_segs); } EXPORT_SYMBOL(bio_split_to_limits); diff --git a/block/blk-mq.c b/block/blk-mq.c index e3c3c0c21b55..3f1f7d0b3ff3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2753,6 +2753,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; + unsigned int depth; /* * We may have been called recursively midway through handling @@ -2763,6 +2764,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (plug->rq_count == 0) return; + depth = plug->rq_count; plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { @@ -2770,6 +2772,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) rq = rq_list_peek(&plug->mq_list); q = rq->q; + trace_block_unplug(q, depth, true); /* * Peek first request and see if we have a ->queue_rqs() hook. @@ -2939,7 +2942,7 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - unsigned int nr_segs = 1; + unsigned int nr_segs; struct request *rq; blk_status_t ret; @@ -2981,11 +2984,10 @@ void blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; - } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + if (!bio_integrity_prep(bio)) goto queue_exit; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index dd7310c94713..2cfb297d9a62 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -263,7 +263,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { - /* The memory barrier in set_task_state saves us here. */ + /* The memory barrier in set_current_state saves us here. */ if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 6943ec720f39..2c4192e12efa 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1584,6 +1584,22 @@ void blk_throtl_cancel_bios(struct gendisk *disk) spin_unlock_irq(&q->queue_lock); } +static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + /* throtl is FIFO - if bios are already queued, should queue */ + if (tg->service_queue.nr_queued[rw]) + return false; + + return tg_may_dispatch(tg, bio, NULL); +} + +static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) + tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); + tg->carryover_ios[rw]--; +} + bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -1600,34 +1616,35 @@ bool __blk_throtl_bio(struct bio *bio) sq = &tg->service_queue; while (true) { - if (tg->last_low_overflow_time[rw] == 0) - tg->last_low_overflow_time[rw] = jiffies; - /* throtl is FIFO - if bios are already queued, should queue */ - if (sq->nr_queued[rw]) - break; + if (tg_within_limit(tg, bio, rw)) { + /* within limits, let's charge and dispatch directly */ + throtl_charge_bio(tg, bio); - /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) { - tg->last_low_overflow_time[rw] = jiffies; + /* + * We need to trim slice even when bios are not being + * queued otherwise it might happen that a bio is not + * queued for a long time and slice keeps on extending + * and trim is not called for a long time. Now if limits + * are reduced suddenly we take into account all the IO + * dispatched so far at new low rate and * newly queued + * IO gets a really long dispatch time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + } else if (bio_issue_as_root_blkg(bio)) { + /* + * IOs which may cause priority inversions are + * dispatched directly, even if they're over limit. + * Debts are handled by carryover_bytes/ios while + * calculating wait time. + */ + tg_dispatch_in_debt(tg, bio, rw); + } else { + /* if above limits, break to queue */ break; } - /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); - - /* - * We need to trim slice even when bios are not being queued - * otherwise it might happen that a bio is not queued for - * a long time and slice keeps on extending and trim is not - * called for a long time. Now if limits are reduced suddenly - * we take into account all the IO dispatched so far at new - * low rate and * newly queued IO gets a really long dispatch - * time. - * - * So keep on trimming slice even if bio is not queued. - */ - throtl_trim_slice(tg, rw); - /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it @@ -1650,8 +1667,6 @@ bool __blk_throtl_bio(struct bio *bio) tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); - tg->last_low_overflow_time[rw] = jiffies; - td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 4d9ef5abdf21..1a36d1278eea 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -106,8 +106,6 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; - unsigned long last_low_overflow_time[2]; - uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2]; diff --git a/block/blk.h b/block/blk.h index e180863f918b..c718e4291db0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -331,33 +331,67 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -static inline bool bio_may_exceed_limits(struct bio *bio, - const struct queue_limits *lim) -{ - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - return true; /* non-trivial splitting decisions */ - default: - break; - } +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs); +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs); +struct bio *bio_split_zone_append(struct bio *bio, + const struct queue_limits *lim, unsigned *nr_segs); - /* - * All drivers must accept single-segments bios that are <= PAGE_SIZE. - * This is a quick and dirty check that relies on the fact that - * bi_io_vec[0] is always valid if a bio has data. The check might - * lead to occasional false negatives when bios are cloned, but compared - * to the performance impact of cloned bios themselves the loop below - * doesn't matter anyway. - */ +/* + * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. + * + * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is + * always valid if a bio has data. The check might lead to occasional false + * positives when bios are cloned, but compared to the performance impact of + * cloned bios themselves the loop below doesn't matter anyway. + */ +static inline bool bio_may_need_split(struct bio *bio, + const struct queue_limits *lim) +{ return lim->chunk_sectors || bio->bi_vcnt != 1 || bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs); +/** + * __bio_split_to_limits - split a bio to fit the queue limits + * @bio: bio to be split + * @lim: queue limits to split based on + * @nr_segs: returns the number of segments in the returned bio + * + * Check if @bio needs splitting based on the queue limits, and if so split off + * a bio fitting the limits from the beginning of @bio and return it. @bio is + * shortened to the remainder and re-submitted. + * + * The split bio is allocated from @q->bio_split, which is provided by the + * block layer. + */ +static inline struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, unsigned int *nr_segs) +{ + switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + if (bio_may_need_split(bio, lim)) + return bio_split_rw(bio, lim, nr_segs); + *nr_segs = 1; + return bio; + case REQ_OP_ZONE_APPEND: + return bio_split_zone_append(bio, lim, nr_segs); + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + return bio_split_discard(bio, lim, nr_segs); + case REQ_OP_WRITE_ZEROES: + return bio_split_write_zeroes(bio, lim, nr_segs); + default: + /* other operations can't be split */ + *nr_segs = 0; + return bio; + } +} + int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -540,6 +574,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +int bio_add_hw_folio(struct request_queue *q, struct bio *bio, + struct folio *folio, size_t len, size_t offset, + unsigned int max_sectors, bool *same_page); + /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. @@ -571,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file); int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); +int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; diff --git a/block/fops.c b/block/fops.c index 35c47a304727..e69deb6d8635 100644 --- a/block/fops.c +++ b/block/fops.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) @@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = { .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, }; diff --git a/block/ioctl.c b/block/ioctl.c index e8e4a4190f18..6554b728bae6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, @@ -92,38 +95,51 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif +/* + * Check that [start, start + len) is a valid range from the block device's + * perspective, including verifying that it can be correctly translated into + * logical block addresses. + */ +static int blk_validate_byte_range(struct block_device *bdev, + uint64_t start, uint64_t len) +{ + unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; + uint64_t end; + + if ((start | len) & bs_mask) + return -EINVAL; + if (!len) + return -EINVAL; + if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) + return -EINVAL; + + return 0; +} + static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { - unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; - uint64_t range[2], start, len, end; + uint64_t range[2], start, len; struct bio *prev = NULL, *bio; sector_t sector, nr_sects; struct blk_plug plug; int err; - if (!(mode & BLK_OPEN_WRITE)) - return -EBADF; - - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - if (bdev_read_only(bdev)) - return -EPERM; - if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; - start = range[0]; len = range[1]; - if (!len) - return -EINVAL; - if ((start | len) & bs_mask) - return -EINVAL; + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; - if (check_add_overflow(start, len, &end) || - end > bdev_nr_bytes(bdev)) - return -EINVAL; + if (!(mode & BLK_OPEN_WRITE)) + return -EBADF; + if (bdev_read_only(bdev)) + return -EPERM; + err = blk_validate_byte_range(bdev, start, len); + if (err) + return err; filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); @@ -163,7 +179,7 @@ fail: static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { - uint64_t start, len; + uint64_t start, len, end; uint64_t range[2]; int err; @@ -178,11 +194,12 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(bdev->bd_mapping); - err = truncate_bdev_range(bdev, mode, start, start + len - 1); + err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); @@ -734,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return ret; } #endif + +struct blk_iou_cmd { + int res; + bool nowait; +}; + +static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + + if (bic->res == -EAGAIN && bic->nowait) + io_uring_cmd_issue_blocking(cmd); + else + io_uring_cmd_done(cmd, bic->res, 0, issue_flags); +} + +static void bio_cmd_bio_end_io(struct bio *bio) +{ + struct io_uring_cmd *cmd = bio->bi_private; + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + + if (unlikely(bio->bi_status) && !bic->res) + bic->res = blk_status_to_errno(bio->bi_status); + + io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete); + bio_put(bio); +} + +static int blkdev_cmd_discard(struct io_uring_cmd *cmd, + struct block_device *bdev, + uint64_t start, uint64_t len, bool nowait) +{ + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL; + sector_t sector = start >> SECTOR_SHIFT; + sector_t nr_sects = len >> SECTOR_SHIFT; + struct bio *prev = NULL, *bio; + int err; + + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE)) + return -EBADF; + if (bdev_read_only(bdev)) + return -EPERM; + err = blk_validate_byte_range(bdev, start, len); + if (err) + return err; + + err = filemap_invalidate_pages(bdev->bd_mapping, start, + start + len - 1, nowait); + if (err) + return err; + + while (true) { + bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, gfp); + if (!bio) + break; + if (nowait) { + /* + * Don't allow multi-bio non-blocking submissions as + * subsequent bios may fail but we won't get a direct + * indication of that. Normally, the caller should + * retry from a blocking context. + */ + if (unlikely(nr_sects)) { + bio_put(bio); + return -EAGAIN; + } + bio->bi_opf |= REQ_NOWAIT; + } + + prev = bio_chain_and_submit(prev, bio); + } + if (unlikely(!prev)) + return -EAGAIN; + if (unlikely(nr_sects)) + bic->res = -EAGAIN; + + prev->bi_private = cmd; + prev->bi_end_io = bio_cmd_bio_end_io; + submit_bio(prev); + return -EIOCBQUEUED; +} + +int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host); + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + const struct io_uring_sqe *sqe = cmd->sqe; + u32 cmd_op = cmd->cmd_op; + uint64_t start, len; + + if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len || + sqe->rw_flags || sqe->file_index)) + return -EINVAL; + + bic->res = 0; + bic->nowait = issue_flags & IO_URING_F_NONBLOCK; + + start = READ_ONCE(sqe->addr); + len = READ_ONCE(sqe->addr3); + + switch (cmd_op) { + case BLOCK_URING_CMD_DISCARD: + return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait); + } + return -EINVAL; +} diff --git a/block/partitions/core.c b/block/partitions/core.c index ab76e64f0f6c..5bd7a603092e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -555,9 +555,11 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); - if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %pe\n", - disk->disk_name, p, part); + if (IS_ERR(part)) { + if (PTR_ERR(part) != -ENXIO) { + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); + } return true; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 425e2836f3e1..e7052a728966 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include "blk.h" @@ -240,9 +239,9 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, } } -static bool ext_pi_ref_escape(u8 *ref_tag) +static bool ext_pi_ref_escape(const u8 ref_tag[6]) { - static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } @@ -472,6 +471,3 @@ void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) else t10_pi_type1_complete(rq, nr_bytes); } - -MODULE_DESCRIPTION("T10 Protection Information module"); -MODULE_LICENSE("GPL"); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 94dc0a235919..2a05d955e30b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -297,10 +297,6 @@ struct drbd_epoch { unsigned long flags; }; -/* Prototype declaration of function defined in drbd_receiver.c */ -int drbdd_init(struct drbd_thread *); -int drbd_asender(struct drbd_thread *); - /* drbd_epoch flag bits */ enum { DE_HAVE_BARRIER_NUMBER, @@ -864,7 +860,6 @@ struct drbd_device { struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ struct list_head net_ee; /* zero-copy network send in progress */ - int next_barrier_nr; struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ @@ -1390,9 +1385,6 @@ extern void conn_free_crypto(struct drbd_connection *connection); extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *); void drbd_submit_bio(struct bio *bio); -extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); -extern int is_valid_ar_handle(struct drbd_request *, sector_t); - /* drbd_nl.c */ @@ -1474,7 +1466,6 @@ extern int w_resync_timer(struct drbd_work *, int); extern int w_send_write_hint(struct drbd_work *, int); extern int w_send_dblock(struct drbd_work *, int); extern int w_send_read_req(struct drbd_work *, int); -extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); @@ -1488,7 +1479,6 @@ extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); -extern void drbd_send_ping_wf(struct work_struct *ws); extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, @@ -1504,7 +1494,6 @@ extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); -extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a9e49b212341..449123eb54bf 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1550,7 +1550,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa * put_page(); and would cause either a VM_BUG directly, or * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ - if (!drbd_disable_sendpage && sendpage_ok(page)) + if (!drbd_disable_sendpage && sendpages_ok(page, len, offset)) msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES; drbd_update_congested(peer_device->connection); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index e858e7e0383f..c2b6c4d9729d 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) ns.disk == D_OUTDATED) rv = SS_CONNECTED_OUTDATES; - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && (nc->verify_alg[0] == 0)) rv = SS_NO_VERIFY_ALG; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c6ef0546ffc9..11901f2812ad 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2269,25 +2269,12 @@ static const struct file_operations mtip_flags_fops = { .llseek = no_llseek, }; -static int mtip_hw_debugfs_init(struct driver_data *dd) +static void mtip_hw_debugfs_init(struct driver_data *dd) { - if (!dfs_parent) - return -1; - dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); - if (IS_ERR_OR_NULL(dd->dfs_node)) { - dev_warn(&dd->pdev->dev, - "Error creating node %s under debugfs\n", - dd->disk->disk_name); - dd->dfs_node = NULL; - return -1; - } - debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops); debugfs_create_file("registers", 0444, dd->dfs_node, dd, &mtip_regs_fops); - - return 0; } static void mtip_hw_debugfs_exit(struct driver_data *dd) @@ -4043,10 +4030,6 @@ static int __init mtip_init(void) mtip_major = error; dfs_parent = debugfs_create_dir("rssd", NULL); - if (IS_ERR_OR_NULL(dfs_parent)) { - pr_warn("Error creating debugfs parent\n"); - dfs_parent = NULL; - } /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 41a90150b501..b852050d8a96 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -181,6 +181,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); + lockdep_assert_held(&cmd->lock); + + /* + * Clear INFLIGHT flag so that this cmd won't be completed in + * normal completion path + * + * INFLIGHT flag will be set when the cmd is queued to nbd next + * time. + */ + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); + if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) blk_mq_requeue_request(req, true); } @@ -339,7 +350,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim = queue_limits_start_update(nbd->disk->queue); if (nbd->config->flags & NBD_FLAG_SEND_TRIM) - lim.max_hw_discard_sectors = UINT_MAX; + lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; else lim.max_hw_discard_sectors = 0; if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { @@ -350,6 +361,11 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.features |= BLK_FEAT_WRITE_CACHE; lim.features &= ~BLK_FEAT_FUA; } + if (nbd->config->flags & NBD_FLAG_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) + lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; + lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); @@ -418,6 +434,8 @@ static u32 req_to_nbd_cmd_type(struct request *req) return NBD_CMD_WRITE; case REQ_OP_READ: return NBD_CMD_READ; + case REQ_OP_WRITE_ZEROES: + return NBD_CMD_WRITE_ZEROES; default: return U32_MAX; } @@ -488,8 +506,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } - mutex_unlock(&cmd->lock); nbd_requeue_cmd(cmd); + mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_DONE; } @@ -634,6 +652,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, if (req->cmd_flags & REQ_FUA) nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) + nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the @@ -1703,6 +1723,10 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + if (flags & NBD_FLAG_SEND_WRITE_ZEROES) + seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); + if (flags & NBD_FLAG_ROTATIONAL) + seq_puts(s, "NBD_FLAG_ROTATIONAL\n"); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7cece5884b9c..3edb37a41312 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -498,8 +498,6 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) if (!pkt_debugfs_root) return; pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); - if (!pd->dfs_d_root) - return; pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, pd, &pkt_seq_fops); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index f6e3a3c4b76c..08ce6d96d04c 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -149,15 +149,22 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { - rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); + rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n"); err = -EINVAL; goto bio_put; } + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio_has_data(bio) && + bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); - bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); bio_set_prio(bio, prio); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1d53a3f48a0e..bca06bfb4bc3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -71,9 +71,6 @@ struct ublk_rq_data { struct llist_node node; struct kref ref; - __u64 sector; - __u32 operation; - __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -214,6 +211,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) #ifdef CONFIG_BLK_DEV_ZONED +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + static int ublk_get_nr_zones(const struct ublk_device *ub) { const struct ublk_param_basic *p = &ub->params.basic; @@ -308,7 +332,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, unsigned int zones_in_request = min_t(unsigned int, remaining_zones, max_zones_per_request); struct request *req; - struct ublk_rq_data *pdu; + struct ublk_zoned_report_desc desc; blk_status_t status; memset(buffer, 0, buffer_length); @@ -319,20 +343,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, goto out; } - pdu = blk_mq_rq_to_pdu(req); - pdu->operation = UBLK_IO_OP_REPORT_ZONES; - pdu->sector = sector; - pdu->nr_zones = zones_in_request; + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, GFP_KERNEL); - if (ret) { - blk_mq_free_request(req); - goto out; - } + if (ret) + goto erase_desc; status = blk_execute_rq(req, 0); ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: blk_mq_free_request(req); if (ret) goto out; @@ -366,7 +393,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + struct ublk_zoned_report_desc *desc; u32 ublk_op; switch (req_op(req)) { @@ -389,12 +416,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - ublk_op = pdu->operation; + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; switch (ublk_op) { case UBLK_IO_OP_REPORT_ZONES: iod->op_flags = ublk_op | ublk_req_build_flags(req); - iod->nr_zones = pdu->nr_zones; - iod->start_sector = pdu->sector; + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; return BLK_STS_OK; default: return BLK_STS_IOERR; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index efcb8d9d274c..84cd62efac0f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,17 +59,17 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, static int zram_slot_trylock(struct zram *zram, u32 index) { - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); + return spin_trylock(&zram->table[index].lock); } static void zram_slot_lock(struct zram *zram, u32 index) { - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); + spin_lock(&zram->table[index].lock); } static void zram_slot_unlock(struct zram *zram, u32 index) { - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); } static inline bool init_done(struct zram *zram) @@ -1211,7 +1211,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) static bool zram_meta_alloc(struct zram *zram, u64 disksize) { - size_t num_pages; + size_t num_pages, index; num_pages = disksize >> PAGE_SHIFT; zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table))); @@ -1226,6 +1226,9 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); return true; } @@ -1283,7 +1286,7 @@ out: zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); WARN_ON_ONCE(zram->table[index].flags & - ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); + ~(1UL << ZRAM_UNDER_WB)); } /* @@ -2401,9 +2404,10 @@ static void destroy_devices(void) static int __init zram_init(void) { + struct zram_table_entry zram_te; int ret; - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 35e322144629..531cefc66668 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -45,9 +45,7 @@ /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { - /* zram slot is locked */ - ZRAM_LOCK = ZRAM_FLAG_SHIFT, - ZRAM_SAME, /* Page consists the same element */ + ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ @@ -68,7 +66,8 @@ struct zram_table_entry { unsigned long handle; unsigned long element; }; - unsigned long flags; + unsigned int flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 0c3323e0adb2..63682d27fc8d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) /* Try loading the bitmap unless "raid0", which does not have one */ if (!rs_is_raid0(rs) && !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { - r = md_bitmap_load(&rs->md); + struct mddev *mddev = &rs->md; + + r = mddev->bitmap_ops->load(mddev); if (r) DMERR("Failed to load bitmap"); } @@ -4066,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize, false); if (r) DMERR("Failed to resize bitmap"); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index db5330d97348..29da10e6f703 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -32,11 +32,210 @@ #include "md.h" #include "md-bitmap.h" +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stripe is clean + * Anything that dirties the stripe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +/* + * bitmap structures: + */ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* + * count of dirty bits on the page + */ + unsigned int count:30; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + /* total number of pages in the bitmap */ + unsigned long pages; + /* number of pages not yet allocated */ + unsigned long missing_pages; + /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunkshift; + /* total number of data chunks for the array */ + unsigned long chunks; + } counts; + + struct mddev *mddev; /* the md device that the bitmap is for */ + + __u64 events_cleared; + int need_sync; + + struct bitmap_storage { + /* backing disk file */ + struct file *file; + /* cached copy of the bitmap file superblock */ + struct page *sb_page; + unsigned long sb_index; + /* list of cache pages for the file */ + struct page **filemap; + /* attributes associated filemap pages */ + unsigned long *filemap_attr; + /* number of pages in the file */ + unsigned long file_pages; + /* total bytes in the bitmap */ + unsigned long bytes; + } storage; + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + /* highest actual value at runtime */ + unsigned long behind_writes_used; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + /* + * when we lasted called end_sync to update bitmap with resync + * progress. + */ + unsigned long last_end_sync; + + /* pending writes to the bitmap file */ + atomic_t pending_writes; + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct kernfs_node *sysfs_can_clear; + /* slot offset for clustered env */ + int cluster_slot; +}; + +static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, bool init); + static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } +static bool __bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + +static bool bitmap_enabled(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return false; + + return __bitmap_enabled(bitmap); +} + /* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * @@ -472,9 +671,10 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap) /* update the event counter and sync the superblock to disk */ -void md_bitmap_update_sb(struct bitmap *bitmap) +static void bitmap_update_sb(void *data) { bitmap_super_t *sb; + struct bitmap *bitmap = data; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; @@ -510,10 +710,8 @@ void md_bitmap_update_sb(struct bitmap *bitmap) write_sb_page(bitmap, bitmap->storage.sb_index, bitmap->storage.sb_page, 1); } -EXPORT_SYMBOL(md_bitmap_update_sb); -/* print out the bitmap file superblock */ -void md_bitmap_print_sb(struct bitmap *bitmap) +static void bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; @@ -760,7 +958,7 @@ out_no_sb: bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; } else { - md_bitmap_print_sb(bitmap); + bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); } @@ -893,7 +1091,7 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store) static void md_bitmap_file_kick(struct bitmap *bitmap) { if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); if (bitmap->storage.file) { pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", @@ -1028,13 +1226,13 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ -void md_bitmap_unplug(struct bitmap *bitmap) +static void __bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; int writing = 0; - if (!md_bitmap_enabled(bitmap)) + if (!__bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -1060,7 +1258,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) md_bitmap_file_kick(bitmap); } -EXPORT_SYMBOL(md_bitmap_unplug); struct bitmap_unplug_work { struct work_struct work; @@ -1073,11 +1270,11 @@ static void md_bitmap_unplug_fn(struct work_struct *work) struct bitmap_unplug_work *unplug_work = container_of(work, struct bitmap_unplug_work, work); - md_bitmap_unplug(unplug_work->bitmap); + __bitmap_unplug(unplug_work->bitmap); complete(unplug_work->done); } -void md_bitmap_unplug_async(struct bitmap *bitmap) +static void bitmap_unplug_async(struct bitmap *bitmap) { DECLARE_COMPLETION_ONSTACK(done); struct bitmap_unplug_work unplug_work; @@ -1089,7 +1286,19 @@ void md_bitmap_unplug_async(struct bitmap *bitmap) queue_work(md_bitmap_wq, &unplug_work.work); wait_for_completion(&done); } -EXPORT_SYMBOL(md_bitmap_unplug_async); + +static void bitmap_unplug(struct mddev *mddev, bool sync) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; + + if (sync) + __bitmap_unplug(bitmap); + else + bitmap_unplug_async(bitmap); +} static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); @@ -1226,22 +1435,21 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return ret; } -void md_bitmap_write_all(struct bitmap *bitmap) +/* just flag bitmap pages as needing to be written. */ +static void bitmap_write_all(struct mddev *mddev) { - /* We don't actually write all bitmap blocks here, - * just flag them as needing to be written - */ int i; + struct bitmap *bitmap = mddev->bitmap; if (!bitmap || !bitmap->storage.filemap) return; + + /* Only one copy, so nothing needed */ if (bitmap->storage.file) - /* Only one copy, so nothing needed */ return; for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, - BITMAP_PAGE_NEEDWRITE); + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap->allclean = 0; } @@ -1290,7 +1498,7 @@ out: * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ -void md_bitmap_daemon_work(struct mddev *mddev) +static void bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; unsigned long j; @@ -1461,8 +1669,11 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +static int bitmap_startwrite(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool behind) { + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return 0; @@ -1523,13 +1734,15 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s } return 0; } -EXPORT_SYMBOL(md_bitmap_startwrite); -void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind) +static void bitmap_endwrite(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool success, bool behind) { + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return; + if (behind) { if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); @@ -1576,26 +1789,27 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, sectors = 0; } } -EXPORT_SYMBOL(md_bitmap_endwrite); -static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - int rv; + bool rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ *blocks = 1024; - return 1; /* always resync if no bitmap */ + return true; /* always resync if no bitmap */ } spin_lock_irq(&bitmap->counts.lock); + + rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); - rv = 0; if (bmc) { /* locked */ - if (RESYNC(*bmc)) - rv = 1; - else if (NEEDED(*bmc)) { - rv = 1; + if (RESYNC(*bmc)) { + rv = true; + } else if (NEEDED(*bmc)) { + rv = true; if (!degraded) { /* don't set/clear bits if degraded */ *bmc |= RESYNC_MASK; *bmc &= ~NEEDED_MASK; @@ -1603,11 +1817,12 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t } } spin_unlock_irq(&bitmap->counts.lock); + return rv; } -int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +static bool bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) { /* bitmap_start_sync must always report on multiples of whole * pages, otherwise resync (which is very PAGE_SIZE based) will @@ -1616,21 +1831,22 @@ int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *block * At least PAGE_SIZE>>9 blocks are covered. * Return the 'or' of the result. */ - int rv = 0; + bool rv = false; sector_t blocks1; *blocks = 0; while (*blocks < (PAGE_SIZE>>9)) { - rv |= __bitmap_start_sync(bitmap, offset, + rv |= __bitmap_start_sync(mddev->bitmap, offset, &blocks1, degraded); offset += blocks1; *blocks += blocks1; } + return rv; } -EXPORT_SYMBOL(md_bitmap_start_sync); -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks, bool aborted) { bitmap_counter_t *bmc; unsigned long flags; @@ -1659,9 +1875,14 @@ void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks unlock: spin_unlock_irqrestore(&bitmap->counts.lock, flags); } -EXPORT_SYMBOL(md_bitmap_end_sync); -void md_bitmap_close_sync(struct bitmap *bitmap) +static void bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + __bitmap_end_sync(mddev->bitmap, offset, blocks, true); +} + +static void bitmap_close_sync(struct mddev *mddev) { /* Sync has finished, and any bitmap chunks that weren't synced * properly have been aborted. It remains to us to clear the @@ -1669,19 +1890,23 @@ void md_bitmap_close_sync(struct bitmap *bitmap) */ sector_t sector = 0; sector_t blocks; + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return; + while (sector < bitmap->mddev->resync_max_sectors) { - md_bitmap_end_sync(bitmap, sector, &blocks, 0); + __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } } -EXPORT_SYMBOL(md_bitmap_close_sync); -void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) +static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) { sector_t s = 0; sector_t blocks; + struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; @@ -1700,34 +1925,32 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { - md_bitmap_end_sync(bitmap, s, &blocks, 0); + __bitmap_end_sync(bitmap, s, &blocks, false); s += blocks; } bitmap->last_end_sync = jiffies; sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } -EXPORT_SYMBOL(md_bitmap_cond_end_sync); -void md_bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi) +static void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) { struct bitmap *bitmap = mddev->bitmap; sector_t sector, blocks = 0; for (sector = old_lo; sector < new_lo; ) { - md_bitmap_end_sync(bitmap, sector, &blocks, 0); + __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); for (sector = old_hi; sector < new_hi; ) { - md_bitmap_start_sync(bitmap, sector, &blocks, 0); + bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); } -EXPORT_SYMBOL(md_bitmap_sync_with_cluster); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { @@ -1756,12 +1979,18 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ -void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) { unsigned long chunk; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; for (chunk = s; chunk <= e; chunk++) { sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; + md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); if (sec < bitmap->mddev->recovery_cp) @@ -1773,10 +2002,7 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long } } -/* - * flush out any pending updates - */ -void md_bitmap_flush(struct mddev *mddev) +static void bitmap_flush(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; long sleep; @@ -1789,23 +2015,21 @@ void md_bitmap_flush(struct mddev *mddev) */ sleep = mddev->bitmap_info.daemon_sleep * 2; bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); if (mddev->bitmap_info.external) md_super_wait(mddev); - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); } -/* - * free memory that was allocated - */ -void md_bitmap_free(struct bitmap *bitmap) +static void md_bitmap_free(void *data) { unsigned long k, pages; struct bitmap_page *bp; + struct bitmap *bitmap = data; if (!bitmap) /* there was no bitmap */ return; @@ -1836,9 +2060,8 @@ void md_bitmap_free(struct bitmap *bitmap) kfree(bp); kfree(bitmap); } -EXPORT_SYMBOL(md_bitmap_free); -void md_bitmap_wait_behind_writes(struct mddev *mddev) +static void bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -1852,14 +2075,14 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev) } } -void md_bitmap_destroy(struct mddev *mddev) +static void bitmap_destroy(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) /* there was no bitmap */ return; - md_bitmap_wait_behind_writes(mddev); + bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) mddev_destroy_serial_pool(mddev, NULL); @@ -1878,7 +2101,7 @@ void md_bitmap_destroy(struct mddev *mddev) * if this returns an error, bitmap_destroy must be called to do clean up * once mddev->bitmap is set */ -struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) +static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; @@ -1948,7 +2171,8 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) goto error; bitmap->daemon_lastrun = jiffies; - err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, + true); if (err) goto error; @@ -1965,7 +2189,18 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } -int md_bitmap_load(struct mddev *mddev) +static int bitmap_create(struct mddev *mddev, int slot) +{ + struct bitmap *bitmap = __bitmap_create(mddev, slot); + + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + mddev->bitmap = bitmap; + return 0; +} + +static int bitmap_load(struct mddev *mddev) { int err = 0; sector_t start = 0; @@ -1989,10 +2224,10 @@ int md_bitmap_load(struct mddev *mddev) */ while (sector < mddev->resync_max_sectors) { sector_t blocks; - md_bitmap_start_sync(bitmap, sector, &blocks, 0); + bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } - md_bitmap_close_sync(bitmap); + bitmap_close_sync(mddev); if (mddev->degraded == 0 || bitmap->events_cleared == mddev->events) @@ -2014,22 +2249,21 @@ int md_bitmap_load(struct mddev *mddev) mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) err = -EIO; out: return err; } -EXPORT_SYMBOL_GPL(md_bitmap_load); /* caller need to free returned bitmap with md_bitmap_free() */ -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) +static void *bitmap_get_from_slot(struct mddev *mddev, int slot) { int rv = 0; struct bitmap *bitmap; - bitmap = md_bitmap_create(mddev, slot); + bitmap = __bitmap_create(mddev, slot); if (IS_ERR(bitmap)) { rv = PTR_ERR(bitmap); return ERR_PTR(rv); @@ -2043,20 +2277,19 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) return bitmap; } -EXPORT_SYMBOL(get_bitmap_from_slot); /* Loads the bitmap associated with slot and copies the resync information * to our bitmap */ -int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *low, sector_t *high, bool clear_bits) +static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, + sector_t *high, bool clear_bits) { int rv = 0, i, j; sector_t block, lo = 0, hi = 0; struct bitmap_counts *counts; struct bitmap *bitmap; - bitmap = get_bitmap_from_slot(mddev, slot); + bitmap = bitmap_get_from_slot(mddev, slot); if (IS_ERR(bitmap)) { pr_err("%s can't get bitmap from slot %d\n", __func__, slot); return -1; @@ -2076,53 +2309,59 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, } if (clear_bits) { - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); - md_bitmap_unplug(bitmap); + __bitmap_unplug(bitmap); } - md_bitmap_unplug(mddev->bitmap); + __bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; md_bitmap_free(bitmap); return rv; } -EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); - -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +static void bitmap_set_pages(void *data, unsigned long pages) { - unsigned long chunk_kb; - struct bitmap_counts *counts; + struct bitmap *bitmap = data; - if (!bitmap) - return; - - counts = &bitmap->counts; - - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - counts->pages - counts->missing_pages, - counts->pages, - (counts->pages - counts->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->storage.file) { - seq_printf(seq, ", file: "); - seq_file_path(seq, bitmap->storage.file, " \t\n"); - } - - seq_printf(seq, "\n"); + bitmap->counts.pages = pages; } -int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init) +static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) +{ + struct bitmap_storage *storage; + struct bitmap_counts *counts; + struct bitmap *bitmap = data; + bitmap_super_t *sb; + + if (!bitmap) + return -ENOENT; + + sb = kmap_local_page(bitmap->storage.sb_page); + stats->sync_size = le64_to_cpu(sb->sync_size); + kunmap_local(sb); + + counts = &bitmap->counts; + stats->missing_pages = counts->missing_pages; + stats->pages = counts->pages; + + storage = &bitmap->storage; + stats->file_pages = storage->file_pages; + stats->file = storage->file; + + stats->behind_writes = atomic_read(&bitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait); + stats->events_cleared = bitmap->events_cleared; + return 0; +} + +static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, bool init) { /* If chunk_size is 0, choose an appropriate chunk size. * Then possibly allocate new storage space. @@ -2320,14 +2559,24 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, spin_unlock_irq(&bitmap->counts.lock); if (!init) { - md_bitmap_unplug(bitmap); + __bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); } ret = 0; err: return ret; } -EXPORT_SYMBOL_GPL(md_bitmap_resize); + +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, + bool init) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return 0; + + return __bitmap_resize(bitmap, blocks, chunksize, init); +} static ssize_t location_show(struct mddev *mddev, char *page) @@ -2367,7 +2616,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) goto out; } - md_bitmap_destroy(mddev); + bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; @@ -2377,7 +2626,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { /* No bitmap, OK to set a location */ long long offset; - struct bitmap *bitmap; if (strncmp(buf, "none", 4) == 0) /* nothing to be done */; @@ -2404,17 +2652,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - rv = PTR_ERR(bitmap); + rv = bitmap_create(mddev, -1); + if (rv) goto out; - } - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); + rv = bitmap_load(mddev); if (rv) { mddev->bitmap_info.offset = 0; - md_bitmap_destroy(mddev); + bitmap_destroy(mddev); goto out; } } @@ -2450,6 +2695,7 @@ space_show(struct mddev *mddev, char *page) static ssize_t space_store(struct mddev *mddev, const char *buf, size_t len) { + struct bitmap *bitmap; unsigned long sectors; int rv; @@ -2460,8 +2706,8 @@ space_store(struct mddev *mddev, const char *buf, size_t len) if (sectors == 0) return -EINVAL; - if (mddev->bitmap && - sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + bitmap = mddev->bitmap; + if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9) return -EFBIG; /* Bitmap is too big for this small space */ /* could make sure it isn't too big, but that isn't really @@ -2569,7 +2815,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) - md_bitmap_update_sb(mddev->bitmap); + bitmap_update_sb(mddev->bitmap); mddev_unlock_and_resume(mddev); return len; @@ -2638,10 +2884,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + struct bitmap *bitmap; + spin_lock(&mddev->lock); - if (mddev->bitmap) - len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? - "false" : "true")); + bitmap = mddev->bitmap; + if (bitmap) + len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" : + "true")); else len = sprintf(page, "\n"); spin_unlock(&mddev->lock); @@ -2650,17 +2899,24 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page) static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) { - if (mddev->bitmap == NULL) + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) return -ENOENT; - if (strncmp(buf, "false", 5) == 0) - mddev->bitmap->need_sync = 1; - else if (strncmp(buf, "true", 4) == 0) { + + if (strncmp(buf, "false", 5) == 0) { + bitmap->need_sync = 1; + return len; + } + + if (strncmp(buf, "true", 4) == 0) { if (mddev->degraded) return -EBUSY; - mddev->bitmap->need_sync = 0; - } else - return -EINVAL; - return len; + bitmap->need_sync = 0; + return len; + } + + return -EINVAL; } static struct md_sysfs_entry bitmap_can_clear = @@ -2670,21 +2926,26 @@ static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { ssize_t ret; + struct bitmap *bitmap; + spin_lock(&mddev->lock); - if (mddev->bitmap == NULL) + bitmap = mddev->bitmap; + if (!bitmap) ret = sprintf(page, "0\n"); else - ret = sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "%lu\n", bitmap->behind_writes_used); spin_unlock(&mddev->lock); + return ret; } static ssize_t behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) { - if (mddev->bitmap) - mddev->bitmap->behind_writes_used = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (bitmap) + bitmap->behind_writes_used = 0; return len; } @@ -2707,3 +2968,38 @@ const struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; + +static struct bitmap_operations bitmap_ops = { + .enabled = bitmap_enabled, + .create = bitmap_create, + .resize = bitmap_resize, + .load = bitmap_load, + .destroy = bitmap_destroy, + .flush = bitmap_flush, + .write_all = bitmap_write_all, + .dirty_bits = bitmap_dirty_bits, + .unplug = bitmap_unplug, + .daemon_work = bitmap_daemon_work, + .wait_behind_writes = bitmap_wait_behind_writes, + + .startwrite = bitmap_startwrite, + .endwrite = bitmap_endwrite, + .start_sync = bitmap_start_sync, + .end_sync = bitmap_end_sync, + .cond_end_sync = bitmap_cond_end_sync, + .close_sync = bitmap_close_sync, + + .update_sb = bitmap_update_sb, + .get_stats = bitmap_get_stats, + + .sync_with_cluster = bitmap_sync_with_cluster, + .get_from_slot = bitmap_get_from_slot, + .copy_from_slot = bitmap_copy_from_slot, + .set_pages = bitmap_set_pages, + .free = md_bitmap_free, +}; + +void mddev_set_bitmap_ops(struct mddev *mddev) +{ + mddev->bitmap_ops = &bitmap_ops; +} diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index bb9eb418780a..662e6fc141a7 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -7,81 +7,7 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared as well, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stripe is clean - * Anything that dirties the stripe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) +#define BITMAP_MAGIC 0x6d746962 typedef __u16 bitmap_counter_t; #define COUNTER_BITS 16 @@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t; #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SHIFT 9 - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { @@ -152,136 +58,58 @@ typedef struct bitmap_super_s { * devices. For raid10 it is the size of the array. */ -#ifdef __KERNEL__ +struct md_bitmap_stats { + u64 events_cleared; + int behind_writes; + bool behind_wait; -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * If any counter in this page is '1' or '2' - and so could be - * cleared then that page is marked as 'pending' - */ - unsigned int pending:1; - /* - * count of dirty bits on the page - */ - unsigned int count:30; + unsigned long missing_pages; + unsigned long file_pages; + unsigned long sync_size; + unsigned long pages; + struct file *file; }; -/* the main bitmap structure - one per mddev */ -struct bitmap { +struct bitmap_operations { + bool (*enabled)(struct mddev *mddev); + int (*create)(struct mddev *mddev, int slot); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, + bool init); - struct bitmap_counts { - spinlock_t lock; - struct bitmap_page *bp; - unsigned long pages; /* total number of pages - * in the bitmap */ - unsigned long missing_pages; /* number of pages - * not yet allocated */ - unsigned long chunkshift; /* chunksize = 2^chunkshift - * (for bitops) */ - unsigned long chunks; /* Total number of data - * chunks for the array */ - } counts; + int (*load)(struct mddev *mddev); + void (*destroy)(struct mddev *mddev); + void (*flush)(struct mddev *mddev); + void (*write_all)(struct mddev *mddev); + void (*dirty_bits)(struct mddev *mddev, unsigned long s, + unsigned long e); + void (*unplug)(struct mddev *mddev, bool sync); + void (*daemon_work)(struct mddev *mddev); + void (*wait_behind_writes)(struct mddev *mddev); - struct mddev *mddev; /* the md device that the bitmap is for */ + int (*startwrite)(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool behind); + void (*endwrite)(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool success, bool behind); + bool (*start_sync)(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded); + void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); + void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force); + void (*close_sync)(struct mddev *mddev); - __u64 events_cleared; - int need_sync; + void (*update_sb)(void *data); + int (*get_stats)(void *data, struct md_bitmap_stats *stats); - struct bitmap_storage { - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap - * file superblock */ - unsigned long sb_index; - struct page **filemap; /* list of cache pages for - * the file */ - unsigned long *filemap_attr; /* attributes associated - * w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file*/ - unsigned long bytes; /* total bytes in the bitmap */ - } storage; - - unsigned long flags; - - int allclean; - - atomic_t behind_writes; - unsigned long behind_writes_used; /* highest actual value at runtime */ - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - wait_queue_head_t behind_wait; - - struct kernfs_node *sysfs_can_clear; - int cluster_slot; /* Slot offset for clustered env */ + void (*sync_with_cluster)(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); + void *(*get_from_slot)(struct mddev *mddev, int slot); + int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, + sector_t *hi, bool clear_bits); + void (*set_pages)(void *data, unsigned long pages); + void (*free)(void *data); }; /* the bitmap API */ - -/* these are used only by md/bitmap */ -struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); -int md_bitmap_load(struct mddev *mddev); -void md_bitmap_flush(struct mddev *mddev); -void md_bitmap_destroy(struct mddev *mddev); - -void md_bitmap_print_sb(struct bitmap *bitmap); -void md_bitmap_update_sb(struct bitmap *bitmap); -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); - -int md_bitmap_setallbits(struct bitmap *bitmap); -void md_bitmap_write_all(struct bitmap *bitmap); - -void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - -/* these are exported */ -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); -int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); -void md_bitmap_close_sync(struct bitmap *bitmap); -void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); -void md_bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi); - -void md_bitmap_unplug(struct bitmap *bitmap); -void md_bitmap_unplug_async(struct bitmap *bitmap); -void md_bitmap_daemon_work(struct mddev *mddev); - -int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init); -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); -int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *lo, sector_t *hi, bool clear_bits); -void md_bitmap_free(struct bitmap *bitmap); -void md_bitmap_wait_behind_writes(struct mddev *mddev); - -static inline bool md_bitmap_enabled(struct bitmap *bitmap) -{ - return bitmap && bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - -#endif +void mddev_set_bitmap_ops(struct mddev *mddev); #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1d0db62f0351..6595f89becdb 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread) str, ret); goto clear_bit; } - ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); + ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true); if (ret) { pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); goto clear_bit; @@ -497,8 +497,8 @@ static void process_suspend_info(struct mddev *mddev, * we don't want to trigger lots of WARN. */ if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) - md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, - cinfo->sync_hi, lo, hi); + mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, lo, hi); cinfo->sync_low = lo; cinfo->sync_hi = hi; @@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) break; case BITMAP_RESIZE: if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) - ret = md_bitmap_resize(mddev->bitmap, - le64_to_cpu(msg->high), 0, 0); + ret = mddev->bitmap_ops->resize(mddev, + le64_to_cpu(msg->high), + 0, false); break; default: ret = -1; @@ -856,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) } /* Read the disk bitmap sb and check if it needs recovery */ - ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false); if (ret) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); lockres_free(bm_lockres); @@ -1143,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size) static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) { - struct bitmap_counts *counts; - char str[64]; - struct dlm_lock_resource *bm_lockres; - struct bitmap *bitmap = mddev->bitmap; - unsigned long my_pages = bitmap->counts.pages; + void *bitmap = mddev->bitmap; + struct md_bitmap_stats stats; + unsigned long my_pages; int i, rv; + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) + return rv; + + my_pages = stats.pages; /* * We need to ensure all the nodes can grow to a larger * bitmap size before make the reshaping. @@ -1159,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz return rv; for (i = 0; i < mddev->bitmap_info.nodes; i++) { + struct dlm_lock_resource *bm_lockres; + char str[64]; + if (i == md_cluster_ops->slot_number(mddev)) continue; - bitmap = get_bitmap_from_slot(mddev, i); + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); bitmap = NULL; goto out; } - counts = &bitmap->counts; + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) + goto out; /* * If we can hold the bitmap lock of one node then * the slot is not occupied, update the pages. @@ -1183,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - counts->pages = my_pages; + mddev->bitmap_ops->set_pages(bitmap, my_pages); lockres_free(bm_lockres); - if (my_pages != counts->pages) + if (my_pages != stats.pages) /* * Let's revert the bitmap size if one node * can't resize bitmap */ goto out; - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); } return 0; out: - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); update_bitmap_size(mddev, oldsize); return -1; } @@ -1207,24 +1216,27 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int i, rv; - bitmap_super_t *sb; - unsigned long my_sync_size, sync_size = 0; - int node_num = mddev->bitmap_info.nodes; int current_slot = md_cluster_ops->slot_number(mddev); - struct bitmap *bitmap = mddev->bitmap; - char str[64]; + int node_num = mddev->bitmap_info.nodes; struct dlm_lock_resource *bm_lockres; + struct md_bitmap_stats stats; + void *bitmap = mddev->bitmap; + unsigned long sync_size = 0; + unsigned long my_sync_size; + char str[64]; + int i, rv; - sb = kmap_atomic(bitmap->storage.sb_page); - my_sync_size = sb->sync_size; - kunmap_atomic(sb); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) + return rv; + + my_sync_size = stats.sync_size; for (i = 0; i < node_num; i++) { if (i == current_slot) continue; - bitmap = get_bitmap_from_slot(mddev, i); + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); return -1; @@ -1238,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev) bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) { pr_err("md-cluster: Cannot initialize %s\n", str); - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); return -1; } bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - md_bitmap_update_sb(bitmap); + mddev->bitmap_ops->update_sb(bitmap); lockres_free(bm_lockres); - sb = kmap_atomic(bitmap->storage.sb_page); - if (sync_size == 0) - sync_size = sb->sync_size; - else if (sync_size != sb->sync_size) { - kunmap_atomic(sb); - md_bitmap_free(bitmap); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) { + mddev->bitmap_ops->free(bitmap); + return rv; + } + + if (sync_size == 0) { + sync_size = stats.sync_size; + } else if (sync_size != stats.sync_size) { + mddev->bitmap_ops->free(bitmap); return -1; } - kunmap_atomic(sb); - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); } return (my_sync_size == sync_size) ? 0 : -1; @@ -1585,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev) for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { if (sn == (cinfo->slot_number - 1)) continue; - err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); + err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false); if (err) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index d3a837506a36..179ee4afe937 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n return 0; } -/* - * Generic flush handling for md - */ - -static void md_end_flush(struct bio *bio) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - bio_put(bio); - - rdev_dec_pending(rdev, mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct md_rdev *rdev; - - mddev->start_flush = ktime_get_boottime(); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - struct bio *bi; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_bioset(rdev->bdev, 0, - REQ_OP_WRITE | REQ_PREFLUSH, - GFP_NOIO, &mddev->bio_set); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - atomic_inc(&mddev->flush_pending); - submit_bio(bi); - rcu_read_lock(); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - - /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - spin_lock_irq(&mddev->lock); - mddev->prev_flush_start = mddev->start_flush; - mddev->flush_bio = NULL; - spin_unlock_irq(&mddev->lock); - wake_up(&mddev->sb_wait); - - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - } else { - bio->bi_opf &= ~REQ_PREFLUSH; - - /* - * make_requst() will never return error here, it only - * returns error in raid5_make_request() by dm-raid. - * Since dm always splits data and flush operation into - * two separate io, io size of flush submitted by dm - * always is 0, make_request() will not be called here. - */ - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) - bio_io_error(bio); - } - - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); -} - -/* - * Manages consolidation of flushes and submitting any flushes needed for - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is - * being finished in another context. Returns false if the flushing is - * complete but still needs the I/O portion of the bio to be processed. - */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { - ktime_t req_start = ktime_get_boottime(); - spin_lock_irq(&mddev->lock); - /* flush requests wait until ongoing flush completes, - * hence coalescing all the pending requests. + struct md_rdev *rdev; + struct bio *new; + + /* + * md_flush_reqeust() should be called under md_handle_request() and + * 'active_io' is already grabbed. Hence it's safe to get rdev directly + * without rcu protection. */ - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio || - ktime_before(req_start, mddev->prev_flush_start), - mddev->lock); - /* new request after previous flush is completed */ - if (ktime_after(req_start, mddev->prev_flush_start)) { - WARN_ON(mddev->flush_bio); - /* - * Grab a reference to make sure mddev_suspend() will wait for - * this flush to be done. - * - * md_flush_reqeust() is called under md_handle_request() and - * 'active_io' is already grabbed, hence percpu_ref_is_zero() - * won't pass, percpu_ref_tryget_live() can't be used because - * percpu_ref_kill() can be called by mddev_suspend() - * concurrently. - */ - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - percpu_ref_get(&mddev->active_io); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); - return true; + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + new = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, + &mddev->bio_set); + bio_chain(new, bio); + submit_bio(new); } - /* flush was performed for some other bio while we waited. */ - spin_unlock_irq(&mddev->lock); - if (bio->bi_iter.bi_size == 0) { - /* pure flush without data - all done */ + if (bio_sectors(bio) == 0) { bio_endio(bio); return true; } @@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -772,6 +664,7 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -1372,6 +1265,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor return ret; } +static u64 md_bitmap_events_cleared(struct mddev *mddev) +{ + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return 0; + + return stats.events_cleared; +} + /* * validate_super for 0.90.0 * note: we are not using "freshest" for 0.9 superblock @@ -1464,7 +1369,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -1991,7 +1896,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -2323,7 +2228,6 @@ super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* All necessary checks on new >= old have been done */ - struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; @@ -2340,11 +2244,18 @@ super_1_allow_new_offset(struct md_rdev *rdev, */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - bitmap = rdev->mddev->bitmap; - if (bitmap && !rdev->mddev->bitmap_info.file && - rdev->sb_start + rdev->mddev->bitmap_info.offset + - bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) - return 0; + + if (!rdev->mddev->bitmap_info.file) { + struct mddev *mddev = rdev->mddev; + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && rdev->sb_start + mddev->bitmap_info.offset + + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) + return 0; + } + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; @@ -2820,7 +2731,7 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ @@ -4141,6 +4052,34 @@ out_unlock: static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); +static ssize_t +new_level_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->new_level); +} + +static ssize_t +new_level_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + + mddev->new_level = n; + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + return len; +} +static struct md_sysfs_entry md_new_level = +__ATTR(new_level, 0664, new_level_show, new_level_store); + static ssize_t layout_show(struct mddev *mddev, char *page) { @@ -4680,17 +4619,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) /* buf should be ... or - ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; + if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; } - if (*end && !isspace(*end)) break; - md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + + if (*end && !isspace(*end)) + break; + + mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } - md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ + mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; @@ -5666,6 +5611,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_new_level.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -6206,16 +6152,10 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - struct bitmap *bitmap; - - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - err = PTR_ERR(bitmap); + err = mddev->bitmap_ops->create(mddev, -1); + if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); - } else - mddev->bitmap = bitmap; - } if (err) goto bitmap_abort; @@ -6285,7 +6225,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; module_put(pers->owner); - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6304,9 +6244,10 @@ int do_md_run(struct mddev *mddev) err = md_run(mddev); if (err) goto out; - err = md_bitmap_load(mddev); + + err = mddev->bitmap_ops->load(mddev); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); goto out; } @@ -6450,7 +6391,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_bitmap_flush(mddev); + + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6477,7 +6419,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - md_bitmap_wait_behind_writes(mddev); + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6492,7 +6434,8 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - md_bitmap_destroy(mddev); + + mddev->bitmap_ops->destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7270,22 +7213,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - struct bitmap *bitmap; + err = mddev->bitmap_ops->create(mddev, -1); + if (!err) + err = mddev->bitmap_ops->load(mddev); - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; - err = md_bitmap_load(mddev); - } else - err = PTR_ERR(bitmap); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); fd = -1; } } else if (fd < 0) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } } + if (fd < 0) { struct file *f = mddev->bitmap_info.file; if (f) { @@ -7554,7 +7494,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) goto err; } if (info->state & (1<bitmap) { rv = -EEXIST; @@ -7568,24 +7507,24 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); - } else - rv = PTR_ERR(bitmap); + rv = mddev->bitmap_ops->create(mddev, -1); + if (!rv) + rv = mddev->bitmap_ops->load(mddev); + if (rv) - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } else { - /* remove the bitmap */ - if (!mddev->bitmap) { - rv = -ENOENT; + struct md_bitmap_stats stats; + + rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (rv) goto err; - } - if (mddev->bitmap->storage.file) { + + if (stats.file) { rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { @@ -7600,7 +7539,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -8370,6 +8309,33 @@ static void md_seq_stop(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); } +static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) +{ + struct md_bitmap_stats stats; + unsigned long used_pages; + unsigned long chunk_kb; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + chunk_kb = mddev->bitmap_info.chunksize >> 10; + used_pages = stats.pages - stats.missing_pages; + + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + + if (stats.file) { + seq_puts(seq, ", file: "); + seq_file_path(seq, stats.file, " \t\n"); + } + + seq_putc(seq, '\n'); +} + static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev; @@ -8390,14 +8356,19 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { - seq_printf(seq, "%s : %sactive", mdname(mddev), - mddev->pers ? "" : "in"); + seq_printf(seq, "%s : ", mdname(mddev)); if (mddev->pers) { + if (test_bit(MD_BROKEN, &mddev->flags)) + seq_printf(seq, "broken"); + else + seq_printf(seq, "active"); if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); + } else { + seq_printf(seq, "inactive"); } sectors = 0; @@ -8453,7 +8424,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - md_bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev); seq_printf(seq, "\n"); } @@ -8668,7 +8639,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi) BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -9506,7 +9476,7 @@ static void md_start_sync(struct work_struct *ws) * stored on all devices. So make sure all bitmap pages get written. */ if (spares) - md_bitmap_write_all(mddev->bitmap); + mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? "reshape" : "resync"; @@ -9594,7 +9564,7 @@ static void unregister_sync_thread(struct mddev *mddev) void md_check_recovery(struct mddev *mddev) { if (mddev->bitmap) - md_bitmap_daemon_work(mddev); + mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { @@ -9965,7 +9935,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (ret) pr_info("md-cluster: resize failed\n"); else - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ diff --git a/drivers/md/md.h b/drivers/md/md.h index a0d6827dced9..5d2e6bd58e4d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -535,7 +535,8 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ - struct bitmap *bitmap; /* the bitmap for the device */ + void *bitmap; /* the bitmap for the device */ + struct bitmap_operations *bitmap_ops; struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of @@ -571,16 +572,6 @@ struct mddev { */ struct bio_set io_clone_set; - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed - * flush was started. - */ - struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 2ea1710a3b70..4378d3250bd7 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!md_bitmap_enabled(mddev->bitmap)) { + if (!mddev->bitmap_ops->enabled(mddev)) { raid1_submit_write(bio); return true; } @@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * while current io submission must wait for bitmap io to be done. In order to * avoid such deadlock, submit bitmap io asynchronously. */ -static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +static inline void raid1_prepare_flush_writes(struct mddev *mddev) { - if (current->bio_list) - md_bitmap_unplug_async(bitmap); - else - md_bitmap_unplug(bitmap); + mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL); } /* diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 761989d67906..6c9d24203f39 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -411,18 +411,20 @@ static void raid1_end_read_request(struct bio *bio) static void close_write(struct r1bio *r1_bio) { + struct mddev *mddev = r1_bio->mddev; + /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { bio_free_pages(r1_bio->behind_master_bio); bio_put(r1_bio->behind_master_bio); r1_bio->behind_master_bio = NULL; } + /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(mddev); } static void r1_bio_write_done(struct r1bio *r1_bio) @@ -900,7 +902,7 @@ static void wake_up_barrier(struct r1conf *conf) static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ @@ -1317,13 +1319,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - struct bitmap *bitmap = mddev->bitmap; const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; int rdisk; bool r1bio_existed = !!r1_bio; - char b[BDEVNAME_SIZE]; /* * If r1_bio is set, we are blocking the raid1d thread @@ -1332,16 +1332,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; - if (r1bio_existed) { - /* Need to get the block device name carefully */ - struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - - if (rdev) - snprintf(b, sizeof(b), "%pg", rdev->bdev); - else - strcpy(b, "???"); - } - /* * Still need barrier for READ in case that whole * array is frozen. @@ -1363,15 +1353,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * used and no empty request is available. */ rdisk = read_balance(conf, r1_bio, &max_sectors); - if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (r1bio_existed) { - pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + if (r1bio_existed) + pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", mdname(mddev), - b, - (unsigned long long)r1_bio->sector); - } + conf->mirrors[r1_bio->read_disk].rdev->bdev, + r1_bio->sector); raid_end_bio_io(r1_bio); return; } @@ -1383,15 +1371,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { + if (test_bit(WriteMostly, &mirror->rdev->flags)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ mddev_add_trace_msg(mddev, "raid1 wait behind writes"); - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); + mddev->bitmap_ops->wait_behind_writes(mddev); } if (max_sectors < bio_sectors(bio)) { @@ -1432,7 +1418,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct r1bio *r1_bio; int i, disks; - struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; int first_clone; @@ -1585,7 +1570,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * at a time and thus needs a new bio that can fit the whole payload * this bio in page sized chunks. */ - if (write_behind && bitmap) + if (write_behind && mddev->bitmap) max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { @@ -1612,19 +1597,23 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { + unsigned long max_write_behind = + mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + /* do behind I/O ? * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && write_behind && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) { + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && write_behind && !stats.behind_wait && + stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); - } - md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + mddev->bitmap_ops->startwrite( + mddev, r1_bio->sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); first_clone = 0; } @@ -2042,7 +2031,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2771,7 +2760,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int wonly = -1; int write_targets = 0, read_targets = 0; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ int idx = sector_to_idx(sector_nr); @@ -2788,12 +2777,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2813,7 +2802,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2831,9 +2820,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * sector_nr + two times RESYNC_SECTORS */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, - mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -2864,7 +2853,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) - still_degraded = 1; + still_degraded = true; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_opf = REQ_OP_WRITE; bio->bi_end_io = end_sync_write; @@ -2988,8 +2977,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, still_degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3313,14 +3302,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); + int ret; + if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2a9c4ee982e0..f3bf1116794a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -426,12 +426,13 @@ static void raid10_end_read_request(struct bio *bio) static void close_write(struct r10bio *r10_bio) { + struct mddev *mddev = r10_bio->mddev; + /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + false); + md_write_end(mddev); } static void one_write_done(struct r10bio *r10_bio) @@ -884,7 +885,7 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1100,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - raid1_prepare_flush_writes(mddev->bitmap); + raid1_prepare_flush_writes(mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ @@ -1492,7 +1493,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, md_account_bio(mddev, &bio); r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors, + false); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) @@ -2465,7 +2467,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) s = PAGE_SIZE >> 9; rdev = conf->mirrors[dr].rdev; - addr = r10_bio->devs[0].addr + sect, + addr = r10_bio->devs[0].addr + sect; ok = sync_page_io(rdev, addr, s << 9, @@ -3192,13 +3194,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, + mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - md_bitmap_end_sync(mddev->bitmap, sect, - &sync_blocks, 1); + + mddev->bitmap_ops->end_sync(mddev, sect, + &sync_blocks); } } else { /* completed sync */ @@ -3218,7 +3222,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3287,10 +3291,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio = NULL; for (i = 0 ; i < conf->geo.raid_disks; i++) { - int still_degraded; + bool still_degraded; struct r10bio *rb2; sector_t sect; - int must_sync; + bool must_sync; int any_working; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3307,7 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (!mrdev && !mreplace) continue; - still_degraded = 0; + still_degraded = false; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); @@ -3320,8 +3324,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, + &sync_blocks, + true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3359,13 +3364,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, struct md_rdev *rdev = conf->mirrors[j].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - still_degraded = 1; + still_degraded = false; break; } } - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, + &sync_blocks, still_degraded); any_working = 0; for (j=0; jcopies;j++) { @@ -3538,12 +3543,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, mddev->degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -4190,6 +4196,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; + int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4202,11 +4209,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) if (mddev->external_size && mddev->array_sectors > size) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, size, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { @@ -4472,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev) newsize = raid10_size(mddev, 0, conf->geo.raid_disks); if (!mddev_is_clustered(mddev)) { - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort; else @@ -4487,20 +4494,20 @@ static int raid10_start_reshape(struct mddev *mddev) /* * some node is already performing reshape, and no need to - * call md_bitmap_resize again since it should be called when + * call bitmap_ops->resize again since it should be called when * receiving BITMAP_RESIZE msg */ if ((sb && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort; ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); + mddev->bitmap_ops->resize(mddev, oldsize, 0, false); goto abort; } } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 874874fe4fa1..b4f7b79fd187 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -313,10 +313,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + false); } } } @@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, { struct r5l_log *log = READ_ONCE(conf->log); int i; - int do_wakeup = 0; sector_t tree_index; void __rcu **pslot; uintptr_t refcount; @@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, for (i = sh->disks; i--; ) { clear_bit(R5_InJournal, &sh->dev[i].flags); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); } /* @@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); - if (do_wakeup) - wake_up(&conf->wait_for_overlap); - spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); spin_unlock_irq(&log->stripe_in_journal_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c14cf2410365..dc2ea636d173 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&sh->raid_conf->wait_for_overlap); + wake_up_bit(&dev->flags, R5_Overlap); } } local_unlock(&conf->percpu->lock); @@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, * With PPL only writes to consecutive data chunks within a * stripe are allowed because for a single stripe_head we can * only have one PPL entry at a time, which describes one data - * range. Not really an overlap, but wait_for_overlap can be + * range. Not really an overlap, but R5_Overlap can be * used to handle this. */ sector_t sector; @@ -3563,8 +3563,8 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, */ set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0); + conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, + RAID5_STRIPE_SECTORS(conf), false); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, log_stripe_write_finished(sh); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[i].flags, R5_Overlap); while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { @@ -3663,8 +3663,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + false, false); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3696,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].toread = NULL; spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[i].flags, R5_Overlap); if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < @@ -3709,8 +3710,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } } if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + false, false); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3734,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, BUG_ON(sh->batch_head); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -4059,10 +4061,10 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + false); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4875,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, { struct stripe_head *sh, *next; int i; - int do_wakeup = 0; list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { @@ -4911,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, spin_unlock_irq(&sh->stripe_lock); for (i = 0; i < sh->disks; i++) { if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); sh->dev[i].flags = head_sh->dev[i].flags & (~((1 << R5_WriteError) | (1 << R5_Overlap))); } @@ -4925,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, spin_unlock_irq(&head_sh->stripe_lock); for (i = 0; i < head_sh->disks; i++) if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&head_sh->dev[i].flags, R5_Overlap); if (head_sh->state & handle_flags) set_bit(STRIPE_HANDLE, &head_sh->state); - - if (do_wakeup) - wake_up(&head_sh->raid_conf->wait_for_overlap); } static void handle_stripe(struct stripe_head *sh) @@ -5196,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh) md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); } /* If the failed drives are just a ReadError, then we might need @@ -5259,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh) } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } @@ -5753,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) int d; again: sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); if (test_bit(STRIPE_SYNCING, &sh->state)) { raid5_release_stripe(sh); - schedule(); + wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap, + TASK_UNINTERRUPTIBLE); goto again; } clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); @@ -5770,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) set_bit(R5_Overlap, &sh->dev[d].flags); spin_unlock_irq(&sh->stripe_lock); raid5_release_stripe(sh); - schedule(); + wait_on_bit(&sh->dev[d].flags, R5_Overlap, + TASK_UNINTERRUPTIBLE); goto again; } } set_bit(STRIPE_DISCARD, &sh->state); - finish_wait(&conf->wait_for_overlap, &w); sh->overwrite_disks = 0; for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -5788,13 +5785,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { - for (d = 0; - d < conf->raid_disks - conf->max_degraded; + for (d = 0; d < conf->raid_disks - conf->max_degraded; d++) - md_bitmap_startwrite(mddev->bitmap, - sh->sector, - RAID5_STRIPE_SECTORS(conf), - 0); + mddev->bitmap_ops->startwrite(mddev, sh->sector, + RAID5_STRIPE_SECTORS(conf), false); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -5855,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf, struct bio *bi, int forwrite, int previous) { int dd_idx; - int ret = 1; spin_lock_irq(&sh->stripe_lock); @@ -5871,14 +5864,19 @@ static int add_all_stripe_bios(struct r5conf *conf, if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { set_bit(R5_Overlap, &dev->flags); - ret = 0; - continue; + spin_unlock_irq(&sh->stripe_lock); + raid5_release_stripe(sh); + /* release batch_last before wait to avoid risk of deadlock */ + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + md_wakeup_thread(conf->mddev->thread); + wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE); + return 0; } } - if (!ret) - goto out; - for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { struct r5dev *dev = &sh->dev[dd_idx]; @@ -5894,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf, RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); } -out: spin_unlock_irq(&sh->stripe_lock); - return ret; + return 1; } enum reshape_loc { @@ -5992,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, goto out_release; } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { - /* - * Stripe is busy expanding or add failed due to - * overlap. Flush everything and wait a while. - */ + if (test_bit(STRIPE_EXPANDING, &sh->state)) { md_wakeup_thread(mddev->thread); ret = STRIPE_SCHEDULE_AND_RETRY; goto out_release; } + if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + ret = STRIPE_RETRY; + goto out; + } + if (stripe_can_batch(sh)) { stripe_add_to_batch_list(conf, sh, ctx->batch_last); if (ctx->batch_last) @@ -6073,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); + bool on_wq; struct r5conf *conf = mddev->private; sector_t logical_sector; struct stripe_request_ctx ctx = {}; @@ -6146,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * sequential IO pattern. We don't bother with the optimization when * reshaping as the performance benefit is not worth the complexity. */ - if (likely(conf->reshape_progress == MaxSector)) + if (likely(conf->reshape_progress == MaxSector)) { logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); + on_wq = false; + } else { + add_wait_queue(&conf->wait_for_reshape, &wait); + on_wq = true; + } s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); - add_wait_queue(&conf->wait_for_overlap, &wait); while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); @@ -6161,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; if (res == STRIPE_SCHEDULE_AND_RETRY) { + WARN_ON_ONCE(!on_wq); /* * Must release the reference to batch_last before * scheduling and waiting for work to be done, @@ -6185,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = ctx.first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } - remove_wait_queue(&conf->wait_for_overlap, &wait); + if (unlikely(on_wq)) + remove_wait_queue(&conf->wait_for_reshape, &wait); if (ctx.batch_last) raid5_release_stripe(ctx.batch_last); @@ -6338,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk : (safepos < writepos && readpos > writepos)) || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, + wait_event(conf->wait_for_reshape, atomic_read(&conf->reshape_stripes)==0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (atomic_read(&conf->reshape_stripes) != 0) @@ -6364,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); sysfs_notify_dirent_safe(mddev->sysfs_completed); } @@ -6447,7 +6451,7 @@ finish: (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, + wait_event(conf->wait_for_reshape, atomic_read(&conf->reshape_stripes) == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (atomic_read(&conf->reshape_stripes) != 0) @@ -6473,7 +6477,7 @@ finish: spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: @@ -6486,7 +6490,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n struct r5conf *conf = mddev->private; struct stripe_head *sh; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int i; if (sector_nr >= max_sector) { @@ -6498,17 +6502,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); return 0; } /* Allow raid5_quiesce to complete */ - wait_event(conf->wait_for_overlap, conf->quiesce != 2); + wait_event(conf->wait_for_reshape, conf->quiesce != 2); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -6531,7 +6535,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, + true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6540,7 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6559,10 +6564,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n struct md_rdev *rdev = conf->disks[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) - still_degraded = 1; + still_degraded = true; } - md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, + still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6767,7 +6773,7 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - md_bitmap_unplug(mddev->bitmap); + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -7492,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); + init_waitqueue_head(&conf->wait_for_reshape); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->loprio_list); INIT_LIST_HEAD(&conf->hold_list); @@ -8312,6 +8318,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; + int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8320,11 +8327,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { @@ -8550,7 +8557,7 @@ static void end_reshape(struct r5conf *conf) !test_bit(In_sync, &rdev->flags)) rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); mddev_update_io_opt(conf->mddev, conf->raid_disks - conf->max_degraded); @@ -8614,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) conf->quiesce = 1; unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); } else { /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); unlock_all_device_hash_locks_irq(conf); } log_quiesce(conf, quiesce); @@ -8939,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev) { struct r5conf *conf = mddev->private; - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); } static struct md_personality raid6_personality = diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9b5a7dc3f2a0..896ecfc4afa6 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -668,7 +668,7 @@ struct r5conf { struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; + wait_queue_head_t wait_for_reshape; unsigned long cache_state; struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index 6f7e7a8fa5ae..ed5167f942d8 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -20,6 +20,28 @@ key_serial_t nvme_keyring_id(void) } EXPORT_SYMBOL_GPL(nvme_keyring_id); +static bool nvme_tls_psk_revoked(struct key *psk) +{ + return test_bit(KEY_FLAG_REVOKED, &psk->flags) || + test_bit(KEY_FLAG_INVALIDATED, &psk->flags); +} + +struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return key; + } + if (nvme_tls_psk_revoked(key)) { + pr_err("key id %08x revoked\n", key_id); + return ERR_PTR(-EKEYREVOKED); + } + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_key_lookup); + static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); @@ -36,14 +58,12 @@ static bool nvme_tls_psk_match(const struct key *key, pr_debug("%s: no key description\n", __func__); return false; } - match_len = strlen(key->description); - pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len); - if (!match_data->raw_data) { pr_debug("%s: no match data\n", __func__); return false; } match_id = match_data->raw_data; + match_len = strlen(match_id); pr_debug("%s: match '%s' '%s' len %zd\n", __func__, match_id, key->description, match_len); return !memcmp(key->description, match_id, match_len); @@ -71,7 +91,7 @@ static struct key_type nvme_tls_psk_key_type = { static struct key *nvme_tls_psk_lookup(struct key *keyring, const char *hostnqn, const char *subnqn, - int hmac, bool generated) + u8 hmac, u8 psk_ver, bool generated) { char *identity; size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11; @@ -82,8 +102,8 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, if (!identity) return ERR_PTR(-ENOMEM); - snprintf(identity, identity_len, "NVMe0%c%02d %s %s", - generated ? 'G' : 'R', hmac, hostnqn, subnqn); + snprintf(identity, identity_len, "NVMe%u%c%02u %s %s", + psk_ver, generated ? 'G' : 'R', hmac, hostnqn, subnqn); if (!keyring) keyring = nvme_keyring; @@ -107,21 +127,38 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, /* * NVMe PSK priority list * - * 'Retained' PSKs (ie 'generated == false') - * should be preferred to 'generated' PSKs, - * and SHA-384 should be preferred to SHA-256. + * 'Retained' PSKs (ie 'generated == false') should be preferred to 'generated' + * PSKs, PSKs with hash (psk_ver 1) should be preferred to PSKs without hash + * (psk_ver 0), and SHA-384 should be preferred to SHA-256. */ static struct nvme_tls_psk_priority_list { bool generated; + u8 psk_ver; enum nvme_tcp_tls_cipher cipher; } nvme_tls_psk_prio[] = { { .generated = false, + .psk_ver = 1, .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, { .generated = false, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = false, + .psk_ver = 0, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = false, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, { .generated = true, + .psk_ver = 1, .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, { .generated = true, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = true, + .psk_ver = 0, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = true, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, }; @@ -137,10 +174,11 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) { bool generated = nvme_tls_psk_prio[prio].generated; + u8 ver = nvme_tls_psk_prio[prio].psk_ver; enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher; tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn, - cipher, generated); + cipher, ver, generated); if (!IS_ERR(tls_key)) { tls_key_id = tls_key->serial; key_put(tls_key); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index a3caef75aa0a..486afe598184 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -41,6 +41,7 @@ config NVME_HWMON config NVME_FABRICS select NVME_CORE + select NVME_KEYRING if NVME_TCP_TLS tristate config NVME_RDMA @@ -94,7 +95,6 @@ config NVME_TCP config NVME_TCP_TLS bool "NVMe over Fabrics TCP TLS encryption support" depends on NVME_TCP - select NVME_KEYRING select NET_HANDSHAKE select KEYS help @@ -109,6 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH + select NVME_KEYRING if NVME_TCP_TLS help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 983909a600ad..ca9959a8fb9e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include #include #include #include @@ -987,8 +988,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); cmnd->rw.reftag = 0; - cmnd->rw.apptag = 0; - cmnd->rw.appmask = 0; + cmnd->rw.lbat = 0; + cmnd->rw.lbatm = 0; if (ns->head->ms) { /* @@ -4040,6 +4041,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) } } +/** + * struct async_scan_info - keeps track of controller & NSIDs to scan + * @ctrl: Controller on which namespaces are being scanned + * @next_nsid: Index of next NSID to scan in ns_list + * @ns_list: Pointer to list of NSIDs to scan + * + * Note: There is a single async_scan_info structure shared by all instances + * of nvme_scan_ns_async() scanning a given controller, so the atomic + * operations on next_nsid are critical to ensure each instance scans a unique + * NSID. + */ +struct async_scan_info { + struct nvme_ctrl *ctrl; + atomic_t next_nsid; + __le32 *ns_list; +}; + +static void nvme_scan_ns_async(void *data, async_cookie_t cookie) +{ + struct async_scan_info *scan_info = data; + int idx; + u32 nsid; + + idx = (u32)atomic_fetch_inc(&scan_info->next_nsid); + nsid = le32_to_cpu(scan_info->ns_list[idx]); + + nvme_scan_ns(scan_info->ctrl, nsid); +} + static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -4066,11 +4096,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) __le32 *ns_list; u32 prev = 0; int ret = 0, i; + ASYNC_DOMAIN(domain); + struct async_scan_info scan_info; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; + scan_info.ctrl = ctrl; + scan_info.ns_list = ns_list; for (;;) { struct nvme_command cmd = { .identify.opcode = nvme_admin_identify, @@ -4086,19 +4120,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) goto free; } + atomic_set(&scan_info.next_nsid, 0); for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); if (!nsid) /* end of the list? */ goto out; - nvme_scan_ns(ctrl, nsid); + async_schedule_domain(nvme_scan_ns_async, &scan_info, + &domain); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } + async_synchronize_full_domain(&domain); } out: nvme_remove_invalid_namespaces(ctrl, prev); free: + async_synchronize_full_domain(&domain); kfree(ns_list); return ret; } @@ -4568,7 +4606,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) set->flags |= BLK_MQ_F_BLOCKING; - set->cmd_size = cmd_size, + set->cmd_size = cmd_size; set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; @@ -4678,7 +4716,6 @@ static void nvme_free_ctrl(struct device *dev) if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); - key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); cleanup_srcu_struct(&ctrl->srcu); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f5f545fa0103..432efcbf9e2f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -665,7 +665,7 @@ static struct key *nvmf_parse_key(int key_id) return ERR_PTR(-EINVAL); } - key = key_lookup(key_id); + key = nvme_tls_key_lookup(key_id); if (IS_ERR(key)) pr_err("key id %08x not found\n", key_id); else diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f1d58e70933f..1d769c842fbf 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -4,6 +4,7 @@ * Copyright (c) 2017-2021 Christoph Hellwig. */ #include +#include #include /* for force_successful_syscall_return */ #include #include @@ -119,9 +120,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; + bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); + bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; int ret; + if (has_metadata && !supports_metadata) + return -EINVAL; + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { struct iov_iter iter; @@ -143,15 +149,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, goto out; bio = req->bio; - if (bdev) { + if (bdev) bio_set_dev(bio, bdev); - if (meta_buffer && meta_len) { - ret = bio_integrity_map_user(bio, meta_buffer, meta_len, - meta_seed); - if (ret) - goto out_unmap; - req->cmd_flags |= REQ_INTEGRITY; - } + + if (has_metadata) { + ret = bio_integrity_map_user(bio, meta_buffer, meta_len, + meta_seed); + if (ret) + goto out_unmap; + req->cmd_flags |= REQ_INTEGRITY; } return ret; @@ -260,8 +266,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.control = cpu_to_le16(io.control); c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); + c.rw.lbat = cpu_to_le16(io.apptag); + c.rw.lbatm = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0, 0); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index da57947130cc..313a4f978a2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -90,6 +90,11 @@ enum nvme_quirks { */ NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), + /* + * Problems seen with concurrent commands + */ + NVME_QUIRK_QDEPTH_ONE = (1 << 6), + /* * Set MEDIUM priority on SQ creation */ @@ -372,7 +377,7 @@ struct nvme_ctrl { struct nvme_dhchap_key *ctrl_key; u16 transaction; #endif - struct key *tls_key; + key_serial_t tls_pskid; /* Power saving configuration */ u64 ps_max_latency_us; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c0533f3f64cb..7990c3f22ecf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2563,15 +2563,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) else dev->io_sqes = NVME_NVM_IOSQES; - /* - * Temporary fix for the Apple controller found in the MacBook8,1 and - * some MacBook7,1 to avoid controller resets and data loss. - */ - if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) { dev->q_depth = 2; - dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " - "set queue depth=%u to work around controller resets\n", - dev->q_depth); } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && (pdev->device == 0xa821 || pdev->device == 0xa822) && NVME_CAP_MQES(dev->ctrl.cap) == 0) { @@ -3442,6 +3435,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ + .driver_data = NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, @@ -3576,7 +3571,12 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), - .driver_data = NVME_QUIRK_SINGLE_VECTOR }, + /* + * Fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR | diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2eb33842f971..15b5e06039a5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1363,8 +1363,8 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; @@ -1876,6 +1876,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) */ priv.hrqsize = cpu_to_le16(queue->queue_size); priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + /* cntlid should only be set when creating an I/O queue */ + priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid); } ret = rdma_connect_locked(queue->cm_id, ¶m); diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index ba05faaac562..eb345551d6fe 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -664,19 +664,6 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); #endif -#ifdef CONFIG_NVME_TCP_TLS -static ssize_t tls_key_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (!ctrl->tls_key) - return 0; - return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key)); -} -static DEVICE_ATTR_RO(tls_key); -#endif - static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -703,9 +690,6 @@ static struct attribute *nvme_dev_attrs[] = { #ifdef CONFIG_NVME_HOST_AUTH &dev_attr_dhchap_secret.attr, &dev_attr_dhchap_ctrl_secret.attr, -#endif -#ifdef CONFIG_NVME_TCP_TLS - &dev_attr_tls_key.attr, #endif &dev_attr_adm_passthru_err_log_enabled.attr, NULL @@ -737,11 +721,6 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) return 0; #endif -#ifdef CONFIG_NVME_TCP_TLS - if (a == &dev_attr_tls_key.attr && - (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))) - return 0; -#endif return a->mode; } @@ -752,8 +731,77 @@ const struct attribute_group nvme_dev_attrs_group = { }; EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); +#ifdef CONFIG_NVME_TCP_TLS +static ssize_t tls_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->tls_pskid) + return 0; + return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid); +} +static DEVICE_ATTR_RO(tls_key); + +static ssize_t tls_configured_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct key *key = ctrl->opts->tls_key; + + return sysfs_emit(buf, "%08x\n", key_serial(key)); +} +static DEVICE_ATTR_RO(tls_configured_key); + +static ssize_t tls_keyring_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct key *keyring = ctrl->opts->keyring; + + return sysfs_emit(buf, "%s\n", keyring->description); +} +static DEVICE_ATTR_RO(tls_keyring); + +static struct attribute *nvme_tls_attrs[] = { + &dev_attr_tls_key.attr, + &dev_attr_tls_configured_key.attr, + &dev_attr_tls_keyring.attr, +}; + +static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")) + return 0; + + if (a == &dev_attr_tls_key.attr && + !ctrl->opts->tls) + return 0; + if (a == &dev_attr_tls_configured_key.attr && + !ctrl->opts->tls_key) + return 0; + if (a == &dev_attr_tls_keyring.attr && + !ctrl->opts->keyring) + return 0; + + return a->mode; +} + +const struct attribute_group nvme_tls_attrs_group = { + .attrs = nvme_tls_attrs, + .is_visible = nvme_tls_attrs_are_visible, +}; +#endif + const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, +#ifdef CONFIG_NVME_TCP_TLS + &nvme_tls_attrs_group, +#endif NULL, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a2a47d3ab99f..89c44413c593 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -165,6 +165,7 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; + bool tls_enabled; struct ahash_request *rcv_hash; struct ahash_request *snd_hash; __le32 exp_ddgst; @@ -213,7 +214,21 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } -static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) +/* + * Check if the queue is TLS encrypted + */ +static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return queue->tls_enabled; +} + +/* + * Check if TLS is configured for the controller. + */ +static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) { if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; @@ -368,7 +383,7 @@ static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue) static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) { - return !nvme_tcp_tls(&queue->ctrl->ctrl) && + return !nvme_tcp_queue_tls(queue) && nvme_tcp_queue_has_pending(queue); } @@ -1051,7 +1066,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else msg.msg_flags |= MSG_MORE; - if (!sendpage_ok(page)) + if (!sendpages_ok(page, len, offset)) msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page, len, offset); @@ -1427,7 +1442,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { + if (nvme_tcp_queue_tls(queue)) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } @@ -1439,7 +1454,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } ret = -ENOTCONN; - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { + if (nvme_tcp_queue_tls(queue)) { ctype = tls_get_record_type(queue->sock->sk, (struct cmsghdr *)cbuf); if (ctype != TLS_RECORD_TYPE_DATA) { @@ -1581,13 +1596,16 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) goto out_complete; } - tls_key = key_lookup(pskid); + tls_key = nvme_tls_key_lookup(pskid); if (IS_ERR(tls_key)) { dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", qid, pskid); queue->tls_err = -ENOKEY; } else { - ctrl->ctrl.tls_key = tls_key; + queue->tls_enabled = true; + if (qid == 0) + ctrl->ctrl.tls_pskid = key_serial(tls_key); + key_put(tls_key); queue->tls_err = 0; } @@ -1768,7 +1786,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, } /* If PSKs are configured try to start TLS */ - if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { + if (nvme_tcp_tls_configured(nctrl) && pskid) { ret = nvme_tcp_start_tls(nctrl, queue, pskid); if (ret) goto err_init_connect; @@ -1829,6 +1847,8 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_lock(&queue->queue_lock); if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) __nvme_tcp_stop_queue(queue); + /* Stopping the queue will disable TLS */ + queue->tls_enabled = false; mutex_unlock(&queue->queue_lock); } @@ -1925,16 +1945,17 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (nvme_tcp_tls(ctrl)) { + if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else + else { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); - if (!pskid) { - dev_err(ctrl->device, "no valid PSK found\n"); - return -ENOKEY; + if (!pskid) { + dev_err(ctrl->device, "no valid PSK found\n"); + return -ENOKEY; + } } } @@ -1957,13 +1978,14 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { + if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { dev_err(ctrl->device, "no PSK negotiated\n"); return -ENOKEY; } + for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_tcp_alloc_queue(ctrl, i, - key_serial(ctrl->tls_key)); + ctrl->tls_pskid); if (ret) goto out_free_queues; } @@ -2144,6 +2166,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); + if (ctrl->tls_pskid) { + dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", + ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 85006b2df8ae..954d4c074770 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1015,8 +1015,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_admin_cmd(req); - if (unlikely(!nvmet_check_auth_status(req))) - return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 8bc3f431c77f..7897d02c681d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -25,6 +25,18 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, unsigned char key_hash; char *dhchap_secret; + if (!strlen(secret)) { + if (set_ctrl) { + kfree(host->dhchap_ctrl_secret); + host->dhchap_ctrl_secret = NULL; + host->dhchap_ctrl_key_hash = 0; + } else { + kfree(host->dhchap_secret); + host->dhchap_secret = NULL; + host->dhchap_key_hash = 0; + } + return 0; + } if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1) return -EINVAL; if (key_hash > 3) { diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1eff8ca6a5f1..1b6264fa5803 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -578,8 +578,8 @@ static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 2e612834329a..e8c2c4535cb3 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -14,8 +14,6 @@ /* Ugly macros make the code more pretty. */ -#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) -#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h index 1b973a669d23..da3217ab6adb 100644 --- a/fs/affs/amigaffs.h +++ b/fs/affs/amigaffs.h @@ -49,12 +49,13 @@ struct affs_short_date { struct affs_root_head { __be32 ptype; + /* The following fields are not used, but kept as documentation. */ __be32 spare1; __be32 spare2; __be32 hash_size; __be32 spare3; __be32 checksum; - __be32 hashtable[1]; + __be32 hashtable[]; }; struct affs_root_tail { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a2de5c05f97c..e2f478ecd7fd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -219,8 +219,8 @@ static void free_pref(struct prelim_ref *ref) * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ -static int prelim_ref_compare(struct prelim_ref *ref1, - struct prelim_ref *ref2) +static int prelim_ref_compare(const struct prelim_ref *ref1, + const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; @@ -251,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount, struct prelim_ref *newref) + int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b4e31ae17cd9..fec5c6cde0a7 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -53,7 +53,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; - - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -120,12 +113,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -147,8 +134,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, } } -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -179,7 +167,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -211,7 +199,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -280,7 +268,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -326,7 +314,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -360,7 +348,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -380,7 +368,7 @@ static void btrfs_simple_end_io(struct bio *bio) } else { if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -394,7 +382,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -424,7 +412,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -502,8 +490,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ @@ -593,7 +581,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; } @@ -603,7 +591,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) @@ -664,6 +652,19 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) + return sector_offset << SECTOR_SHIFT; + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; @@ -678,7 +679,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.is_scrub = !bbio->inode; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, @@ -690,10 +694,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bbio = btrfs_split_bio(fs_info, bbio, map_length); bio = &bbio->bio; } @@ -749,7 +753,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; @@ -765,16 +769,14 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - remaining->bio.bi_status = ret; - btrfs_orig_bbio_end_io(remaining); + btrfs_bio_end_io(remaining, ret); } - bbio->bio.bi_status = ret; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -786,7 +788,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * @@ -875,7 +877,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index d9dd5276093d..e48612340745 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -29,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and - * passed to btrfs_submit_bio for mapping to the physical devices. + * passed to btrfs_submit_bbio() for mapping to the physical devices. */ struct btrfs_bio { /* @@ -42,7 +42,7 @@ struct btrfs_bio { union { /* * For data reads: checksumming and original I/O information. - * (for internal use in the btrfs_submit_bio machinery only) + * (for internal use in the btrfs_submit_bbio() machinery only) */ struct { u8 *csum; @@ -104,7 +104,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct folio *folio, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 2e49d978f504..7980b2e33a92 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -23,7 +23,7 @@ #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) * * Should be called with balance_lock held */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; + const struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) @@ -1415,9 +1415,9 @@ out: } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *bg) + const struct btrfs_block_group *bg) { - struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; @@ -1756,14 +1756,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } -static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } -static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed) { const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); u64 thresh_bytes = mult_perc(bg->length, thresh_pct); @@ -2006,8 +2006,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } -static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct btrfs_path *path) +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, + const struct btrfs_path *path) { struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; @@ -2055,7 +2055,7 @@ out_free_map: static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; @@ -2640,8 +2640,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, } static int insert_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 chunk_offset, - u64 start, u64 num_bytes) + const struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -2817,7 +2817,7 @@ next: * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ -static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; @@ -3842,8 +3842,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4218,7 +4218,7 @@ out: return ret; } -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4622,7 +4622,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, return 0; } -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 915111338fc0..36937eeab9b8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -266,7 +266,7 @@ struct btrfs_block_group { u64 reclaim_mark; }; -static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } @@ -278,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } -static inline bool btrfs_is_block_group_data_only( - struct btrfs_block_group *block_group) +static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the @@ -290,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -370,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || @@ -387,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b299b82d676e..a07b9594dc70 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -553,7 +553,7 @@ try_reserve: return ERR_PTR(ret); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 1f53b967d069..d12b1fac5c74 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -89,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3056c8aed8ef..9a4b7c119318 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -350,10 +350,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, WRITE_ONCE(inode->first_dir_index_to_log, index); } -static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} +/* Type checked and const-preserving VFS inode -> btrfs inode. */ +#define BTRFS_I(_inode) \ + _Generic(_inode, \ + struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ + const struct inode *: (const struct btrfs_inode *)container_of( \ + _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) @@ -505,6 +507,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } +static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) +{ + /* Immediately trigger a crash if the inode is not locked. */ + ASSERT(inode_is_locked(&inode->vfs_inode)); + /* Trigger a splat in dmesg if this task is not holding the lock. */ + lockdep_assert_held(&inode->vfs_inode.i_rwsem); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -578,7 +588,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len); + struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, @@ -596,9 +606,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a8e2c461aff7..90aef2627ca2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -138,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, + const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: @@ -395,7 +395,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->bbio.ordered = ordered; btrfs_add_compressed_bio_folios(cb); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); } /* @@ -420,7 +420,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; - struct page *page; + struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -453,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - page = xa_load(&mapping->i_pages, pg_index); - if (page && !xa_is_value(page)) { - sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + folio = __filemap_get_folio(mapping, pg_index, 0, 0); + if (!IS_ERR(folio)) { + u64 folio_sz = folio_size(folio); + u64 offset = offset_in_folio(folio, cur); + + folio_put(folio); + sectors_missed += (folio_sz - offset) >> fs_info->sectorsize_bits; /* Beyond threshold, no need to continue */ @@ -466,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode, * Jump to next page start as we already have page for * current offset. */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += (folio_sz - offset); continue; } - page = __page_cache_alloc(mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, + ~__GFP_FS), 0); + if (!folio) break; - if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - put_page(page); + if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { /* There is already a page, skip to page end */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += folio_size(folio); + folio_put(folio); continue; } - if (!*memstall && PageWorkingset(page)) { + if (!*memstall && folio_test_workingset(folio)) { psi_memstall_enter(pflags); *memstall = 1; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } - page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); @@ -511,28 +515,28 @@ static noinline int add_ra_bio_pages(struct inode *inode, orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); - if (page->index == end_index) { - size_t zero_offset = offset_in_page(isize); + if (folio->index == end_index) { + size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { int zeros; - zeros = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, zeros); + zeros = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, zeros); } } - ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); - if (ret != add_size) { - unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + if (!bio_add_folio(orig_bio, folio, add_size, + offset_in_folio(folio, cur))) { + folio_unlock(folio); + folio_put(folio); break; } /* @@ -541,9 +545,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page_folio(page), - cur, add_size); - put_page(page); + btrfs_subpage_start_reader(fs_info, folio, cur, + add_size); + folio_put(folio); cur += add_size; } return 0; @@ -626,7 +630,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); return; out_free_compressed_pages: @@ -1057,10 +1061,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -1073,7 +1077,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); workspace = get_workspace(type, 0); - ret = compression_decompress(type, workspace, data_in, dest_page, + ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); put_workspace(type, workspace); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index cfdc64319186..b6563b6a333e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -82,13 +82,21 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } +/* @range_end must be exclusive. */ +static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +{ + u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + + return min(range_end, page_end) - cur; +} + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); @@ -154,7 +162,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -165,7 +173,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); @@ -175,7 +183,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 451203055bbf..0cc919d15b14 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,8 +2564,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * */ static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, int level) + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; @@ -2594,7 +2594,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * that the new key won't break the order */ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2660,8 +2660,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, * is correct, we only need to bother the last key of @left and the first * key of @right. */ -static bool check_sibling_keys(struct extent_buffer *left, - struct extent_buffer *right) +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) { struct btrfs_key left_last; struct btrfs_key right_first; @@ -2928,8 +2928,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static int insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { struct extent_buffer *lower; @@ -4019,7 +4019,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * the front. */ void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end) + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4111,7 +4111,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size) + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8568b1a61c4..1a44fb9845e3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,6 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H +#include "linux/cleanup.h" #include #include #include @@ -84,6 +85,9 @@ struct btrfs_path { unsigned int nowait:1; }; +#define BTRFS_PATH_AUTO_FREE(path_name) \ + struct btrfs_path *path_name __free(btrfs_free_path) = NULL + /* * The state of btrfs root */ @@ -538,7 +542,7 @@ int btrfs_previous_item(struct btrfs_root *root, int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -572,9 +576,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size); + const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end); + const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -598,6 +602,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); +DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f6dbda37a361..acf1f39e45d0 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -45,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; @@ -61,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -83,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -107,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -119,34 +117,28 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) - return 0; + if (!need_auto_defrag(fs_info)) + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = btrfs_get_root_last_trans(root); + return; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; + defrag->transid = btrfs_get_root_last_trans(root); defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; @@ -157,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* @@ -189,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -198,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -214,27 +205,22 @@ out: void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); + + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); - - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -245,7 +231,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -273,9 +259,10 @@ again: range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -302,11 +289,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ @@ -324,7 +313,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -1317,8 +1306,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1350,7 +1338,7 @@ out: * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1372,12 +1360,13 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1406,18 +1395,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cur = round_down(range->start, fs_info->sectorsize); last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. @@ -1472,8 +1449,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 878528e086fb..6b7596c4f0dc 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -18,8 +18,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 06a9e0542d70..ad9ef8312e41 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -855,11 +855,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { - if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord)) + int ret; + + ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, + delayed_refs, qrecord); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); kfree(qrecord); - else + } else { qrecord_inserted = true; + } } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -1005,18 +1011,16 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - return -ENOMEM; - } + if (!head_ref) + goto free_node; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); - return -ENOMEM; - } + if (!record) + goto free_head_ref; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, + generic_ref->bytenr, GFP_NOFS)) + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); @@ -1052,6 +1056,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record); return 0; + +free_record: + kfree(record); +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_node: + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); + return -ENOMEM; } /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 05f634eb472d..085f30968aba 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -202,8 +202,8 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* dirty extent records */ - struct rb_root dirty_extent_root; + /* Track dirty extent records. */ + struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f638c458d285..83d5cdd77f29 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -824,22 +824,45 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - u64 start = 0; - int i; + struct rb_node *node; + + /* + * The chunk mutex must be held so that no new chunks can be created + * while we are updating existing chunks. This guarantees we don't miss + * any new chunk that gets created for a range that falls before the + * range of the last chunk we processed. + */ + lockdep_assert_held(&fs_info->chunk_mutex); write_lock(&fs_info->mapping_tree_lock); - do { + node = rb_first_cached(&fs_info->mapping_tree); + while (node) { + struct rb_node *next = rb_next(node); struct btrfs_chunk_map *map; + u64 next_start; - map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); - if (!map) - break; - for (i = 0; i < map->num_stripes; i++) + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + next_start = map->start + map->chunk_len; + + for (int i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); - } while (start); + + if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) { + map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX); + if (!map) + break; + node = &map->rb_node; + /* + * Drop the lookup reference since we are holding the + * lock in write mode and no one can remove the chunk + * map from the tree and drop its tree reference. + */ + btrfs_free_chunk_map(map); + } else { + node = next; + } + } write_unlock(&fs_info->mapping_tree_lock); } diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 364bce34f034..bd38df5647e3 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct btrfs_ordered_extent *ordered; int ret = 0; + /* Direct lock must be taken before the extent lock. */ + if (nowait) { + if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + return -EAGAIN; + } else { + lock_dio_extent(io_tree, lockstart, lockend, cached_state); + } + while (1) { if (nowait) { if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; + cached_state)) { + ret = -EAGAIN; + break; + } } else { lock_extent(io_tree, lockstart, lockend, cached_state); } @@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cond_resched(); } + if (ret) + unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -353,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; const u64 data_alloc_len = length; - bool unlock_extents = false; + u32 unlock_bits = EXTENT_LOCKED; /* * We could potentially fault if we have a buffer > PAGE_SIZE, and if @@ -514,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, &len, flags); if (ret < 0) goto unlock_err; - unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) { @@ -535,22 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, release_offset, release_len); } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; } - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does @@ -569,11 +566,33 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->length = len; free_extent_map(em); + /* + * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, + * writes only hold it for this part. We hold the extent lock until + * we're completely done with the extent map to make sure it remains + * valid. + */ + if (write) + unlock_bits |= EXTENT_DIO_LOCKED; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); + + /* We didn't use everything, unlock the dio extent for the remainder. */ + if (!write && (start + len) < lockend) + unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); + return 0; unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + /* + * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget + * to update this, be explicit that we expect EXTENT_LOCKED and + * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -596,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -608,8 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -641,8 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; @@ -726,7 +745,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a4..e815d165cccc 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) + const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } @@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ -static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6f5441e62d1..25d768e67e37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -525,7 +525,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, @@ -1285,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -3322,6 +3321,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; @@ -3346,20 +3346,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index c54c5d7a5cd5..6d08c100b01d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -126,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * Empty an io tree, removing and freeing every extent state record from the * tree. This should be called once we are sure no other task can access the * tree anymore, so no tree updates happen after we empty the tree and there - * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ void extent_io_tree_release(struct extent_io_tree *tree) @@ -141,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); - ASSERT(!(state->state & EXTENT_LOCKED)); + ASSERT(!(state->state & EXTENT_LOCK_BITS)); /* * No need for a memory barrier here, as we are holding the tree * lock and we only change the waitqueue while holding that lock @@ -399,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) return; merge_prev_state(tree, state); @@ -445,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, struct rb_node *parent = NULL; const u64 start = state->start - 1; const u64 end = state->end + 1; - const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); + const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -616,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. * - * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given - * range from the tree regardless of state (ie for truncate). - * * The range [start, end] is inclusive. * * This takes the tree lock, and returns 0 on success and < 0 on error. @@ -647,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCKED) ? 1 : 0; - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); + if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc) { @@ -861,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state, static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); + return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY); } /* @@ -1063,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int ret = 0; u64 last_start; u64 last_end; - u32 exclusive_bits = (bits & EXTENT_LOCKED); + u32 exclusive_bits = (bits & EXTENT_LOCK_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -1812,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. + * We don't support EXTENT_LOCK_BITS yet, as current changeset will + * record any bits changed, so for EXTENT_LOCK_BITS case, it will either + * fail with -EEXIST or changeset will record the whole range. */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } @@ -1826,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * Don't support EXTENT_LOCKED case, same reason as + * Don't support EXTENT_LOCK_BITS case, same reason as * set_record_extent_bits(). */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached); + clear_extent_bit(tree, start, failed_start - 1, bits, cached); return 0; } return 1; @@ -1855,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached_state); + bits, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, - &failed_state); - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + wait_extent_bit(tree, failed_start, end, bits, &failed_state); + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 9d3a52d8f59a..6ffef1cd37c1 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -19,6 +19,7 @@ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), @@ -67,6 +68,8 @@ enum { EXTENT_ADD_INODE_BYTES | \ EXTENT_CLEAR_ALL_BITS) +#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED) + /* * Redefined bits above which are used only in the device allocation tree, * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV @@ -134,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); @@ -212,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); +static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); +} #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index feec49e6f9c8..a5966324607d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6551,13 +6551,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c73cd4f89015..39c9677c47d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,13 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; + + /* + * The sectors of the page which are going to be submitted by + * extent_writepage_io(). + * This is to avoid touching ranges covered by compression/inline. + */ + unsigned long submit_bitmap; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -117,7 +124,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -164,11 +171,10 @@ void __cold extent_buffer_free_cachep(void) kmem_cache_destroy(extent_buffer_cache); } -static void process_one_page(struct btrfs_fs_info *fs_info, - struct page *page, const struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_folio(struct btrfs_fs_info *fs_info, + struct folio *folio, const struct folio *locked_folio, + unsigned long page_ops, u64 start, u64 end) { - struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); @@ -183,13 +189,13 @@ static void process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); - if (page != locked_page && (page_ops & PAGE_UNLOCK)) + if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) btrfs_folio_end_writer_lock(fs_info, folio, start, len); } -static void __process_pages_contig(struct address_space *mapping, - const struct page *locked_page, u64 start, u64 end, - unsigned long page_ops) +static void __process_folios_contig(struct address_space *mapping, + const struct folio *locked_folio, u64 start, + u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; @@ -207,8 +213,8 @@ static void __process_pages_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, locked_page, - page_ops, start, end); + process_one_folio(fs_info, folio, locked_folio, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); @@ -216,24 +222,23 @@ static void __process_pages_contig(struct address_space *mapping, } static noinline void __unlock_for_delalloc(const struct inode *inode, - const struct page *locked_page, + const struct folio *locked_folio, u64 start, u64 end) { unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_page); - if (index == locked_page->index && end_index == index) + ASSERT(locked_folio); + if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK); + __process_folios_contig(inode->i_mapping, locked_folio, start, end, + PAGE_UNLOCK); } -static noinline int lock_delalloc_pages(struct inode *inode, - const struct page *locked_page, - u64 start, - u64 end) +static noinline int lock_delalloc_folios(struct inode *inode, + const struct folio *locked_folio, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; @@ -243,7 +248,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_page->index && index == end_index) + if (index == locked_folio->index && index == end_index) return 0; folio_batch_init(&fbatch); @@ -257,23 +262,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; - if (page == locked_page) + if (folio == locked_folio) continue; if (btrfs_folio_start_writer_lock(fs_info, folio, start, len)) goto out; - if (!PageDirty(page) || page->mapping != mapping) { + if (!folio_test_dirty(folio) || folio->mapping != mapping) { btrfs_folio_end_writer_lock(fs_info, folio, start, len); goto out; } - processed_end = page_offset(page) + PAGE_SIZE - 1; + processed_end = folio_pos(folio) + folio_size(folio) - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -283,7 +287,8 @@ static noinline int lock_delalloc_pages(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_page, start, processed_end); + __unlock_for_delalloc(inode, locked_folio, start, + processed_end); return -EAGAIN; } @@ -304,8 +309,8 @@ out: */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, - u64 *end) + struct folio *locked_folio, + u64 *start, u64 *end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -323,9 +328,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* Caller should pass a valid @end to indicate the search range end */ ASSERT(orig_end > orig_start); - /* The range should at least cover part of the page */ - ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || - orig_end <= page_offset(locked_page))); + /* The range should at least cover part of the folio */ + ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; @@ -342,25 +347,25 @@ again: } /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page + * start comes from the offset of locked_folio. We have to lock + * folios in order, so we can't process delalloc bytes before + * locked_folio */ if (delalloc_start < *start) delalloc_start = *start; /* - * make sure to limit the number of pages we try to lock down + * make sure to limit the number of folios we try to lock down */ if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, - delalloc_start, delalloc_end); + /* step two, lock all the folioss after the folios that has start */ + ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, + delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by + /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); @@ -384,8 +389,8 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_page, - delalloc_start, delalloc_end); + __unlock_for_delalloc(inode, locked_folio, delalloc_start, + delalloc_end); cond_resched(); goto again; } @@ -396,40 +401,41 @@ out_failed: } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, + end, page_ops); } -static bool btrfs_verify_page(struct page *page, u64 start) +static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) { - if (!fsverity_active(page->mapping->host) || - PageUptodate(page) || - start >= i_size_read(page->mapping->host)) + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + + if (!fsverity_active(folio->mapping->host) || + btrfs_folio_test_uptodate(fs_info, folio, start, len) || + start >= i_size_read(folio->mapping->host)) return true; - return fsverity_verify_page(page); + return fsverity_verify_folio(folio); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) + if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page->mapping)) - unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) + folio_unlock(folio); else btrfs_subpage_end_reader(fs_info, folio, start, len); } @@ -471,8 +477,8 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) "incomplete page write with offset %zu and length %zu", fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, - folio_page(folio, 0), start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, + !error); if (error) mapping_set_error(folio->mapping, error); btrfs_folio_clear_writeback(fs_info, folio, start, len); @@ -481,85 +487,14 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) bio_put(bio); } -/* - * Record previously processed extent range - * - * For endio_readpage_release_extent() to handle a full extent range, reducing - * the extent io operations. - */ -struct processed_extent { - struct btrfs_inode *inode; - /* Start of the range in @inode */ - u64 start; - /* End of the range in @inode */ - u64 end; - bool uptodate; -}; - -/* - * Try to release processed extent range - * - * May not release the extent range right now if the current range is - * contiguous to processed extent. - * - * Will release processed extent when any of @inode, @uptodate, the range is - * no longer contiguous to the processed range. - * - * Passing @inode == NULL will force processed extent to be released. - */ -static void endio_readpage_release_extent(struct processed_extent *processed, - struct btrfs_inode *inode, u64 start, u64 end, - bool uptodate) +static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { - struct extent_state *cached = NULL; - struct extent_io_tree *tree; - - /* The first extent, initialize @processed */ - if (!processed->inode) - goto update; - - /* - * Contiguous to processed extent, just uptodate the end. - * - * Several things to notice: - * - * - bio can be merged as long as on-disk bytenr is contiguous - * This means we can have page belonging to other inodes, thus need to - * check if the inode still matches. - * - bvec can contain range beyond current page for multi-page bvec - * Thus we need to do processed->end + 1 >= start check - */ - if (processed->inode == inode && processed->uptodate == uptodate && - processed->end + 1 >= start && end >= processed->end) { - processed->end = end; - return; - } - - tree = &processed->inode->io_tree; - /* - * Now we don't have range contiguous to the processed range, release - * the processed range now. - */ - unlock_extent(tree, processed->start, processed->end, &cached); - -update: - /* Update processed to current range */ - processed->inode = inode; - processed->start = start; - processed->end = end; - processed->uptodate = uptodate; -} - -static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) -{ - struct folio *folio = page_folio(page); - ASSERT(folio_test_locked(folio)); if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); + btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -578,7 +513,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) { struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; - struct processed_extent processed = { 0 }; struct folio_iter fi; const u32 sectorsize = fs_info->sectorsize; @@ -642,12 +576,8 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_page_read(folio_page(folio, 0), uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); + end_folio_read(folio, uptodate, start, len); } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } @@ -737,12 +667,13 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + struct folio *folio, u64 disk_bytenr, unsigned int pg_offset) { struct bio *bio = &bio_ctrl->bbio->bio; struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -755,7 +686,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, /* * The contig check requires the following conditions to be met: * - * 1) The pages are belonging to the same inode + * 1) The folios are belonging to the same inode * This is implied by the call chain. * * 2) The range has adjacent logical bytenr @@ -764,8 +695,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, * This is required for the usage of btrfs_bio->file_offset. */ return bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == - page_offset(page) + pg_offset; + folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == + folio_pos(folio) + pg_offset; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -818,17 +749,17 @@ static void alloc_new_bio(struct btrfs_inode *inode, * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, +static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct folio *folio, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); do { @@ -837,7 +768,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, /* Allocate new bio if needed */ if (!bio_ctrl->bbio) { alloc_new_bio(inode, bio_ctrl, disk_bytenr, - page_offset(page) + pg_offset); + folio_pos(folio) + pg_offset); } /* Cap to the current ordered extent boundary if there is one. */ @@ -847,21 +778,22 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, len = bio_ctrl->len_to_oe_boundary; } - if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); continue; } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + len); size -= len; pg_offset += len; disk_bytenr += len; /* - * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * @@ -871,15 +803,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would - * result in a 4095 byte IO for the last page right before - * we hit the bio limit of UINT_MAX. bio_add_page() has all + * result in a 4095 byte IO for the last folio right before + * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous - * pages for large extents. + * folios for large extents. */ if (bio_ctrl->len_to_oe_boundary != U32_MAX) bio_ctrl->len_to_oe_boundary -= len; @@ -952,27 +884,28 @@ int set_folio_extent_mapped(struct folio *folio) return 0; } -void clear_page_extent_mapped(struct page *page) +void clear_folio_extent_mapped(struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (!folio_test_private(folio)) return; - fs_info = page_to_fs_info(page); - if (btrfs_is_subpage(fs_info, page->mapping)) + fs_info = folio_to_fs_info(folio); + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, - u64 start, u64 len, struct extent_map **em_cached) +static struct extent_map *__get_extent_map(struct inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; + struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -988,12 +921,15 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); + em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + return em; } /* @@ -1003,12 +939,12 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - u64 start = page_offset(page); + u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; @@ -1019,25 +955,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = fs_info->sectorsize; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_extent(tree, start, end, NULL); - unlock_page(page); + folio_unlock(folio); return ret; } - if (page->index == last_byte >> PAGE_SHIFT) { - size_t zero_offset = offset_in_page(last_byte); + if (folio->index == last_byte >> folio_shift(folio)) { + size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { - iosize = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, iosize); + iosize = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, iosize); } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_page_read(fs_info, page); + begin_folio_read(fs_info, folio); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; @@ -1045,16 +979,15 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = PAGE_SIZE - pg_offset; - memzero_page(page, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + iosize = folio_size(folio) - pg_offset; + folio_zero_range(folio, pg_offset, iosize); + end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); + em = __get_extent_map(inode, folio, cur, end - cur + 1, + em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); + end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1079,8 +1012,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a - * single bio to populate the pages for the 2 ranges because - * this makes the compressed extent read zero out the pages + * single bio to populate the folios for the 2 ranges because + * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout @@ -1093,13 +1026,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, - * it will decompress extent X into the pages belonging to the + * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining - * pages that belong to the other range that points to extent X. + * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently - * make the compressed bio endio callback populate the pages + * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the @@ -1120,18 +1053,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - memzero_page(page, pg_offset, iosize); + folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } - /* the get_extent function already copied into the page */ + /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1144,8 +1075,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1155,17 +1086,11 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct btrfs_inode *inode = page_to_inode(page); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); /* @@ -1176,28 +1101,8 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } -static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - u64 *prev_em_start) -{ - struct btrfs_inode *inode = page_to_inode(pages[0]); - int index; - - ASSERT(em_cached); - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - prev_em_start); - put_page(pages[index]); - } -} - /* - * helper for __extent_writepage, doing all of the delayed allocation setup. + * helper for extent_writepage(), doing all of the delayed allocation setup. * * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has @@ -1207,13 +1112,14 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc) + struct folio *folio, + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); - struct folio *folio = page_folio(page); - const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping); - const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; + struct writeback_control *wbc = bio_ctrl->wbc; + const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const u64 page_start = folio_pos(folio); + const u64 page_end = page_start + folio_size(folio) - 1; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1225,10 +1131,18 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_to_write = 0; int ret = 0; - /* Lock all (subpage) delalloc ranges inside the page first. */ + /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->sectors_per_page > 1); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); + } else { + bio_ctrl->submit_bitmap = 1; + } + + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, page, + if (!find_lock_delalloc_range(&inode->vfs_inode, folio, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; @@ -1253,7 +1167,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (!is_subpage) { /* * For non-subpage case, the found delalloc range must - * cover this page and there must be only one locked + * cover this folio and there must be only one locked * delalloc range. */ found_start = page_start; @@ -1267,7 +1181,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, break; /* * The subpage range covers the last sector, the delalloc range may - * end beyond the page boundary, use the saved delalloc_end + * end beyond the folio boundary, use the saved delalloc_end * instead. */ if (found_start + found_len >= page_end) @@ -1275,7 +1189,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret >= 0) { /* No errors hit so far, run the current delalloc range. */ - ret = btrfs_run_delalloc_range(inode, page, found_start, + ret = btrfs_run_delalloc_range(inode, folio, + found_start, found_start + found_len - 1, wbc); } else { @@ -1285,30 +1200,27 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, page, found_start, + __unlock_for_delalloc(&inode->vfs_inode, folio, + found_start, found_start + found_len - 1); } /* - * We can hit btrfs_run_delalloc_range() with >0 return value. - * - * This happens when either the IO is already done and page - * unlocked (inline) or the IO submission and page unlock would - * be handled as async (compression). - * - * Inline is only possible for regular sectorsize for now. - * - * Compression is possible for both subpage and regular cases, - * but even for subpage compression only happens for page aligned - * range, thus the found delalloc range must go beyond current - * page. + * We have some ranges that's going to be submitted asynchronously + * (compression or inline). These range have their own control + * on when to unlock the pages. We should not touch them + * anymore, so clear the range from the submission bitmap. */ - if (ret > 0) - ASSERT(!is_subpage || found_start + found_len >= page_end); - + if (ret > 0) { + unsigned int start_bit = (found_start - page_start) >> + fs_info->sectorsize_bits; + unsigned int end_bit = (min(page_end + 1, found_start + found_len) - + page_start) >> fs_info->sectorsize_bits; + bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); + } /* - * Above btrfs_run_delalloc_range() may have unlocked the page, - * thus for the last range, we cannot touch the page anymore. + * Above btrfs_run_delalloc_range() may have unlocked the folio, + * thus for the last range, we cannot touch the folio anymore. */ if (found_start + found_len >= last_delalloc_end + 1) break; @@ -1330,10 +1242,10 @@ out: DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); /* - * If btrfs_run_dealloc_range() already started I/O and unlocked - * the pages, we just need to account for them here. + * If all ranges are submitted asynchronously, we just need to account + * for them here. */ - if (ret == 1) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1351,182 +1263,148 @@ out: } /* - * Find the first byte we need to write. + * Return 0 if we have submitted or queued the sector for submission. + * Return <0 for critical errors. * - * For subpage, one page can contain several sectors, and - * __extent_writepage_io() will just grab all extent maps in the page - * range and try to submit all non-inline/non-compressed extents. - * - * This is a big problem for subpage, we shouldn't re-submit already written - * data at all. - * This function will lookup subpage dirty bit to find which range we really - * need to submit. - * - * Return the next dirty range in [@start, @end). - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ -static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, - struct page *page, u64 *start, u64 *end) +static int submit_one_sector(struct btrfs_inode *inode, + struct folio *folio, + u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { - struct folio *folio = page_folio(page); - struct btrfs_subpage *subpage = folio_get_private(folio); - struct btrfs_subpage_info *spi = fs_info->subpage_info; - u64 orig_start = *start; - /* Declare as unsigned long so we can use bitmap ops */ - unsigned long flags; - int range_start_bit; - int range_end_bit; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + u64 block_start; + u64 disk_bytenr; + u64 extent_offset; + u64 em_end; + const u32 sectorsize = fs_info->sectorsize; + + ASSERT(IS_ALIGNED(filepos, sectorsize)); + + /* @filepos >= i_size case should be handled by the caller. */ + ASSERT(filepos < i_size); + + em = btrfs_get_extent(inode, NULL, filepos, sectorsize); + if (IS_ERR(em)) + return PTR_ERR_OR_ZERO(em); + + extent_offset = filepos - em->start; + em_end = extent_map_end(em); + ASSERT(filepos <= em_end); + ASSERT(IS_ALIGNED(em->start, sectorsize)); + ASSERT(IS_ALIGNED(em->len, sectorsize)); + + block_start = extent_map_block_start(em); + disk_bytenr = extent_map_block_start(em) + extent_offset; + + ASSERT(!extent_map_is_compressed(em)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + + free_extent_map(em); + em = NULL; + + btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + /* + * Above call should set the whole folio with writeback flag, even + * just for a single subpage sector. + * As long as the folio is properly locked and the range is correct, + * we should always get the folio with writeback flag. + */ + ASSERT(folio_test_writeback(folio)); /* - * For regular sector size == page size case, since one page only - * contains one sector, we return the page offset directly. + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * folio for range already written to disk. */ - if (!btrfs_is_subpage(fs_info, page->mapping)) { - *start = page_offset(page); - *end = page_offset(page) + PAGE_SIZE; - return; - } - - range_start_bit = spi->dirty_offset + - (offset_in_page(orig_start) >> fs_info->sectorsize_bits); - - /* We should have the page locked, but just in case */ - spin_lock_irqsave(&subpage->lock, flags); - bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, - spi->dirty_offset + spi->bitmap_nr_bits); - spin_unlock_irqrestore(&subpage->lock, flags); - - range_start_bit -= spi->dirty_offset; - range_end_bit -= spi->dirty_offset; - - *start = page_offset(page) + range_start_bit * fs_info->sectorsize; - *end = page_offset(page) + range_end_bit * fs_info->sectorsize; + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + sectorsize, filepos - folio_pos(folio)); + return 0; } /* - * helper for __extent_writepage. This calls the writepage start hooks, + * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct page *page, u64 start, u32 len, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, - int *nr_ret) +static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 cur = start; - u64 end = start + len - 1; - u64 extent_offset; - u64 block_start; - struct extent_map *em; + unsigned long range_bitmap = 0; + bool submitted_io = false; + const u64 folio_start = folio_pos(folio); + u64 cur; + int bit; int ret = 0; - int nr = 0; - ASSERT(start >= page_offset(page) && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(start >= folio_start && + start + len <= folio_start + folio_size(folio)); - ret = btrfs_writepage_cow_fixup(page); + ret = btrfs_writepage_cow_fixup(folio); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(bio_ctrl->wbc, page); - unlock_page(page); + folio_redirty_for_writepage(bio_ctrl->wbc, folio); + folio_unlock(folio); return 1; } + for (cur = start; cur < start + len; cur += fs_info->sectorsize) + set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, + fs_info->sectors_per_page); + bio_ctrl->end_io_func = end_bbio_data_write; - while (cur <= end) { - u32 len = end - cur + 1; - u64 disk_bytenr; - u64 em_end; - u64 dirty_range_start = cur; - u64 dirty_range_end; - u32 iosize; + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, page, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, folio, cur, + start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or - * the next time the page gets dirtied, we will try to + * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, + start + len - cur); break; } - - find_next_dirty_byte(fs_info, page, &dirty_range_start, - &dirty_range_end); - if (cur < dirty_range_start) { - cur = dirty_range_start; - continue; - } - - em = btrfs_get_extent(inode, NULL, cur, len); - if (IS_ERR(em)) { - ret = PTR_ERR_OR_ZERO(em); - goto out_error; - } - - extent_offset = cur - em->start; - em_end = extent_map_end(em); - ASSERT(cur <= em_end); - ASSERT(cur < end); - ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); - ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); - - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; - - ASSERT(!extent_map_is_compressed(em)); - ASSERT(block_start != EXTENT_MAP_HOLE); - ASSERT(block_start != EXTENT_MAP_INLINE); - - /* - * Note that em_end from extent_map_end() and dirty_range_end from - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - free_extent_map(em); - em = NULL; - - /* - * Although the PageDirty bit might be cleared before entering - * this function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(inode->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } - - - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - cur - page_offset(page)); - cur += iosize; - nr++; + ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); + if (ret < 0) + goto out; + submitted_io = true; } - btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len); - *nr_ret = nr; - return 0; - -out_error: + btrfs_folio_assert_not_dirty(fs_info, folio, start, len); +out: /* - * If we finish without problem, we should not only clear page dirty, - * but also empty subpage dirty bits + * If we didn't submitted any sector (>= i_size), folio dirty get + * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared + * by folio_start_writeback() if the folio is not dirty). + * + * Here we set writeback and clear for the range. If the full folio + * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. */ - *nr_ret = nr; + if (!submitted_io) { + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); + } return ret; } @@ -1539,62 +1417,65 @@ out_error: * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; - const u64 page_start = page_offset(page); + struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + const u64 page_start = folio_pos(folio); int ret; - int nr = 0; size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, inode, bio_ctrl->wbc); - WARN_ON(!PageLocked(page)); + WARN_ON(!folio_test_locked(folio)); - pg_offset = offset_in_page(i_size); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { + pg_offset = offset_in_folio(folio, i_size); + if (folio->index > end_index || + (folio->index == end_index && !pg_offset)) { folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); return 0; } - if (page->index == end_index) - memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + if (folio->index == end_index) + folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); - ret = set_page_extent_mapped(page); + /* + * Default to unlock the whole folio. + * The proper bitmap can only be initialized until writepage_delalloc(). + */ + bio_ctrl->submit_bitmap = (unsigned long)-1; + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page), - PAGE_SIZE, bio_ctrl, i_size, &nr); + ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; bio_ctrl->wbc->nr_to_write--; done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, - PAGE_SIZE, !ret); - mapping_set_error(page->mapping, ret); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, + page_start, PAGE_SIZE, !ret); + mapping_set_error(folio->mapping, ret); } - btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio); + /* + * Only unlock ranges that are submitted. As there can be some async + * submitted ranges inside the folio. + */ + btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -1846,7 +1727,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_unlock(folio); } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } /* @@ -1863,17 +1744,16 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); int submitted = 0; - u64 page_start = page_offset(page); + u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { + while (bit_start < fs_info->sectors_per_page) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1883,21 +1763,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); bit_start++; continue; } - start = page_start + bit_start * fs_info->sectorsize; + start = folio_start + bit_start * fs_info->sectorsize; bit_start += sectors_per_node; /* @@ -1906,7 +1786,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1945,19 +1825,18 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = page->mapping; - struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; struct extent_buffer *eb; int ret; if (!folio_test_private(folio)) return 0; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { @@ -2055,7 +1934,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &ctx); + ret = submit_eb_page(folio, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -2109,7 +1988,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * - * We can get ret > 0 from submit_extent_page() indicating how many ebs + * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) @@ -2248,7 +2127,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, bio_ctrl); + ret = extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2295,7 +2174,7 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -2319,37 +2198,46 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; - struct page *page; - int nr = 0; + struct folio *folio; - page = find_get_page(mapping, cur >> PAGE_SHIFT); - ASSERT(PageLocked(page)); - if (pages_dirty && page != locked_page) - ASSERT(PageDirty(page)); + folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); - ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len, - &bio_ctrl, i_size, &nr); + /* + * This shouldn't happen, the pages are pinned and locked, this + * code is just in case, but shouldn't actually be run. + */ + if (IS_ERR(folio)) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + cur, cur_len, false); + mapping_set_error(mapping, PTR_ERR(folio)); + cur = cur_end + 1; + continue; + } + + ASSERT(folio_test_locked(folio)); + if (pages_dirty && folio != locked_folio) + ASSERT(folio_test_dirty(folio)); + + /* + * Set the submission bitmap to submit all sectors. + * extent_writepage_io() will do the truncation correctly. + */ + bio_ctrl.submit_bitmap = (unsigned long)-1; + ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, + &bio_ctrl, i_size); if (ret == 1) goto next_page; - /* Make sure the mapping tag for page dirty gets cleared. */ - if (nr == 0) { - struct folio *folio; - - folio = page_folio(page); - btrfs_folio_set_writeback(fs_info, folio, cur, cur_len); - btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); - } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, cur, cur_len, !ret); - mapping_set_error(page->mapping, ret); + mapping_set_error(mapping, ret); } - btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); + btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: - put_page(page); + folio_put(folio); cur = cur_end + 1; } @@ -2379,18 +2267,12 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct page *pagepool[16]; + struct folio *folio; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - int nr; - while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = readahead_pos(rac); - u64 contig_end = contig_start + readahead_batch_length(rac) - 1; - - contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio_ctrl, &prev_em_start); - } + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -2435,9 +2317,9 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct page *page, gfp_t mask) + struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; bool ret; @@ -2473,11 +2355,11 @@ static bool try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -bool try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; while (start <= end) { @@ -2546,7 +2428,7 @@ next: cond_resched(); } } - return try_release_extent_state(io_tree, page, mask); + return try_release_extent_state(io_tree, folio, mask); } static void __free_extent_buffer(struct extent_buffer *eb) @@ -2572,7 +2454,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli return true; /* * Even there is no eb refs here, we may still have - * end_page_read() call relying on page::private. + * end_folio_read() call relying on page::private. */ if (atomic_read(&subpage->readers)) return true; @@ -3615,7 +3497,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, ASSERT(ret); } } - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bbio(bbio, mirror_num); done: if (wait == WAIT_COMPLETE) { @@ -4171,17 +4053,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, #define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) + const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) { struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; - u64 page_start = page_offset(page); - u64 cur = page_start; + u64 folio_start = folio_pos(folio); + u64 cur = folio_start; - ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { + while (cur < folio_start + PAGE_SIZE) { int ret; int i; @@ -4193,7 +4075,7 @@ static struct extent_buffer *get_next_extent_buffer( goto out; for (i = 0; i < ret; i++) { /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) + if (gang[i]->start >= folio_start + PAGE_SIZE) goto out; /* Found one */ if (gang[i]->start >= bytenr) { @@ -4207,11 +4089,11 @@ out: return found; } -static int try_release_subpage_extent_buffer(struct page *page) +static int try_release_subpage_extent_buffer(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - u64 cur = page_offset(page); - const u64 end = page_offset(page) + PAGE_SIZE; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + u64 cur = folio_pos(folio); + const u64 end = cur + PAGE_SIZE; int ret; while (cur < end) { @@ -4226,7 +4108,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page, cur); + eb = get_next_extent_buffer(fs_info, folio, cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); @@ -4267,31 +4149,30 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->i_private_lock); - if (!folio_test_private(page_folio(page))) + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return ret; } -int try_release_extent_buffer(struct page *page) +int try_release_extent_buffer(struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(folio); /* * We need to make sure nobody is changing folio private, as we rely on * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 1; } @@ -4306,10 +4187,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index dceebd76c7d1..8a36117ed453 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -236,11 +236,11 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -bool try_release_extent_mapping(struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page); +bool try_release_extent_mapping(struct folio *folio, gfp_t mask); +int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -249,7 +249,7 @@ int btree_write_cache_pages(struct address_space *mapping, void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); -void clear_page_extent_mapped(struct page *page); +void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); @@ -354,7 +354,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, @@ -368,7 +368,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, + struct folio *locked_folio, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 10ac5f657e38..25d191f1ac10 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -192,10 +192,13 @@ static inline u64 extent_map_block_len(const struct extent_map *em) static inline u64 extent_map_block_end(const struct extent_map *em) { - if (extent_map_block_start(em) + extent_map_block_len(em) < - extent_map_block_start(em)) + const u64 block_start = extent_map_block_start(em); + const u64 block_end = block_start + extent_map_block_len(em); + + if (block_end < block_start) return (u64)-1; - return extent_map_block_start(em) + extent_map_block_len(em); + + return block_end; } static bool can_merge_extent_map(const struct extent_map *em) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5c342fe1af61..886749b39672 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -151,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ -static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } @@ -1272,7 +1272,7 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 557dc43d7142..0e13661a71f3 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -74,7 +74,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2aeb8116549c..c5e36f58eb07 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1617,7 +1617,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { skip_ilock = true; current->journal_info = NULL; - lockdep_assert_held(&inode->vfs_inode.i_rwsem); + btrfs_assert_inode_locked(inode); } trace_btrfs_sync_file(file, datasync); @@ -1920,8 +1920,8 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; end = page_end; /* @@ -1949,18 +1949,18 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; again: down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || + if ((folio->mapping != inode->i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); + ret2 = set_folio_extent_mapped(folio); if (ret2 < 0) { ret = vmf_error(ret2); unlock_extent(io_tree, page_start, page_end, &cached_state); @@ -1974,14 +1974,14 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } - if (page->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; @@ -2011,13 +2011,13 @@ again: } /* Page is wholly or partially inside EOF. */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); else zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); @@ -2034,7 +2034,7 @@ again: return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 3d6d4b503220..79f64e383edd 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -703,8 +703,8 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; + u32 sectors_per_page; struct workqueue_struct *scrub_workers; - struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 316756ff08ac..29572dfaf878 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -14,7 +14,7 @@ #include "extent-tree.h" #include "file-item.h" -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { @@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; @@ -423,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } -static inline void btrfs_trace_truncate(struct btrfs_inode *inode, - struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, +static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, + const struct extent_buffer *leaf, + const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c4aded82709b..c11b97fdccc4 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -109,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1b6564ab68f..edac499fd83d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -116,7 +116,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); @@ -393,17 +393,17 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; - struct page *page; + struct folio *folio; - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; + if (locked_folio) { + page_start = folio_pos(locked_folio); + page_end = page_start + folio_size(locked_folio) - 1; } while (index <= end_index) { @@ -417,13 +417,13 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { + if (locked_folio && index == (page_start >> PAGE_SHIFT)) { index++; continue; } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -431,14 +431,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } - if (locked_page) { + if (locked_folio) { /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) + if (bytes + offset <= page_start + folio_size(locked_folio)) return; /* * In case this page belongs to the delalloc range being @@ -447,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + bytes = offset + bytes - folio_pos(locked_folio) - + folio_size(locked_folio); + offset = folio_pos(locked_folio) + folio_size(locked_folio); } } @@ -494,7 +495,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; @@ -554,12 +554,16 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, + 0, 0, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -715,7 +719,7 @@ out: } static noinline int cow_file_range_inline(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 end, size_t compressed_size, int compress_type, @@ -740,13 +744,26 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return ret; } + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ if (ret == 0) - locked_page = NULL; + locked_folio = NULL; - extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, - clear_flags, - PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; } @@ -762,7 +779,7 @@ struct async_extent { struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -868,25 +885,25 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); } static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int ret = 0; for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - page = find_get_page(inode->i_mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + if (IS_ERR(folio)) { if (!ret) - ret = -ENOENT; + ret = PTR_ERR(folio); continue; } - clear_page_dirty_for_io(page); - put_page(page); + folio_clear_dirty_for_io(folio); + folio_put(folio); } return ret; } @@ -1122,7 +1139,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1135,20 +1152,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, locked_folio, start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); + btrfs_cleanup_ordered_extents(inode, locked_folio, + start, end - start + 1); + if (locked_folio) { + const u64 page_start = folio_pos(locked_folio); - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, + folio_start_writeback(locked_folio); + folio_end_writeback(locked_folio); + btrfs_mark_ordered_io_finished(inode, locked_folio, page_start, PAGE_SIZE, !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); + mapping_set_error(locked_folio->mapping, ret); + folio_unlock(locked_folio); } } } @@ -1164,7 +1183,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; @@ -1175,19 +1194,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1202,7 +1222,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1306,21 +1326,21 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1329,8 +1349,8 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; @@ -1363,7 +1383,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, locked_page, start, end, 0, + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* @@ -1500,7 +1520,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1553,13 +1573,13 @@ out_unlock: * function. * * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * (except @locked_folio) to ensure all the pages are unlocked. */ if (keep_locked && orig_start < start) { - if (!locked_page) + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, NULL, 0, page_ops); + locked_folio, NULL, 0, page_ops); } /* @@ -1582,8 +1602,7 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, &cached, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; @@ -1597,7 +1616,7 @@ out_unlock: */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } @@ -1651,7 +1670,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1691,15 +1710,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1709,12 +1729,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, &locked_folio->page, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_folio = locked_folio; + locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { @@ -1743,7 +1763,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1751,20 +1771,21 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, locked_folio, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1833,7 +1854,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -1987,7 +2009,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -2150,8 +2172,8 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { btrfs_dec_nocow_writers(nocow_bg); @@ -2206,7 +2228,7 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, &cached_state, + locked_folio, &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2228,7 +2250,7 @@ must_cow: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; if (ret) goto error; @@ -2255,7 +2277,7 @@ error: lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -2282,39 +2304,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, + btrfs_cleanup_ordered_extents(inode, locked_folio, start, end - start + 1); return ret; } @@ -2690,7 +2712,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2702,50 +2724,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2753,7 +2776,7 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2762,14 +2785,14 @@ again: lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2787,7 +2810,7 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2801,14 +2824,14 @@ out_page: * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* @@ -2821,33 +2844,34 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2857,14 +2881,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -6700,7 +6724,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6722,7 +6746,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6733,36 +6758,36 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) + struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6784,7 +6809,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6807,7 +6832,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6937,7 +6962,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(inode, path, folio); if (ret < 0) goto out; goto insert; @@ -7179,13 +7204,12 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7214,9 +7238,9 @@ static int btrfs_launder_folio(struct folio *folio) static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (try_release_extent_mapping(&folio->page, gfp_flags)) { - wait_subpage_spinlock(&folio->page); - clear_page_extent_mapped(&folio->page); + if (try_release_extent_mapping(folio, gfp_flags)) { + wait_subpage_spinlock(folio); + clear_folio_extent_mapped(folio); return true; } return false; @@ -7276,7 +7300,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like @@ -7414,7 +7438,7 @@ next: btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) @@ -8951,19 +8975,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); + ASSERT(folio_order(folio) == 0); + btrfs_folio_set_writeback(fs_info, folio, start, len); + folio_put(folio); index++; } } @@ -9128,7 +9152,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); @@ -9143,7 +9167,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } while (disk_io_size); atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e0a664b8a46a..8537eb9b5531 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -543,13 +543,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); - if (ret < 0) - return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; - return 0; + return ret; } int __pure btrfs_is_empty_uuid(const u8 *uuid) @@ -4765,11 +4763,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1e2a68b8f62d..72856f6775f7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -438,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; @@ -467,22 +467,22 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (unlikely(ret != LZO_E_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(fs_info, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto out; } ASSERT(out_len <= sectorsize); - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); + folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); } out: return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 82a68394a89c..2104d60c2161 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -332,7 +332,7 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -340,10 +340,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* * Ordered (Private2) bit indicates whether we still have @@ -351,10 +351,9 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_folio_test_ordered(fs_info, page_folio(page), - file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ @@ -398,7 +397,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) } void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -408,7 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); /* @@ -449,8 +449,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -460,7 +460,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { struct rb_node *node; @@ -524,7 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); @@ -1015,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, { struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); spin_lock_irq(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 51b9e81726e2..4e152736d06c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -163,11 +163,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, bool uptodate); + struct folio *folio, u64 file_offset, + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 6195a2215b8f..9f3ad124104f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -9,9 +9,8 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -21,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, 0); } int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -44,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } + return ret; + if (ret) + return -ENOENT; - ret = btrfs_del_item(trans, root, path); - -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index feb8f9f2f358..c297909f1506 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1998,16 +1998,14 @@ out: * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. - * Error is not possible + * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { - struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_qgroup_extent_record *entry; - u64 bytenr = record->bytenr; + struct btrfs_qgroup_extent_record *existing, *ret; + unsigned long bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; @@ -2015,26 +2013,24 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, - node); - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - } else { - if (record->data_rsv && !entry->data_rsv) { - entry->data_rsv = record->data_rsv; - entry->data_rsv_refroot = - record->data_rsv_refroot; - } - return 1; + xa_lock(&delayed_refs->dirty_extents); + existing = xa_load(&delayed_refs->dirty_extents, bytenr); + if (existing) { + if (record->data_rsv && !existing->data_rsv) { + existing->data_rsv = record->data_rsv; + existing->data_rsv_refroot = record->data_rsv_refroot; } + xa_unlock(&delayed_refs->dirty_extents); + return 1; + } + + ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + xa_unlock(&delayed_refs->dirty_extents); + if (xa_is_err(ret)) { + qgroup_mark_inconsistent(fs_info); + return xa_err(ret); } - rb_link_node(&record->node, parent_node, p); - rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } @@ -2141,6 +2137,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + kfree(record); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; record->bytenr = bytenr; record->num_bytes = num_bytes; @@ -2149,7 +2150,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); - if (ret > 0) { + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, record->bytenr); kfree(record); return 0; } @@ -3018,7 +3021,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; - struct rb_node *node; + unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -3028,10 +3031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; - while ((node = rb_first(&delayed_refs->dirty_extent_root))) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - + xa_for_each(&delayed_refs->dirty_extents, index, record) { num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); @@ -3097,7 +3097,7 @@ cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; - rb_erase(node, &delayed_refs->dirty_extent_root); + xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } @@ -4874,15 +4874,13 @@ out: void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; - struct btrfs_qgroup_extent_record *next; - struct rb_root *root; + unsigned long index; - root = &trans->delayed_refs.dirty_extent_root; - rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } - *root = RB_ROOT; + xa_destroy(&trans->delayed_refs.dirty_extents); } void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index deb479d176a9..98adf4ec7b01 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -125,7 +125,6 @@ struct btrfs_inode; * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - struct rb_node node; u64 bytenr; u64 num_bytes; diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index e6f7a234b8f6..4c859b550f6c 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -66,6 +66,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le if (ret) break; + start += key.offset; + length -= key.offset; + if (length == 0) + break; + btrfs_release_path(path); } @@ -73,6 +78,36 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le return ret; } +static int update_raid_extent_item(struct btrfs_trans_handle *trans, + struct btrfs_key *key, + struct btrfs_stripe_extent *stripe_extent, + const size_t item_size) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, + 0, 1); + if (ret) + return (ret == 1 ? ret : -EINVAL); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), + item_size); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + return ret; +} + static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc) { @@ -112,6 +147,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); + if (ret == -EEXIST) + ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, + item_size); if (ret) btrfs_abort_transaction(trans, ret); @@ -172,7 +210,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (!path) return -ENOMEM; - if (stripe->is_scrub) { + if (stripe->rst_search_commit_root) { path->skip_locking = 1; path->search_commit_root = 1; } @@ -245,10 +283,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, out: if (ret > 0) ret = -ENOENT; - if (ret && ret != -EIO && !stripe->is_scrub) { - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - btrfs_print_tree(leaf, 1); - btrfs_err(fs_info, + if (ret && ret != -EIO && !stripe->rst_search_commit_root) { + btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index df6b93b927cd..f0824c948cb7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out; - page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(mapping)); - if (!page) { + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; @@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, offset_in_page(file_offset), data_start, - datal); + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, - offset_in_page(file_offset), + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) - memzero_page(page, datal, block_size - datal); + folio_zero_range(folio, datal, block_size - datal); - btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0533d0f82dc9..ea4ed85919ec 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -36,6 +36,7 @@ #include "relocation.h" #include "super.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" /* * Relocation overview @@ -2965,21 +2966,34 @@ static int relocate_one_folio(struct reloc_control *rc, u64 folio_end; u64 cur; int ret; + const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { - page_cache_sync_readahead(inode->i_mapping, ra, NULL, - index, last_index + 1 - index); + + /* + * On relocation we're doing readahead on the relocation inode, + * but if the filesystem is backed by a RAID stripe tree we can + * get ENOENT (e.g. due to preallocated extents not being + * mapped in the RST) from the lookup. + * + * But readahead doesn't handle the error and submits invalid + * reads to the device, causing a assertion failures. + */ + if (!use_rst) + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + index, last_index + 1 - index); folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); if (IS_ERR(folio)) return PTR_ERR(folio); } WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio)) + if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0de9162ff481..3a3427428074 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -838,7 +838,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; @@ -857,7 +857,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } @@ -1683,7 +1683,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); bbio = NULL; } @@ -1694,7 +1694,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - io_stripe.is_scrub = true; + io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; /* * For RST cases, we need to manually split the bbio to @@ -1720,7 +1720,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1776,7 +1776,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, mirror = calc_next_mirror(mirror, num_copies); } - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 619fa0b8b3f6..7f48ba6c1c77 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -62,7 +62,7 @@ struct fs_path { /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy - * a allocation later during send. + * an allocation later during send. */ char pad[256]; }; @@ -1136,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater - * then the page size, attempt to increase the buffer. Typically xattr + * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c691784b4660..d5a9cd8a4fd8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -163,7 +163,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -368,7 +368,7 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) } static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { u64 profile; @@ -437,7 +437,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -542,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -844,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -871,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -1943,7 +1942,7 @@ static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) * Typically with 10 block groups as the target, the discrete values this comes * out to are 0, 10, 20, ... , 80, 90, and 99. */ -static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) { struct btrfs_fs_info *fs_info = space_info->fs_info; u64 unalloc = atomic64_read(&fs_info->free_chunk_space); @@ -1962,7 +1961,7 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) return calc_pct_ratio(want, target); } -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) { lockdep_assert_held(&space_info->lock); @@ -1985,7 +1984,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(struct btrfs_fs_info *fs_info, +static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; @@ -2073,7 +2072,7 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) return ret; } -void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) { int raid; struct btrfs_space_info *space_info; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5602026c5e14..efbecc0c5258 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -217,7 +217,7 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); @@ -258,7 +258,7 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -271,7 +271,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( @@ -293,7 +293,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); -void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8ddd5fcbeb93..fe4d719d506b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,6 +64,7 @@ * This means a slightly higher tree locking latency. */ +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) @@ -85,37 +86,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space return true; return false; } - -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) -{ - unsigned int cur = 0; - unsigned int nr_bits; - - ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); - - nr_bits = PAGE_SIZE / sectorsize; - subpage_info->bitmap_nr_bits = nr_bits; - - subpage_info->uptodate_offset = cur; - cur += nr_bits; - - subpage_info->dirty_offset = cur; - cur += nr_bits; - - subpage_info->writeback_offset = cur; - cur += nr_bits; - - subpage_info->ordered_offset = cur; - cur += nr_bits; - - subpage_info->checked_offset = cur; - cur += nr_bits; - - subpage_info->locked_offset = cur; - cur += nr_bits; - - subpage_info->total_nr_bits = cur; -} +#endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) @@ -163,7 +134,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, ASSERT(fs_info->sectorsize < PAGE_SIZE); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -246,7 +217,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->subpage_info->name##_offset; \ + __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -351,6 +322,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; + unsigned int cleared = 0; + int bit = start_bit; bool last; btrfs_subpage_assert(fs_info, folio, start, len); @@ -368,11 +341,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf return true; } - ASSERT(atomic_read(&subpage->writers) >= nbits); - /* The target range should have been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->writers); + for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { + clear_bit(bit, subpage->bitmaps); + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); spin_unlock_irqrestore(&subpage->lock, flags); return last; } @@ -404,27 +378,94 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, return 0; } +/* + * Handle different locked folios: + * + * - Non-subpage folio + * Just unlock it. + * + * - folio locked but without any subpage locked + * This happens either before writepage_delalloc() or the delalloc range is + * already handled by previous folio. + * We can simple unlock it. + * + * - folio locked with subpage range locked. + * We go through the locked sectors inside the range and clear their locked + * bitmap, reduce the writer lock number, and unlock the page if that's + * the last locked range. + */ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { + struct btrfs_subpage *subpage = folio_get_private(folio); + + ASSERT(folio_test_locked(folio)); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) folio_unlock(folio); } +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) +{ + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + unsigned long flags; + bool last = false; + int cleared = 0; + int bit; + + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + + spin_lock_irqsave(&subpage->lock, flags); + for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + if (last) + folio_unlock(folio); +} + #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) #define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -728,53 +769,6 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Handle different locked pages with different page sizes: - * - * - Page locked by plain lock_page() - * It should not have any subpage::writers count. - * Can be unlocked by unlock_page(). - * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages(). - * Rarer cases include the @locked_page from extent_write_locked_range(). - * - * - Page locked by lock_delalloc_pages() - * There is only one caller, all pages except @locked_page for - * extent_write_locked_range(). - * In this case, we have to call subpage helper to handle the case. - */ -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage; - - ASSERT(folio_test_locked(folio)); - /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); - - /* - * For subpage case, there are two types of locked page. With or - * without writers number. - * - * Since we own the page lock, no one else could touch subpage::writers - * and we are safe to do several atomic operations without spinlock. - */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page() */ - folio_unlock(folio); - return; - } - - /* Have writers, use proper subpage helper to end it */ - btrfs_folio_end_writer_lock(fs_info, folio, start, len); -} - /* * This is for folio already locked by plain lock_page()/folio_lock(), which * doesn't have any subpage awareness. @@ -803,7 +797,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits); + ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -819,14 +813,13 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage = folio_get_private(folio); + const u32 sectors_per_page = fs_info->sectors_per_page; const unsigned int len = PAGE_SIZE - offset_in_page(search_start); const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, locked, search_start, len); - const unsigned int locked_bitmap_start = subpage_info->locked_offset; - const unsigned int locked_bitmap_end = locked_bitmap_start + - subpage_info->bitmap_nr_bits; + const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; unsigned long flags; int first_zero; int first_set; @@ -855,59 +848,21 @@ out: return found; } -/* - * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range, - * this ends all writer locked ranges of a page. - * - * This is for the locked page of __extent_writepage(), as the locked page - * can contain several locked subpage ranges. - */ -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - /* The page has no new delalloc range locked on it. Just plain unlock. */ - if (atomic_read(&subpage->writers) == 0) { - folio_unlock(folio); - return; - } - while (cur < folio_start + PAGE_SIZE) { - u64 found_start; - u32 found_len; - bool found; - bool last; - - found = btrfs_subpage_find_writer_locked(fs_info, folio, cur, - &found_start, &found_len); - if (!found) - break; - last = btrfs_subpage_end_and_test_writer(fs_info, folio, - found_start, found_len); - if (last) { - folio_unlock(folio); - break; - } - cur = found_start + found_len; - } +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +{ \ + const int sectors_per_page = fs_info->sectors_per_page; \ + \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ } -#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ - bitmap_cut(dst, subpage->bitmaps, 0, \ - subpage_info->name##_offset, subpage_info->bitmap_nr_bits) - void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; + const u32 sectors_per_page = fs_info->sectors_per_page; unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -916,25 +871,41 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - subpage_info->bitmap_nr_bits, &uptodate_bitmap, - subpage_info->bitmap_nr_bits, &dirty_bitmap, - subpage_info->bitmap_nr_bits, &writeback_bitmap, - subpage_info->bitmap_nr_bits, &ordered_bitmap, - subpage_info->bitmap_nr_bits, &checked_bitmap); + sectors_per_page, &uptodate_bitmap, + sectors_per_page, &dirty_bitmap, + sectors_per_page, &writeback_bitmap, + sectors_per_page, &ordered_bitmap, + sectors_per_page, &checked_bitmap); +} + +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap) +{ + struct btrfs_subpage *subpage; + unsigned long flags; + + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(fs_info->sectors_per_page > 1); + subpage = folio_get_private(folio); + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 249396e118d0..4b85d91d0e18 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -5,6 +5,7 @@ #include #include +#include struct address_space; struct folio; @@ -18,39 +19,23 @@ struct btrfs_fs_info; * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- dirty_offset /- ordered_offset + * /- uptodate /- dirty /- ordered * | | | * v v v * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| - * |<- bitmap_nr_bits ->| - * |<----------------- total_nr_bits ------------------>| + * |< sectors_per_page >| + * + * Unlike regular macro-like enums, here we do not go upper-case names, as + * these names will be utilized in various macros to define function names. */ -struct btrfs_subpage_info { - /* Number of bits for each bitmap */ - unsigned int bitmap_nr_bits; - - /* Total number of bits for the whole bitmap */ - unsigned int total_nr_bits; - - /* - * *_offset indicates where the bitmap starts, the length is always - * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. - */ - unsigned int uptodate_offset; - unsigned int dirty_offset; - unsigned int writeback_offset; - unsigned int ordered_offset; - unsigned int checked_offset; - - /* - * For locked bitmaps, normally it's subpage representation for folio - * Locked flag, but metadata is different: - * - * - Metadata doesn't really lock the folio - * It's just to prevent page::private get cleared before the last - * end_page_read(). - */ - unsigned int locked_offset; +enum { + btrfs_bitmap_nr_uptodate = 0, + btrfs_bitmap_nr_dirty, + btrfs_bitmap_nr_writeback, + btrfs_bitmap_nr_ordered, + btrfs_bitmap_nr_checked, + btrfs_bitmap_nr_locked, + btrfs_bitmap_nr_max }; /* @@ -88,9 +73,16 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#else +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct address_space *mapping) +{ + return false; +} +#endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -114,10 +106,11 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret); -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio); /* * Template for subpage related operations. @@ -164,8 +157,9 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 865d4af4b303..0a2dbfaaf49e 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) } start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5e6fff8e1003..0fc873af891f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -143,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.dirty_extent_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -351,7 +350,7 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; - cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; + xa_init(&cur_trans->delayed_refs.dirty_extents); atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f0cf8ce26f01..e2ed2a791f8f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2877,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; - ASSERT(inode_is_locked(&ctx->inode->vfs_inode)); + btrfs_assert_inode_locked(ctx->inode); list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { list_del_init(&ordered->log_list); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index fa45b5fb9683..b382a4c443d4 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -170,7 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +188,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffe /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) + const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; @@ -198,7 +198,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, return true; } -static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, +static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { @@ -221,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; @@ -258,7 +258,7 @@ out_unlock: return ret; } -static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, +static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -278,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -535,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, } int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index ff00c8e8a393..6308c577a4a4 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -37,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, @@ -47,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items); -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items); u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index eae75bb572b9..c6399513c66f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -3,6 +3,7 @@ * Copyright (C) STRATO AG 2013. All rights reserved. */ +#include #include #include #include "messages.h" @@ -12,6 +13,7 @@ #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key) { @@ -390,3 +392,180 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index a3f5757cc7cf..c60ad20325cc 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -13,5 +13,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); #endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4042dd6437ae..e97ad824ae16 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -284,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * page and ignore dest, but it must still be non-NULL to avoid the * counting-only behavior. * @len: length in bytes to read - * @dest_page: copy into this page instead of the dest buffer + * @dest_folio: copy into this folio instead of the dest buffer * * Helper function to read items from the btree. This returns the number of * bytes read or < 0 for errors. We can return short reads if the items don't @@ -294,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * Returns number of bytes read or a negative error code on failure. */ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, - char *dest, u64 len, struct page *dest_page) + char *dest, u64 len, struct folio *dest_folio) { struct btrfs_path *path; struct btrfs_root *root = inode->root; @@ -314,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (!path) return -ENOMEM; - if (dest_page) + if (dest_folio) path->reada = READA_FORWARD; key.objectid = btrfs_ino(inode); @@ -371,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, copy_offset = offset - key.offset; if (dest) { - if (dest_page) - kaddr = kmap_local_page(dest_page); + if (dest_folio) + kaddr = kmap_local_folio(dest_folio, 0); data = btrfs_item_ptr(leaf, path->slots[0], void); read_extent_buffer(leaf, kaddr + dest_offset, (unsigned long)data + copy_offset, copy_bytes); - if (dest_page) + if (dest_folio) kunmap_local(kaddr); } @@ -460,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; int ret; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); ret = btrfs_drop_verity_items(inode); @@ -585,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp) struct btrfs_trans_handle *trans; int ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; @@ -633,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc, int ret = 0; int rollback_ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (desc == NULL) goto rollback; @@ -762,7 +762,7 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - folio_address(folio), PAGE_SIZE, &folio->page); + folio_address(folio), PAGE_SIZE, folio); if (ret < 0) { folio_put(folio); return ERR_PTR(ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fcedc43ef291..8f340ad1d938 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -476,6 +476,8 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); + btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", + device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); @@ -4784,183 +4786,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_uuid_scan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = data; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_key key; - struct btrfs_path *path = NULL; - int ret = 0; - struct extent_buffer *eb; - int slot; - struct btrfs_root_item root_item; - u32 item_size; - struct btrfs_trans_handle *trans = NULL; - bool closing = false; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - key.objectid = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - - while (1) { - if (btrfs_fs_closing(fs_info)) { - closing = true; - break; - } - ret = btrfs_search_forward(root, &key, path, - BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - break; - } - - if (key.type != BTRFS_ROOT_ITEM_KEY || - (key.objectid < BTRFS_FIRST_FREE_OBJECTID && - key.objectid != BTRFS_FS_TREE_OBJECTID) || - key.objectid > BTRFS_LAST_FREE_OBJECTID) - goto skip; - - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size(eb, slot); - if (item_size < sizeof(root_item)) - goto skip; - - read_extent_buffer(eb, &root_item, - btrfs_item_ptr_offset(eb, slot), - (int)sizeof(root_item)); - if (btrfs_root_refs(&root_item) == 0) - goto skip; - - if (!btrfs_is_empty_uuid(root_item.uuid) || - !btrfs_is_empty_uuid(root_item.received_uuid)) { - if (trans) - goto update_tree; - - btrfs_release_path(path); - /* - * 1 - subvol uuid item - * 1 - received_subvol uuid item - */ - trans = btrfs_start_transaction(fs_info->uuid_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - continue; - } else { - goto skip; - } -update_tree: - btrfs_release_path(path); - if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - - if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, - root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - -skip: - btrfs_release_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - trans = NULL; - if (ret) - break; - } - - if (key.offset < (u64)-1) { - key.offset++; - } else if (key.type < BTRFS_ROOT_ITEM_KEY) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - } else if (key.objectid < (u64)-1) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.objectid++; - } else { - break; - } - cond_resched(); - } - -out: - btrfs_free_path(path); - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else if (!closing) - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); - up(&fs_info->uuid_tree_rescan_sem); - return 0; -} - -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *uuid_root; - struct task_struct *task; - int ret; - - /* - * 1 - root node - * 1 - root item - */ - trans = btrfs_start_transaction(tree_root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - fs_info->uuid_root = uuid_root; - - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_scan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -5956,11 +5781,31 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) write_unlock(&fs_info->mapping_tree_lock); } +static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); + + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + + /* + * There could be two corrupted data stripes, we need to loop retry in + * order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the stripe under + * reconstruction. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return map->num_stripes; + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + return btrfs_raid_array[index].ncopies; +} + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; - enum btrfs_raid_types index; - int ret = 1; + int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) @@ -5972,22 +5817,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - index = btrfs_bg_flags_to_raid_index(map->type); - - /* Non-RAID56, use their ncopies from btrfs_raid_array. */ - if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) - ret = btrfs_raid_array[index].ncopies; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - /* - * There could be two corrupted data stripes, we need - * to loop retry in order to rebuild the correct data. - * - * Fail a stripe at a time on every retry except the - * stripe under reconstruction. - */ - ret = map->num_stripes; + ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } @@ -6637,14 +6467,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.stripe_index = 0; io_geom.op = op; - num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (io_geom.mirror_num > num_copies) - return -EINVAL; - map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); + num_copies = btrfs_chunk_map_num_copies(map); + if (io_geom.mirror_num > num_copies) + return -EINVAL; + map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37a09ebb34dd..03d2d60afe0c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -444,7 +444,7 @@ struct btrfs_io_stripe { /* Block mapping. */ u64 physical; u64 length; - bool is_scrub; + bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; }; @@ -725,8 +725,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 738c7bb8ea7c..ce464cd8e0ac 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(inode_is_locked(inode)); + btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 30971dd741e2..100abc00b794 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,6 +20,8 @@ #include #include "btrfs_inode.h" #include "compression.h" +#include "fs.h" +#include "subpage.h" /* workspace buffer size for s390 zlib hardware support */ #define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) @@ -108,6 +110,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u64 orig_end = start + len; *out_folios = 0; *total_out = 0; @@ -153,6 +156,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, if (in_buf_folios > 1) { int i; + /* S390 hardware acceleration path, not subpage. */ + ASSERT(!btrfs_is_subpage( + inode_to_fs_info(mapping->host), + mapping)); for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); @@ -167,9 +174,14 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; + workspace->strm.avail_in = + (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; } else { + unsigned int pg_off; + unsigned int cur_len; + if (data_in) { kunmap_local(data_in); folio_put(in_folio); @@ -179,12 +191,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - data_in = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + data_in = kmap_local_folio(in_folio, pg_off); start += PAGE_SIZE; workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; } - workspace->strm.avail_in = min(bytes_left, - (unsigned long) workspace->buf_size); } ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); @@ -380,7 +393,7 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -408,12 +421,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, ret = zlib_inflateInit2(&workspace->strm, wbits); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); return -EIO; } @@ -426,16 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, if (ret != Z_STREAM_END) goto out; - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy); out: if (unlikely(to_copy != destlen)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page), to_copy, destlen); + folio_pos(dest_folio), to_copy, destlen); ret = -EIO; } else { ret = 0; @@ -444,7 +457,7 @@ out: zlib_inflateEnd(&workspace->strm); if (unlikely(to_copy < destlen)) - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 047e3337852e..7fa2920632ba 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -287,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -304,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (ret > 0) { - ret = -EUCLEAN; - goto out; - } + if (ret > 0) + return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); - ret = 0; - -out: - btrfs_free_path(path); - - return ret; + return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -1211,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; @@ -1246,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!ret) ret = -EUCLEAN; if (ret < 0) - goto out; + return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { @@ -1254,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, ret = 0; *offset_ret = 0; } - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -1266,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; - ret = 0; - -out: - btrfs_free_path(path); - return ret; + return 0; } struct zone_info { @@ -2459,7 +2447,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); } -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 30b2e48a1cec..7612e6572605 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -89,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); @@ -242,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } -static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { return false; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 2a079561b2b1..866607fd3e58 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -389,7 +389,10 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; + const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; + unsigned int pg_off; + unsigned int cur_len; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -415,9 +418,11 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ out_folio = btrfs_alloc_compr_folio(); @@ -494,14 +499,16 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); - start += PAGE_SIZE; - len -= PAGE_SIZE; + start += cur_len; + len -= cur_len; ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; } } while (1) { @@ -649,11 +656,11 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; @@ -662,12 +669,12 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto finish; } @@ -686,21 +693,21 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, */ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret))) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret), btrfs_root_id(inode->root), - btrfs_ino(inode), page_offset(dest_page)); + btrfs_ino(inode), folio_pos(dest_folio)); goto finish; } to_copy = workspace->out_buf.pos; - memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: /* Error or early end. */ if (unlikely(to_copy < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 7dcdce660cac..6ea60661fa55 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -74,6 +74,23 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_ZIP bool "EROFS Data Compression Support" depends on EROFS_FS @@ -128,7 +145,7 @@ config EROFS_FS_ZIP_ZSTD If unsure, say N. config EROFS_FS_ONDEMAND - bool "EROFS fscache-based on-demand read support" + bool "EROFS fscache-based on-demand read support (deprecated)" depends on EROFS_FS select NETFS_SUPPORT select FSCACHE @@ -138,6 +155,9 @@ config EROFS_FS_ONDEMAND This permits EROFS to use fscache-backed data blobs with on-demand read support. + It is now deprecated and scheduled to be removed from the kernel + after fanotify pre-content hooks are landed. + If unsure, say N. config EROFS_FS_PCPU_KTHREAD diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 097d672e6b14..4331d53c7109 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1b7eba38ba1e..61debd799cf9 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -59,8 +59,12 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - if (erofs_is_fscache_mode(sb)) - buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (erofs_is_fileio_mode(sbi)) + buf->mapping = file_inode(sbi->fdev)->i_mapping; + else if (erofs_is_fscache_mode(sb)) + buf->mapping = sbi->s_fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -75,38 +79,28 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map) { - erofs_blk_t nblocks, lastblk; - u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - nblocks = erofs_iblks(inode); - lastblk = nblocks - tailendpacking; - - /* there is no hole in flatmode */ - map->m_flags = EROFS_MAP_MAPPED; - if (offset < erofs_pos(sb, lastblk)) { + map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ + if (map->m_la < erofs_pos(sb, lastblk)) { map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - offset; - } else if (tailendpacking) { + map->m_plen = erofs_pos(sb, lastblk) - map->m_la; + } else { + DBG_BUGON(!tailendpacking); map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, offset); - map->m_plen = inode->i_size - offset; + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_plen = inode->i_size - map->m_la; /* inline data should be located in the same meta block */ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data cross block boundary @ nid %llu", - vi->nid); + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; - } else { - erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", - vi->nid, inode->i_size, map->m_la); - DBG_BUGON(1); - return -EIO; } return 0; } @@ -128,7 +122,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; - map->m_plen = 0; + map->m_plen = map->m_llen; goto out; } @@ -189,16 +183,34 @@ out: return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct erofs_device_info *dif) +{ + map->m_bdev = NULL; + map->m_fp = NULL; + if (dif->file) { + if (S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); + else + map->m_fp = dif->file; + } + map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff, length; int id; map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; map->m_fscache = EROFS_SB(sb)->s_fscache; + map->m_fp = EROFS_SB(sb)->fdev; if (map->m_deviceid) { down_read(&devs->rwsem); @@ -212,29 +224,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - if (!dif->mapped_blkaddr) continue; + startoff = erofs_pos(sb, dif->mapped_blkaddr); length = erofs_pos(sb, dif->blocks); - if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_file ? - file_bdev(dif->bdev_file) : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); break; } } @@ -243,6 +246,42 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this folio + * bit 0 - 29: remaining parts to complete this folio + */ +#define EROFS_ONLINEFOLIO_EIO (1 << 30) + +void erofs_onlinefolio_init(struct folio *folio) +{ + union { + atomic_t o; + void *v; + } u = { .o = ATOMIC_INIT(1) }; + + folio->private = u.v; /* valid only if file-backed folio is locked */ +} + +void erofs_onlinefolio_split(struct folio *folio) +{ + atomic_inc((atomic_t *)&folio->private); +} + +void erofs_onlinefolio_end(struct folio *folio, int err) +{ + int orig, v; + + do { + orig = atomic_read((atomic_t *)&folio->private); + v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); + + if (v & ~EROFS_ONLINEFOLIO_EIO) + return; + folio->private = 0; + folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { @@ -392,7 +431,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .read_folio = erofs_read_folio, .readahead = erofs_readahead, .bmap = erofs_bmap, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index c2253b6a5416..eb318c7ddd80 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -539,7 +539,7 @@ int __init z_erofs_init_decompressor(void) for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) { err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0; if (err) { - while (--i) + while (i--) if (z_erofs_decomp[i]) z_erofs_decomp[i]->exit(); return err; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 6c0c270c42e1..c8f2ae845bd2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -288,9 +288,12 @@ struct erofs_dirent { #define EROFS_NAME_LEN 255 -/* maximum supported size of a physical compression cluster */ +/* maximum supported encoded size of a physical compressed cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +/* maximum supported decoded size of a physical compressed cluster */ +#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 000000000000..3af96b1e2c2a --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include + +struct erofs_fileio_rq { + struct bio_vec bvecs[BIO_MAX_VECS]; + struct bio bio; + struct kiocb iocb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct folio_iter fi; + + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + if (rq->bio.bi_end_io) { + rq->bio.bi_end_io(&rq->bio); + } else { + bio_for_each_folio_all(fi, &rq->bio) { + DBG_BUGON(folio_test_uptodate(fi.folio)); + erofs_onlinefolio_end(fi.folio, ret); + } + } + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? + IOCB_DIRECT : 0; + iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); + rq->iocb.ki_filp = mdev->m_fp; + return rq; +} + +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) +{ + return &erofs_fileio_rq_alloc(mdev)->bio; +} + +void erofs_fileio_submit_bio(struct bio *bio) +{ + return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq, + bio)); +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) +{ + struct inode *inode = folio_inode(folio); + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = folio_size(folio), len, attached = 0; + loff_t pos = folio_pos(folio), ofs; + struct iov_iter iter; + struct bio_vec bv; + int err = 0; + + erofs_onlinefolio_init(folio); + while (cur < end) { + if (!in_range(pos + cur, map->m_la, map->m_llen)) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = folio_pos(folio) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + map->m_pa + ofs, EROFS_KMAP); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + bvec_set_folio(&bv, folio, len, cur); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); + if (copy_to_iter(src, len, &iter) != len) { + erofs_put_metabuf(&buf); + err = -EIO; + break; + } + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + attached = 0; + } + if (!attached++) + erofs_onlinefolio_split(folio); + if (!bio_add_folio(&io->rq->bio, folio, len, cur)) + goto io_retry; + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinefolio_end(folio, err); + return err; +} + +static int erofs_fileio_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_read_folio(folio, true); + err = erofs_fileio_scan_folio(&io, folio); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct folio *folio; + int err; + + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), true); + while ((folio = readahead_folio(rac))) { + err = erofs_fileio_scan_folio(&io, folio); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .read_folio = erofs_fileio_read_folio, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 419432be3223..db29190656eb 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,11 +5,26 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" - #include -static void *erofs_read_inode(struct erofs_buf *buf, - struct inode *inode, unsigned int *ofs) +static int erofs_fill_symlink(struct inode *inode, void *kaddr, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + loff_t off; + + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) + return 0; + + inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); + return inode->i_link ? 0 : -ENOMEM; +} + +static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -20,20 +35,21 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; union erofs_inode_i_u iu; - unsigned int ifmt; - int err; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int ifmt, ofs; + int err = 0; blkaddr = erofs_blknr(sb, inode_loc); - *ofs = erofs_blkoff(sb, inode_loc); + ofs = erofs_blkoff(sb, inode_loc); - kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", vi->nid, PTR_ERR(kaddr)); - return kaddr; + return PTR_ERR(kaddr); } - dic = kaddr + *ofs; + dic = kaddr + ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -54,11 +70,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= sb->s_blocksize) { - *ofs += vi->inode_isize; + if (ofs + vi->inode_isize <= sb->s_blocksize) { + ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = sb->s_blocksize - *ofs; + const unsigned int gotten = sb->s_blocksize - ofs; copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { @@ -66,16 +82,16 @@ static void *erofs_read_inode(struct erofs_buf *buf, goto err_out; } memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr + 1), + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", vi->nid, PTR_ERR(kaddr)); kfree(copied); - return kaddr; + return PTR_ERR(kaddr); } - *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, *ofs); + ofs = vi->inode_isize - gotten; + memcpy((u8 *)copied + gotten, kaddr, ofs); die = copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); @@ -91,11 +107,10 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_size = le64_to_cpu(die->i_size); kfree(copied); - copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); - *ofs += vi->inode_isize; + ofs += vi->inode_isize; vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); @@ -115,11 +130,21 @@ static void *erofs_read_inode(struct erofs_buf *buf, goto err_out; } + if (unlikely(inode->i_size < 0)) { + erofs_err(sb, "negative i_size @ nid %llu", vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } switch (inode->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: case S_IFLNK: vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + if(S_ISLNK(inode->i_mode)) { + err = erofs_fill_symlink(inode, kaddr, ofs); + if (err) + goto err_out; + } break; case S_IFCHR: case S_IFBLK: @@ -165,65 +190,23 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); - return kaddr; - err_out: - DBG_BUGON(1); - kfree(copied); - erofs_put_metabuf(buf); - return ERR_PTR(err); -} - -static int erofs_fill_symlink(struct inode *inode, void *kaddr, - unsigned int m_pofs) -{ - struct erofs_inode *vi = EROFS_I(inode); - unsigned int bsz = i_blocksize(inode); - char *lnk; - - /* if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= bsz || inode->i_size < 0) { - inode->i_op = &erofs_symlink_iops; - return 0; - } - - lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); - if (!lnk) - return -ENOMEM; - - m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross block boundary */ - if (m_pofs + inode->i_size > bsz) { - kfree(lnk); - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - memcpy(lnk, kaddr + m_pofs, inode->i_size); - lnk[inode->i_size] = '\0'; - - inode->i_link = lnk; - inode->i_op = &erofs_fast_symlink_iops; - return 0; + DBG_BUGON(err); + erofs_put_metabuf(&buf); + return err; } static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - unsigned int ofs; - int err = 0; + int err; trace_erofs_fill_inode(inode); /* read inode base data from disk */ - kaddr = erofs_read_inode(&buf, inode, &ofs); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); + err = erofs_read_inode(inode); + if (err) + return err; /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -240,9 +223,10 @@ static int erofs_fill_inode(struct inode *inode) inode_nohighmem(inode); break; case S_IFLNK: - err = erofs_fill_symlink(inode, kaddr, ofs); - if (err) - goto out_unlock; + if (inode->i_link) + inode->i_op = &erofs_fast_symlink_iops; + else + inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; case S_IFCHR: @@ -251,10 +235,9 @@ static int erofs_fill_inode(struct inode *inode) case S_IFSOCK: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); - goto out_unlock; + return 0; default: - err = -EFSCORRUPTED; - goto out_unlock; + return -EFSCORRUPTED; } mapping_set_large_folios(inode->i_mapping); @@ -268,14 +251,17 @@ static int erofs_fill_inode(struct inode *inode) err = -EOPNOTSUPP; #endif } else { - inode->i_mapping->a_ops = &erofs_raw_access_aops; + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; #endif } -out_unlock: - erofs_put_metabuf(&buf); + return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 45dc15ebd870..4efd578d7c62 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct file *bdev_file; + struct file *file; struct dax_device *dax_dev; u64 dax_part_off; @@ -130,6 +130,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -190,9 +191,15 @@ struct erofs_sb_info { #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) #define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { - return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && + !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev; } enum { @@ -365,6 +372,7 @@ struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; struct dax_device *m_daxdev; + struct file *m_fp; u64 m_dax_part_off; erofs_off_t m_pa; @@ -373,7 +381,8 @@ struct erofs_map_dev { extern const struct super_operations erofs_sops; -extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; extern const struct address_space_operations z_erofs_aops; extern const struct address_space_operations erofs_fscache_access_aops; @@ -404,6 +413,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); +void erofs_onlinefolio_init(struct folio *folio); +void erofs_onlinefolio_split(struct folio *folio); +void erofs_onlinefolio_end(struct folio *folio, int err); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, @@ -477,6 +489,14 @@ static inline void z_erofs_exit_subsystem(void) {} static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fileio_submit_bio(struct bio *bio); +#else +static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fileio_submit_bio(struct bio *bio) {} +#endif + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 6cb5c8916174..666873f745da 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "xattr.h" #define CREATE_TRACE_POINTS @@ -161,7 +162,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct file *bdev_file; + struct file *file; dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); if (IS_ERR(dis)) @@ -183,13 +184,17 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ, - sb->s_type, NULL); - if (IS_ERR(bdev_file)) - return PTR_ERR(bdev_file); - dif->bdev_file = bdev_file; - dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file), - &dif->dax_part_off, NULL, NULL); + file = erofs_is_fileio_mode(sbi) ? + filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : + bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, NULL); + if (IS_ERR(file)) + return PTR_ERR(file); + + dif->file = file; + if (!erofs_is_fileio_mode(sbi)) + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), + &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -348,7 +353,7 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_scan_devices(sb, dsb); if (erofs_is_fscache_mode(sb)) - erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -566,15 +571,16 @@ static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) - super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id, - sbi->fsid); - else - super_set_sysfs_name_generic(sb, "%s", sbi->fsid); - return; - } - super_set_sysfs_name_id(sb); + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, + sbi->fsid); + else if (sbi->fsid) + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + else if (erofs_is_fileio_mode(sbi)) + super_set_sysfs_name_generic(sb, "%s", + bdi_dev_name(sb->s_bdi)); + else + super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) @@ -589,14 +595,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; - if (erofs_is_fscache_mode(sb)) { + if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - err = erofs_fscache_register_fs(sb); - if (err) - return err; - + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } err = super_setup_bdi(sb); if (err) return err; @@ -644,7 +651,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - erofs_set_sysfs_name(sb); #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); @@ -682,6 +688,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; @@ -693,11 +700,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev(fc, erofs_fc_fill_super); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + if (!fc->source) + return invalf(fc, "No source specified"); + sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(sbi->fdev)) + return PTR_ERR(sbi->fdev); + + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) @@ -727,8 +747,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_file) - fput(dif->bdev_file); + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -791,7 +811,7 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) kill_anon_super(sb); else kill_block_super(sb); @@ -801,6 +821,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } @@ -903,7 +925,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) - buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 435e515c0792..63cffd0fd261 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -205,34 +205,16 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - char *name; - char *str = NULL; int err; - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) { - str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, - sbi->fsid); - if (!str) - return -ENOMEM; - name = str; - } else { - name = sbi->fsid; - } - } else { - name = sb->s_id; - } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); - kfree(str); - if (err) - goto put_sb_kobj; - return 0; - -put_sb_kobj: - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", + sb->s_sysfs_name); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } return err; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 424f656cd765..8936790618c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -122,42 +122,6 @@ static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) return fo->mapping == MNGD_MAPPING(sbi); } -/* - * bit 30: I/O error occurred on this folio - * bit 0 - 29: remaining parts to complete this folio - */ -#define Z_EROFS_FOLIO_EIO (1 << 30) - -static void z_erofs_onlinefolio_init(struct folio *folio) -{ - union { - atomic_t o; - void *v; - } u = { .o = ATOMIC_INIT(1) }; - - folio->private = u.v; /* valid only if file-backed folio is locked */ -} - -static void z_erofs_onlinefolio_split(struct folio *folio) -{ - atomic_inc((atomic_t *)&folio->private); -} - -static void z_erofs_onlinefolio_end(struct folio *folio, int err) -{ - int orig, v; - - do { - orig = atomic_read((atomic_t *)&folio->private); - v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); - - if (v & ~Z_EROFS_FOLIO_EIO) - return; - folio->private = 0; - folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO)); -} - #define Z_EROFS_ONSTACK_PAGES 32 /* @@ -232,7 +196,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_KERNEL); + nextpage = __erofs_allocpage(pagepool, GFP_KERNEL, + true); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -965,7 +930,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, int err = 0; tight = (bs == PAGE_SIZE); - z_erofs_onlinefolio_init(folio); + erofs_onlinefolio_init(folio); do { if (offset + end - 1 < map->m_la || offset + end - 1 >= map->m_la + map->m_llen) { @@ -1024,7 +989,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, if (err) break; - z_erofs_onlinefolio_split(folio); + erofs_onlinefolio_split(folio); if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) f->pcl->multibases = true; if (f->pcl->length < offset + end - map->m_la) { @@ -1044,7 +1009,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, tight = (bs == PAGE_SIZE); } } while ((end = cur) > 0); - z_erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err); return err; } @@ -1147,7 +1112,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); list_del(p); kfree(bvi); } @@ -1190,9 +1155,10 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - /* compressed data ought to be valid before decompressing */ - if (!page) { - err = -EIO; + /* compressed data ought to be valid when decompressing */ + if (IS_ERR(page) || !page) { + bvec->page = NULL; /* clear the failure reason */ + err = page ? PTR_ERR(page) : -EIO; continue; } be->compressed_pages[i] = page; @@ -1268,8 +1234,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, - .gfp = pcl->besteffort ? - GFP_KERNEL | __GFP_NOFAIL : + .gfp = pcl->besteffort ? GFP_KERNEL : GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); @@ -1302,7 +1267,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - z_erofs_onlinefolio_end(page_folio(page), err); + erofs_onlinefolio_end(page_folio(page), err); continue; } if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { @@ -1333,8 +1298,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, return err; } -static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct page **pagepool) +static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct page **pagepool) { struct z_erofs_decompress_backend be = { .sb = io->sb, @@ -1343,6 +1308,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, LIST_HEAD_INIT(be.decompressed_secondary_bvecs), }; z_erofs_next_pcluster_t owned = io->head; + int err = io->eio ? -EIO : 0; while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); @@ -1350,12 +1316,13 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, be.pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + err = z_erofs_decompress_pcluster(&be, err) ?: err; if (z_erofs_is_inline_pcluster(be.pcl)) z_erofs_free_pcluster(be.pcl); else erofs_workgroup_put(&be.pcl->obj); } + return err; } static void z_erofs_decompressqueue_work(struct work_struct *work) @@ -1428,6 +1395,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, struct z_erofs_bvec zbv; struct address_space *mapping; struct folio *folio; + struct page *page; int bs = i_blocksize(f->inode); /* Except for inplace folios, the entire folio can be used for I/Os */ @@ -1450,7 +1418,6 @@ repeat: * file-backed folios will be used instead. */ if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { - folio->private = 0; tocache = true; goto out_tocache; } @@ -1468,7 +1435,7 @@ repeat: } folio_lock(folio); - if (folio->mapping == mc) { + if (likely(folio->mapping == mc)) { /* * The cached folio is still in managed cache but without * a valid `->private` pcluster hint. Let's reconnect them. @@ -1478,41 +1445,48 @@ repeat: /* compressed_bvecs[] already takes a ref before */ folio_put(folio); } - - /* no need to submit if it is already up-to-date */ - if (folio_test_uptodate(folio)) { - folio_unlock(folio); - bvec->bv_page = NULL; + if (likely(folio->private == pcl)) { + /* don't submit cache I/Os again if already uptodate */ + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + bvec->bv_page = NULL; + } + return; } - return; + /* + * Already linked with another pcluster, which only appears in + * crafted images by fuzzers for now. But handle this anyway. + */ + tocache = false; /* use temporary short-lived pages */ + } else { + DBG_BUGON(1); /* referenced managed folios can't be truncated */ + tocache = true; } - - /* - * It has been truncated, so it's unsafe to reuse this one. Let's - * allocate a new page for compressed data. - */ - DBG_BUGON(folio->mapping); - tocache = true; folio_unlock(folio); folio_put(folio); out_allocfolio: - zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + page = __erofs_allocpage(&f->pagepool, gfp, true); spin_lock(&pcl->obj.lockref.lock); - if (pcl->compressed_bvecs[nr].page) { - erofs_pagepool_add(&f->pagepool, zbv.page); + if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { + if (page) + erofs_pagepool_add(&f->pagepool, page); spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } - bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page; - folio = page_folio(zbv.page); - /* first mark it as a temporary shortlived folio (now 1 ref) */ - folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; + pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); spin_unlock(&pcl->obj.lockref.lock); + bvec->bv_page = page; + if (!page) + return; + folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) + filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived folio (1 ref) */ + folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; + } folio_attach_private(folio, pcl); /* drop a refcount added by allocpage (then 2 refs in total here) */ folio_put(folio); @@ -1647,17 +1621,16 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, cur = mdev.m_pa; end = cur + pcl->pclustersize; do { - z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); - if (!bvec.bv_page) - continue; - + bvec.bv_page = NULL; if (bio && (cur != last_pa || bio->bi_bdev != mdev.m_bdev)) { -io_retry: - if (!erofs_is_fscache_mode(sb)) - submit_bio(bio); - else +drain_io: + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); + else + submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); @@ -1666,6 +1639,15 @@ io_retry: bio = NULL; } + if (!bvec.bv_page) { + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) + continue; + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); + } + if (unlikely(PageWorkingset(bvec.bv_page)) && !memstall) { psi_memstall_enter(&pflags); @@ -1673,10 +1655,13 @@ io_retry: } if (!bio) { - bio = erofs_is_fscache_mode(sb) ? - erofs_fscache_bio_alloc(&mdev) : - bio_alloc(mdev.m_bdev, BIO_MAX_VECS, - REQ_OP_READ, GFP_NOIO); + if (erofs_is_fileio_mode(EROFS_SB(sb))) + bio = erofs_fileio_bio_alloc(&mdev); + else if (erofs_is_fscache_mode(sb)) + bio = erofs_fscache_bio_alloc(&mdev); + else + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_endio; bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; @@ -1685,13 +1670,9 @@ io_retry: ++nr_bios; } - if (cur + bvec.bv_len > end) - bvec.bv_len = end - cur; - DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) - goto io_retry; - + goto drain_io; last_pa = cur + bvec.bv_len; bypass = false; } while ((cur += bvec.bv_len) < end); @@ -1703,10 +1684,12 @@ io_retry: } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { - if (!erofs_is_fscache_mode(sb)) - submit_bio(bio); - else + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); + else + submit_bio(bio); if (memstall) psi_memstall_leave(&pflags); } @@ -1722,26 +1705,28 @@ io_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - bool force_fg, bool ra) +static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f, + unsigned int ra_folios) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; + struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); + bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios); + int err; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) - return; - z_erofs_submit_queue(f, io, &force_fg, ra); + return 0; + z_erofs_submit_queue(f, io, &force_fg, !!ra_folios); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); - + err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) - return; + return err; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); + return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err; } /* @@ -1803,7 +1788,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; @@ -1815,9 +1799,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); - /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - + /* if some pclusters are ready, need submit them anyway */ + err = z_erofs_runqueue(&f, 0) ?: err; if (err && err != -EINTR) erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", err, folio->index, EROFS_I(inode)->nid); @@ -1830,7 +1813,6 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct folio *head = NULL, *folio; unsigned int nr_folios; @@ -1860,7 +1842,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); + (void)z_erofs_runqueue(&f, nr_folios); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 403af6e31d5b..e980e29873a5 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -687,32 +687,30 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int err = 0; trace_erofs_map_blocks_enter(inode, map, flags); - - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { + if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ map->m_llen = map->m_la + 1 - inode->i_size; map->m_la = inode->i_size; map->m_flags = 0; - goto out; + } else { + err = z_erofs_fill_inode_lazy(inode); + if (!err) { + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + } else { + err = z_erofs_do_map_blocks(inode, map, flags); + } + } + if (!err && (map->m_flags & EROFS_MAP_ENCODED) && + unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + err = -EOPNOTSUPP; + if (err) + map->m_llen = 0; } - - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; - - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | - EROFS_MAP_FRAGMENT; - goto out; - } - - err = z_erofs_do_map_blocks(inode, map, flags); -out: - if (err) - map->m_llen = 0; trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } diff --git a/include/linux/bio.h b/include/linux/bio.h index a46e2047bea4..faceadb040f9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,8 +324,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes); +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b7664d593486..643c9020a35a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1187,7 +1187,8 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) +static inline unsigned int +queue_limits_max_zone_append_sectors(const struct queue_limits *l) { unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 447fbfd32215..c189d36ad55e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) return sqe->cmd; } +static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) +{ + BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu)); +} +#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \ + io_uring_cmd_private_sz_check(sizeof(pdu_type)), \ + ((pdu_type *)&(cmd)->pdu) \ +) + #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); @@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); +/* Execute the request from a blocking context */ +void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) @@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { } +static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) +{ +} #endif /* diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3315005df117..4b9ba523978d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -239,6 +239,9 @@ struct io_ring_ctx { struct io_rings *rings; struct percpu_ref refs; + clockid_t clockid; + enum tk_offsets clock_offset; + enum task_work_notify_mode notify_method; unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3065972c94ba..3066af38e94e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1605,6 +1605,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) diff --git a/include/linux/net.h b/include/linux/net.h index 688320b79fcc..b75bc534c1b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -322,6 +322,25 @@ static inline bool sendpage_ok(struct page *page) return !PageSlab(page) && page_count(page) >= 1; } +/* + * Check sendpage_ok on contiguous pages. + */ +static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) +{ + struct page *p = page + (offset >> PAGE_SHIFT); + size_t count = 0; + + while (count < len) { + if (!sendpage_ok(p)) + return false; + + p++; + count += PAGE_SIZE; + } + + return true; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index e10333d78dbb..19d2b256180f 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -12,7 +12,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); - +struct key *nvme_tls_key_lookup(key_serial_t key_id); #else static inline key_serial_t nvme_tls_psk_default(struct key *keyring, @@ -24,5 +24,9 @@ static inline key_serial_t nvme_keyring_id(void) { return 0; } +static inline struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + return ERR_PTR(-ENOTSUPP); +} #endif /* !CONFIG_NVME_KEYRING */ #endif /* _NVME_KEYRING_H */ diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index eb2f04d636c8..97c5f00b9aa3 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -25,6 +25,7 @@ enum nvme_rdma_cm_status { NVME_RDMA_CM_NO_RSC = 0x06, NVME_RDMA_CM_INVALID_IRD = 0x07, NVME_RDMA_CM_INVALID_ORD = 0x08, + NVME_RDMA_CM_INVALID_CNTLID = 0x09, }; static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) @@ -46,6 +47,8 @@ static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) return "invalid IRD"; case NVME_RDMA_CM_INVALID_ORD: return "Invalid ORD"; + case NVME_RDMA_CM_INVALID_CNTLID: + return "invalid controller ID"; default: return "unrecognized reason"; } @@ -64,7 +67,8 @@ struct nvme_rdma_cm_req { __le16 qid; __le16 hrqsize; __le16 hsqsize; - u8 rsvd[24]; + __le16 cntlid; + u8 rsvd[22]; }; /** diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7b2ae2e43544..b58d9405d65e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -987,8 +987,8 @@ struct nvme_rw_command { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum { @@ -1057,8 +1057,8 @@ struct nvme_write_zeroes_cmd { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum nvme_zone_mgmt_action { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9c7edb6422b..e39c3a7ce33c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); +int filemap_invalidate_pages(struct address_space *mapping, + loff_t pos, loff_t end, bool nowait); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0a523023bdcc..bf60ad50011e 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -674,10 +674,10 @@ TRACE_EVENT(btrfs_finish_ordered_extent, DECLARE_EVENT_CLASS(btrfs__writepage, - TP_PROTO(const struct page *page, const struct inode *inode, + TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), - TP_ARGS(page, inode, wbc), + TP_ARGS(folio, inode, wbc), TP_STRUCT__entry_btrfs( __field( u64, ino ) @@ -695,7 +695,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->index = page->index; + __entry->index = folio->index; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; @@ -721,12 +721,12 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __entry->writeback_index) ); -DEFINE_EVENT(btrfs__writepage, __extent_writepage, +DEFINE_EVENT(btrfs__writepage, extent_writepage, - TP_PROTO(const struct page *page, const struct inode *inode, + TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), - TP_ARGS(page, inode, wbc) + TP_ARGS(folio, inode, wbc) ); TRACE_EVENT(btrfs_writepage_end_io_hook, @@ -1825,7 +1825,7 @@ TRACE_EVENT(qgroup_update_counters, TRACE_EVENT(qgroup_update_reserve, - TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, + TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qgroup, s64 diff, int type), TP_ARGS(fs_info, qgroup, diff, type), @@ -1851,7 +1851,7 @@ TRACE_EVENT(qgroup_update_reserve, TRACE_EVENT(qgroup_meta_reserve, - TP_PROTO(struct btrfs_root *root, s64 diff, int type), + TP_PROTO(const struct btrfs_root *root, s64 diff, int type), TP_ARGS(root, diff, type), @@ -1874,7 +1874,7 @@ TRACE_EVENT(qgroup_meta_reserve, TRACE_EVENT(qgroup_meta_convert, - TP_PROTO(struct btrfs_root *root, s64 diff), + TP_PROTO(const struct btrfs_root *root, s64 diff), TP_ARGS(root, diff), diff --git a/include/uapi/linux/blkdev.h b/include/uapi/linux/blkdev.h new file mode 100644 index 000000000000..66373cd1a83a --- /dev/null +++ b/include/uapi/linux/blkdev.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_BLKDEV_H +#define _UAPI_LINUX_BLKDEV_H + +#include +#include + +/* + * io_uring block file commands, see IORING_OP_URING_CMD. + * It's a different number space from ioctl(), reuse the block's code 0x12. + */ +#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0) + +#endif diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index adc2524fd8e3..9dc5bb428c8a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -440,11 +440,21 @@ struct io_uring_cqe { * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct * them from sends. + * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + * more completions. In other words, the buffer is being + * partially consumed, and will be used by the kernel for + * more completions. This is only set for buffers used via + * the incremental buffer consumption, as provided by + * a ring buffer setup with IOU_PBUF_RING_INC. For any + * other provided buffer type, all completions with a + * buffer passed back is automatically returned to the + * application. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) +#define IORING_CQE_F_BUF_MORE (1U << 4) #define IORING_CQE_BUFFER_SHIFT 16 @@ -507,6 +517,7 @@ struct io_cqring_offsets { #define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_EXT_ARG (1U << 3) #define IORING_ENTER_REGISTERED_RING (1U << 4) +#define IORING_ENTER_ABS_TIMER (1U << 5) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -542,6 +553,7 @@ struct io_uring_params { #define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) +#define IORING_FEAT_MIN_TIMEOUT (1U << 15) /* * io_uring_register(2) opcodes and arguments @@ -595,6 +607,11 @@ enum io_uring_register_op { IORING_REGISTER_NAPI = 27, IORING_UNREGISTER_NAPI = 28, + IORING_REGISTER_CLOCK = 29, + + /* copy registered buffers from source ring to current ring */ + IORING_REGISTER_COPY_BUFFERS = 30, + /* this goes last */ IORING_REGISTER_LAST, @@ -675,6 +692,21 @@ struct io_uring_restriction { __u32 resv2[3]; }; +struct io_uring_clock_register { + __u32 clockid; + __u32 __resv[3]; +}; + +enum { + IORING_REGISTER_SRC_REGISTERED = 1, +}; + +struct io_uring_copy_buffers { + __u32 src_fd; + __u32 flags; + __u32 pad[6]; +}; + struct io_uring_buf { __u64 addr; __u32 len; @@ -707,9 +739,17 @@ struct io_uring_buf_ring { * mmap(2) with the offset set as: * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * to get a virtual mapping for the ring. + * IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be + * consumed incrementally. Normally one (or more) buffers + * are fully consumed. With incremental consumptions, it's + * feasible to register big ranges of buffers, and each + * use of it will consume only as much as it needs. This + * requires that both the kernel and application keep + * track of where the current read/recv index is at. */ enum io_uring_register_pbuf_ring_flags { IOU_PBUF_RING_MMAP = 1, + IOU_PBUF_RING_INC = 2, }; /* argument for IORING_(UN)REGISTER_PBUF_RING */ @@ -758,7 +798,7 @@ enum io_uring_register_restriction_op { struct io_uring_getevents_arg { __u64 sigmask; __u32 sigmask_sz; - __u32 pad; + __u32 min_wait_usec; __u64 ts; }; diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 80ce0ef43afd..f1d468acfb25 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -42,8 +42,9 @@ enum { NBD_CMD_WRITE = 1, NBD_CMD_DISC = 2, NBD_CMD_FLUSH = 3, - NBD_CMD_TRIM = 4 + NBD_CMD_TRIM = 4, /* userspace defines additional extension commands */ + NBD_CMD_WRITE_ZEROES = 6, }; /* values for flags field, these are server interaction specific. */ @@ -51,12 +52,15 @@ enum { #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ #define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ -/* there is a gap here to match userspace */ +#define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* supports WRITE_ZEROES */ +/* there is a gap here to match userspace */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* values for cmd flags in the upper 16 bits of request type */ #define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */ +#define NBD_CMD_FLAG_NO_HOLE (1 << 17) /* Do not punch a hole for WRITE_ZEROES */ /* These are client behavior specific flags. */ #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on diff --git a/init/Kconfig b/init/Kconfig index 80ec1fb61aef..17dad213dfff 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1687,6 +1687,19 @@ config IO_URING applications to submit and complete IO through submission and completion rings that are shared between the kernel and application. +config GCOV_PROFILE_URING + bool "Enable GCOV profiling on the io_uring subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on the io_uring subsystem, to facilitate + code coverage testing. + + If unsure, say N. + + Note that this will have a negative impact on the performance of + the io_uring subsystem, hence this should only be enabled for + specific test purposes. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/io_uring/Makefile b/io_uring/Makefile index 61923e11c767..53167bef37d7 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,6 +2,10 @@ # # Makefile for io_uring +ifdef CONFIG_GCOV_PROFILE_URING +GCOV_PROFILE := y +endif + obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ eventfd.o uring_cmd.o openclose.o \ diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index b9384503a2b7..e37fddd5d9ce 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -15,7 +15,7 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async: 1; struct rcu_head rcu; - atomic_t refs; + refcount_t refs; atomic_t ops; }; @@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - if (atomic_dec_and_test(&ev_fd->refs)) + if (refcount_dec_and_test(&ev_fd->refs)) io_eventfd_free(rcu); } @@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) */ if (unlikely(!ev_fd)) return; - if (!atomic_inc_not_zero(&ev_fd->refs)) + if (!refcount_inc_not_zero(&ev_fd->refs)) return; if (ev_fd->eventfd_async && !io_wq_current_is_worker()) goto out; @@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) } } out: - if (atomic_dec_and_test(&ev_fd->refs)) + if (refcount_dec_and_test(&ev_fd->refs)) call_rcu(&ev_fd->rcu, io_eventfd_free); } @@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ev_fd->cq_ev_fd)) { int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); return ret; } @@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; - atomic_set(&ev_fd->refs, 1); + refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; @@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (atomic_dec_and_test(&ev_fd->refs)) + if (refcount_dec_and_test(&ev_fd->refs)) call_rcu(&ev_fd->rcu, io_eventfd_free); return 0; } diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b1e0e0d85349..d43e1b5fcb36 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) cqe->user_data, cqe->res, cqe->flags); } - spin_unlock(&ctx->completion_lock); + +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ctx->napi_enabled) { + seq_puts(m, "NAPI:\tenabled\n"); + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); + if (ctx->napi_prefer_busy_poll) + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); + else + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); + } else { + seq_puts(m, "NAPI:\tdisabled\n"); + } +#endif } #endif diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index f1e7c670add8..a38f36b68060 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) goto err; - cpumask_copy(wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(data->task, wq->cpu_mask); wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); @@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { + cpumask_var_t allowed_mask; + int ret = 0; + if (!tctx || !tctx->io_wq) return -EINVAL; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + return -ENOMEM; + rcu_read_lock(); - if (mask) - cpumask_copy(tctx->io_wq->cpu_mask, mask); - else - cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask); + if (mask) { + if (cpumask_subset(mask, allowed_mask)) + cpumask_copy(tctx->io_wq->cpu_mask, mask); + else + ret = -EINVAL; + } else { + cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask); + } rcu_read_unlock(); - return 0; + free_cpumask_var(allowed_mask); + return ret; } /* diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3942db160f18..86cf31902841 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -533,6 +533,17 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +{ + io_queue_iowq(req); +} + +void io_req_queue_iowq(struct io_kiocb *req) +{ + req->io_task_work.func = io_req_queue_iowq_tw; + io_req_task_work_add(req); +} + static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { @@ -904,7 +915,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -2350,12 +2361,113 @@ static bool current_pending_io(void) return percpu_counter_read_positive(&tctx->inflight); } -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq) +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { - int ret; + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; + } + + iowq->t.function = io_cqring_timer_wakeup; + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + iowq->t.function = io_cqring_min_timer_wakeup; + } else { + timeout = iowq->timeout; + iowq->t.function = io_cqring_timer_wakeup; + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + int ret = 0; + + /* + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. + */ + if (current_pending_io()) + current->in_iowait = 1; + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else + schedule(); + current->in_iowait = 0; + return ret; +} + +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(!llist_empty(&ctx->work_llist))) @@ -2367,32 +2479,26 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (unlikely(io_should_wake(iowq))) return 0; - /* - * Mark us as being in io_wait if we have pending requests, so cpufreq - * can take into account that the task is waiting for IO - turns out - * to be important for low QD IO. - */ - if (current_pending_io()) - current->in_iowait = 1; - ret = 0; - if (iowq->timeout == KTIME_MAX) - schedule(); - else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) - ret = -ETIME; - current->in_iowait = 0; - return ret; + return __io_cqring_wait_schedule(ctx, iowq, start_time); } +struct ext_arg { + size_t argsz; + struct __kernel_timespec __user *ts; + const sigset_t __user *sig; + ktime_t min_time; +}; + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; + ktime_t start_time; int ret; if (!io_allowed_run_tw(ctx)) @@ -2410,30 +2516,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); - if (uts) { + if (ext_arg->ts) { struct timespec64 ts; - ktime_t dt; - if (get_timespec64(&ts, uts)) + if (get_timespec64(&ts, ext_arg->ts)) return -EFAULT; - dt = timespec64_to_ktime(ts); - iowq.timeout = ktime_add(dt, ktime_get()); - io_napi_adjust_timeout(ctx, &iowq, dt); + iowq.timeout = timespec64_to_ktime(ts); + if (!(flags & IORING_ENTER_ABS_TIMER)) + iowq.timeout = ktime_add(iowq.timeout, start_time); } - if (sig) { + if (ext_arg->sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); else #endif - ret = set_user_sigmask(sig, sigsz); + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); if (ret) return ret; @@ -2443,8 +2552,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); unsigned long check_cq; + int nr_wait; + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings->cq.tail); + else + nr_wait = 1; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); @@ -2454,7 +2570,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq); + ret = io_cqring_wait_schedule(ctx, &iowq, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -3112,9 +3228,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) +static int io_get_ext_arg(unsigned flags, const void __user *argp, + struct ext_arg *ext_arg) { struct io_uring_getevents_arg arg; @@ -3123,8 +3238,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; + ext_arg->sig = (const sigset_t __user *) argp; + ext_arg->ts = NULL; return 0; } @@ -3132,15 +3247,14 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) + if (ext_arg->argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); + ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(arg.sigmask); + ext_arg->argsz = arg.sigmask_sz; + ext_arg->ts = u64_to_user_ptr(arg.ts); return 0; } @@ -3154,7 +3268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) + IORING_ENTER_REGISTERED_RING | + IORING_ENTER_ABS_TIMER))) return -EINVAL; /* @@ -3245,15 +3360,14 @@ iopoll_locked: } mutex_unlock(&ctx->uring_lock); } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; + struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + ret2 = io_get_ext_arg(flags, argp, &ext_arg); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); + ret2 = io_cqring_wait(ctx, min_complete, flags, + &ext_arg); } } @@ -3424,6 +3538,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!ctx) return -ENOMEM; + ctx->clockid = CLOCK_MONOTONIC; + ctx->clock_offset = 0; + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3535,7 +3652,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index c2acf6180845..9d70b2cf7b1e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -39,8 +39,12 @@ struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; + unsigned cq_min_tail; unsigned nr_timeouts; + int hit_timeout; + ktime_t min_timeout; ktime_t timeout; + struct hrtimer t; #ifdef CONFIG_NET_RX_BUSY_POLL ktime_t napi_busy_poll_dt; @@ -90,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); +void io_req_queue_iowq(struct io_kiocb *req); int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); @@ -437,6 +442,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req) return false; } +static inline ktime_t io_get_time(struct io_ring_ctx *ctx) +{ + if (ctx->clockid == CLOCK_MONOTONIC) + return ktime_get(); + + return ktime_get_with_offset(ctx->clock_offset); +} + enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index bdfa30b38321..d407576ddfb7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { /* * We can add this buffer back to two lists: @@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, &ctx->io_buffers_comp); + __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { lockdep_assert_held(&req->ctx->uring_lock); - __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); } } @@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, return 1; } -static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, - __u16 head, __u16 mask) -{ - return &br->bufs[head & mask]; -} - static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ - req->flags &= ~REQ_F_BUFFERS_COMMIT; + io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; - bl->head++; } return u64_to_user_ptr(buf->addr); } @@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->is_buf_ring) + if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -219,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); - size_t needed; if (unlikely(!len)) return -ENOBUFS; - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; + /* + * Limit incremental buffers to 1 segment. No point trying + * to peek ahead and map more than we need, when the buffers + * themselves should be large when setup with + * IOU_PBUF_RING_INC. + */ + if (bl->flags & IOBL_INC) { + nr_avail = 1; + } else { + size_t needed; + + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } } /* @@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->buf_index = buf->bid; do { - /* truncate end piece, if needed */ - if (buf->len > arg->max_len) - buf->len = arg->max_len; + u32 len = buf->len; + + /* truncate end piece, if needed, for non partial buffers */ + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) + buf->len = len; + } iov->iov_base = u64_to_user_ptr(buf->addr); - iov->iov_len = buf->len; + iov->iov_len = len; iov++; - arg->out_len += buf->len; - arg->max_len -= buf->len; + arg->out_len += len; + arg->max_len -= len; if (!arg->max_len) break; @@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, if (unlikely(!bl)) goto out_unlock; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = io_ring_buffers_peek(req, arg, bl); /* * Don't recycle these buffers if we need to go through poll. @@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, * committed them, they cannot be put back in the queue. */ if (ret > 0) { - req->flags |= REQ_F_BL_NO_RECYCLE; - req->buf_list->head += ret; + req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; + io_kbuf_commit(req, bl, arg->out_len, ret); } } else { ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); @@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) if (unlikely(!bl)) return -ENOENT; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; @@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; if (bl->buf_nr_pages) { int j; - if (!bl->is_mmap) { + if (!(bl->flags & IOBL_MMAP)) { for (j = 0; j < bl->buf_nr_pages; j++) unpin_user_page(bl->buf_pages[j]); } io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->is_mmap); - bl->is_mmap = 0; + &bl->buf_nr_pages, bl->flags & IOBL_MMAP); + bl->flags &= ~IOBL_MMAP; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); - bl->is_buf_ring = 0; + bl->flags &= ~IOBL_BUF_RING; return i; } @@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = -EINVAL; goto err; } @@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->buf_ring = br; - bl->is_buf_ring = 1; - bl->is_mmap = 0; + bl->flags |= IOBL_BUF_RING; + bl->flags &= ~IOBL_MMAP; return 0; error_unpin: unpin_user_pages(pages, nr_pages); @@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, return -ENOMEM; } - bl->is_buf_ring = 1; - bl->is_mmap = 1; + bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); return 0; } @@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags & ~IOU_PBUF_RING_MMAP) + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!reg.ring_addr) @@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->is_buf_ring || !list_empty(&bl->buf_list)) + if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!ret) { bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; io_buffer_add_list(ctx, bl, reg.bgid); return 0; @@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; xa_erase(&ctx->io_bl_xa, bl->bgid); @@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) return -ENOENT; - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; buf_status.head = bl->head; @@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ ret = false; - if (bl && bl->is_mmap) + if (bl && bl->flags & IOBL_MMAP) ret = atomic_inc_not_zero(&bl->refs); rcu_read_unlock(); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b90aca3a57fa..36aadfe5ac00 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -4,6 +4,16 @@ #include +enum { + /* ring mapped provided buffers */ + IOBL_BUF_RING = 1, + /* ring mapped provided buffers, but mmap'ed by application */ + IOBL_MMAP = 2, + /* buffers are consumed incrementally rather than always fully */ + IOBL_INC = 4, + +}; + struct io_buffer_list { /* * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, @@ -25,12 +35,9 @@ struct io_buffer_list { __u16 head; __u16 mask; - atomic_t refs; + __u16 flags; - /* ring mapped provided buffers */ - __u8 is_buf_ring; - /* ring mapped provided buffers, but mmap'ed by application */ - __u8 is_mmap; + atomic_t refs; }; struct io_buffer { @@ -52,8 +59,8 @@ struct buf_sel_arg { struct iovec *iovs; size_t out_len; size_t max_len; - int nr_iovs; - int mode; + unsigned short nr_iovs; + unsigned short mode; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, @@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] + +static inline bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) +{ + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; + + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + + if (bl->flags & IOBL_INC) { + struct io_uring_buf *buf; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + if (WARN_ON_ONCE(len > buf->len)) + len = buf->len; + buf->len -= len; + if (buf->len) { + buf->addr += len; + return false; + } + } + + bl->head += nr; + return true; +} + +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) { struct io_buffer_list *bl = req->buf_list; + bool ret = true; if (bl) { - if (req->flags & REQ_F_BUFFERS_COMMIT) { - bl->head += nr; - req->flags &= ~REQ_F_BUFFERS_COMMIT; - } + ret = io_kbuf_commit(req, bl, len, nr); req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; + return ret; } -static inline void __io_put_kbuf_list(struct io_kiocb *req, +static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req, 1); + __io_put_kbuf_ring(req, len, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); @@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; - __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); + /* len == 0 is fine here, non-ring will always drop all of it */ + __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); } -static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, - unsigned issue_flags) +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, + int nbufs, unsigned issue_flags) { unsigned int ret; @@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, return 0; ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req, nbufs); - else - __io_put_kbuf(req, issue_flags); + if (req->flags & REQ_F_BUFFER_RING) { + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + } else { + __io_put_kbuf(req, len, issue_flags); + } return ret; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, +static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - return __io_put_kbufs(req, 1, issue_flags); + return __io_put_kbufs(req, len, 1, issue_flags); } -static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, - unsigned issue_flags) +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, + int nbufs, unsigned issue_flags) { - return __io_put_kbufs(req, nbufs, issue_flags); + return __io_put_kbufs(req, len, nbufs, issue_flags); } #endif diff --git a/io_uring/napi.c b/io_uring/napi.c index 1de1d4d62925..d0cf694d0172 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -269,27 +269,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) return 0; } -/* - * __io_napi_adjust_timeout() - adjust busy loop timeout - * @ctx: pointer to io-uring context structure - * @iowq: pointer to io wait queue - * @ts: pointer to timespec or NULL - * - * Adjust the busy loop timeout according to timespec and busy poll timeout. - * If the specified NAPI timeout is bigger than the wait timeout, then adjust - * the NAPI timeout accordingly. - */ -void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - ktime_t to_wait) -{ - ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); - - if (to_wait) - poll_dt = min(poll_dt, to_wait); - - iowq->napi_busy_poll_dt = poll_dt; -} - /* * __io_napi_busy_loop() - execute busy poll loop * @ctx: pointer to io-uring context structure @@ -299,10 +278,18 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow */ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { - iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + if (ctx->flags & IORING_SETUP_SQPOLL) + return; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) - io_napi_blocking_busy_loop(ctx, iowq); + iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); + if (iowq->timeout != KTIME_MAX) { + ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); + + iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); + } + + iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + io_napi_blocking_busy_loop(ctx, iowq); } /* diff --git a/io_uring/napi.h b/io_uring/napi.h index 27b88c3eb428..fd275ef0456d 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); -void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, ktime_t to_wait); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx) return !list_empty(&ctx->napi_list); } -static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t to_wait) -{ - if (!io_napi(ctx)) - return; - __io_napi_adjust_timeout(ctx, iowq, to_wait); -} - static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { @@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx) static inline void io_napi_add(struct io_kiocb *req) { } -static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t to_wait) -{ -} static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { diff --git a/io_uring/net.c b/io_uring/net.c index d08abcca89cc..f10f5a22d66a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->buf_group = req->buf_index; req->buf_list = NULL; } - if (req->flags & REQ_F_BUFFER_SELECT && sr->len) - return -EINVAL; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, unsigned int cflags; if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { - cflags = io_put_kbuf(req, issue_flags); + cflags = io_put_kbuf(req, *ret, issue_flags); goto finish; } - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); + cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); if (bundle_finished || req->flags & REQ_F_BL_EMPTY) goto finish; @@ -599,7 +597,7 @@ retry_bundle: if (io_do_buffer_select(req)) { struct buf_sel_arg arg = { .iovs = &kmsg->fast_iov, - .max_len = INT_MAX, + .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, }; @@ -618,14 +616,23 @@ retry_bundle: if (unlikely(ret < 0)) return ret; - sr->len = arg.out_len; - iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, - arg.out_len); if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { kmsg->free_iov_nr = ret; kmsg->free_iov = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } + sr->len = arg.out_len; + + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } else { + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, + arg.iovs, ret, arg.out_len); + } } /* @@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= IORING_CQE_F_SOCK_NONEMPTY; if (sr->flags & IORING_RECVSEND_BUNDLE) { - cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), + cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; } else { - cflags |= io_put_kbuf(req, issue_flags); + cflags |= io_put_kbuf(req, *ret, issue_flags); } /* diff --git a/io_uring/register.c b/io_uring/register.c index e3c20be5a198..dab0f8024ddf 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -335,6 +335,31 @@ err: return ret; } +static int io_register_clock(struct io_ring_ctx *ctx, + struct io_uring_clock_register __user *arg) +{ + struct io_uring_clock_register reg; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + + switch (reg.clockid) { + case CLOCK_MONOTONIC: + ctx->clock_offset = 0; + break; + case CLOCK_BOOTTIME: + ctx->clock_offset = TK_OFFS_BOOT; + break; + default: + return -EINVAL; + } + + ctx->clockid = reg.clockid; + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -511,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_napi(ctx, arg); break; + case IORING_REGISTER_CLOCK: + ret = -EINVAL; + if (!arg || nr_args) + break; + ret = io_register_clock(ctx, arg); + break; + case IORING_REGISTER_COPY_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_copy_buffers(ctx, arg); + break; default: ret = -EINVAL; break; @@ -519,6 +556,38 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return ret; } +/* + * Given an 'fd' value, return the ctx associated with if. If 'registered' is + * true, then the registered index is used. Otherwise, the normal fd table. + * Caller must call fput() on the returned file, unless it's an ERR_PTR. + */ +struct file *io_uring_register_get_file(int fd, bool registered) +{ + struct file *file; + + if (registered) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + struct io_uring_task *tctx = current->io_uring; + + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) + return ERR_PTR(-EINVAL); + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + file = tctx->registered_rings[fd]; + } else { + file = fget(fd); + } + + if (unlikely(!file)) + return ERR_PTR(-EBADF); + if (io_is_uring_fops(file)) + return file; + fput(file); + return ERR_PTR(-EOPNOTSUPP); +} + SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { @@ -533,35 +602,15 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, if (opcode >= IORING_REGISTER_LAST) return -EINVAL; - if (use_registered_ring) { - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - struct io_uring_task *tctx = current->io_uring; - - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; - } else { - file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(file)) - goto out_fput; - } - + file = io_uring_register_get_file(fd, use_registered_ring); + if (IS_ERR(file)) + return PTR_ERR(file); ctx = file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: if (!use_registered_ring) fput(file); return ret; diff --git a/io_uring/register.h b/io_uring/register.h index c9da997d503c..cc69b88338fe 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,5 +4,6 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); +struct file *io_uring_register_get_file(int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 453867add7ca..a7164aa7d13e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -17,6 +17,7 @@ #include "openclose.h" #include "rsrc.h" #include "memmap.h" +#include "register.h" struct io_rsrc_update { struct file *file; @@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo struct io_mapped_ubuf *imu = *slot; unsigned int i; + *slot = NULL; if (imu != &dummy_ubuf) { + if (!refcount_dec_and_test(&imu->refs)) + return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } - *slot = NULL; } static void io_rsrc_put_work(struct io_rsrc_node *node) @@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } +static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data, int nr_folios) +{ + struct page **page_array = *pages, **new_array = NULL; + int nr_pages_left = *nr_pages, i, j; + + /* Store head pages only*/ + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), + GFP_KERNEL); + if (!new_array) + return false; + + new_array[0] = compound_head(page_array[0]); + /* + * The pages are bound to the folio, it doesn't + * actually unpin them but drops all but one reference, + * which is usually put down by io_buffer_unmap(). + * Note, needs a better helper. + */ + if (data->nr_pages_head > 1) + unpin_user_pages(&page_array[1], data->nr_pages_head - 1); + + j = data->nr_pages_head; + nr_pages_left -= data->nr_pages_head; + for (i = 1; i < nr_folios; i++) { + unsigned int nr_unpin; + + new_array[i] = page_array[j]; + nr_unpin = min_t(unsigned int, nr_pages_left - 1, + data->nr_pages_mid - 1); + if (nr_unpin) + unpin_user_pages(&page_array[j+1], nr_unpin); + j += data->nr_pages_mid; + nr_pages_left -= data->nr_pages_mid; + } + kvfree(page_array); + *pages = new_array; + *nr_pages = nr_folios; + return true; +} + +static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) +{ + struct page **page_array = *pages; + struct folio *folio = page_folio(page_array[0]); + unsigned int count = 1, nr_folios = 1; + int i; + + if (*nr_pages <= 1) + return false; + + data->nr_pages_mid = folio_nr_pages(folio); + if (data->nr_pages_mid == 1) + return false; + + data->folio_shift = folio_shift(folio); + /* + * Check if pages are contiguous inside a folio, and all folios have + * the same page count except for the head and tail. + */ + for (i = 1; i < *nr_pages; i++) { + if (page_folio(page_array[i]) == folio && + page_array[i] == page_array[i-1] + 1) { + count++; + continue; + } + + if (nr_folios == 1) { + if (folio_page_idx(folio, page_array[i-1]) != + data->nr_pages_mid - 1) + return false; + + data->nr_pages_head = count; + } else if (count != data->nr_pages_mid) { + return false; + } + + folio = page_folio(page_array[i]); + if (folio_size(folio) != (1UL << data->folio_shift) || + folio_page_idx(folio, page_array[i]) != 0) + return false; + + count = 1; + nr_folios++; + } + if (nr_folios == 1) + data->nr_pages_head = count; + + return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) @@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; - struct folio *folio = NULL; + struct io_imu_folio_data data; + bool coalesced; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) @@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - /* If it's a huge page, try to coalesce them into a single bvec entry */ - if (nr_pages > 1) { - folio = page_folio(pages[0]); - for (i = 1; i < nr_pages; i++) { - /* - * Pages must be consecutive and on the same folio for - * this to work - */ - if (page_folio(pages[i]) != folio || - pages[i] != pages[i - 1] + 1) { - folio = NULL; - break; - } - } - if (folio) { - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - unpin_user_pages(&pages[1], nr_pages - 1); - nr_pages = 1; - } - } + /* If it's huge page(s), try to coalesce them into fewer bvec entries */ + coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) @@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; + imu->folio_shift = PAGE_SHIFT; + imu->folio_mask = PAGE_MASK; + if (coalesced) { + imu->folio_shift = data.folio_shift; + imu->folio_mask = ~((1UL << data.folio_shift) - 1); + } + refcount_set(&imu->refs, 1); + off = (unsigned long) iov->iov_base & ~imu->folio_mask; *pimu = imu; ret = 0; - if (folio) { - bvec_set_page(&imu->bvec[0], pages[0], size, off); - goto done; - } for (i = 0; i < nr_pages; i++) { size_t vec_len; - vec_len = min_t(size_t, size, PAGE_SIZE - off); + vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; @@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * we know that: * * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the + * 2) all bvecs are the same in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. + * be folio_size aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - /* - * Note, huge pages buffers consists of one large - * bvec entry and should always go this way. The other - * branch doesn't expect non PAGE_SIZE'd chunks. - */ iter->bvec = bvec; iter->count -= offset; iter->iov_offset = offset; @@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter, /* skip first vec */ offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); + seg_skip = 1 + (offset >> imu->folio_shift); iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; + iter->iov_offset = offset & ~imu->folio_mask; } } return 0; } + +static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +{ + struct io_mapped_ubuf **user_bufs; + struct io_rsrc_data *data; + int i, ret, nbufs; + + /* + * Drop our own lock here. We'll setup the data we need and reference + * the source buffers, then re-grab, check, and assign at the end. + */ + mutex_unlock(&ctx->uring_lock); + + mutex_lock(&src_ctx->uring_lock); + ret = -ENXIO; + nbufs = src_ctx->nr_user_bufs; + if (!nbufs) + goto out_unlock; + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); + if (ret) + goto out_unlock; + + ret = -ENOMEM; + user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); + if (!user_bufs) + goto out_free_data; + + for (i = 0; i < nbufs; i++) { + struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; + + refcount_inc(&src->refs); + user_bufs[i] = src; + } + + /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ + mutex_unlock(&src_ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + if (!ctx->user_bufs) { + ctx->user_bufs = user_bufs; + ctx->buf_data = data; + ctx->nr_user_bufs = nbufs; + return 0; + } + + /* someone raced setting up buffers, dump ours */ + for (i = 0; i < nbufs; i++) + io_buffer_unmap(ctx, &user_bufs[i]); + io_rsrc_data_free(data); + kfree(user_bufs); + return -EBUSY; +out_free_data: + io_rsrc_data_free(data); +out_unlock: + mutex_unlock(&src_ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + return ret; +} + +/* + * Copy the registered buffers from the source ring whose file descriptor + * is given in the src_fd to the current ring. This is identical to registering + * the buffers with ctx, except faster as mappings already exist. + * + * Since the memory is already accounted once, don't account it again. + */ +int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_copy_buffers buf; + bool registered_src; + struct file *file; + int ret; + + if (ctx->user_bufs || ctx->nr_user_bufs) + return -EBUSY; + if (copy_from_user(&buf, arg, sizeof(buf))) + return -EFAULT; + if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) + return -EINVAL; + if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) + return -EINVAL; + + registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; + file = io_uring_register_get_file(buf.src_fd, registered_src); + if (IS_ERR(file)) + return PTR_ERR(file); + ret = io_copy_buffers(ctx, file->private_data); + if (!registered_src) + fput(file); + return ret; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c032ca3436ca..93546ab337a6 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -22,8 +22,6 @@ struct io_rsrc_put { }; }; -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - struct io_rsrc_data { struct io_ring_ctx *ctx; @@ -46,10 +44,21 @@ struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; unsigned int nr_bvecs; + unsigned int folio_shift; unsigned long acct_pages; + unsigned long folio_mask; + refcount_t refs; struct bio_vec bvec[] __counted_by(nr_bvecs); }; +struct io_imu_folio_data { + /* Head folio can be partially included in the fixed buf */ + unsigned int nr_pages_head; + /* For non-head/tail folios, has to be fully included */ + unsigned int nr_pages_mid; + unsigned int folio_shift; +}; + void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); @@ -59,6 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len); +int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/rw.c b/io_uring/rw.c index c004d21e2f12..f023ff49c688 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) { /* * Reissue will start accounting again, finish the * current cycle. @@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, 0); + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); io_req_task_complete(req, ts); @@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, */ io_req_io_end(req); io_req_set_res(req, final_ret, - io_put_kbuf(req, issue_flags)); + io_put_kbuf(req, ret, issue_flags)); io_req_rw_cleanup(req, issue_flags); return IOU_OK; } @@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(rw, &io->iter); + /* + * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT + * issue, even though they should be returning -EAGAIN. To be safe, + * retry from blocking context for either. + */ + if (ret == -EOPNOTSUPP && force_nonblock) + ret = -EAGAIN; + if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; /* If we can poll, just do that. */ @@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Put our buffer and post a CQE. If we fail to post a CQE, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, issue_flags); + cflags = io_put_kbuf(req, ret, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - req->cqe.flags = io_put_kbuf(req, 0); + req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 3b50dc9586d1..272df9d00f45 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { + if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) @@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return 0; if (p->flags & IORING_SETUP_SQ_AFF) { + struct cpumask allowed_mask; int cpu = p->sq_thread_cpu; ret = -EINVAL; - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + cpuset_cpus_allowed(current, &allowed_mask); + if (!cpumask_test_cpu(cpu, &allowed_mask)) goto err_sqpoll; sqd->sq_cpu = cpu; } else { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8391c7c7c1ec..39c3c816ec78 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); +void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + io_req_queue_iowq(req); +} + static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) diff --git a/mm/filemap.c b/mm/filemap.c index 8b03c1c5e88c..60a9cc593e9b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +int filemap_invalidate_pages(struct address_space *mapping, + loff_t pos, loff_t end, bool nowait) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - loff_t pos = iocb->ki_pos; - loff_t end = pos + count - 1; int ret; - if (iocb->ki_flags & IOCB_NOWAIT) { + if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; @@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } + +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + return filemap_invalidate_pages(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1, + iocb->ki_flags & IOCB_NOWAIT); +} EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** diff --git a/mm/gup.c b/mm/gup.c index 54d0dc3831fb..02c46ae33028 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -415,6 +415,19 @@ void unpin_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(unpin_user_pages); +/** + * unpin_user_folio() - release pages of a folio + * @folio: pointer to folio to be released + * @npages: number of pages of same folio + * + * Release npages of the folio + */ +void unpin_user_folio(struct folio *folio, unsigned long npages) +{ + gup_put_folio(folio, npages, FOLL_PIN); +} +EXPORT_SYMBOL(unpin_user_folio); + /** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released.