Merge adfc3ded5c ("Merge tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux") into android-mainline

Steps on the way to 6.12-rc1

Bug: 367265496
Change-Id: Id6063dbaee05b0c74216fd6e1ff2370a38960d07
Signed-off-by: Matthias Maennich <maennich@google.com>
This commit is contained in:
Matthias Maennich
2024-09-16 22:20:40 +00:00
committed by Treehugger Robot
168 changed files with 4782 additions and 3494 deletions
+2 -3
View File
@@ -3813,10 +3813,9 @@ F: Documentation/filesystems/befs.rst
F: fs/befs/
BFQ I/O SCHEDULER
M: Paolo Valente <paolo.valente@unimore.it>
M: Jens Axboe <axboe@kernel.dk>
M: Yu Kuai <yukuai3@huawei.com>
L: linux-block@vger.kernel.org
S: Maintained
S: Odd Fixes
F: Documentation/block/bfq-iosched.rst
F: block/bfq-*
+1 -7
View File
@@ -679,12 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
bfqg_and_blkg_put(old_parent);
if (entity->parent &&
entity->parent->last_bfqq_created == bfqq)
entity->parent->last_bfqq_created = NULL;
else if (bfqd->last_bfqq_created == bfqq)
bfqd->last_bfqq_created = NULL;
bfq_reassign_last_bfqq(bfqq, NULL);
entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data;
/* pin down bfqg and its associated blkg */
@@ -741,7 +736,6 @@ static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
*/
bfq_put_cooperator(sync_bfqq);
bic_set_bfqq(bic, NULL, true, act_idx);
bfq_release_process_ref(bfqd, sync_bfqq);
}
}
+113 -93
View File
@@ -2911,8 +2911,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
/* if a merge has already been setup, then proceed with that first */
if (bfqq->new_bfqq)
return bfqq->new_bfqq;
new_bfqq = bfqq->new_bfqq;
if (new_bfqq) {
while (new_bfqq->new_bfqq)
new_bfqq = new_bfqq->new_bfqq;
return new_bfqq;
}
/*
* Check delayed stable merge for rotational or non-queueing
@@ -3093,8 +3097,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
static void
bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,
struct bfq_queue *new_bfqq)
{
if (cur_bfqq->entity.parent &&
cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
@@ -3125,10 +3129,12 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}
static void
bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd,
struct bfq_io_cq *bic,
struct bfq_queue *bfqq)
{
struct bfq_queue *new_bfqq = bfqq->new_bfqq;
bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
(unsigned long)new_bfqq->pid);
/* Save weight raising and idle window of the merged queues */
@@ -3222,6 +3228,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_reassign_last_bfqq(bfqq, new_bfqq);
bfq_release_process_ref(bfqd, bfqq);
return new_bfqq;
}
static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -3257,14 +3265,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
* fulfilled, i.e., bic can be redirected to new_bfqq
* and bfqq can be put.
*/
bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
new_bfqq);
/*
* If we get here, bio will be queued into new_queue,
* so use new_bfqq to decide whether bio and rq can be
* merged.
*/
bfqq = new_bfqq;
while (bfqq != new_bfqq)
bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq);
/*
* Change also bqfd->bio_bfqq, as
@@ -5432,6 +5434,8 @@ void bfq_put_cooperator(struct bfq_queue *bfqq)
bfq_put_queue(__bfqq);
__bfqq = next;
}
bfq_release_process_ref(bfqq->bfqd, bfqq);
}
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -5444,8 +5448,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
bfq_put_cooperator(bfqq);
bfq_release_process_ref(bfqd, bfqq);
}
static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
@@ -5701,9 +5703,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* state before killing it.
*/
bfqq->bic = bic;
bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
return new_bfqq;
return bfq_merge_bfqqs(bfqd, bic, bfqq);
}
/*
@@ -6158,6 +6158,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
struct bfq_queue *old_bfqq = bfqq;
/*
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
@@ -6174,18 +6175,18 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* new_bfqq.
*/
if (bic_to_bfqq(RQ_BIC(rq), true,
bfq_actuator_index(bfqd, rq->bio)) == bfqq)
bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
bfqq, new_bfqq);
bfq_actuator_index(bfqd, rq->bio)) == bfqq) {
while (bfqq != new_bfqq)
bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq);
}
bfq_clear_bfqq_just_created(bfqq);
bfq_clear_bfqq_just_created(old_bfqq);
/*
* rq is about to be enqueued into new_bfqq,
* release rq reference on bfqq
*/
bfq_put_queue(bfqq);
bfq_put_queue(old_bfqq);
rq->elv.priv[1] = new_bfqq;
bfqq = new_bfqq;
}
bfq_update_io_thinktime(bfqd, bfqq);
@@ -6723,7 +6724,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
{
bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
if (bfqq_process_refs(bfqq) == 1) {
if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) {
bfqq->pid = current->pid;
bfq_clear_bfqq_coop(bfqq);
bfq_clear_bfqq_split_coop(bfqq);
@@ -6733,16 +6734,13 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
bfq_put_cooperator(bfqq);
bfq_release_process_ref(bfqq->bfqd, bfqq);
return NULL;
}
static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
struct bfq_io_cq *bic,
struct bio *bio,
bool split, bool is_sync,
bool *new_queue)
static struct bfq_queue *
__bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic,
struct bio *bio, bool split, bool is_sync,
bool *new_queue)
{
unsigned int act_idx = bfq_actuator_index(bfqd, bio);
struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
@@ -6821,6 +6819,84 @@ static void bfq_prepare_request(struct request *rq)
rq->elv.priv[0] = rq->elv.priv[1] = NULL;
}
static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq)
{
struct bfq_queue *new_bfqq = bfqq->new_bfqq;
struct bfq_queue *waker_bfqq = bfqq->waker_bfqq;
if (!waker_bfqq)
return NULL;
while (new_bfqq) {
if (new_bfqq == waker_bfqq) {
/*
* If waker_bfqq is in the merge chain, and current
* is the only procress.
*/
if (bfqq_process_refs(waker_bfqq) == 1)
return NULL;
break;
}
new_bfqq = new_bfqq->new_bfqq;
}
return waker_bfqq;
}
static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
struct bfq_io_cq *bic,
struct bio *bio,
unsigned int idx,
bool is_sync)
{
struct bfq_queue *waker_bfqq;
struct bfq_queue *bfqq;
bool new_queue = false;
bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
&new_queue);
if (unlikely(new_queue))
return bfqq;
/* If the queue was seeky for too long, break it apart. */
if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) ||
bic->bfqq_data[idx].stably_merged)
return bfqq;
waker_bfqq = bfq_waker_bfqq(bfqq);
/* Update bic before losing reference to bfqq */
if (bfq_bfqq_in_large_burst(bfqq))
bic->bfqq_data[idx].saved_in_large_burst = true;
bfqq = bfq_split_bfqq(bic, bfqq);
if (bfqq) {
bfq_bfqq_resume_state(bfqq, bfqd, bic, true);
return bfqq;
}
bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL);
if (unlikely(bfqq == &bfqd->oom_bfqq))
return bfqq;
bfq_bfqq_resume_state(bfqq, bfqd, bic, false);
bfqq->waker_bfqq = waker_bfqq;
bfqq->tentative_waker_bfqq = NULL;
/*
* If the waker queue disappears, then new_bfqq->waker_bfqq must be
* reset. So insert new_bfqq into the
* woken_list of the waker. See
* bfq_check_waker for details.
*/
if (waker_bfqq)
hlist_add_head(&bfqq->woken_list_node,
&bfqq->waker_bfqq->woken_list);
return bfqq;
}
/*
* If needed, init rq, allocate bfq data structures associated with
* rq, and increment reference counters in the destination bfq_queue
@@ -6852,8 +6928,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
struct bfq_io_cq *bic;
const int is_sync = rq_is_sync(rq);
struct bfq_queue *bfqq;
bool new_queue = false;
bool bfqq_already_existing = false, split = false;
unsigned int a_idx = bfq_actuator_index(bfqd, bio);
if (unlikely(!rq->elv.icq))
@@ -6870,54 +6944,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
return RQ_BFQQ(rq);
bic = icq_to_bic(rq->elv.icq);
bfq_check_ioprio_change(bic, bio);
bfq_bic_update_cgroup(bic, bio);
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
&new_queue);
if (likely(!new_queue)) {
/* If the queue was seeky for too long, break it apart. */
if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
!bic->bfqq_data[a_idx].stably_merged) {
struct bfq_queue *old_bfqq = bfqq;
/* Update bic before losing reference to bfqq */
if (bfq_bfqq_in_large_burst(bfqq))
bic->bfqq_data[a_idx].saved_in_large_burst =
true;
bfqq = bfq_split_bfqq(bic, bfqq);
split = true;
if (!bfqq) {
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
true, is_sync,
NULL);
if (unlikely(bfqq == &bfqd->oom_bfqq))
bfqq_already_existing = true;
} else
bfqq_already_existing = true;
if (!bfqq_already_existing) {
bfqq->waker_bfqq = old_bfqq->waker_bfqq;
bfqq->tentative_waker_bfqq = NULL;
/*
* If the waker queue disappears, then
* new_bfqq->waker_bfqq must be
* reset. So insert new_bfqq into the
* woken_list of the waker. See
* bfq_check_waker for details.
*/
if (bfqq->waker_bfqq)
hlist_add_head(&bfqq->woken_list_node,
&bfqq->waker_bfqq->woken_list);
}
}
}
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync);
bfqq_request_allocated(bfqq);
bfqq->ref++;
@@ -6934,18 +6963,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
* addition, if the queue has also just been split, we have to
* resume its state.
*/
if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq &&
bfqq_process_refs(bfqq) == 1)
bfqq->bic = bic;
if (split) {
/*
* The queue has just been split from a shared
* queue: restore the idle window and the
* possible weight raising period.
*/
bfq_bfqq_resume_state(bfqq, bfqd, bic,
bfqq_already_existing);
}
}
/*
* Consider bfqq as possibly belonging to a burst of newly
+2 -6
View File
@@ -1156,6 +1156,8 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,
struct bfq_queue *new_bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
@@ -1183,11 +1185,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
"%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
blk_add_cgroup_trace_msg((bfqd)->queue, \
&bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \
} while (0)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
@@ -1197,7 +1194,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+87 -25
View File
@@ -934,7 +934,8 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
*same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
PAGE_MASK));
if (!*same_page) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
@@ -1019,6 +1020,29 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
return len;
}
/**
* bio_add_hw_folio - attempt to add a folio to a bio with hw constraints
* @q: the target queue
* @bio: destination bio
* @folio: folio to add
* @len: vec entry length
* @offset: vec entry offset in the folio
* @max_sectors: maximum number of sectors that can be added
* @same_page: return if the segment has been merged inside the same folio
*
* Add a folio to a bio while respecting the hardware max_sectors, max_segment
* and gap limitations.
*/
int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
struct folio *folio, size_t len, size_t offset,
unsigned int max_sectors, bool *same_page)
{
if (len > UINT_MAX || offset > UINT_MAX)
return 0;
return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset,
max_sectors, same_page);
}
/**
* bio_add_pc_page - attempt to add page to passthrough bio
* @q: the target queue
@@ -1169,7 +1193,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
struct folio_iter fi;
bio_for_each_folio_all(fi, bio) {
struct page *page;
size_t nr_pages;
if (mark_dirty) {
@@ -1177,12 +1200,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_mark_dirty(fi.folio);
folio_unlock(fi.folio);
}
page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
fi.offset / PAGE_SIZE + 1;
do {
bio_release_page(bio, page++);
} while (--nr_pages != 0);
unpin_user_folio(fi.folio, nr_pages);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
@@ -1207,8 +1227,8 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
static int bio_iov_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t offset)
{
bool same_page = false;
@@ -1217,30 +1237,61 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
page, len, offset, &same_page)) {
folio_page(folio, 0), len, offset,
&same_page)) {
bio->bi_iter.bi_size += len;
if (same_page)
bio_release_page(bio, page);
if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
unpin_user_folio(folio, 1);
return 0;
}
__bio_add_page(bio, page, len, offset);
bio_add_folio_nofail(bio, folio, len, offset);
return 0;
}
static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
size_t len, size_t offset)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bool same_page = false;
if (bio_add_hw_page(q, bio, page, len, offset,
if (bio_add_hw_folio(q, bio, folio, len, offset,
queue_max_zone_append_sectors(q), &same_page) != len)
return -EINVAL;
if (same_page)
bio_release_page(bio, page);
if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
unpin_user_folio(folio, 1);
return 0;
}
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
size_t offset)
{
size_t bytes = left;
size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
unsigned int j;
/*
* We might COW a single page in the middle of
* a large folio, so we have to check that all
* pages belong to the same folio.
*/
bytes -= contig_sz;
for (j = i + 1; j < i + *num_pages; j++) {
size_t next = min_t(size_t, PAGE_SIZE, bytes);
if (page_folio(pages[j]) != folio ||
pages[j] != pages[j - 1] + 1) {
break;
}
contig_sz += next;
bytes -= next;
}
*num_pages = j - i;
return contig_sz;
}
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
@@ -1260,9 +1311,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
ssize_t size, left;
unsigned len, i = 0;
size_t offset;
ssize_t size;
unsigned int num_pages, i = 0;
size_t offset, folio_offset, left, len;
int ret = 0;
/*
@@ -1302,17 +1353,28 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
goto out;
}
for (left = size, i = 0; left > 0; left -= len, i++) {
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
folio_offset = ((size_t)folio_page_idx(folio, page) <<
PAGE_SHIFT) + offset;
len = min(folio_size(folio) - folio_offset, left);
num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
if (num_pages > 1)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);
len = min_t(size_t, PAGE_SIZE - offset, left);
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
ret = bio_iov_add_zone_append_page(bio, page, len,
offset);
ret = bio_iov_add_zone_append_folio(bio, folio, len,
folio_offset);
if (ret)
break;
} else
bio_iov_add_page(bio, page, len, offset);
bio_iov_add_folio(bio, folio, len, folio_offset);
offset = 0;
}
+13 -10
View File
@@ -1458,7 +1458,6 @@ int blkcg_init_disk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct blkcg_gq *new_blkg, *blkg;
bool preloaded;
int ret;
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
@@ -1478,15 +1477,8 @@ int blkcg_init_disk(struct gendisk *disk)
if (preloaded)
radix_tree_preload_end();
ret = blk_ioprio_init(disk);
if (ret)
goto err_destroy_all;
return 0;
err_destroy_all:
blkg_destroy_all(disk);
return ret;
err_unlock:
spin_unlock_irq(&q->queue_lock);
if (preloaded)
@@ -1554,6 +1546,14 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (blkcg_policy_enabled(q, pol))
return 0;
/*
* Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
* for example, ioprio. Such policy will work on blkcg level, not disk
* level, and don't need to be activated.
*/
if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
return -EINVAL;
if (queue_is_mq(q))
blk_mq_freeze_queue(q);
retry:
@@ -1733,9 +1733,12 @@ int blkcg_policy_register(struct blkcg_policy *pol)
goto err_unlock;
}
/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
/*
* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
* without pd_alloc_fn/pd_free_fn can't be activated.
*/
if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
goto err_unlock;
/* register @pol */
-1
View File
@@ -485,7 +485,6 @@ static inline void blkcg_deactivate_policy(struct gendisk *disk,
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
+6 -4
View File
@@ -648,7 +648,7 @@ static const struct ioc_params autop[] = {
* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
* vtime credit shortage and down on device saturation.
*/
static u32 vrate_adj_pct[] =
static const u32 vrate_adj_pct[] =
{ 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -2076,7 +2076,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
struct ioc_now *now)
{
struct ioc_gq *iocg;
u64 dur, usage_pct, nr_cycles;
u64 dur, usage_pct, nr_cycles, nr_cycles_shift;
/* if no debtor, reset the cycle */
if (!nr_debtors) {
@@ -2138,10 +2138,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
old_debt = iocg->abs_vdebt;
old_delay = iocg->delay;
nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1);
if (iocg->abs_vdebt)
iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1;
if (iocg->delay)
iocg->delay = iocg->delay >> nr_cycles ?: 1;
iocg->delay = iocg->delay >> nr_cycles_shift ?: 1;
iocg_kick_waitq(iocg, true, now);
+1 -56
View File
@@ -49,14 +49,6 @@ static const char *policy_name[] = {
static struct blkcg_policy ioprio_policy;
/**
* struct ioprio_blkg - Per (cgroup, request queue) data.
* @pd: blkg_policy_data structure.
*/
struct ioprio_blkg {
struct blkg_policy_data pd;
};
/**
* struct ioprio_blkcg - Per cgroup data.
* @cpd: blkcg_policy_data structure.
@@ -67,11 +59,6 @@ struct ioprio_blkcg {
enum prio_policy prio_policy;
};
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
}
static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
{
return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
@@ -84,16 +71,6 @@ ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
}
static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
{
struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);
if (!pd)
return NULL;
return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
}
static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
@@ -118,25 +95,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
return nbytes;
}
static struct blkg_policy_data *
ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp)
{
struct ioprio_blkg *ioprio_blkg;
ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
if (!ioprio_blkg)
return NULL;
return &ioprio_blkg->pd;
}
static void ioprio_free_pd(struct blkg_policy_data *pd)
{
struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);
kfree(ioprio_blkg);
}
static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
{
struct ioprio_blkcg *blkcg;
@@ -179,14 +137,11 @@ static struct blkcg_policy ioprio_policy = {
.cpd_alloc_fn = ioprio_alloc_cpd,
.cpd_free_fn = ioprio_free_cpd,
.pd_alloc_fn = ioprio_alloc_pd,
.pd_free_fn = ioprio_free_pd,
};
void blkcg_set_ioprio(struct bio *bio)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg);
u16 prio;
if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
@@ -219,16 +174,6 @@ void blkcg_set_ioprio(struct bio *bio)
bio->bi_ioprio = prio;
}
void blk_ioprio_exit(struct gendisk *disk)
{
blkcg_deactivate_policy(disk, &ioprio_policy);
}
int blk_ioprio_init(struct gendisk *disk)
{
return blkcg_activate_policy(disk, &ioprio_policy);
}
static int __init ioprio_init(void)
{
return blkcg_policy_register(&ioprio_policy);
-9
View File
@@ -9,17 +9,8 @@ struct request_queue;
struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
int blk_ioprio_init(struct gendisk *disk);
void blk_ioprio_exit(struct gendisk *disk);
void blkcg_set_ioprio(struct bio *bio);
#else
static inline int blk_ioprio_init(struct gendisk *disk)
{
return 0;
}
static inline void blk_ioprio_exit(struct gendisk *disk)
{
}
static inline void blkcg_set_ioprio(struct bio *bio)
{
}
+74 -88
View File
@@ -105,9 +105,33 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
static struct bio *bio_split_discard(struct bio *bio,
const struct queue_limits *lim,
unsigned *nsegs, struct bio_set *bs)
static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
if (unlikely(split_sectors < 0)) {
bio->bi_status = errno_to_blk_status(split_sectors);
bio_endio(bio);
return NULL;
}
if (split_sectors) {
struct bio *split;
split = bio_split(bio, split_sectors, GFP_NOIO,
&bio->bi_bdev->bd_disk->bio_split);
split->bi_opf |= REQ_NOMERGE;
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
return bio;
}
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
unsigned *nsegs)
{
unsigned int max_discard_sectors, granularity;
sector_t tmp;
@@ -121,10 +145,10 @@ static struct bio *bio_split_discard(struct bio *bio,
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors))
return NULL;
return bio;
if (bio_sectors(bio) <= max_discard_sectors)
return NULL;
return bio;
split_sectors = max_discard_sectors;
@@ -139,19 +163,18 @@ static struct bio *bio_split_discard(struct bio *bio,
if (split_sectors > tmp)
split_sectors -= tmp;
return bio_split(bio, split_sectors, GFP_NOIO, bs);
return bio_submit_split(bio, split_sectors);
}
static struct bio *bio_split_write_zeroes(struct bio *bio,
const struct queue_limits *lim,
unsigned *nsegs, struct bio_set *bs)
struct bio *bio_split_write_zeroes(struct bio *bio,
const struct queue_limits *lim, unsigned *nsegs)
{
*nsegs = 0;
if (!lim->max_write_zeroes_sectors)
return NULL;
return bio;
if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
return NULL;
return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
return bio;
return bio_submit_split(bio, lim->max_write_zeroes_sectors);
}
static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
@@ -274,27 +297,19 @@ static bool bvec_split_segs(const struct queue_limits *lim,
}
/**
* bio_split_rw - split a bio in two bios
* bio_split_rw_at - check if and where to split a read/write bio
* @bio: [in] bio to be split
* @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
* @bs: [in] bio set to allocate the clone from
* @max_bytes: [in] maximum number of bytes per bio
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
* - That it has at most @max_bytes worth of data
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
* the original bio. It is the responsibility of the caller to ensure that the
* original bio is not freed before the cloned bio. The caller is also
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
* Find out if @bio needs to be split to fit the queue limits in @lim and a
* maximum size of @max_bytes. Returns a negative error number if @bio can't be
* split, 0 if the bio doesn't have to be split, or a positive sector offset if
* @bio needs to be split.
*/
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
unsigned *segs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
@@ -324,22 +339,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
}
*segs = nsegs;
return NULL;
return 0;
split:
if (bio->bi_opf & REQ_ATOMIC) {
bio->bi_status = BLK_STS_INVAL;
bio_endio(bio);
return ERR_PTR(-EINVAL);
}
if (bio->bi_opf & REQ_ATOMIC)
return -EINVAL;
/*
* We can't sanely support splitting for a REQ_NOWAIT bio. End it
* with EAGAIN if splitting is required and return an error pointer.
*/
if (bio->bi_opf & REQ_NOWAIT) {
bio->bi_status = BLK_STS_AGAIN;
bio_endio(bio);
return ERR_PTR(-EAGAIN);
}
if (bio->bi_opf & REQ_NOWAIT)
return -EAGAIN;
*segs = nsegs;
@@ -356,58 +366,36 @@ split:
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_polled(bio);
return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
return bytes >> SECTOR_SHIFT;
}
EXPORT_SYMBOL_GPL(bio_split_rw);
EXPORT_SYMBOL_GPL(bio_split_rw_at);
/**
* __bio_split_to_limits - split a bio to fit the queue limits
* @bio: bio to be split
* @lim: queue limits to split based on
* @nr_segs: returns the number of segments in the returned bio
*
* Check if @bio needs splitting based on the queue limits, and if so split off
* a bio fitting the limits from the beginning of @bio and return it. @bio is
* shortened to the remainder and re-submitted.
*
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
struct bio *__bio_split_to_limits(struct bio *bio,
const struct queue_limits *lim,
unsigned int *nr_segs)
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *nr_segs)
{
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
struct bio *split;
return bio_submit_split(bio,
bio_split_rw_at(bio, lim, nr_segs,
get_max_io_size(bio, lim) << SECTOR_SHIFT));
}
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
split = bio_split_discard(bio, lim, nr_segs, bs);
break;
case REQ_OP_WRITE_ZEROES:
split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
default:
split = bio_split_rw(bio, lim, nr_segs, bs,
get_max_io_size(bio, lim) << SECTOR_SHIFT);
if (IS_ERR(split))
return NULL;
break;
}
/*
* REQ_OP_ZONE_APPEND bios must never be split by the block layer.
*
* But we want the nr_segs calculation provided by bio_split_rw_at, and having
* a good sanity check that the submitter built the bio correctly is nice to
* have as well.
*/
struct bio *bio_split_zone_append(struct bio *bio,
const struct queue_limits *lim, unsigned *nr_segs)
{
unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
int split_sectors;
if (split) {
/* there isn't chance to merge the split bio */
split->bi_opf |= REQ_NOMERGE;
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
return bio;
split_sectors = bio_split_rw_at(bio, lim, nr_segs,
max_sectors << SECTOR_SHIFT);
if (WARN_ON_ONCE(split_sectors > 0))
split_sectors = -EINVAL;
return bio_submit_split(bio, split_sectors);
}
/**
@@ -426,9 +414,7 @@ struct bio *bio_split_to_limits(struct bio *bio)
const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
if (bio_may_exceed_limits(bio, lim))
return __bio_split_to_limits(bio, lim, &nr_segs);
return bio;
return __bio_split_to_limits(bio, lim, &nr_segs);
}
EXPORT_SYMBOL(bio_split_to_limits);
+8 -6
View File
@@ -2753,6 +2753,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request *rq;
unsigned int depth;
/*
* We may have been called recursively midway through handling
@@ -2763,6 +2764,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (plug->rq_count == 0)
return;
depth = plug->rq_count;
plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
@@ -2770,6 +2772,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
rq = rq_list_peek(&plug->mq_list);
q = rq->q;
trace_block_unplug(q, depth, true);
/*
* Peek first request and see if we have a ->queue_rqs() hook.
@@ -2939,7 +2942,7 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
unsigned int nr_segs = 1;
unsigned int nr_segs;
struct request *rq;
blk_status_t ret;
@@ -2981,11 +2984,10 @@ void blk_mq_submit_bio(struct bio *bio)
goto queue_exit;
}
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto queue_exit;
}
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto queue_exit;
if (!bio_integrity_prep(bio))
goto queue_exit;
+1 -1
View File
@@ -263,7 +263,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
TASK_UNINTERRUPTIBLE);
do {
/* The memory barrier in set_task_state saves us here. */
/* The memory barrier in set_current_state saves us here. */
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+41 -26
View File
@@ -1584,6 +1584,22 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
spin_unlock_irq(&q->queue_lock);
}
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
{
/* throtl is FIFO - if bios are already queued, should queue */
if (tg->service_queue.nr_queued[rw])
return false;
return tg_may_dispatch(tg, bio, NULL);
}
static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
{
if (!bio_flagged(bio, BIO_BPS_THROTTLED))
tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
tg->carryover_ios[rw]--;
}
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -1600,34 +1616,35 @@ bool __blk_throtl_bio(struct bio *bio)
sq = &tg->service_queue;
while (true) {
if (tg->last_low_overflow_time[rw] == 0)
tg->last_low_overflow_time[rw] = jiffies;
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
if (tg_within_limit(tg, bio, rw)) {
/* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
/* if above limits, break to queue */
if (!tg_may_dispatch(tg, bio, NULL)) {
tg->last_low_overflow_time[rw] = jiffies;
/*
* We need to trim slice even when bios are not being
* queued otherwise it might happen that a bio is not
* queued for a long time and slice keeps on extending
* and trim is not called for a long time. Now if limits
* are reduced suddenly we take into account all the IO
* dispatched so far at new low rate and * newly queued
* IO gets a really long dispatch time.
*
* So keep on trimming slice even if bio is not queued.
*/
throtl_trim_slice(tg, rw);
} else if (bio_issue_as_root_blkg(bio)) {
/*
* IOs which may cause priority inversions are
* dispatched directly, even if they're over limit.
* Debts are handled by carryover_bytes/ios while
* calculating wait time.
*/
tg_dispatch_in_debt(tg, bio, rw);
} else {
/* if above limits, break to queue */
break;
}
/* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
/*
* We need to trim slice even when bios are not being queued
* otherwise it might happen that a bio is not queued for
* a long time and slice keeps on extending and trim is not
* called for a long time. Now if limits are reduced suddenly
* we take into account all the IO dispatched so far at new
* low rate and * newly queued IO gets a really long dispatch
* time.
*
* So keep on trimming slice even if bio is not queued.
*/
throtl_trim_slice(tg, rw);
/*
* @bio passed through this layer without being throttled.
* Climb up the ladder. If we're already at the top, it
@@ -1650,8 +1667,6 @@ bool __blk_throtl_bio(struct bio *bio)
tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);
tg->last_low_overflow_time[rw] = jiffies;
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
-2
View File
@@ -106,8 +106,6 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
unsigned long last_low_overflow_time[2];
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
+61 -22
View File
@@ -331,33 +331,67 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
static inline bool bio_may_exceed_limits(struct bio *bio,
const struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
return true; /* non-trivial splitting decisions */
default:
break;
}
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
unsigned *nsegs);
struct bio *bio_split_write_zeroes(struct bio *bio,
const struct queue_limits *lim, unsigned *nsegs);
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *nr_segs);
struct bio *bio_split_zone_append(struct bio *bio,
const struct queue_limits *lim, unsigned *nr_segs);
/*
* All drivers must accept single-segments bios that are <= PAGE_SIZE.
* This is a quick and dirty check that relies on the fact that
* bi_io_vec[0] is always valid if a bio has data. The check might
* lead to occasional false negatives when bios are cloned, but compared
* to the performance impact of cloned bios themselves the loop below
* doesn't matter anyway.
*/
/*
* All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
*
* This is a quick and dirty check that relies on the fact that bi_io_vec[0] is
* always valid if a bio has data. The check might lead to occasional false
* positives when bios are cloned, but compared to the performance impact of
* cloned bios themselves the loop below doesn't matter anyway.
*/
static inline bool bio_may_need_split(struct bio *bio,
const struct queue_limits *lim)
{
return lim->chunk_sectors || bio->bi_vcnt != 1 ||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
struct bio *__bio_split_to_limits(struct bio *bio,
const struct queue_limits *lim,
unsigned int *nr_segs);
/**
* __bio_split_to_limits - split a bio to fit the queue limits
* @bio: bio to be split
* @lim: queue limits to split based on
* @nr_segs: returns the number of segments in the returned bio
*
* Check if @bio needs splitting based on the queue limits, and if so split off
* a bio fitting the limits from the beginning of @bio and return it. @bio is
* shortened to the remainder and re-submitted.
*
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
static inline struct bio *__bio_split_to_limits(struct bio *bio,
const struct queue_limits *lim, unsigned int *nr_segs)
{
switch (bio_op(bio)) {
case REQ_OP_READ:
case REQ_OP_WRITE:
if (bio_may_need_split(bio, lim))
return bio_split_rw(bio, lim, nr_segs);
*nr_segs = 1;
return bio;
case REQ_OP_ZONE_APPEND:
return bio_split_zone_append(bio, lim, nr_segs);
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
return bio_split_discard(bio, lim, nr_segs);
case REQ_OP_WRITE_ZEROES:
return bio_split_write_zeroes(bio, lim, nr_segs);
default:
/* other operations can't be split */
*nr_segs = 0;
return bio;
}
}
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -540,6 +574,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
struct folio *folio, size_t len, size_t offset,
unsigned int max_sectors, bool *same_page);
/*
* Clean up a page appropriately, where the page may be pinned, may have a
* ref taken on it or neither.
@@ -571,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops;
+2
View File
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/io_uring/cmd.h>
#include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file)
@@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = {
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC,
};
+147 -21
View File
@@ -11,6 +11,9 @@
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
static int blkpg_do_ioctl(struct block_device *bdev,
@@ -92,38 +95,51 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
/*
* Check that [start, start + len) is a valid range from the block device's
* perspective, including verifying that it can be correctly translated into
* logical block addresses.
*/
static int blk_validate_byte_range(struct block_device *bdev,
uint64_t start, uint64_t len)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t end;
if ((start | len) & bs_mask)
return -EINVAL;
if (!len)
return -EINVAL;
if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
return -EINVAL;
return 0;
}
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t range[2], start, len, end;
uint64_t range[2], start, len;
struct bio *prev = NULL, *bio;
sector_t sector, nr_sects;
struct blk_plug plug;
int err;
if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
return -EPERM;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
start = range[0];
len = range[1];
if (!len)
return -EINVAL;
if ((start | len) & bs_mask)
return -EINVAL;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (check_add_overflow(start, len, &end) ||
end > bdev_nr_bytes(bdev))
return -EINVAL;
if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
@@ -163,7 +179,7 @@ fail:
static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
void __user *argp)
{
uint64_t start, len;
uint64_t start, len, end;
uint64_t range[2];
int err;
@@ -178,11 +194,12 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
len = range[1];
if ((start & 511) || (len & 511))
return -EINVAL;
if (start + len > bdev_nr_bytes(bdev))
if (check_add_overflow(start, len, &end) ||
end > bdev_nr_bytes(bdev))
return -EINVAL;
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
@@ -734,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ret;
}
#endif
struct blk_iou_cmd {
int res;
bool nowait;
};
static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
if (bic->res == -EAGAIN && bic->nowait)
io_uring_cmd_issue_blocking(cmd);
else
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
}
static void bio_cmd_bio_end_io(struct bio *bio)
{
struct io_uring_cmd *cmd = bio->bi_private;
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
if (unlikely(bio->bi_status) && !bic->res)
bic->res = blk_status_to_errno(bio->bi_status);
io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
bio_put(bio);
}
static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
struct block_device *bdev,
uint64_t start, uint64_t len, bool nowait)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
sector_t sector = start >> SECTOR_SHIFT;
sector_t nr_sects = len >> SECTOR_SHIFT;
struct bio *prev = NULL, *bio;
int err;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;
err = filemap_invalidate_pages(bdev->bd_mapping, start,
start + len - 1, nowait);
if (err)
return err;
while (true) {
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
if (!bio)
break;
if (nowait) {
/*
* Don't allow multi-bio non-blocking submissions as
* subsequent bios may fail but we won't get a direct
* indication of that. Normally, the caller should
* retry from a blocking context.
*/
if (unlikely(nr_sects)) {
bio_put(bio);
return -EAGAIN;
}
bio->bi_opf |= REQ_NOWAIT;
}
prev = bio_chain_and_submit(prev, bio);
}
if (unlikely(!prev))
return -EAGAIN;
if (unlikely(nr_sects))
bic->res = -EAGAIN;
prev->bi_private = cmd;
prev->bi_end_io = bio_cmd_bio_end_io;
submit_bio(prev);
return -EIOCBQUEUED;
}
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
const struct io_uring_sqe *sqe = cmd->sqe;
u32 cmd_op = cmd->cmd_op;
uint64_t start, len;
if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
sqe->rw_flags || sqe->file_index))
return -EINVAL;
bic->res = 0;
bic->nowait = issue_flags & IO_URING_F_NONBLOCK;
start = READ_ONCE(sqe->addr);
len = READ_ONCE(sqe->addr3);
switch (cmd_op) {
case BLOCK_URING_CMD_DISCARD:
return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
}
return -EINVAL;
}
+5 -3
View File
@@ -555,9 +555,11 @@ static bool blk_add_partition(struct gendisk *disk,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
printk(KERN_ERR " %s: p%d could not be added: %pe\n",
disk->disk_name, p, part);
if (IS_ERR(part)) {
if (PTR_ERR(part) != -ENXIO) {
printk(KERN_ERR " %s: p%d could not be added: %pe\n",
disk->disk_name, p, part);
}
return true;
}
+2 -6
View File
@@ -8,7 +8,6 @@
#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
#include <linux/crc64.h>
#include <linux/module.h>
#include <net/checksum.h>
#include <asm/unaligned.h>
#include "blk.h"
@@ -240,9 +239,9 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
}
}
static bool ext_pi_ref_escape(u8 *ref_tag)
static bool ext_pi_ref_escape(const u8 ref_tag[6])
{
static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
}
@@ -472,6 +471,3 @@ void blk_integrity_complete(struct request *rq, unsigned int nr_bytes)
else
t10_pi_type1_complete(rq, nr_bytes);
}
MODULE_DESCRIPTION("T10 Protection Information module");
MODULE_LICENSE("GPL");
-11
View File
@@ -297,10 +297,6 @@ struct drbd_epoch {
unsigned long flags;
};
/* Prototype declaration of function defined in drbd_receiver.c */
int drbdd_init(struct drbd_thread *);
int drbd_asender(struct drbd_thread *);
/* drbd_epoch flag bits */
enum {
DE_HAVE_BARRIER_NUMBER,
@@ -864,7 +860,6 @@ struct drbd_device {
struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
int next_barrier_nr;
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
@@ -1390,9 +1385,6 @@ extern void conn_free_crypto(struct drbd_connection *connection);
extern void do_submit(struct work_struct *ws);
extern void __drbd_make_request(struct drbd_device *, struct bio *);
void drbd_submit_bio(struct bio *bio);
extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
/* drbd_nl.c */
@@ -1474,7 +1466,6 @@ extern int w_resync_timer(struct drbd_work *, int);
extern int w_send_write_hint(struct drbd_work *, int);
extern int w_send_dblock(struct drbd_work *, int);
extern int w_send_read_req(struct drbd_work *, int);
extern int w_e_reissue(struct drbd_work *, int);
extern int w_restart_disk_io(struct drbd_work *, int);
extern int w_send_out_of_sync(struct drbd_work *, int);
@@ -1488,7 +1479,6 @@ extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
sector_t start, unsigned int nr_sectors, int flags);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_ack_receiver(struct drbd_thread *thi);
extern void drbd_send_ping_wf(struct work_struct *ws);
extern void drbd_send_acks_wf(struct work_struct *ws);
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector,
@@ -1504,7 +1494,6 @@ extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request
#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
extern int drbd_connected(struct drbd_peer_device *);
+1 -1
View File
@@ -1550,7 +1550,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa
* put_page(); and would cause either a VM_BUG directly, or
* __page_cache_release a page that would actually still be referenced
* by someone, leading to some obscure delayed Oops somewhere else. */
if (!drbd_disable_sendpage && sendpage_ok(page))
if (!drbd_disable_sendpage && sendpages_ok(page, len, offset))
msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES;
drbd_update_congested(peer_device->connection);
+1 -1
View File
@@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
ns.disk == D_OUTDATED)
rv = SS_CONNECTED_OUTDATES;
else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
(nc->verify_alg[0] == 0))
rv = SS_NO_VERIFY_ALG;
+1 -18
View File
@@ -2269,25 +2269,12 @@ static const struct file_operations mtip_flags_fops = {
.llseek = no_llseek,
};
static int mtip_hw_debugfs_init(struct driver_data *dd)
static void mtip_hw_debugfs_init(struct driver_data *dd)
{
if (!dfs_parent)
return -1;
dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
if (IS_ERR_OR_NULL(dd->dfs_node)) {
dev_warn(&dd->pdev->dev,
"Error creating node %s under debugfs\n",
dd->disk->disk_name);
dd->dfs_node = NULL;
return -1;
}
debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops);
debugfs_create_file("registers", 0444, dd->dfs_node, dd,
&mtip_regs_fops);
return 0;
}
static void mtip_hw_debugfs_exit(struct driver_data *dd)
@@ -4043,10 +4030,6 @@ static int __init mtip_init(void)
mtip_major = error;
dfs_parent = debugfs_create_dir("rssd", NULL);
if (IS_ERR_OR_NULL(dfs_parent)) {
pr_warn("Error creating debugfs parent\n");
dfs_parent = NULL;
}
/* Register our PCI operations. */
error = pci_register_driver(&mtip_pci_driver);
+26 -2
View File
@@ -181,6 +181,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
lockdep_assert_held(&cmd->lock);
/*
* Clear INFLIGHT flag so that this cmd won't be completed in
* normal completion path
*
* INFLIGHT flag will be set when the cmd is queued to nbd next
* time.
*/
__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
blk_mq_requeue_request(req, true);
}
@@ -339,7 +350,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
lim = queue_limits_start_update(nbd->disk->queue);
if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
lim.max_hw_discard_sectors = UINT_MAX;
lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT;
else
lim.max_hw_discard_sectors = 0;
if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) {
@@ -350,6 +361,11 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
lim.features |= BLK_FEAT_WRITE_CACHE;
lim.features &= ~BLK_FEAT_FUA;
}
if (nbd->config->flags & NBD_FLAG_ROTATIONAL)
lim.features |= BLK_FEAT_ROTATIONAL;
if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES)
lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT;
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
@@ -418,6 +434,8 @@ static u32 req_to_nbd_cmd_type(struct request *req)
return NBD_CMD_WRITE;
case REQ_OP_READ:
return NBD_CMD_READ;
case REQ_OP_WRITE_ZEROES:
return NBD_CMD_WRITE_ZEROES;
default:
return U32_MAX;
}
@@ -488,8 +506,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
}
mutex_unlock(&cmd->lock);
nbd_requeue_cmd(cmd);
mutex_unlock(&cmd->lock);
nbd_config_put(nbd);
return BLK_EH_DONE;
}
@@ -634,6 +652,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
if (req->cmd_flags & REQ_FUA)
nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES))
nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE;
/* We did a partial send previously, and we at least sent the whole
* request struct, so just go and send the rest of the pages in the
@@ -1703,6 +1723,10 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
seq_puts(s, "NBD_FLAG_SEND_FUA\n");
if (flags & NBD_FLAG_SEND_TRIM)
seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
if (flags & NBD_FLAG_SEND_WRITE_ZEROES)
seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n");
if (flags & NBD_FLAG_ROTATIONAL)
seq_puts(s, "NBD_FLAG_ROTATIONAL\n");
return 0;
}
-2
View File
@@ -498,8 +498,6 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
if (!pkt_debugfs_root)
return;
pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root);
if (!pd->dfs_d_root)
return;
pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root,
pd, &pkt_seq_fops);
+9 -2
View File
@@ -149,15 +149,22 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
if (bio_add_page(bio, virt_to_page(data), datalen,
offset_in_page(data)) != datalen) {
rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
err = -EINVAL;
goto bio_put;
}
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
if (bio_has_data(bio) &&
bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) {
rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n",
bio->bi_iter.bi_size, msg->bi_size);
err = -EINVAL;
goto bio_put;
}
bio->bi_end_io = rnbd_dev_bi_end_io;
bio->bi_private = priv;
bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
bio_set_prio(bio, prio);
+46 -16
View File
@@ -71,9 +71,6 @@ struct ublk_rq_data {
struct llist_node node;
struct kref ref;
__u64 sector;
__u32 operation;
__u32 nr_zones;
};
struct ublk_uring_cmd_pdu {
@@ -214,6 +211,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
#ifdef CONFIG_BLK_DEV_ZONED
struct ublk_zoned_report_desc {
__u64 sector;
__u32 operation;
__u32 nr_zones;
};
static DEFINE_XARRAY(ublk_zoned_report_descs);
static int ublk_zoned_insert_report_desc(const struct request *req,
struct ublk_zoned_report_desc *desc)
{
return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
desc, GFP_KERNEL);
}
static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
const struct request *req)
{
return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
}
static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
const struct request *req)
{
return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
}
static int ublk_get_nr_zones(const struct ublk_device *ub)
{
const struct ublk_param_basic *p = &ub->params.basic;
@@ -308,7 +332,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int zones_in_request =
min_t(unsigned int, remaining_zones, max_zones_per_request);
struct request *req;
struct ublk_rq_data *pdu;
struct ublk_zoned_report_desc desc;
blk_status_t status;
memset(buffer, 0, buffer_length);
@@ -319,20 +343,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
goto out;
}
pdu = blk_mq_rq_to_pdu(req);
pdu->operation = UBLK_IO_OP_REPORT_ZONES;
pdu->sector = sector;
pdu->nr_zones = zones_in_request;
desc.operation = UBLK_IO_OP_REPORT_ZONES;
desc.sector = sector;
desc.nr_zones = zones_in_request;
ret = ublk_zoned_insert_report_desc(req, &desc);
if (ret)
goto free_req;
ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
GFP_KERNEL);
if (ret) {
blk_mq_free_request(req);
goto out;
}
if (ret)
goto erase_desc;
status = blk_execute_rq(req, 0);
ret = blk_status_to_errno(status);
erase_desc:
ublk_zoned_erase_report_desc(req);
free_req:
blk_mq_free_request(req);
if (ret)
goto out;
@@ -366,7 +393,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
struct ublk_zoned_report_desc *desc;
u32 ublk_op;
switch (req_op(req)) {
@@ -389,12 +416,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
break;
case REQ_OP_DRV_IN:
ublk_op = pdu->operation;
desc = ublk_zoned_get_report_desc(req);
if (!desc)
return BLK_STS_IOERR;
ublk_op = desc->operation;
switch (ublk_op) {
case UBLK_IO_OP_REPORT_ZONES:
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_zones = pdu->nr_zones;
iod->start_sector = pdu->sector;
iod->nr_zones = desc->nr_zones;
iod->start_sector = desc->sector;
return BLK_STS_OK;
default:
return BLK_STS_IOERR;
+10 -6
View File
@@ -59,17 +59,17 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
static int zram_slot_trylock(struct zram *zram, u32 index)
{
return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
return spin_trylock(&zram->table[index].lock);
}
static void zram_slot_lock(struct zram *zram, u32 index)
{
bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags);
spin_lock(&zram->table[index].lock);
}
static void zram_slot_unlock(struct zram *zram, u32 index)
{
bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
spin_unlock(&zram->table[index].lock);
}
static inline bool init_done(struct zram *zram)
@@ -1211,7 +1211,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
static bool zram_meta_alloc(struct zram *zram, u64 disksize)
{
size_t num_pages;
size_t num_pages, index;
num_pages = disksize >> PAGE_SHIFT;
zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
@@ -1226,6 +1226,9 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
if (!huge_class_size)
huge_class_size = zs_huge_class_size(zram->mem_pool);
for (index = 0; index < num_pages; index++)
spin_lock_init(&zram->table[index].lock);
return true;
}
@@ -1283,7 +1286,7 @@ out:
zram_set_handle(zram, index, 0);
zram_set_obj_size(zram, index, 0);
WARN_ON_ONCE(zram->table[index].flags &
~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
~(1UL << ZRAM_UNDER_WB));
}
/*
@@ -2401,9 +2404,10 @@ static void destroy_devices(void)
static int __init zram_init(void)
{
struct zram_table_entry zram_te;
int ret;
BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8);
ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
zcomp_cpu_up_prepare, zcomp_cpu_dead);
+3 -4
View File
@@ -45,9 +45,7 @@
/* Flags for zram pages (table[page_no].flags) */
enum zram_pageflags {
/* zram slot is locked */
ZRAM_LOCK = ZRAM_FLAG_SHIFT,
ZRAM_SAME, /* Page consists the same element */
ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */
ZRAM_WB, /* page is stored on backing_device */
ZRAM_UNDER_WB, /* page is under writeback */
ZRAM_HUGE, /* Incompressible page */
@@ -68,7 +66,8 @@ struct zram_table_entry {
unsigned long handle;
unsigned long element;
};
unsigned long flags;
unsigned int flags;
spinlock_t lock;
#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
ktime_t ac_time;
#endif
+5 -2
View File
@@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
/* Try loading the bitmap unless "raid0", which does not have one */
if (!rs_is_raid0(rs) &&
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
r = md_bitmap_load(&rs->md);
struct mddev *mddev = &rs->md;
r = mddev->bitmap_ops->load(mddev);
if (r)
DMERR("Failed to load bitmap");
}
@@ -4066,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
chunksize, false);
if (r)
DMERR("Failed to resize bitmap");
}
+439 -143
View File
File diff suppressed because it is too large Load Diff
+44 -216
View File
@@ -7,81 +7,7 @@
#ifndef BITMAP_H
#define BITMAP_H 1
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
* in-memory bitmap:
*
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed. The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync | counter |
* | needed | active | |
* | (0-1) | (0-1) | (0-16383) |
* +--------+--------+------------------------------------------------+
*
* The "resync needed" bit is set when:
* a '1' bit is read from storage at startup.
* a write request fails on some drives
* a resync is aborted on a chunk with 'resync active' set
* It is cleared (and resync-active set) when a resync starts across all drives
* of the chunk.
*
*
* The "resync active" bit is set when:
* a resync is started on all drives, and resync_needed is set.
* resync_needed will be cleared (as long as resync_active wasn't already set).
* It is cleared when a resync completes.
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
* bit can be cleared as well, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
* If the counter is 0, the on-disk bit is clear and the stripe is clean
* Anything that dirties the stripe pushes the counter to 2 (at least)
* and sets the on-disk bit (lazily).
* If a periodic sweep find the counter at 2, it is decremented to 1.
* If the sweep find the counter at 1, the on-disk bit is cleared and the
* counter goes to zero.
*
* Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
* counters as a fallback when "page" memory cannot be allocated:
*
* Normal case (page memory allocated):
*
* page pointer (32-bit)
*
* [ ] ------+
* |
* +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
* c1 c2 c2048
*
* Hijacked case (page memory allocation failed):
*
* hijacked page pointer (32-bit)
*
* [ ][ ] (no page memory allocated)
* counter #1 (16-bit) counter #2 (16-bit)
*
*/
#ifdef __KERNEL__
#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
#define BITMAP_MAGIC 0x6d746962
typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
@@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t;
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SHIFT 9
#endif
/*
* bitmap structures:
*/
#define BITMAP_MAGIC 0x6d746962
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
@@ -152,136 +58,58 @@ typedef struct bitmap_super_s {
* devices. For raid10 it is the size of the array.
*/
#ifdef __KERNEL__
struct md_bitmap_stats {
u64 events_cleared;
int behind_writes;
bool behind_wait;
/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
/*
* map points to the actual memory page
*/
char *map;
/*
* in emergencies (when map cannot be alloced), hijack the map
* pointer and use it as two counters itself
*/
unsigned int hijacked:1;
/*
* If any counter in this page is '1' or '2' - and so could be
* cleared then that page is marked as 'pending'
*/
unsigned int pending:1;
/*
* count of dirty bits on the page
*/
unsigned int count:30;
unsigned long missing_pages;
unsigned long file_pages;
unsigned long sync_size;
unsigned long pages;
struct file *file;
};
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_operations {
bool (*enabled)(struct mddev *mddev);
int (*create)(struct mddev *mddev, int slot);
int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
bool init);
struct bitmap_counts {
spinlock_t lock;
struct bitmap_page *bp;
unsigned long pages; /* total number of pages
* in the bitmap */
unsigned long missing_pages; /* number of pages
* not yet allocated */
unsigned long chunkshift; /* chunksize = 2^chunkshift
* (for bitops) */
unsigned long chunks; /* Total number of data
* chunks for the array */
} counts;
int (*load)(struct mddev *mddev);
void (*destroy)(struct mddev *mddev);
void (*flush)(struct mddev *mddev);
void (*write_all)(struct mddev *mddev);
void (*dirty_bits)(struct mddev *mddev, unsigned long s,
unsigned long e);
void (*unplug)(struct mddev *mddev, bool sync);
void (*daemon_work)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
struct mddev *mddev; /* the md device that the bitmap is for */
int (*startwrite)(struct mddev *mddev, sector_t offset,
unsigned long sectors, bool behind);
void (*endwrite)(struct mddev *mddev, sector_t offset,
unsigned long sectors, bool success, bool behind);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force);
void (*close_sync)(struct mddev *mddev);
__u64 events_cleared;
int need_sync;
void (*update_sb)(void *data);
int (*get_stats)(void *data, struct md_bitmap_stats *stats);
struct bitmap_storage {
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap
* file superblock */
unsigned long sb_index;
struct page **filemap; /* list of cache pages for
* the file */
unsigned long *filemap_attr; /* attributes associated
* w/ filemap pages */
unsigned long file_pages; /* number of pages in the file*/
unsigned long bytes; /* total bytes in the bitmap */
} storage;
unsigned long flags;
int allclean;
atomic_t behind_writes;
unsigned long behind_writes_used; /* highest actual value at runtime */
/*
* the bitmap daemon - periodically wakes up and sweeps the bitmap
* file, cleaning up bits and flushing out pages to disk as necessary
*/
unsigned long daemon_lastrun; /* jiffies of last run */
unsigned long last_end_sync; /* when we lasted called end_sync to
* update bitmap with resync progress */
atomic_t pending_writes; /* pending writes to the bitmap file */
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;
wait_queue_head_t behind_wait;
struct kernfs_node *sysfs_can_clear;
int cluster_slot; /* Slot offset for clustered env */
void (*sync_with_cluster)(struct mddev *mddev,
sector_t old_lo, sector_t old_hi,
sector_t new_lo, sector_t new_hi);
void *(*get_from_slot)(struct mddev *mddev, int slot);
int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo,
sector_t *hi, bool clear_bits);
void (*set_pages)(void *data, unsigned long pages);
void (*free)(void *data);
};
/* the bitmap API */
/* these are used only by md/bitmap */
struct bitmap *md_bitmap_create(struct mddev *mddev, int slot);
int md_bitmap_load(struct mddev *mddev);
void md_bitmap_flush(struct mddev *mddev);
void md_bitmap_destroy(struct mddev *mddev);
void md_bitmap_print_sb(struct bitmap *bitmap);
void md_bitmap_update_sb(struct bitmap *bitmap);
void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
int md_bitmap_setallbits(struct bitmap *bitmap);
void md_bitmap_write_all(struct bitmap *bitmap);
void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
/* these are exported */
int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int behind);
void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int success, int behind);
int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void md_bitmap_close_sync(struct bitmap *bitmap);
void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void md_bitmap_sync_with_cluster(struct mddev *mddev,
sector_t old_lo, sector_t old_hi,
sector_t new_lo, sector_t new_hi);
void md_bitmap_unplug(struct bitmap *bitmap);
void md_bitmap_unplug_async(struct bitmap *bitmap);
void md_bitmap_daemon_work(struct mddev *mddev);
int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
void md_bitmap_free(struct bitmap *bitmap);
void md_bitmap_wait_behind_writes(struct mddev *mddev);
static inline bool md_bitmap_enabled(struct bitmap *bitmap)
{
return bitmap && bitmap->storage.filemap &&
!test_bit(BITMAP_STALE, &bitmap->flags);
}
#endif
void mddev_set_bitmap_ops(struct mddev *mddev);
#endif
+53 -38
View File
@@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread)
str, ret);
goto clear_bit;
}
ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true);
if (ret) {
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
goto clear_bit;
@@ -497,8 +497,8 @@ static void process_suspend_info(struct mddev *mddev,
* we don't want to trigger lots of WARN.
*/
if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
cinfo->sync_hi, lo, hi);
mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low,
cinfo->sync_hi, lo, hi);
cinfo->sync_low = lo;
cinfo->sync_hi = hi;
@@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
break;
case BITMAP_RESIZE:
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
ret = md_bitmap_resize(mddev->bitmap,
le64_to_cpu(msg->high), 0, 0);
ret = mddev->bitmap_ops->resize(mddev,
le64_to_cpu(msg->high),
0, false);
break;
default:
ret = -1;
@@ -856,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
}
/* Read the disk bitmap sb and check if it needs recovery */
ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false);
if (ret) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
lockres_free(bm_lockres);
@@ -1143,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size)
static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
{
struct bitmap_counts *counts;
char str[64];
struct dlm_lock_resource *bm_lockres;
struct bitmap *bitmap = mddev->bitmap;
unsigned long my_pages = bitmap->counts.pages;
void *bitmap = mddev->bitmap;
struct md_bitmap_stats stats;
unsigned long my_pages;
int i, rv;
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv)
return rv;
my_pages = stats.pages;
/*
* We need to ensure all the nodes can grow to a larger
* bitmap size before make the reshaping.
@@ -1159,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
return rv;
for (i = 0; i < mddev->bitmap_info.nodes; i++) {
struct dlm_lock_resource *bm_lockres;
char str[64];
if (i == md_cluster_ops->slot_number(mddev))
continue;
bitmap = get_bitmap_from_slot(mddev, i);
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
bitmap = NULL;
goto out;
}
counts = &bitmap->counts;
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv)
goto out;
/*
* If we can hold the bitmap lock of one node then
* the slot is not occupied, update the pages.
@@ -1183,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
counts->pages = my_pages;
mddev->bitmap_ops->set_pages(bitmap, my_pages);
lockres_free(bm_lockres);
if (my_pages != counts->pages)
if (my_pages != stats.pages)
/*
* Let's revert the bitmap size if one node
* can't resize bitmap
*/
goto out;
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
}
return 0;
out:
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
update_bitmap_size(mddev, oldsize);
return -1;
}
@@ -1207,24 +1216,27 @@ out:
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
int i, rv;
bitmap_super_t *sb;
unsigned long my_sync_size, sync_size = 0;
int node_num = mddev->bitmap_info.nodes;
int current_slot = md_cluster_ops->slot_number(mddev);
struct bitmap *bitmap = mddev->bitmap;
char str[64];
int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
struct md_bitmap_stats stats;
void *bitmap = mddev->bitmap;
unsigned long sync_size = 0;
unsigned long my_sync_size;
char str[64];
int i, rv;
sb = kmap_atomic(bitmap->storage.sb_page);
my_sync_size = sb->sync_size;
kunmap_atomic(sb);
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv)
return rv;
my_sync_size = stats.sync_size;
for (i = 0; i < node_num; i++) {
if (i == current_slot)
continue;
bitmap = get_bitmap_from_slot(mddev, i);
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
return -1;
@@ -1238,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev)
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
pr_err("md-cluster: Cannot initialize %s\n", str);
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
return -1;
}
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
md_bitmap_update_sb(bitmap);
mddev->bitmap_ops->update_sb(bitmap);
lockres_free(bm_lockres);
sb = kmap_atomic(bitmap->storage.sb_page);
if (sync_size == 0)
sync_size = sb->sync_size;
else if (sync_size != sb->sync_size) {
kunmap_atomic(sb);
md_bitmap_free(bitmap);
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv) {
mddev->bitmap_ops->free(bitmap);
return rv;
}
if (sync_size == 0) {
sync_size = stats.sync_size;
} else if (sync_size != stats.sync_size) {
mddev->bitmap_ops->free(bitmap);
return -1;
}
kunmap_atomic(sb);
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
}
return (my_sync_size == sync_size) ? 0 : -1;
@@ -1585,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
if (sn == (cinfo->slot_number - 1))
continue;
err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false);
if (err) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
goto out;
+155 -185
View File
@@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n
return 0;
}
/*
* Generic flush handling for md
*/
static void md_end_flush(struct bio *bio)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
bio_put(bio);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending))
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}
static void md_submit_flush_data(struct work_struct *ws);
static void submit_flushes(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
mddev->start_flush = ktime_get_boottime();
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) {
struct bio *bi;
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
bi = bio_alloc_bioset(rdev->bdev, 0,
REQ_OP_WRITE | REQ_PREFLUSH,
GFP_NOIO, &mddev->bio_set);
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
atomic_inc(&mddev->flush_pending);
submit_bio(bi);
rcu_read_lock();
}
rcu_read_unlock();
if (atomic_dec_and_test(&mddev->flush_pending))
queue_work(md_wq, &mddev->flush_work);
}
static void md_submit_flush_data(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct bio *bio = mddev->flush_bio;
/*
* must reset flush_bio before calling into md_handle_request to avoid a
* deadlock, because other bios passed md_handle_request suspend check
* could wait for this and below md_handle_request could wait for those
* bios because of suspend check
*/
spin_lock_irq(&mddev->lock);
mddev->prev_flush_start = mddev->start_flush;
mddev->flush_bio = NULL;
spin_unlock_irq(&mddev->lock);
wake_up(&mddev->sb_wait);
if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
} else {
bio->bi_opf &= ~REQ_PREFLUSH;
/*
* make_requst() will never return error here, it only
* returns error in raid5_make_request() by dm-raid.
* Since dm always splits data and flush operation into
* two separate io, io size of flush submitted by dm
* always is 0, make_request() will not be called here.
*/
if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
bio_io_error(bio);
}
/* The pair is percpu_ref_get() from md_flush_request() */
percpu_ref_put(&mddev->active_io);
}
/*
* Manages consolidation of flushes and submitting any flushes needed for
* a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
* being finished in another context. Returns false if the flushing is
* complete but still needs the I/O portion of the bio to be processed.
*/
bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
ktime_t req_start = ktime_get_boottime();
spin_lock_irq(&mddev->lock);
/* flush requests wait until ongoing flush completes,
* hence coalescing all the pending requests.
struct md_rdev *rdev;
struct bio *new;
/*
* md_flush_reqeust() should be called under md_handle_request() and
* 'active_io' is already grabbed. Hence it's safe to get rdev directly
* without rcu protection.
*/
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio ||
ktime_before(req_start, mddev->prev_flush_start),
mddev->lock);
/* new request after previous flush is completed */
if (ktime_after(req_start, mddev->prev_flush_start)) {
WARN_ON(mddev->flush_bio);
/*
* Grab a reference to make sure mddev_suspend() will wait for
* this flush to be done.
*
* md_flush_reqeust() is called under md_handle_request() and
* 'active_io' is already grabbed, hence percpu_ref_is_zero()
* won't pass, percpu_ref_tryget_live() can't be used because
* percpu_ref_kill() can be called by mddev_suspend()
* concurrently.
*/
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
percpu_ref_get(&mddev->active_io);
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->lock);
INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
return true;
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
continue;
new = bio_alloc_bioset(rdev->bdev, 0,
REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
&mddev->bio_set);
bio_chain(new, bio);
submit_bio(new);
}
/* flush was performed for some other bio while we waited. */
spin_unlock_irq(&mddev->lock);
if (bio->bi_iter.bi_size == 0) {
/* pure flush without data - all done */
if (bio_sectors(bio) == 0) {
bio_endio(bio);
return true;
}
@@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
@@ -772,6 +664,7 @@ int mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
@@ -1372,6 +1265,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
return ret;
}
static u64 md_bitmap_events_cleared(struct mddev *mddev)
{
struct md_bitmap_stats stats;
int err;
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return 0;
return stats.events_cleared;
}
/*
* validate_super for 0.90.0
* note: we are not using "freshest" for 0.9 superblock
@@ -1464,7 +1369,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
/* if adding to array with a bitmap, then we can accept an
* older device ... but not too old.
*/
if (ev1 < mddev->bitmap->events_cleared)
if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
@@ -1991,7 +1896,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
/* If adding to array with a bitmap, then we can accept an
* older device, but not too old.
*/
if (ev1 < mddev->bitmap->events_cleared)
if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
@@ -2323,7 +2228,6 @@ super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
/* All necessary checks on new >= old have been done */
struct bitmap *bitmap;
if (new_offset >= rdev->data_offset)
return 1;
@@ -2340,11 +2244,18 @@ super_1_allow_new_offset(struct md_rdev *rdev,
*/
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
bitmap = rdev->mddev->bitmap;
if (bitmap && !rdev->mddev->bitmap_info.file &&
rdev->sb_start + rdev->mddev->bitmap_info.offset +
bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
return 0;
if (!rdev->mddev->bitmap_info.file) {
struct mddev *mddev = rdev->mddev;
struct md_bitmap_stats stats;
int err;
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (!err && rdev->sb_start + mddev->bitmap_info.offset +
stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
return 0;
}
if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
return 0;
@@ -2820,7 +2731,7 @@ repeat:
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
md_bitmap_update_sb(mddev->bitmap);
mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
@@ -4141,6 +4052,34 @@ out_unlock:
static struct md_sysfs_entry md_level =
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
static ssize_t
new_level_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->new_level);
}
static ssize_t
new_level_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned int n;
int err;
err = kstrtouint(buf, 10, &n);
if (err < 0)
return err;
err = mddev_lock(mddev);
if (err)
return err;
mddev->new_level = n;
md_update_sb(mddev, 1);
mddev_unlock(mddev);
return len;
}
static struct md_sysfs_entry md_new_level =
__ATTR(new_level, 0664, new_level_show, new_level_store);
static ssize_t
layout_show(struct mddev *mddev, char *page)
{
@@ -4680,17 +4619,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
while (*buf) {
chunk = end_chunk = simple_strtoul(buf, &end, 0);
if (buf == end) break;
if (buf == end)
break;
if (*end == '-') { /* range */
buf = end + 1;
end_chunk = simple_strtoul(buf, &end, 0);
if (buf == end) break;
if (buf == end)
break;
}
if (*end && !isspace(*end)) break;
md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
if (*end && !isspace(*end))
break;
mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
buf = skip_spaces(end);
}
md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
out:
mddev_unlock(mddev);
return len;
@@ -5666,6 +5611,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_new_level.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
@@ -6206,16 +6152,10 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, -1);
if (IS_ERR(bitmap)) {
err = PTR_ERR(bitmap);
err = mddev->bitmap_ops->create(mddev, -1);
if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
} else
mddev->bitmap = bitmap;
}
if (err)
goto bitmap_abort;
@@ -6285,7 +6225,7 @@ bitmap_abort:
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
@@ -6304,9 +6244,10 @@ int do_md_run(struct mddev *mddev)
err = md_run(mddev);
if (err)
goto out;
err = md_bitmap_load(mddev);
err = mddev->bitmap_ops->load(mddev);
if (err) {
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
goto out;
}
@@ -6450,7 +6391,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
md_bitmap_flush(mddev);
mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6477,7 +6419,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
md_bitmap_wait_behind_writes(mddev);
mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -6492,7 +6434,8 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
@@ -7270,22 +7213,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
struct bitmap *bitmap;
err = mddev->bitmap_ops->create(mddev, -1);
if (!err)
err = mddev->bitmap_ops->load(mddev);
bitmap = md_bitmap_create(mddev, -1);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = md_bitmap_load(mddev);
} else
err = PTR_ERR(bitmap);
if (err) {
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
}
}
if (fd < 0) {
struct file *f = mddev->bitmap_info.file;
if (f) {
@@ -7554,7 +7494,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
goto err;
}
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
struct bitmap *bitmap;
/* add the bitmap */
if (mddev->bitmap) {
rv = -EEXIST;
@@ -7568,24 +7507,24 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
bitmap = md_bitmap_create(mddev, -1);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
} else
rv = PTR_ERR(bitmap);
rv = mddev->bitmap_ops->create(mddev, -1);
if (!rv)
rv = mddev->bitmap_ops->load(mddev);
if (rv)
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
rv = -ENOENT;
struct md_bitmap_stats stats;
rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (rv)
goto err;
}
if (mddev->bitmap->storage.file) {
if (stats.file) {
rv = -EINVAL;
goto err;
}
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
@@ -7600,7 +7539,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -8370,6 +8309,33 @@ static void md_seq_stop(struct seq_file *seq, void *v)
spin_unlock(&all_mddevs_lock);
}
static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
{
struct md_bitmap_stats stats;
unsigned long used_pages;
unsigned long chunk_kb;
int err;
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return;
chunk_kb = mddev->bitmap_info.chunksize >> 10;
used_pages = stats.pages - stats.missing_pages;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
chunk_kb ? "KB" : "B");
if (stats.file) {
seq_puts(seq, ", file: ");
seq_file_path(seq, stats.file, " \t\n");
}
seq_putc(seq, '\n');
}
static int md_seq_show(struct seq_file *seq, void *v)
{
struct mddev *mddev;
@@ -8390,14 +8356,19 @@ static int md_seq_show(struct seq_file *seq, void *v)
spin_unlock(&all_mddevs_lock);
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
mddev->pers ? "" : "in");
seq_printf(seq, "%s : ", mdname(mddev));
if (mddev->pers) {
if (test_bit(MD_BROKEN, &mddev->flags))
seq_printf(seq, "broken");
else
seq_printf(seq, "active");
if (mddev->ro == MD_RDONLY)
seq_printf(seq, " (read-only)");
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name);
} else {
seq_printf(seq, "inactive");
}
sectors = 0;
@@ -8453,7 +8424,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
} else
seq_printf(seq, "\n ");
md_bitmap_status(seq, mddev->bitmap);
md_bitmap_status(seq, mddev);
seq_printf(seq, "\n");
}
@@ -8668,7 +8639,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -9506,7 +9476,7 @@ static void md_start_sync(struct work_struct *ws)
* stored on all devices. So make sure all bitmap pages get written.
*/
if (spares)
md_bitmap_write_all(mddev->bitmap);
mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
"reshape" : "resync";
@@ -9594,7 +9564,7 @@ static void unregister_sync_thread(struct mddev *mddev)
void md_check_recovery(struct mddev *mddev)
{
if (mddev->bitmap)
md_bitmap_daemon_work(mddev);
mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
if (mddev->pers->sync_request && !mddev->external) {
@@ -9965,7 +9935,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (ret)
pr_info("md-cluster: resize failed\n");
else
md_bitmap_update_sb(mddev->bitmap);
mddev->bitmap_ops->update_sb(mddev->bitmap);
}
/* Check for change of roles in the active devices */
+2 -11
View File
@@ -535,7 +535,8 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
struct bitmap *bitmap; /* the bitmap for the device */
void *bitmap; /* the bitmap for the device */
struct bitmap_operations *bitmap_ops;
struct {
struct file *file; /* the bitmap file */
loff_t offset; /* offset from superblock of
@@ -571,16 +572,6 @@ struct mddev {
*/
struct bio_set io_clone_set;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
* the rest of the request (without the REQ_PREFLUSH flag).
*/
struct bio *flush_bio;
atomic_t flush_pending;
ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
* flush was started.
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+3 -6
View File
@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
if (!md_bitmap_enabled(mddev->bitmap)) {
if (!mddev->bitmap_ops->enabled(mddev)) {
raid1_submit_write(bio);
return true;
}
@@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* while current io submission must wait for bitmap io to be done. In order to
* avoid such deadlock, submit bitmap io asynchronously.
*/
static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
static inline void raid1_prepare_flush_writes(struct mddev *mddev)
{
if (current->bio_list)
md_bitmap_unplug_async(bitmap);
else
md_bitmap_unplug(bitmap);
mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
}
/*
+45 -54
View File
@@ -411,18 +411,20 @@ static void raid1_end_read_request(struct bio *bio)
static void close_write(struct r1bio *r1_bio)
{
struct mddev *mddev = r1_bio->mddev;
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
bio_free_pages(r1_bio->behind_master_bio);
bio_put(r1_bio->behind_master_bio);
r1_bio->behind_master_bio = NULL;
}
/* clear the bitmap if all writes complete successfully */
md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
test_bit(R1BIO_BehindIO, &r1_bio->state));
md_write_end(r1_bio->mddev);
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
test_bit(R1BIO_BehindIO, &r1_bio->state));
md_write_end(mddev);
}
static void r1_bio_write_done(struct r1bio *r1_bio)
@@ -900,7 +902,7 @@ static void wake_up_barrier(struct r1conf *conf)
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
raid1_prepare_flush_writes(conf->mddev->bitmap);
raid1_prepare_flush_writes(conf->mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
@@ -1317,13 +1319,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
struct bitmap *bitmap = mddev->bitmap;
const enum req_op op = bio_op(bio);
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
int rdisk;
bool r1bio_existed = !!r1_bio;
char b[BDEVNAME_SIZE];
/*
* If r1_bio is set, we are blocking the raid1d thread
@@ -1332,16 +1332,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
if (r1bio_existed) {
/* Need to get the block device name carefully */
struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
if (rdev)
snprintf(b, sizeof(b), "%pg", rdev->bdev);
else
strcpy(b, "???");
}
/*
* Still need barrier for READ in case that whole
* array is frozen.
@@ -1363,15 +1353,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* used and no empty request is available.
*/
rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
if (r1bio_existed) {
pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
if (r1bio_existed)
pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
mdname(mddev),
b,
(unsigned long long)r1_bio->sector);
}
conf->mirrors[r1_bio->read_disk].rdev->bdev,
r1_bio->sector);
raid_end_bio_io(r1_bio);
return;
}
@@ -1383,15 +1371,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
if (test_bit(WriteMostly, &mirror->rdev->flags) &&
bitmap) {
if (test_bit(WriteMostly, &mirror->rdev->flags)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
mddev_add_trace_msg(mddev, "raid1 wait behind writes");
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
mddev->bitmap_ops->wait_behind_writes(mddev);
}
if (max_sectors < bio_sectors(bio)) {
@@ -1432,7 +1418,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
int i, disks;
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct md_rdev *blocked_rdev;
int first_clone;
@@ -1585,7 +1570,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* at a time and thus needs a new bio that can fit the whole payload
* this bio in page sized chunks.
*/
if (write_behind && bitmap)
if (write_behind && mddev->bitmap)
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
@@ -1612,19 +1597,23 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
unsigned long max_write_behind =
mddev->bitmap_info.max_write_behind;
struct md_bitmap_stats stats;
int err;
/* do behind I/O ?
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
if (bitmap && write_behind &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) {
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (!err && write_behind && !stats.behind_wait &&
stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
}
md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
test_bit(R1BIO_BehindIO, &r1_bio->state));
mddev->bitmap_ops->startwrite(
mddev, r1_bio->sector, r1_bio->sectors,
test_bit(R1BIO_BehindIO, &r1_bio->state));
first_clone = 0;
}
@@ -2042,7 +2031,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
@@ -2771,7 +2760,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int wonly = -1;
int write_targets = 0, read_targets = 0;
sector_t sync_blocks;
int still_degraded = 0;
bool still_degraded = false;
int good_sectors = RESYNC_SECTORS;
int min_bad = 0; /* number of sectors that are bad in all devices */
int idx = sector_to_idx(sector_nr);
@@ -2788,12 +2777,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
&sync_blocks);
else /* completed sync */
conf->fullsync = 0;
md_bitmap_close_sync(mddev->bitmap);
mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
@@ -2813,7 +2802,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
@@ -2831,9 +2820,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* sector_nr + two times RESYNC_SECTORS
*/
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
@@ -2864,7 +2853,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
still_degraded = 1;
still_degraded = true;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
@@ -2988,8 +2977,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, still_degraded) &&
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
&sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
@@ -3313,14 +3302,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
int ret;
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
if (mddev->bitmap) {
int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
if (ret)
return ret;
}
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
return ret;
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
+41 -34
View File
@@ -426,12 +426,13 @@ static void raid10_end_read_request(struct bio *bio)
static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
/* clear the bitmap if all writes complete successfully */
md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
r10_bio->sectors,
!test_bit(R10BIO_Degraded, &r10_bio->state),
0);
md_write_end(r10_bio->mddev);
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
!test_bit(R10BIO_Degraded, &r10_bio->state),
false);
md_write_end(mddev);
}
static void one_write_done(struct r10bio *r10_bio)
@@ -884,7 +885,7 @@ static void flush_pending_writes(struct r10conf *conf)
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
raid1_prepare_flush_writes(conf->mddev->bitmap);
raid1_prepare_flush_writes(conf->mddev);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
@@ -1100,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
raid1_prepare_flush_writes(mddev->bitmap);
raid1_prepare_flush_writes(mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
@@ -1492,7 +1493,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
false);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
@@ -2465,7 +2467,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s = PAGE_SIZE >> 9;
rdev = conf->mirrors[dr].rdev;
addr = r10_bio->devs[0].addr + sect,
addr = r10_bio->devs[0].addr + sect;
ok = sync_page_io(rdev,
addr,
s << 9,
@@ -3192,13 +3194,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev,
mddev->curr_resync,
&sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
md_bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, sect,
&sync_blocks);
}
} else {
/* completed sync */
@@ -3218,7 +3222,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
md_bitmap_close_sync(mddev->bitmap);
mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -3287,10 +3291,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio = NULL;
for (i = 0 ; i < conf->geo.raid_disks; i++) {
int still_degraded;
bool still_degraded;
struct r10bio *rb2;
sector_t sect;
int must_sync;
bool must_sync;
int any_working;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3307,7 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!mrdev && !mreplace)
continue;
still_degraded = 0;
still_degraded = false;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
@@ -3320,8 +3324,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
&sync_blocks,
true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
@@ -3359,13 +3364,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
struct md_rdev *rdev = conf->mirrors[j].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
still_degraded = false;
break;
}
}
must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
&sync_blocks, still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
@@ -3538,12 +3543,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
&sync_blocks,
mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
@@ -4190,6 +4196,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -4202,11 +4209,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > size)
return -EINVAL;
if (mddev->bitmap) {
int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
if (ret)
return ret;
}
ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
if (ret)
return ret;
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > oldsize) {
@@ -4472,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev)
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
if (!mddev_is_clustered(mddev)) {
ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
else
@@ -4487,20 +4494,20 @@ static int raid10_start_reshape(struct mddev *mddev)
/*
* some node is already performing reshape, and no need to
* call md_bitmap_resize again since it should be called when
* call bitmap_ops->resize again since it should be called when
* receiving BITMAP_RESIZE msg
*/
if ((sb && (le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
goto abort;
}
}
+5 -9
View File
@@ -313,10 +313,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
false);
}
}
}
@@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
{
struct r5l_log *log = READ_ONCE(conf->log);
int i;
int do_wakeup = 0;
sector_t tree_index;
void __rcu **pslot;
uintptr_t refcount;
@@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
for (i = sh->disks; i--; ) {
clear_bit(R5_InJournal, &sh->dev[i].flags);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
}
/*
@@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
spin_unlock_irq(&log->stripe_in_journal_lock);
+82 -75
View File
@@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
wake_up(&sh->raid_conf->wait_for_overlap);
wake_up_bit(&dev->flags, R5_Overlap);
}
}
local_unlock(&conf->percpu->lock);
@@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
* With PPL only writes to consecutive data chunks within a
* stripe are allowed because for a single stripe_head we can
* only have one PPL entry at a time, which describes one data
* range. Not really an overlap, but wait_for_overlap can be
* range. Not really an overlap, but R5_Overlap can be
* used to handle this.
*/
sector_t sector;
@@ -3563,8 +3563,8 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
*/
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
RAID5_STRIPE_SECTORS(conf), false);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
@@ -3663,8 +3663,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0, 0);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
false, false);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3696,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
@@ -3709,8 +3710,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0, 0);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
false, false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3734,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -4059,10 +4061,10 @@ returnbi:
bio_endio(wbi);
wbi = wbi2;
}
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
false);
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -4875,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
{
struct stripe_head *sh, *next;
int i;
int do_wakeup = 0;
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
@@ -4911,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&sh->stripe_lock);
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
@@ -4925,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&head_sh->stripe_lock);
for (i = 0; i < head_sh->disks; i++)
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
do_wakeup = 1;
wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
if (head_sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &head_sh->state);
if (do_wakeup)
wake_up(&head_sh->raid_conf->wait_for_overlap);
}
static void handle_stripe(struct stripe_head *sh)
@@ -5196,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh)
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -5259,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh)
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
@@ -5753,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
int d;
again:
sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
raid5_release_stripe(sh);
schedule();
wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap,
TASK_UNINTERRUPTIBLE);
goto again;
}
clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5770,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
schedule();
wait_on_bit(&sh->dev[d].flags, R5_Overlap,
TASK_UNINTERRUPTIBLE);
goto again;
}
}
set_bit(STRIPE_DISCARD, &sh->state);
finish_wait(&conf->wait_for_overlap, &w);
sh->overwrite_disks = 0;
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -5788,13 +5785,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
for (d = 0;
d < conf->raid_disks - conf->max_degraded;
for (d = 0; d < conf->raid_disks - conf->max_degraded;
d++)
md_bitmap_startwrite(mddev->bitmap,
sh->sector,
RAID5_STRIPE_SECTORS(conf),
0);
mddev->bitmap_ops->startwrite(mddev, sh->sector,
RAID5_STRIPE_SECTORS(conf), false);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -5855,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf,
struct bio *bi, int forwrite, int previous)
{
int dd_idx;
int ret = 1;
spin_lock_irq(&sh->stripe_lock);
@@ -5871,14 +5864,19 @@ static int add_all_stripe_bios(struct r5conf *conf,
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &dev->flags);
ret = 0;
continue;
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
/* release batch_last before wait to avoid risk of deadlock */
if (ctx->batch_last) {
raid5_release_stripe(ctx->batch_last);
ctx->batch_last = NULL;
}
md_wakeup_thread(conf->mddev->thread);
wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE);
return 0;
}
}
if (!ret)
goto out;
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
@@ -5894,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf,
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
}
out:
spin_unlock_irq(&sh->stripe_lock);
return ret;
return 1;
}
enum reshape_loc {
@@ -5992,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
goto out_release;
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
/*
* Stripe is busy expanding or add failed due to
* overlap. Flush everything and wait a while.
*/
if (test_bit(STRIPE_EXPANDING, &sh->state)) {
md_wakeup_thread(mddev->thread);
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
ret = STRIPE_RETRY;
goto out;
}
if (stripe_can_batch(sh)) {
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
if (ctx->batch_last)
@@ -6073,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
bool on_wq;
struct r5conf *conf = mddev->private;
sector_t logical_sector;
struct stripe_request_ctx ctx = {};
@@ -6146,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* sequential IO pattern. We don't bother with the optimization when
* reshaping as the performance benefit is not worth the complexity.
*/
if (likely(conf->reshape_progress == MaxSector))
if (likely(conf->reshape_progress == MaxSector)) {
logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
on_wq = false;
} else {
add_wait_queue(&conf->wait_for_reshape, &wait);
on_wq = true;
}
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
add_wait_queue(&conf->wait_for_overlap, &wait);
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
@@ -6161,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
continue;
if (res == STRIPE_SCHEDULE_AND_RETRY) {
WARN_ON_ONCE(!on_wq);
/*
* Must release the reference to batch_last before
* scheduling and waiting for work to be done,
@@ -6185,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = ctx.first_sector +
(s << RAID5_STRIPE_SHIFT(conf));
}
remove_wait_queue(&conf->wait_for_overlap, &wait);
if (unlikely(on_wq))
remove_wait_queue(&conf->wait_for_reshape, &wait);
if (ctx.batch_last)
raid5_release_stripe(ctx.batch_last);
@@ -6338,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes)==0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6364,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
@@ -6447,7 +6451,7 @@ finish:
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes) == 0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6473,7 +6477,7 @@ finish:
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
@@ -6486,7 +6490,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
sector_t sync_blocks;
int still_degraded = 0;
bool still_degraded = false;
int i;
if (sector_nr >= max_sector) {
@@ -6498,17 +6502,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
&sync_blocks);
else /* completed sync */
conf->fullsync = 0;
md_bitmap_close_sync(mddev->bitmap);
mddev->bitmap_ops->close_sync(mddev);
return 0;
}
/* Allow raid5_quiesce to complete */
wait_event(conf->wait_for_overlap, conf->quiesce != 2);
wait_event(conf->wait_for_reshape, conf->quiesce != 2);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -6531,7 +6535,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6540,7 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6559,10 +6564,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
still_degraded = true;
}
md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6767,7 +6773,7 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
md_bitmap_unplug(mddev->bitmap);
mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -7492,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
init_waitqueue_head(&conf->wait_for_reshape);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
@@ -8312,6 +8318,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8320,11 +8327,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
if (mddev->bitmap) {
int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
if (ret)
return ret;
}
ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
if (ret)
return ret;
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
@@ -8550,7 +8557,7 @@ static void end_reshape(struct r5conf *conf)
!test_bit(In_sync, &rdev->flags))
rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
@@ -8614,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
conf->quiesce = 1;
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
} else {
/* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
unlock_all_device_hash_locks_irq(conf);
}
log_quiesce(conf, quiesce);
@@ -8939,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
}
static struct md_personality raid6_personality =
+1 -1
View File
@@ -668,7 +668,7 @@ struct r5conf {
struct llist_head released_stripes;
wait_queue_head_t wait_for_quiescent;
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
wait_queue_head_t wait_for_reshape;
unsigned long cache_state;
struct shrinker *shrinker;
int pool_size; /* number of disks in stripeheads in pool */
+48 -10
View File
@@ -20,6 +20,28 @@ key_serial_t nvme_keyring_id(void)
}
EXPORT_SYMBOL_GPL(nvme_keyring_id);
static bool nvme_tls_psk_revoked(struct key *psk)
{
return test_bit(KEY_FLAG_REVOKED, &psk->flags) ||
test_bit(KEY_FLAG_INVALIDATED, &psk->flags);
}
struct key *nvme_tls_key_lookup(key_serial_t key_id)
{
struct key *key = key_lookup(key_id);
if (IS_ERR(key)) {
pr_err("key id %08x not found\n", key_id);
return key;
}
if (nvme_tls_psk_revoked(key)) {
pr_err("key id %08x revoked\n", key_id);
return ERR_PTR(-EKEYREVOKED);
}
return key;
}
EXPORT_SYMBOL_GPL(nvme_tls_key_lookup);
static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m)
{
seq_puts(m, key->description);
@@ -36,14 +58,12 @@ static bool nvme_tls_psk_match(const struct key *key,
pr_debug("%s: no key description\n", __func__);
return false;
}
match_len = strlen(key->description);
pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len);
if (!match_data->raw_data) {
pr_debug("%s: no match data\n", __func__);
return false;
}
match_id = match_data->raw_data;
match_len = strlen(match_id);
pr_debug("%s: match '%s' '%s' len %zd\n",
__func__, match_id, key->description, match_len);
return !memcmp(key->description, match_id, match_len);
@@ -71,7 +91,7 @@ static struct key_type nvme_tls_psk_key_type = {
static struct key *nvme_tls_psk_lookup(struct key *keyring,
const char *hostnqn, const char *subnqn,
int hmac, bool generated)
u8 hmac, u8 psk_ver, bool generated)
{
char *identity;
size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11;
@@ -82,8 +102,8 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
if (!identity)
return ERR_PTR(-ENOMEM);
snprintf(identity, identity_len, "NVMe0%c%02d %s %s",
generated ? 'G' : 'R', hmac, hostnqn, subnqn);
snprintf(identity, identity_len, "NVMe%u%c%02u %s %s",
psk_ver, generated ? 'G' : 'R', hmac, hostnqn, subnqn);
if (!keyring)
keyring = nvme_keyring;
@@ -107,21 +127,38 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
/*
* NVMe PSK priority list
*
* 'Retained' PSKs (ie 'generated == false')
* should be preferred to 'generated' PSKs,
* and SHA-384 should be preferred to SHA-256.
* 'Retained' PSKs (ie 'generated == false') should be preferred to 'generated'
* PSKs, PSKs with hash (psk_ver 1) should be preferred to PSKs without hash
* (psk_ver 0), and SHA-384 should be preferred to SHA-256.
*/
static struct nvme_tls_psk_priority_list {
bool generated;
u8 psk_ver;
enum nvme_tcp_tls_cipher cipher;
} nvme_tls_psk_prio[] = {
{ .generated = false,
.psk_ver = 1,
.cipher = NVME_TCP_TLS_CIPHER_SHA384, },
{ .generated = false,
.psk_ver = 1,
.cipher = NVME_TCP_TLS_CIPHER_SHA256, },
{ .generated = false,
.psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA384, },
{ .generated = false,
.psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA256, },
{ .generated = true,
.psk_ver = 1,
.cipher = NVME_TCP_TLS_CIPHER_SHA384, },
{ .generated = true,
.psk_ver = 1,
.cipher = NVME_TCP_TLS_CIPHER_SHA256, },
{ .generated = true,
.psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA384, },
{ .generated = true,
.psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA256, },
};
@@ -137,10 +174,11 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) {
bool generated = nvme_tls_psk_prio[prio].generated;
u8 ver = nvme_tls_psk_prio[prio].psk_ver;
enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher;
tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn,
cipher, generated);
cipher, ver, generated);
if (!IS_ERR(tls_key)) {
tls_key_id = tls_key->serial;
key_put(tls_key);
+2 -1
View File
@@ -41,6 +41,7 @@ config NVME_HWMON
config NVME_FABRICS
select NVME_CORE
select NVME_KEYRING if NVME_TCP_TLS
tristate
config NVME_RDMA
@@ -94,7 +95,6 @@ config NVME_TCP
config NVME_TCP_TLS
bool "NVMe over Fabrics TCP TLS encryption support"
depends on NVME_TCP
select NVME_KEYRING
select NET_HANDSHAKE
select KEYS
help
@@ -109,6 +109,7 @@ config NVME_HOST_AUTH
bool "NVMe over Fabrics In-Band Authentication in host side"
depends on NVME_CORE
select NVME_AUTH
select NVME_KEYRING if NVME_TCP_TLS
help
This provides support for NVMe over Fabrics In-Band Authentication in
host side.
+42 -5
View File
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
@@ -987,8 +988,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
cmnd->rw.apptag = 0;
cmnd->rw.appmask = 0;
cmnd->rw.lbat = 0;
cmnd->rw.lbatm = 0;
if (ns->head->ms) {
/*
@@ -4040,6 +4041,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
}
/**
* struct async_scan_info - keeps track of controller & NSIDs to scan
* @ctrl: Controller on which namespaces are being scanned
* @next_nsid: Index of next NSID to scan in ns_list
* @ns_list: Pointer to list of NSIDs to scan
*
* Note: There is a single async_scan_info structure shared by all instances
* of nvme_scan_ns_async() scanning a given controller, so the atomic
* operations on next_nsid are critical to ensure each instance scans a unique
* NSID.
*/
struct async_scan_info {
struct nvme_ctrl *ctrl;
atomic_t next_nsid;
__le32 *ns_list;
};
static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
{
struct async_scan_info *scan_info = data;
int idx;
u32 nsid;
idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
nsid = le32_to_cpu(scan_info->ns_list[idx]);
nvme_scan_ns(scan_info->ctrl, nsid);
}
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid)
{
@@ -4066,11 +4096,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
__le32 *ns_list;
u32 prev = 0;
int ret = 0, i;
ASYNC_DOMAIN(domain);
struct async_scan_info scan_info;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
scan_info.ctrl = ctrl;
scan_info.ns_list = ns_list;
for (;;) {
struct nvme_command cmd = {
.identify.opcode = nvme_admin_identify,
@@ -4086,19 +4120,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
goto free;
}
atomic_set(&scan_info.next_nsid, 0);
for (i = 0; i < nr_entries; i++) {
u32 nsid = le32_to_cpu(ns_list[i]);
if (!nsid) /* end of the list? */
goto out;
nvme_scan_ns(ctrl, nsid);
async_schedule_domain(nvme_scan_ns_async, &scan_info,
&domain);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
async_synchronize_full_domain(&domain);
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
async_synchronize_full_domain(&domain);
kfree(ns_list);
return ret;
}
@@ -4568,7 +4606,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
set->flags = BLK_MQ_F_SHOULD_MERGE;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size,
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
@@ -4678,7 +4716,6 @@ static void nvme_free_ctrl(struct device *dev)
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
cleanup_srcu_struct(&ctrl->srcu);
+1 -1
View File
@@ -665,7 +665,7 @@ static struct key *nvmf_parse_key(int key_id)
return ERR_PTR(-EINVAL);
}
key = key_lookup(key_id);
key = nvme_tls_key_lookup(key_id);
if (IS_ERR(key))
pr_err("key id %08x not found\n", key_id);
else
+16 -10
View File
@@ -4,6 +4,7 @@
* Copyright (c) 2017-2021 Christoph Hellwig.
*/
#include <linux/bio-integrity.h>
#include <linux/blk-integrity.h>
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
#include <linux/io_uring/cmd.h>
@@ -119,9 +120,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
bool has_metadata = meta_buffer && meta_len;
struct bio *bio = NULL;
int ret;
if (has_metadata && !supports_metadata)
return -EINVAL;
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
struct iov_iter iter;
@@ -143,15 +149,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
goto out;
bio = req->bio;
if (bdev) {
if (bdev)
bio_set_dev(bio, bdev);
if (meta_buffer && meta_len) {
ret = bio_integrity_map_user(bio, meta_buffer, meta_len,
meta_seed);
if (ret)
goto out_unmap;
req->cmd_flags |= REQ_INTEGRITY;
}
if (has_metadata) {
ret = bio_integrity_map_user(bio, meta_buffer, meta_len,
meta_seed);
if (ret)
goto out_unmap;
req->cmd_flags |= REQ_INTEGRITY;
}
return ret;
@@ -260,8 +266,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.control = cpu_to_le16(io.control);
c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
c.rw.reftag = cpu_to_le32(io.reftag);
c.rw.apptag = cpu_to_le16(io.apptag);
c.rw.appmask = cpu_to_le16(io.appmask);
c.rw.lbat = cpu_to_le16(io.apptag);
c.rw.lbatm = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
meta_len, lower_32_bits(io.slba), NULL, 0, 0);
+6 -1
View File
@@ -90,6 +90,11 @@ enum nvme_quirks {
*/
NVME_QUIRK_NO_DEEPEST_PS = (1 << 5),
/*
* Problems seen with concurrent commands
*/
NVME_QUIRK_QDEPTH_ONE = (1 << 6),
/*
* Set MEDIUM priority on SQ creation
*/
@@ -372,7 +377,7 @@ struct nvme_ctrl {
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
#endif
struct key *tls_key;
key_serial_t tls_pskid;
/* Power saving configuration */
u64 ps_max_latency_us;
+9 -9
View File
@@ -2563,15 +2563,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
else
dev->io_sqes = NVME_NVM_IOSQES;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) {
dev->q_depth = 2;
dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
"set queue depth=%u to work around controller resets\n",
dev->q_depth);
} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
(pdev->device == 0xa821 || pdev->device == 0xa822) &&
NVME_CAP_MQES(dev->ctrl.cap) == 0) {
@@ -3442,6 +3435,8 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
.driver_data = NVME_QUIRK_QDEPTH_ONE },
{ PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_BOGUS_NID, },
@@ -3576,7 +3571,12 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
.driver_data = NVME_QUIRK_SINGLE_VECTOR },
/*
* Fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
NVME_QUIRK_QDEPTH_ONE },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
+4 -2
View File
@@ -1363,8 +1363,8 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
if (control & NVME_RW_PRINFO_PRCHK_REF)
domain->sig.dif.ref_remap = true;
domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat);
domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm);
domain->sig.dif.app_escape = true;
if (pi_type == NVME_NS_DPS_PI_TYPE3)
domain->sig.dif.ref_escape = true;
@@ -1876,6 +1876,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
*/
priv.hrqsize = cpu_to_le16(queue->queue_size);
priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
/* cntlid should only be set when creating an I/O queue */
priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid);
}
ret = rdma_connect_locked(queue->cm_id, &param);
+69 -21
View File
@@ -664,19 +664,6 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
#endif
#ifdef CONFIG_NVME_TCP_TLS
static ssize_t tls_key_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (!ctrl->tls_key)
return 0;
return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key));
}
static DEVICE_ATTR_RO(tls_key);
#endif
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -703,9 +690,6 @@ static struct attribute *nvme_dev_attrs[] = {
#ifdef CONFIG_NVME_HOST_AUTH
&dev_attr_dhchap_secret.attr,
&dev_attr_dhchap_ctrl_secret.attr,
#endif
#ifdef CONFIG_NVME_TCP_TLS
&dev_attr_tls_key.attr,
#endif
&dev_attr_adm_passthru_err_log_enabled.attr,
NULL
@@ -737,11 +721,6 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
return 0;
#endif
#ifdef CONFIG_NVME_TCP_TLS
if (a == &dev_attr_tls_key.attr &&
(!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")))
return 0;
#endif
return a->mode;
}
@@ -752,8 +731,77 @@ const struct attribute_group nvme_dev_attrs_group = {
};
EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
#ifdef CONFIG_NVME_TCP_TLS
static ssize_t tls_key_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (!ctrl->tls_pskid)
return 0;
return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid);
}
static DEVICE_ATTR_RO(tls_key);
static ssize_t tls_configured_key_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct key *key = ctrl->opts->tls_key;
return sysfs_emit(buf, "%08x\n", key_serial(key));
}
static DEVICE_ATTR_RO(tls_configured_key);
static ssize_t tls_keyring_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct key *keyring = ctrl->opts->keyring;
return sysfs_emit(buf, "%s\n", keyring->description);
}
static DEVICE_ATTR_RO(tls_keyring);
static struct attribute *nvme_tls_attrs[] = {
&dev_attr_tls_key.attr,
&dev_attr_tls_configured_key.attr,
&dev_attr_tls_keyring.attr,
};
static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))
return 0;
if (a == &dev_attr_tls_key.attr &&
!ctrl->opts->tls)
return 0;
if (a == &dev_attr_tls_configured_key.attr &&
!ctrl->opts->tls_key)
return 0;
if (a == &dev_attr_tls_keyring.attr &&
!ctrl->opts->keyring)
return 0;
return a->mode;
}
const struct attribute_group nvme_tls_attrs_group = {
.attrs = nvme_tls_attrs,
.is_visible = nvme_tls_attrs_are_visible,
};
#endif
const struct attribute_group *nvme_dev_attr_groups[] = {
&nvme_dev_attrs_group,
#ifdef CONFIG_NVME_TCP_TLS
&nvme_tls_attrs_group,
#endif
NULL,
};
+42 -15
View File
@@ -165,6 +165,7 @@ struct nvme_tcp_queue {
bool hdr_digest;
bool data_digest;
bool tls_enabled;
struct ahash_request *rcv_hash;
struct ahash_request *snd_hash;
__le32 exp_ddgst;
@@ -213,7 +214,21 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
return queue - queue->ctrl->queues;
}
static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl)
/*
* Check if the queue is TLS encrypted
*/
static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue)
{
if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
return 0;
return queue->tls_enabled;
}
/*
* Check if TLS is configured for the controller.
*/
static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl)
{
if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
return 0;
@@ -368,7 +383,7 @@ static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
{
return !nvme_tcp_tls(&queue->ctrl->ctrl) &&
return !nvme_tcp_queue_tls(queue) &&
nvme_tcp_queue_has_pending(queue);
}
@@ -1051,7 +1066,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
else
msg.msg_flags |= MSG_MORE;
if (!sendpage_ok(page))
if (!sendpages_ok(page, len, offset))
msg.msg_flags &= ~MSG_SPLICE_PAGES;
bvec_set_page(&bvec, page, len, offset);
@@ -1427,7 +1442,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
memset(&msg, 0, sizeof(msg));
iov.iov_base = icresp;
iov.iov_len = sizeof(*icresp);
if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
if (nvme_tcp_queue_tls(queue)) {
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
}
@@ -1439,7 +1454,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
goto free_icresp;
}
ret = -ENOTCONN;
if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
if (nvme_tcp_queue_tls(queue)) {
ctype = tls_get_record_type(queue->sock->sk,
(struct cmsghdr *)cbuf);
if (ctype != TLS_RECORD_TYPE_DATA) {
@@ -1581,13 +1596,16 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
goto out_complete;
}
tls_key = key_lookup(pskid);
tls_key = nvme_tls_key_lookup(pskid);
if (IS_ERR(tls_key)) {
dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
qid, pskid);
queue->tls_err = -ENOKEY;
} else {
ctrl->ctrl.tls_key = tls_key;
queue->tls_enabled = true;
if (qid == 0)
ctrl->ctrl.tls_pskid = key_serial(tls_key);
key_put(tls_key);
queue->tls_err = 0;
}
@@ -1768,7 +1786,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
}
/* If PSKs are configured try to start TLS */
if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) {
if (nvme_tcp_tls_configured(nctrl) && pskid) {
ret = nvme_tcp_start_tls(nctrl, queue, pskid);
if (ret)
goto err_init_connect;
@@ -1829,6 +1847,8 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
mutex_lock(&queue->queue_lock);
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
__nvme_tcp_stop_queue(queue);
/* Stopping the queue will disable TLS */
queue->tls_enabled = false;
mutex_unlock(&queue->queue_lock);
}
@@ -1925,16 +1945,17 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
int ret;
key_serial_t pskid = 0;
if (nvme_tcp_tls(ctrl)) {
if (nvme_tcp_tls_configured(ctrl)) {
if (ctrl->opts->tls_key)
pskid = key_serial(ctrl->opts->tls_key);
else
else {
pskid = nvme_tls_psk_default(ctrl->opts->keyring,
ctrl->opts->host->nqn,
ctrl->opts->subsysnqn);
if (!pskid) {
dev_err(ctrl->device, "no valid PSK found\n");
return -ENOKEY;
if (!pskid) {
dev_err(ctrl->device, "no valid PSK found\n");
return -ENOKEY;
}
}
}
@@ -1957,13 +1978,14 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) {
if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) {
dev_err(ctrl->device, "no PSK negotiated\n");
return -ENOKEY;
}
for (i = 1; i < ctrl->queue_count; i++) {
ret = nvme_tcp_alloc_queue(ctrl, i,
key_serial(ctrl->tls_key));
ctrl->tls_pskid);
if (ret)
goto out_free_queues;
}
@@ -2144,6 +2166,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
if (remove)
nvme_unquiesce_admin_queue(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, remove);
if (ctrl->tls_pskid) {
dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n",
ctrl->tls_pskid);
ctrl->tls_pskid = 0;
}
}
static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
-2
View File
@@ -1015,8 +1015,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
if (nvme_is_fabrics(cmd))
return nvmet_parse_fabrics_admin_cmd(req);
if (unlikely(!nvmet_check_auth_status(req)))
return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR;
if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
return nvmet_parse_discovery_cmd(req);
+12
View File
@@ -25,6 +25,18 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
unsigned char key_hash;
char *dhchap_secret;
if (!strlen(secret)) {
if (set_ctrl) {
kfree(host->dhchap_ctrl_secret);
host->dhchap_ctrl_secret = NULL;
host->dhchap_ctrl_key_hash = 0;
} else {
kfree(host->dhchap_secret);
host->dhchap_secret = NULL;
host->dhchap_key_hash = 0;
}
return 0;
}
if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1)
return -EINVAL;
if (key_hash > 3) {
+2 -2
View File
@@ -578,8 +578,8 @@ static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi,
if (control & NVME_RW_PRINFO_PRCHK_REF)
domain->sig.dif.ref_remap = true;
domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat);
domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm);
domain->sig.dif.app_escape = true;
if (pi_type == NVME_NS_DPS_PI_TYPE3)
domain->sig.dif.ref_escape = true;
-2
View File
@@ -14,8 +14,6 @@
/* Ugly macros make the code more pretty. */
#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st))))
#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data)
+2 -1
View File
@@ -49,12 +49,13 @@ struct affs_short_date {
struct affs_root_head {
__be32 ptype;
/* The following fields are not used, but kept as documentation. */
__be32 spare1;
__be32 spare2;
__be32 hash_size;
__be32 spare3;
__be32 checksum;
__be32 hashtable[1];
__be32 hashtable[];
};
struct affs_root_tail {
+3 -3
View File
@@ -219,8 +219,8 @@ static void free_pref(struct prelim_ref *ref)
* A -1 return indicates ref1 is a 'lower' block than ref2, while 1
* indicates a 'higher' block.
*/
static int prelim_ref_compare(struct prelim_ref *ref1,
struct prelim_ref *ref2)
static int prelim_ref_compare(const struct prelim_ref *ref1,
const struct prelim_ref *ref2)
{
if (ref1->level < ref2->level)
return -1;
@@ -251,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
}
static void update_share_count(struct share_check *sc, int oldcount,
int newcount, struct prelim_ref *newref)
int newcount, const struct prelim_ref *newref)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
+43 -41
View File
@@ -53,7 +53,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
/*
* Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
* btrfs, and is used for all I/O submitted through btrfs_submit_bio.
* btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
*
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
@@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *orig_bbio,
u64 map_length, bool use_append)
u64 map_length)
{
struct btrfs_bio *bbio;
struct bio *bio;
if (use_append) {
unsigned int nr_segs;
bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
&btrfs_clone_bioset, map_length);
} else {
bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
GFP_NOFS, &btrfs_clone_bioset);
}
bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
&btrfs_clone_bioset);
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
@@ -120,12 +113,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
}
}
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
__btrfs_bio_end_io(bbio);
}
static void btrfs_orig_write_end_io(struct bio *bio);
static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
@@ -147,8 +134,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
}
}
static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
@@ -179,7 +167,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
{
if (atomic_dec_and_test(&fbio->repair_count)) {
btrfs_orig_bbio_end_io(fbio->bbio);
btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
mempool_free(fbio, &btrfs_failed_bio_pool);
}
}
@@ -211,7 +199,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
goto done;
}
btrfs_submit_bio(repair_bbio, mirror);
btrfs_submit_bbio(repair_bbio, mirror);
return;
}
@@ -280,7 +268,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
btrfs_submit_bio(repair_bbio, mirror);
btrfs_submit_bbio(repair_bbio, mirror);
return fbio;
}
@@ -326,7 +314,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
if (fbio)
btrfs_repair_done(fbio);
else
btrfs_orig_bbio_end_io(bbio);
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
@@ -360,7 +348,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
if (is_data_bbio(bbio))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
else
btrfs_orig_bbio_end_io(bbio);
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_simple_end_io(struct bio *bio)
@@ -380,7 +368,7 @@ static void btrfs_simple_end_io(struct bio *bio)
} else {
if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
btrfs_record_physical_zoned(bbio);
btrfs_orig_bbio_end_io(bbio);
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
}
@@ -394,7 +382,7 @@ static void btrfs_raid56_end_io(struct bio *bio)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
btrfs_check_read_bio(bbio, NULL);
else
btrfs_orig_bbio_end_io(bbio);
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -424,7 +412,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_orig_bbio_end_io(bbio);
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -502,8 +490,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
{
if (!bioc) {
/* Single mirror read/write fast path. */
@@ -593,7 +581,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
btrfs_orig_bbio_end_io(async->bbio);
btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
return;
}
@@ -603,7 +591,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
* context. This changes nothing when cgroups aren't in use.
*/
bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
static bool should_async_write(struct btrfs_bio *bbio)
@@ -664,6 +652,19 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
return true;
}
static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
{
unsigned int nr_segs;
int sector_offset;
map_length = min(map_length, bbio->fs_info->max_zone_append_size);
sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
&nr_segs, map_length);
if (sector_offset)
return sector_offset << SECTOR_SHIFT;
return map_length;
}
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
@@ -678,7 +679,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
blk_status_t ret;
int error;
smap.is_scrub = !bbio->inode;
if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
smap.rst_search_commit_root = true;
else
smap.rst_search_commit_root = false;
btrfs_bio_counter_inc_blocked(fs_info);
error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
@@ -690,10 +694,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
map_length = min(map_length, length);
if (use_append)
map_length = min(map_length, fs_info->max_zone_append_size);
map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
bbio = btrfs_split_bio(fs_info, bbio, map_length);
bio = &bbio->bio;
}
@@ -749,7 +753,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
}
}
__btrfs_submit_bio(bio, bioc, &smap, mirror_num);
btrfs_submit_bio(bio, bioc, &smap, mirror_num);
done:
return map_length == length;
@@ -765,16 +769,14 @@ fail:
ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
ASSERT(remaining);
remaining->bio.bi_status = ret;
btrfs_orig_bbio_end_io(remaining);
btrfs_bio_end_io(remaining, ret);
}
bbio->bio.bi_status = ret;
btrfs_orig_bbio_end_io(bbio);
btrfs_bio_end_io(bbio, ret);
/* Do not submit another chunk */
return true;
}
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
@@ -786,7 +788,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
/*
* Submit a repair write.
*
* This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
* This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
* RAID setup. Here we only want to write the one bad copy, so we do the
* mapping ourselves and submit the bio directly.
*
@@ -875,7 +877,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_
ASSERT(smap.dev == fs_info->dev_replace.srcdev);
smap.dev = fs_info->dev_replace.tgtdev;
}
__btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
return;
fail:
+3 -3
View File
@@ -29,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
* Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and
* passed to btrfs_submit_bio for mapping to the physical devices.
* passed to btrfs_submit_bbio() for mapping to the physical devices.
*/
struct btrfs_bio {
/*
@@ -42,7 +42,7 @@ struct btrfs_bio {
union {
/*
* For data reads: checksumming and original I/O information.
* (for internal use in the btrfs_submit_bio machinery only)
* (for internal use in the btrfs_submit_bbio() machinery only)
*/
struct {
u8 *csum;
@@ -104,7 +104,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
/* Submit using blkcg_punt_bio_submit. */
#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct folio *folio,
+17 -17
View File
@@ -23,7 +23,7 @@
#include "extent-tree.h"
#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
*
* Should be called with balance_lock held
*/
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
u64 target = 0;
if (!bctl)
@@ -1415,9 +1415,9 @@ out:
}
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
const struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *prev_trans = NULL;
const u64 start = bg->start;
const u64 end = start + bg->length - 1;
@@ -1756,14 +1756,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
{
if (btrfs_is_zoned(fs_info))
return btrfs_zoned_should_reclaim(fs_info);
return true;
}
static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
{
const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
@@ -2006,8 +2006,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
const struct btrfs_path *path)
{
struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
@@ -2055,7 +2055,7 @@ out_free_map:
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
const struct btrfs_key *key)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
@@ -2640,8 +2640,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
}
static int insert_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 chunk_offset,
u64 start, u64 num_bytes)
const struct btrfs_device *device, u64 chunk_offset,
u64 start, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
@@ -2817,7 +2817,7 @@ next:
* For extent tree v2 we use the block_group_item->chunk_offset to point at our
* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
*/
static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
{
u64 div = SZ_1G;
u64 index;
@@ -3842,8 +3842,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4218,7 +4218,7 @@ out:
return ret;
}
static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
{
u64 num_dev;
@@ -4622,7 +4622,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
return 0;
}
bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
{
if (btrfs_is_zoned(bg->fs_info))
return false;
+5 -6
View File
@@ -266,7 +266,7 @@ struct btrfs_block_group {
u64 reclaim_mark;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
{
return (block_group->start + block_group->length);
}
@@ -278,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
}
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
{
/*
* In mixed mode the fragmentation is expected to be high, lowering the
@@ -290,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only(
}
#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(
@@ -370,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}
static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
static inline int btrfs_block_group_done(const struct btrfs_block_group *cache)
{
smp_mb();
return cache->cached == BTRFS_CACHE_FINISHED ||
@@ -387,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
enum btrfs_block_group_size_class size_class,
bool force_wrong_size_class);
bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
#endif /* BTRFS_BLOCK_GROUP_H */
+1 -1
View File
@@ -553,7 +553,7 @@ try_reserve:
return ERR_PTR(ret);
}
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
u64 needed_bytes;
+1 -1
View File
@@ -89,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
+17 -7
View File
@@ -350,10 +350,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
WRITE_ONCE(inode->first_dir_index_to_log, index);
}
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
}
/* Type checked and const-preserving VFS inode -> btrfs inode. */
#define BTRFS_I(_inode) \
_Generic(_inode, \
struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \
const struct inode *: (const struct btrfs_inode *)container_of( \
_inode, const struct btrfs_inode, vfs_inode))
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
@@ -505,6 +507,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
return true;
}
static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
{
/* Immediately trigger a crash if the inode is not locked. */
ASSERT(inode_is_locked(&inode->vfs_inode));
/* Trigger a splat in dmesg if this task is not holding the lock. */
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
}
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
@@ -578,7 +588,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
struct btrfs_path *path);
struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, u64 start, u64 len);
struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
@@ -596,9 +606,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page);
int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+43 -39
View File
@@ -138,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
const u8 *data_in, struct page *dest_page,
const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
@@ -395,7 +395,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb->bbio.ordered = ordered;
btrfs_add_compressed_bio_folios(cb);
btrfs_submit_bio(&cb->bbio, 0);
btrfs_submit_bbio(&cb->bbio, 0);
}
/*
@@ -420,7 +420,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
int ret;
struct page *page;
struct folio *folio;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
@@ -453,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
folio = __filemap_get_folio(mapping, pg_index, 0, 0);
if (!IS_ERR(folio)) {
u64 folio_sz = folio_size(folio);
u64 offset = offset_in_folio(folio, cur);
folio_put(folio);
sectors_missed += (folio_sz - offset) >>
fs_info->sectorsize_bits;
/* Beyond threshold, no need to continue */
@@ -466,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* Jump to next page start as we already have page for
* current offset.
*/
cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
cur += (folio_sz - offset);
continue;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
~__GFP_FS));
if (!page)
folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
~__GFP_FS), 0);
if (!folio)
break;
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) {
/* There is already a page, skip to page end */
cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
cur += folio_size(folio);
folio_put(folio);
continue;
}
if (!*memstall && PageWorkingset(page)) {
if (!*memstall && folio_test_workingset(folio)) {
psi_memstall_enter(pflags);
*memstall = 1;
}
ret = set_page_extent_mapped(page);
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
unlock_page(page);
put_page(page);
folio_unlock(folio);
folio_put(folio);
break;
}
page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
@@ -511,28 +515,28 @@ static noinline int add_ra_bio_pages(struct inode *inode,
orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
folio_unlock(folio);
folio_put(folio);
break;
}
add_size = min(em->start + em->len, page_end + 1) - cur;
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
if (page->index == end_index) {
size_t zero_offset = offset_in_page(isize);
if (folio->index == end_index) {
size_t zero_offset = offset_in_folio(folio, isize);
if (zero_offset) {
int zeros;
zeros = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, zeros);
zeros = folio_size(folio) - zero_offset;
folio_zero_range(folio, zero_offset, zeros);
}
}
ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
if (!bio_add_folio(orig_bio, folio, add_size,
offset_in_folio(folio, cur))) {
folio_unlock(folio);
folio_put(folio);
break;
}
/*
@@ -541,9 +545,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
btrfs_subpage_start_reader(fs_info, page_folio(page),
cur, add_size);
put_page(page);
btrfs_subpage_start_reader(fs_info, folio, cur,
add_size);
folio_put(folio);
cur += add_size;
}
return 0;
@@ -626,7 +630,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
if (memstall)
psi_memstall_leave(&pflags);
btrfs_submit_bio(&cb->bbio, 0);
btrfs_submit_bbio(&cb->bbio, 0);
return;
out_free_compressed_pages:
@@ -1057,10 +1061,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
struct list_head *workspace;
const u32 sectorsize = fs_info->sectorsize;
int ret;
@@ -1073,7 +1077,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
ret = compression_decompress(type, workspace, data_in, dest_folio,
dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
+12 -4
View File
@@ -82,13 +82,21 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
return ((type_level & 0xF0) >> 4);
}
/* @range_end must be exclusive. */
static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
{
u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE;
return min(range_end, page_end) - cur;
}
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@@ -154,7 +162,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -165,7 +173,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
@@ -175,7 +183,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
+9 -9
View File
@@ -2564,8 +2564,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
*
*/
static void fixup_low_keys(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
const struct btrfs_path *path,
const struct btrfs_disk_key *key, int level)
{
int i;
struct extent_buffer *t;
@@ -2594,7 +2594,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
* that the new key won't break the order
*/
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_path *path,
const struct btrfs_key *new_key)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2660,8 +2660,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
* is correct, we only need to bother the last key of @left and the first
* key of @right.
*/
static bool check_sibling_keys(struct extent_buffer *left,
struct extent_buffer *right)
static bool check_sibling_keys(const struct extent_buffer *left,
const struct extent_buffer *right)
{
struct btrfs_key left_last;
struct btrfs_key right_first;
@@ -2928,8 +2928,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
* blocknr is the block the key points to.
*/
static int insert_ptr(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_disk_key *key, u64 bytenr,
const struct btrfs_path *path,
const struct btrfs_disk_key *key, u64 bytenr,
int slot, int level)
{
struct extent_buffer *lower;
@@ -4019,7 +4019,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
* the front.
*/
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u32 new_size, int from_end)
const struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4111,7 +4111,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* make the item pointed to by the path bigger, data_size is the added size.
*/
void btrfs_extend_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u32 data_size)
const struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
+8 -3
View File
@@ -6,6 +6,7 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
#include "linux/cleanup.h"
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
@@ -84,6 +85,9 @@ struct btrfs_path {
unsigned int nowait:1;
};
#define BTRFS_PATH_AUTO_FREE(path_name) \
struct btrfs_path *path_name __free(btrfs_free_path) = NULL
/*
* The state of btrfs root
*/
@@ -538,7 +542,7 @@ int btrfs_previous_item(struct btrfs_root *root,
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -572,9 +576,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u32 data_size);
const struct btrfs_path *path, u32 data_size);
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u32 new_size, int from_end);
const struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -598,6 +602,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T))
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
+36 -61
View File
@@ -45,8 +45,8 @@ struct inode_defrag {
u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
struct inode_defrag *defrag2)
static int compare_inode_defrag(const struct inode_defrag *defrag1,
const struct inode_defrag *defrag2)
{
if (defrag1->root > defrag2->root)
return 1;
@@ -61,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
}
/*
* Pop a record for an inode into the defrag tree. The lock must be held
* Insert a record for an inode into the defrag tree. The lock must be held
* already.
*
* If you're inserting a record for an older transid than an existing record,
* the transid already in the tree is lowered.
*
* If an existing record is found the defrag item you pass in is freed.
*/
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct inode_defrag *entry;
@@ -83,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
ret = __compare_inode_defrag(defrag, entry);
ret = compare_inode_defrag(defrag, entry);
if (ret < 0)
p = &parent->rb_left;
else if (ret > 0)
@@ -107,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
return 0;
}
static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
static inline int need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
return 0;
@@ -119,34 +117,28 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
}
/*
* Insert a defrag record for this inode if auto defrag is enabled.
* Insert a defrag record for this inode if auto defrag is enabled. No errors
* returned as they're not considered fatal.
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u32 extent_thresh)
void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode_defrag *defrag;
u64 transid;
int ret;
if (!__need_auto_defrag(fs_info))
return 0;
if (!need_auto_defrag(fs_info))
return;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
return 0;
if (trans)
transid = trans->transid;
else
transid = btrfs_get_root_last_trans(root);
return;
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
return -ENOMEM;
return;
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->transid = btrfs_get_root_last_trans(root);
defrag->root = btrfs_root_id(root);
defrag->extent_thresh = extent_thresh;
@@ -157,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
* and then re-read this inode, this new inode doesn't have
* IN_DEFRAG flag. At the case, we may find the existed defrag.
*/
ret = __btrfs_add_inode_defrag(inode, defrag);
ret = btrfs_insert_inode_defrag(inode, defrag);
if (ret)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);
return 0;
}
/*
@@ -189,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
ret = __compare_inode_defrag(&tmp, entry);
ret = compare_inode_defrag(&tmp, entry);
if (ret < 0)
p = parent->rb_left;
else if (ret > 0)
@@ -198,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
goto out;
}
if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
if (parent && compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -214,27 +205,22 @@ out:
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
struct rb_node *node;
struct inode_defrag *defrag, *next;
spin_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
while (node) {
rb_erase(node, &fs_info->defrag_inodes);
defrag = rb_entry(node, struct inode_defrag, rb_node);
rbtree_postorder_for_each_entry_safe(defrag, next,
&fs_info->defrag_inodes, rb_node)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
cond_resched_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
}
spin_unlock(&fs_info->defrag_inodes_lock);
}
#define BTRFS_DEFRAG_BATCH 1024
static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct inode_defrag *defrag)
static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct inode_defrag *defrag,
struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
struct inode *inode;
@@ -245,7 +231,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
again:
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
goto cleanup;
if (!__need_auto_defrag(fs_info))
if (!need_auto_defrag(fs_info))
goto cleanup;
/* Get the inode */
@@ -273,9 +259,10 @@ again:
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
file_ra_state_init(ra, inode->i_mapping);
sb_start_write(fs_info->sb);
ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
iput(inode);
@@ -302,11 +289,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
atomic_inc(&fs_info->defrag_running);
while (1) {
struct file_ra_state ra = { 0 };
/* Pause the auto defragger. */
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
break;
if (!__need_auto_defrag(fs_info))
if (!need_auto_defrag(fs_info))
break;
/* find an inode to defrag */
@@ -324,7 +313,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
__btrfs_run_defrag_inode(fs_info, defrag);
btrfs_run_defrag_inode(fs_info, defrag, &ra);
}
atomic_dec(&fs_info->defrag_running);
@@ -1317,8 +1306,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
if (entry->start + range_len <= *last_scanned_ret)
continue;
if (ra)
page_cache_sync_readahead(inode->vfs_inode.i_mapping,
page_cache_sync_readahead(inode->vfs_inode.i_mapping,
ra, NULL, entry->start >> PAGE_SHIFT,
((entry->start + range_len - 1) >> PAGE_SHIFT) -
(entry->start >> PAGE_SHIFT) + 1);
@@ -1350,7 +1338,7 @@ out:
* Entry point to file defragmentation.
*
* @inode: inode to be defragged
* @ra: readahead state (can be NUL)
* @ra: readahead state
* @range: defrag options including range and flags
* @newer_than: minimum transid to defrag
* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
@@ -1372,12 +1360,13 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
ASSERT(ra);
if (isize == 0)
return 0;
@@ -1406,18 +1395,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
cur = round_down(range->start, fs_info->sectorsize);
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
/*
* If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
if (!ra) {
ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
}
/*
* Make writeback start from the beginning of the range, so that the
* defrag range can be written sequentially.
@@ -1472,8 +1449,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
cond_resched();
}
if (ra_allocated)
kfree(ra);
/*
* Update range.start for autodefrag, this will indicate where to start
* in next run.
+1 -2
View File
@@ -18,8 +18,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u32 extent_thresh);
void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_defrag_root(struct btrfs_root *root);
+24 -12
View File
@@ -855,11 +855,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
/* Record qgroup extent info if provided */
if (qrecord) {
if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
delayed_refs, qrecord))
int ret;
ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info,
delayed_refs, qrecord);
if (ret) {
/* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, qrecord->bytenr);
kfree(qrecord);
else
} else {
qrecord_inserted = true;
}
}
trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
@@ -1005,18 +1011,16 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
return -ENOMEM;
}
if (!head_ref)
goto free_node;
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return -ENOMEM;
}
if (!record)
goto free_head_ref;
if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
generic_ref->bytenr, GFP_NOFS))
goto free_record;
}
init_delayed_ref_common(fs_info, node, generic_ref);
@@ -1052,6 +1056,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
free_record:
kfree(record);
free_head_ref:
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
free_node:
kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
return -ENOMEM;
}
/*
+2 -2
View File
@@ -202,8 +202,8 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
/* dirty extent records */
struct rb_root dirty_extent_root;
/* Track dirty extent records. */
struct xarray dirty_extents;
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
+33 -10
View File
@@ -824,22 +824,45 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
u64 start = 0;
int i;
struct rb_node *node;
/*
* The chunk mutex must be held so that no new chunks can be created
* while we are updating existing chunks. This guarantees we don't miss
* any new chunk that gets created for a range that falls before the
* range of the last chunk we processed.
*/
lockdep_assert_held(&fs_info->chunk_mutex);
write_lock(&fs_info->mapping_tree_lock);
do {
node = rb_first_cached(&fs_info->mapping_tree);
while (node) {
struct rb_node *next = rb_next(node);
struct btrfs_chunk_map *map;
u64 next_start;
map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
if (!map)
break;
for (i = 0; i < map->num_stripes; i++)
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
next_start = map->start + map->chunk_len;
for (int i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
start = map->start + map->chunk_len;
btrfs_free_chunk_map(map);
} while (start);
if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) {
map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX);
if (!map)
break;
node = &map->rb_node;
/*
* Drop the lookup reference since we are holding the
* lock in write mode and no one can remove the chunk
* map from the tree and drop its tree reference.
*/
btrfs_free_chunk_map(map);
} else {
node = next;
}
}
write_unlock(&fs_info->mapping_tree_lock);
}
+46 -27
View File
@@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct btrfs_ordered_extent *ordered;
int ret = 0;
/* Direct lock must be taken before the extent lock. */
if (nowait) {
if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
return -EAGAIN;
} else {
lock_dio_extent(io_tree, lockstart, lockend, cached_state);
}
while (1) {
if (nowait) {
if (!try_lock_extent(io_tree, lockstart, lockend,
cached_state))
return -EAGAIN;
cached_state)) {
ret = -EAGAIN;
break;
}
} else {
lock_extent(io_tree, lockstart, lockend, cached_state);
}
@@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
cond_resched();
}
if (ret)
unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
return ret;
}
@@ -353,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
int ret = 0;
u64 len = length;
const u64 data_alloc_len = length;
bool unlock_extents = false;
u32 unlock_bits = EXTENT_LOCKED;
/*
* We could potentially fault if we have a buffer > PAGE_SIZE, and if
@@ -514,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
start, &len, flags);
if (ret < 0)
goto unlock_err;
unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start));
if (dio_data->data_space_reserved) {
@@ -535,22 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
release_offset,
release_len);
}
} else {
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
lockstart = start + len;
if (lockstart < lockend)
unlock_extents = true;
}
if (unlock_extents)
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
else
free_extent_state(cached_state);
/*
* Translate extent map information to iomap.
* We trim the extents (and move the addr) even though iomap code does
@@ -569,11 +566,33 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->length = len;
free_extent_map(em);
/*
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
* writes only hold it for this part. We hold the extent lock until
* we're completely done with the extent map to make sure it remains
* valid.
*/
if (write)
unlock_bits |= EXTENT_DIO_LOCKED;
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, &cached_state);
/* We didn't use everything, unlock the dio extent for the remainder. */
if (!write && (start + len) < lockend)
unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
lockend, NULL);
return 0;
unlock_err:
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
/*
* Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
* to update this, be explicit that we expect EXTENT_LOCKED and
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -596,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
NULL);
unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1, NULL);
return 0;
}
@@ -608,8 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1, NULL);
unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1, NULL);
ret = -ENOTBLK;
}
if (write) {
@@ -641,8 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes,
!bio->bi_status);
} else {
unlock_extent(&inode->io_tree, dip->file_offset,
dip->file_offset + dip->bytes - 1, NULL);
unlock_dio_extent(&inode->io_tree, dip->file_offset,
dip->file_offset + dip->bytes - 1, NULL);
}
bbio->bio.bi_private = bbio->private;
@@ -726,7 +745,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
}
}
btrfs_submit_bio(bbio, 0);
btrfs_submit_bbio(bbio, 0);
}
static const struct iomap_ops btrfs_dio_iomap_ops = {
+2 -2
View File
@@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
};
static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
const struct btrfs_block_group *block_group)
{
return &discard_ctl->discard_list[block_group->discard_index];
}
@@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
*
* Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_fs_info *fs_info = container_of(discard_ctl,
struct btrfs_fs_info,
+3 -13
View File
@@ -525,7 +525,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return false;
return try_release_extent_buffer(&folio->page);
return try_release_extent_buffer(folio);
}
static void btree_invalidate_folio(struct folio *folio, size_t offset,
@@ -1285,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -3322,6 +3321,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
@@ -3346,20 +3346,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
if (sectorsize < PAGE_SIZE)
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
if (!subpage_info) {
ret = -ENOMEM;
goto fail_alloc;
}
btrfs_init_subpage_info(subpage_info, sectorsize);
fs_info->subpage_info = subpage_info;
}
ret = btrfs_init_workqueues(fs_info);
if (ret)
+24 -31
View File
@@ -126,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
* Empty an io tree, removing and freeing every extent state record from the
* tree. This should be called once we are sure no other task can access the
* tree anymore, so no tree updates happen after we empty the tree and there
* aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
* aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
* set on any extent state when calling this function).
*/
void extent_io_tree_release(struct extent_io_tree *tree)
@@ -141,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree)
rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
/* Clear node to keep free_extent_state() happy. */
RB_CLEAR_NODE(&state->rb_node);
ASSERT(!(state->state & EXTENT_LOCKED));
ASSERT(!(state->state & EXTENT_LOCK_BITS));
/*
* No need for a memory barrier here, as we are holding the tree
* lock and we only change the waitqueue while holding that lock
@@ -399,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
*/
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
return;
merge_prev_state(tree, state);
@@ -445,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
struct rb_node *parent = NULL;
const u64 start = state->start - 1;
const u64 end = state->end + 1;
const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
set_state_bits(tree, state, bits, changeset);
@@ -616,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
* inserting elements in the tree, so the gfp mask is used to indicate which
* allocations or sleeping are allowed.
*
* Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
* range from the tree regardless of state (ie for truncate).
*
* The range [start, end] is inclusive.
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
@@ -647,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
wake = (bits & EXTENT_LOCKED) ? 1 : 0;
if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc) {
@@ -861,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state,
static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
return cache_state_if_flags(state, cached_ptr,
EXTENT_LOCKED | EXTENT_BOUNDARY);
return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY);
}
/*
@@ -1063,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int ret = 0;
u64 last_start;
u64 last_end;
u32 exclusive_bits = (bits & EXTENT_LOCKED);
u32 exclusive_bits = (bits & EXTENT_LOCK_BITS);
gfp_t mask;
set_gfp_mask_from_bits(&bits, &mask);
@@ -1812,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
* record any bits changed, so for EXTENT_LOCKED case, it will
* either fail with -EEXIST or changeset will record the whole
* range.
* We don't support EXTENT_LOCK_BITS yet, as current changeset will
* record any bits changed, so for EXTENT_LOCK_BITS case, it will either
* fail with -EEXIST or changeset will record the whole range.
*/
ASSERT(!(bits & EXTENT_LOCKED));
ASSERT(!(bits & EXTENT_LOCK_BITS));
return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
@@ -1826,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
* Don't support EXTENT_LOCK_BITS case, same reason as
* set_record_extent_bits().
*/
ASSERT(!(bits & EXTENT_LOCKED));
ASSERT(!(bits & EXTENT_LOCK_BITS));
return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached)
bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached)
{
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
err = __set_extent_bit(tree, start, end, bits, &failed_start,
NULL, cached, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, cached);
clear_extent_bit(tree, start, failed_start - 1, bits, cached);
return 0;
}
return 1;
@@ -1855,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state)
int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
err = __set_extent_bit(tree, start, end, bits, &failed_start,
&failed_state, cached_state, NULL);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, cached_state);
bits, cached_state);
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
&failed_state);
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
wait_extent_bit(tree, failed_start, end, bits, &failed_state);
err = __set_extent_bit(tree, start, end, bits,
&failed_start, &failed_state,
cached_state, NULL);
}
+34 -4
View File
@@ -19,6 +19,7 @@ enum {
ENUM_BIT(EXTENT_DIRTY),
ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
ENUM_BIT(EXTENT_DIO_LOCKED),
ENUM_BIT(EXTENT_NEW),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
@@ -67,6 +68,8 @@ enum {
EXTENT_ADD_INODE_BYTES | \
EXTENT_CLEAR_ALL_BITS)
#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED)
/*
* Redefined bits above which are used only in the device allocation tree,
* shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
@@ -134,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached);
bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached)
{
return __lock_extent(tree, start, end, EXTENT_LOCKED, cached);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached);
}
int __init extent_state_init_cachep(void);
void __cold extent_state_free_cachep(void);
@@ -212,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL);
}
#endif /* BTRFS_EXTENT_IO_TREE_H */
+2 -2
View File
@@ -6551,13 +6551,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
continue;
ret = btrfs_trim_free_extents(device, &group_trimmed);
trimmed += group_trimmed;
if (ret) {
dev_failed++;
dev_ret = ret;
break;
}
trimmed += group_trimmed;
}
mutex_unlock(&fs_devices->device_list_mutex);
+373 -492
View File
File diff suppressed because it is too large Load Diff
+6 -6
View File
@@ -236,11 +236,11 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
bool try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
bool try_release_extent_mapping(struct folio *folio, gfp_t mask);
int try_release_extent_buffer(struct folio *folio);
int btrfs_read_folio(struct file *file, struct folio *folio);
void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
@@ -249,7 +249,7 @@ int btree_write_cache_pages(struct address_space *mapping,
void btrfs_readahead(struct readahead_control *rac);
int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
void clear_page_extent_mapped(struct page *page);
void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
@@ -354,7 +354,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
const struct page *locked_page,
const struct folio *locked_folio,
struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
@@ -368,7 +368,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
struct folio *locked_folio, u64 *start,
u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+6 -3
View File
@@ -192,10 +192,13 @@ static inline u64 extent_map_block_len(const struct extent_map *em)
static inline u64 extent_map_block_end(const struct extent_map *em)
{
if (extent_map_block_start(em) + extent_map_block_len(em) <
extent_map_block_start(em))
const u64 block_start = extent_map_block_start(em);
const u64 block_end = block_start + extent_map_block_len(em);
if (block_end < block_start)
return (u64)-1;
return extent_map_block_start(em) + extent_map_block_len(em);
return block_end;
}
static bool can_merge_extent_map(const struct extent_map *em)
+2 -2
View File
@@ -151,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
* Calculate the total size needed to allocate for an ordered sum structure
* spanning @bytes in the file.
*/
static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes)
{
return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}
@@ -1272,7 +1272,7 @@ out:
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
const struct btrfs_file_extent_item *fi,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+1 -1
View File
@@ -74,7 +74,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
unsigned long *csum_bitmap);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
const struct btrfs_file_extent_item *fi,
struct extent_map *em);
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
+13 -13
View File
@@ -1617,7 +1617,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
skip_ilock = true;
current->journal_info = NULL;
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
btrfs_assert_inode_locked(inode);
}
trace_btrfs_sync_file(file, datasync);
@@ -1920,8 +1920,8 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
page_start = folio_pos(folio);
page_end = page_start + folio_size(folio) - 1;
end = page_end;
/*
@@ -1949,18 +1949,18 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
again:
down_read(&BTRFS_I(inode)->i_mmap_lock);
lock_page(page);
folio_lock(folio);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
if ((folio->mapping != inode->i_mapping) ||
(page_start >= size)) {
/* Page got truncated out from underneath us. */
goto out_unlock;
}
wait_on_page_writeback(page);
folio_wait_writeback(folio);
lock_extent(io_tree, page_start, page_end, &cached_state);
ret2 = set_page_extent_mapped(page);
ret2 = set_folio_extent_mapped(folio);
if (ret2 < 0) {
ret = vmf_error(ret2);
unlock_extent(io_tree, page_start, page_end, &cached_state);
@@ -1974,14 +1974,14 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
if (ordered) {
unlock_extent(io_tree, page_start, page_end, &cached_state);
unlock_page(page);
folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
@@ -2011,13 +2011,13 @@ again:
}
/* Page is wholly or partially inside EOF. */
if (page_start + PAGE_SIZE > size)
zero_start = offset_in_page(size);
if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size);
else
zero_start = PAGE_SIZE;
if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
@@ -2034,7 +2034,7 @@ again:
return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+1 -1
View File
@@ -703,8 +703,8 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
+5 -5
View File
@@ -14,7 +14,7 @@
#include "extent-tree.h"
#include "file-item.h"
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
const struct fscrypt_str *name)
{
@@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
}
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct fscrypt_str *name)
{
struct btrfs_inode_extref *extref;
@@ -423,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
return ret;
}
static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
struct extent_buffer *leaf,
struct btrfs_file_extent_item *fi,
static inline void btrfs_trace_truncate(const struct btrfs_inode *inode,
const struct extent_buffer *leaf,
const struct btrfs_file_extent_item *fi,
u64 offset, int extent_type, int slot)
{
if (!inode)
+2 -2
View File
@@ -109,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref(
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
const struct fscrypt_str *name);
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct fscrypt_str *name);
#endif
+196 -172
View File
@@ -116,7 +116,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty);
@@ -393,17 +393,17 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *locked_page,
struct folio *locked_folio,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
u64 page_start = 0, page_end = 0;
struct page *page;
struct folio *folio;
if (locked_page) {
page_start = page_offset(locked_page);
page_end = page_start + PAGE_SIZE - 1;
if (locked_folio) {
page_start = folio_pos(locked_folio);
page_end = page_start + folio_size(locked_folio) - 1;
}
while (index <= end_index) {
@@ -417,13 +417,13 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
if (locked_page && index == (page_start >> PAGE_SHIFT)) {
if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
page = find_get_page(inode->vfs_inode.i_mapping, index);
folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
index++;
if (!page)
if (IS_ERR(folio))
continue;
/*
@@ -431,14 +431,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
page_folio(page), offset, bytes);
put_page(page);
btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
offset, bytes);
folio_put(folio);
}
if (locked_page) {
if (locked_folio) {
/* The locked page covers the full range, nothing needs to be done */
if (bytes + offset <= page_start + PAGE_SIZE)
if (bytes + offset <= page_start + folio_size(locked_folio))
return;
/*
* In case this page belongs to the delalloc range being
@@ -447,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
offset = page_offset(locked_page) + PAGE_SIZE;
bytes = offset + bytes - folio_pos(locked_folio) -
folio_size(locked_folio);
offset = folio_pos(locked_folio) + folio_size(locked_folio);
}
}
@@ -494,7 +495,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
@@ -554,12 +554,16 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
page = find_get_page(inode->vfs_inode.i_mapping, 0);
struct folio *folio;
folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
0, 0, 0);
ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_local_page(page);
kaddr = kmap_local_folio(folio, 0);
write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_local(kaddr);
put_page(page);
folio_put(folio);
}
btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -715,7 +719,7 @@ out:
}
static noinline int cow_file_range_inline(struct btrfs_inode *inode,
struct page *locked_page,
struct folio *locked_folio,
u64 offset, u64 end,
size_t compressed_size,
int compress_type,
@@ -740,13 +744,26 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
return ret;
}
/*
* In the successful case (ret == 0 here), cow_file_range will return 1.
*
* Quite a bit further up the callstack in extent_writepage(), ret == 1
* is treated as a short circuited success and does not unlock the folio,
* so we must do it here.
*
* In the failure case, the locked_folio does get unlocked by
* btrfs_folio_end_all_writers, which asserts that it is still locked
* at that point, so we must *not* unlock it here.
*
* The other two callsites in compress_file_range do not have a
* locked_folio, so they are not relevant to this logic.
*/
if (ret == 0)
locked_page = NULL;
locked_folio = NULL;
extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached,
clear_flags,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
clear_flags, PAGE_UNLOCK |
PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
return ret;
}
@@ -762,7 +779,7 @@ struct async_extent {
struct async_chunk {
struct btrfs_inode *inode;
struct page *locked_page;
struct folio *locked_folio;
u64 start;
u64 end;
blk_opf_t write_flags;
@@ -868,25 +885,25 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
btrfs_add_inode_defrag(NULL, inode, small_write);
btrfs_add_inode_defrag(inode, small_write);
}
static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
struct folio *folio;
int ret = 0;
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
page = find_get_page(inode->i_mapping, index);
if (unlikely(!page)) {
folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
if (IS_ERR(folio)) {
if (!ret)
ret = -ENOENT;
ret = PTR_ERR(folio);
continue;
}
clear_page_dirty_for_io(page);
put_page(page);
folio_clear_dirty_for_io(folio);
folio_put(folio);
}
return ret;
}
@@ -1122,7 +1139,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
static void submit_uncompressed_range(struct btrfs_inode *inode,
struct async_extent *async_extent,
struct page *locked_page)
struct folio *locked_folio)
{
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1135,20 +1152,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
};
wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
ret = run_delalloc_cow(inode, locked_folio, start, end,
&wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
if (locked_page) {
const u64 page_start = page_offset(locked_page);
btrfs_cleanup_ordered_extents(inode, locked_folio,
start, end - start + 1);
if (locked_folio) {
const u64 page_start = folio_pos(locked_folio);
set_page_writeback(locked_page);
end_page_writeback(locked_page);
btrfs_mark_ordered_io_finished(inode, locked_page,
folio_start_writeback(locked_folio);
folio_end_writeback(locked_folio);
btrfs_mark_ordered_io_finished(inode, locked_folio,
page_start, PAGE_SIZE,
!ret);
mapping_set_error(locked_page->mapping, ret);
unlock_page(locked_page);
mapping_set_error(locked_folio->mapping, ret);
folio_unlock(locked_folio);
}
}
}
@@ -1164,7 +1183,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
struct btrfs_key ins;
struct page *locked_page = NULL;
struct folio *locked_folio = NULL;
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
@@ -1175,19 +1194,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
kthread_associate_blkcg(async_chunk->blkcg_css);
/*
* If async_chunk->locked_page is in the async_extent range, we need to
* If async_chunk->locked_folio is in the async_extent range, we need to
* handle it.
*/
if (async_chunk->locked_page) {
u64 locked_page_start = page_offset(async_chunk->locked_page);
u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
if (async_chunk->locked_folio) {
u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
u64 locked_folio_end = locked_folio_start +
folio_size(async_chunk->locked_folio) - 1;
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
if (!(start >= locked_folio_end || end <= locked_folio_start))
locked_folio = async_chunk->locked_folio;
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
submit_uncompressed_range(inode, async_extent, locked_page);
submit_uncompressed_range(inode, async_extent, locked_folio);
goto done;
}
@@ -1202,7 +1222,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* non-contiguous space for the uncompressed size instead. So
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_page);
submit_uncompressed_range(inode, async_extent, locked_folio);
goto done;
}
@@ -1306,21 +1326,21 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
*
* locked_page is the page that writepage had locked already. We use
* locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
* When this function fails, it unlocks all pages except @locked_page.
* When this function fails, it unlocks all pages except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
* unlocks all pages including locked_page and starts I/O on them.
* (In reality inline extents are limited to a single page, so locked_page is
* unlocks all pages including locked_folio and starts I/O on them.
* (In reality inline extents are limited to a single page, so locked_folio is
* the only page handled anyway).
*
* When this function succeed and creates a normal extent, the page locking
* status depends on the passed in flags:
*
* - If @keep_locked is set, all pages are kept locked.
* - Else all pages except for @locked_page are unlocked.
* - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are kept
@@ -1329,8 +1349,8 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page, u64 start, u64 end,
u64 *done_offset,
struct folio *locked_folio, u64 start,
u64 end, u64 *done_offset,
bool keep_locked, bool no_inline)
{
struct btrfs_root *root = inode->root;
@@ -1363,7 +1383,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (!no_inline) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, locked_page, start, end, 0,
ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
if (ret <= 0) {
/*
@@ -1500,7 +1520,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page, &cached,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
@@ -1553,13 +1573,13 @@ out_unlock:
* function.
*
* However, in case of @keep_locked, we still need to unlock the pages
* (except @locked_page) to ensure all the pages are unlocked.
* (except @locked_folio) to ensure all the pages are unlocked.
*/
if (keep_locked && orig_start < start) {
if (!locked_page)
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
locked_page, NULL, 0, page_ops);
locked_folio, NULL, 0, page_ops);
}
/*
@@ -1582,8 +1602,7 @@ out_unlock:
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_page, &cached,
clear_bits,
locked_folio, &cached, clear_bits,
page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
start += cur_alloc_size;
@@ -1597,7 +1616,7 @@ out_unlock:
*/
if (start < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
extent_clear_unlock_delalloc(inode, start, end, locked_folio,
&cached, clear_bits, page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
}
@@ -1651,7 +1670,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
}
static bool run_delalloc_compressed(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1691,15 +1710,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
INIT_LIST_HEAD(&async_chunk[i].extents);
/*
* The locked_page comes all the way from writepage and its
* the original page we were actually given. As we spread
* The locked_folio comes all the way from writepage and its
* the original folio we were actually given. As we spread
* this large delalloc region across multiple async_chunk
* structs, only the first struct needs a pointer to locked_page
* structs, only the first struct needs a pointer to
* locked_folio.
*
* This way we don't need racey decisions about who is supposed
* to unlock it.
*/
if (locked_page) {
if (locked_folio) {
/*
* Depending on the compressibility, the pages might or
* might not go through async. We want all of them to
@@ -1709,12 +1729,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
wbc_account_cgroup_owner(wbc, locked_page,
wbc_account_cgroup_owner(wbc, &locked_folio->page,
cur_end - start);
async_chunk[i].locked_page = locked_page;
locked_page = NULL;
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
} else {
async_chunk[i].locked_page = NULL;
async_chunk[i].locked_folio = NULL;
}
if (blkcg_css != blkcg_root_css) {
@@ -1743,7 +1763,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* covered by the range.
*/
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -1751,20 +1771,21 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
int ret;
while (start <= end) {
ret = cow_file_range(inode, locked_page, start, end, &done_offset,
true, false);
ret = cow_file_range(inode, locked_folio, start, end,
&done_offset, true, false);
if (ret)
return ret;
extent_write_locked_range(&inode->vfs_inode, locked_page, start,
done_offset, wbc, pages_dirty);
extent_write_locked_range(&inode->vfs_inode, locked_folio,
start, done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
return 1;
}
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end)
static int fallback_to_cow(struct btrfs_inode *inode,
struct folio *locked_folio, const u64 start,
const u64 end)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
@@ -1833,7 +1854,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
*/
ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
true);
ASSERT(ret != 1);
return ret;
}
@@ -1987,7 +2009,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* blocks on disk
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
struct folio *locked_folio,
const u64 start, const u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -2150,8 +2172,8 @@ must_cow:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_page,
cow_start, found_key.offset - 1);
ret = fallback_to_cow(inode, locked_folio, cow_start,
found_key.offset - 1);
cow_start = (u64)-1;
if (ret) {
btrfs_dec_nocow_writers(nocow_bg);
@@ -2206,7 +2228,7 @@ must_cow:
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
locked_page, &cached_state,
locked_folio, &cached_state,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED);
@@ -2228,7 +2250,7 @@ must_cow:
if (cow_start != (u64)-1) {
cur_offset = end;
ret = fallback_to_cow(inode, locked_page, cow_start, end);
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
cow_start = (u64)-1;
if (ret)
goto error;
@@ -2255,7 +2277,7 @@ error:
lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_page, &cached,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -2282,39 +2304,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc)
{
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
int ret;
/*
* The range must cover part of the @locked_page, or a return of 1
* The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
ASSERT(!(end <= page_offset(locked_page) ||
start >= page_offset(locked_page) + PAGE_SIZE));
ASSERT(!(end <= folio_pos(locked_folio) ||
start >= folio_pos(locked_folio) + folio_size(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_page, start, end);
ret = run_delalloc_nocow(inode, locked_folio, start, end);
goto out;
}
if (btrfs_inode_can_compress(inode) &&
inode_need_compress(inode, start, end) &&
run_delalloc_compressed(inode, locked_page, start, end, wbc))
run_delalloc_compressed(inode, locked_folio, start, end, wbc))
return 1;
if (zoned)
ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
ret = cow_file_range(inode, locked_page, start, end, NULL,
ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
out:
if (ret < 0)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
btrfs_cleanup_ordered_extents(inode, locked_folio, start,
end - start + 1);
return ret;
}
@@ -2690,7 +2712,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
struct folio *folio;
struct btrfs_inode *inode;
struct btrfs_work work;
};
@@ -2702,50 +2724,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct page *page = fixup->page;
struct folio *folio = fixup->folio;
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = page_offset(page);
u64 page_end = page_offset(page) + PAGE_SIZE - 1;
u64 page_start = folio_pos(folio);
u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
/*
* This is similar to page_mkwrite, we need to reserve the space before
* we take the page lock.
* we take the folio lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
PAGE_SIZE);
folio_size(folio));
again:
lock_page(page);
folio_lock(folio);
/*
* Before we queued this fixup, we took a reference on the page.
* page->mapping may go NULL, but it shouldn't be moved to a different
* Before we queued this fixup, we took a reference on the folio.
* folio->mapping may go NULL, but it shouldn't be moved to a different
* address space.
*/
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
if (!folio->mapping || !folio_test_dirty(folio) ||
!folio_test_checked(folio)) {
/*
* Unfortunately this is a little tricky, either
*
* 1) We got here and our page had already been dealt with and
* 1) We got here and our folio had already been dealt with and
* we reserved our space, thus ret == 0, so we need to just
* drop our space reservation and bail. This can happen the
* first time we come into the fixup worker, or could happen
* while waiting for the ordered extent.
* 2) Our page was already dealt with, but we happened to get an
* 2) Our folio was already dealt with, but we happened to get an
* ENOSPC above from the btrfs_delalloc_reserve_space. In
* this case we obviously don't have anything to release, but
* because the page was already dealt with we don't want to
* mark the page with an error, so make sure we're resetting
* because the folio was already dealt with we don't want to
* mark the folio with an error, so make sure we're resetting
* ret to 0. This is why we have this check _before_ the ret
* check, because we do not want to have a surprise ENOSPC
* when the page was already properly dealt with.
* when the folio was already properly dealt with.
*/
if (!ret) {
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
btrfs_delalloc_release_extents(inode, folio_size(folio));
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
page_start, folio_size(folio),
true);
}
ret = 0;
@@ -2753,7 +2776,7 @@ again:
}
/*
* We can't mess with the page state unless it is locked, so now that
* We can't mess with the folio state unless it is locked, so now that
* it is locked bail if we failed to make our space reservation.
*/
if (ret)
@@ -2762,14 +2785,14 @@ again:
lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PageOrdered(page))
if (folio_test_ordered(folio))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
unlock_extent(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -2787,7 +2810,7 @@ again:
*
* The page was dirty when we started, nothing should have cleaned it.
*/
BUG_ON(!PageDirty(page));
BUG_ON(!folio_test_dirty(folio));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
@@ -2801,14 +2824,14 @@ out_page:
* We hit ENOSPC or other errors. Update the mapping and page
* to reflect the errors and clean the page.
*/
mapping_set_error(page->mapping, ret);
btrfs_mark_ordered_io_finished(inode, page, page_start,
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
mapping_set_error(folio->mapping, ret);
btrfs_mark_ordered_io_finished(inode, folio, page_start,
folio_size(folio), !ret);
folio_clear_dirty_for_io(folio);
}
btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
folio_unlock(folio);
folio_put(folio);
kfree(fixup);
extent_changeset_free(data_reserved);
/*
@@ -2821,33 +2844,34 @@ out_page:
/*
* There are a few paths in the higher layers of the kernel that directly
* set the page dirty bit without asking the filesystem if it is a
* set the folio dirty bit without asking the filesystem if it is a
* good idea. This causes problems because we want to make sure COW
* properly happens and the data=ordered rules are followed.
*
* In our case any range that doesn't have the ORDERED bit set
* hasn't been properly setup for IO. We kick off an async process
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
* the delalloc bit and make it safe to write the folio.
*/
int btrfs_writepage_cow_fixup(struct page *page)
int btrfs_writepage_cow_fixup(struct folio *folio)
{
struct inode *inode = page->mapping->host;
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_writepage_fixup *fixup;
/* This page has ordered extent covering it already */
if (PageOrdered(page))
/* This folio has ordered extent covering it already */
if (folio_test_ordered(folio))
return 0;
/*
* PageChecked is set below when we create a fixup worker for this page,
* don't try to create another one if we're already PageChecked()
* folio_checked is set below when we create a fixup worker for this
* folio, don't try to create another one if we're already
* folio_test_checked.
*
* The extent_io writepage code will redirty the page if we send back
* The extent_io writepage code will redirty the foio if we send back
* EAGAIN.
*/
if (PageChecked(page))
if (folio_test_checked(folio))
return -EAGAIN;
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
@@ -2857,14 +2881,14 @@ int btrfs_writepage_cow_fixup(struct page *page)
/*
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the page lock, and we can't trust
* page->mapping outside of the page lock.
* takes place outside of the folio lock, and we can't trust
* page->mapping outside of the folio lock.
*/
ihold(inode);
btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
folio_get(folio);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
fixup->folio = folio;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -6700,7 +6724,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
}
static noinline int uncompress_inline(struct btrfs_path *path,
struct page *page,
struct folio *folio,
struct btrfs_file_extent_item *item)
{
int ret;
@@ -6722,7 +6746,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
max_size);
/*
* decompression code contains a memset to fill in any space between the end
@@ -6733,36 +6758,36 @@ static noinline int uncompress_inline(struct btrfs_path *path,
*/
if (max_size < PAGE_SIZE)
memzero_page(page, max_size, PAGE_SIZE - max_size);
folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
struct page *page)
struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
if (!page || PageUptodate(page))
if (!folio || folio_test_uptodate(folio))
return 0;
ASSERT(page_offset(page) == 0);
ASSERT(folio_pos(folio) == 0);
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
return uncompress_inline(path, page, fi);
return uncompress_inline(path, folio, fi);
copy_size = min_t(u64, PAGE_SIZE,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
kaddr = kmap_local_page(page);
kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
if (copy_size < PAGE_SIZE)
memzero_page(page, copy_size, PAGE_SIZE - copy_size);
folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
return 0;
}
@@ -6784,7 +6809,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, u64 start, u64 len)
struct folio *folio, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6807,7 +6832,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (em) {
if (em->start > start || em->start + em->len <= start)
free_extent_map(em);
else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)
else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
free_extent_map(em);
else
goto out;
@@ -6937,7 +6962,7 @@ next:
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
ret = read_inline_extent(inode, path, page);
ret = read_inline_extent(inode, path, folio);
if (ret < 0)
goto out;
goto insert;
@@ -7179,13 +7204,12 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
*/
static void wait_subpage_spinlock(struct page *page)
static void wait_subpage_spinlock(struct folio *folio)
{
struct btrfs_fs_info *fs_info = page_to_fs_info(page);
struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct btrfs_subpage *subpage;
if (!btrfs_is_subpage(fs_info, page->mapping))
if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
@@ -7214,9 +7238,9 @@ static int btrfs_launder_folio(struct folio *folio)
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
if (try_release_extent_mapping(&folio->page, gfp_flags)) {
wait_subpage_spinlock(&folio->page);
clear_page_extent_mapped(&folio->page);
if (try_release_extent_mapping(folio, gfp_flags)) {
wait_subpage_spinlock(folio);
clear_folio_extent_mapped(folio);
return true;
}
return false;
@@ -7276,7 +7300,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* do double ordered extent accounting on the same folio.
*/
folio_wait_writeback(folio);
wait_subpage_spinlock(&folio->page);
wait_subpage_spinlock(folio);
/*
* For subpage case, we have call sites like
@@ -7414,7 +7438,7 @@ next:
btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
clear_folio_extent_mapped(folio);
}
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
@@ -8951,19 +8975,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
struct folio *folio;
u32 len;
ASSERT(end + 1 - start <= U32_MAX);
len = end + 1 - start;
while (index <= end_index) {
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
/* This is for data, which doesn't yet support larger folio. */
ASSERT(folio_order(page_folio(page)) == 0);
btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
ASSERT(folio_order(folio) == 0);
btrfs_folio_set_writeback(fs_info, folio, start, len);
folio_put(folio);
index++;
}
}
@@ -9128,7 +9152,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
atomic_inc(&priv.pending);
btrfs_submit_bio(bbio, 0);
btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
btrfs_encoded_read_endio, &priv);
@@ -9143,7 +9167,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
} while (disk_io_size);
atomic_inc(&priv.pending);
btrfs_submit_bio(bbio, 0);
btrfs_submit_bbio(bbio, 0);
if (atomic_dec_return(&priv.pending))
io_wait_event(priv.wait, !atomic_read(&priv.pending));
+4 -7
View File
@@ -543,13 +543,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
return ret;
if (copy_to_user(arg, &range, sizeof(range)))
return -EFAULT;
return 0;
return ret;
}
int __pure btrfs_is_empty_uuid(const u8 *uuid)
@@ -4765,11 +4763,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
/*
* The transaction thread may want to do more work,
* namely it pokes the cleaner kthread that will start
* processing uncleaned subvols.
* There may be work for the cleaner kthread to do (subvolume
* deletion, delayed iputs, defrag inodes, etc), so wake it up.
*/
wake_up_process(fs_info->transaction_kthread);
wake_up_process(fs_info->cleaner_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
+6 -6
View File
@@ -438,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
@@ -467,22 +467,22 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (unlikely(ret != LZO_E_OK)) {
struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(fs_info,
"lzo decompression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
page_offset(dest_page));
folio_pos(dest_folio));
ret = -EIO;
goto out;
}
ASSERT(out_len <= sectorsize);
memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len);
/* Early end, considered as an error. */
if (unlikely(out_len < destlen)) {
ret = -EIO;
memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len);
}
out:
return ret;
+15 -15
View File
@@ -332,7 +332,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
}
static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset,
struct folio *folio, u64 file_offset,
u64 len, bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
@@ -340,10 +340,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
lockdep_assert_held(&inode->ordered_tree_lock);
if (page) {
ASSERT(page->mapping);
ASSERT(page_offset(page) <= file_offset);
ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
/*
* Ordered (Private2) bit indicates whether we still have
@@ -351,10 +351,9 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
file_offset, len))
if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
return false;
btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
}
/* Now we're fine to update the accounting. */
@@ -398,7 +397,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
}
void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset, u64 len,
struct folio *folio, u64 file_offset, u64 len,
bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
@@ -408,7 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
spin_lock_irqsave(&inode->ordered_tree_lock, flags);
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
/*
@@ -449,8 +449,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
/*
* Mark all ordered extents io inside the specified range finished.
*
* @page: The involved page for the operation.
* For uncompressed buffered IO, the page status also needs to be
* @folio: The involved folio for the operation.
* For uncompressed buffered IO, the folio status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
@@ -460,7 +460,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
* extent(s) covering it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
struct folio *folio, u64 file_offset,
u64 num_bytes, bool uptodate)
{
struct rb_node *node;
@@ -524,7 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
ASSERT(end + 1 - cur < U32_MAX);
len = end + 1 - cur;
if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
btrfs_queue_ordered_fn(entry);
spin_lock_irqsave(&inode->ordered_tree_lock, flags);
@@ -1015,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
{
struct rb_node *n;
ASSERT(inode_is_locked(&inode->vfs_inode));
btrfs_assert_inode_locked(inode);
spin_lock_irq(&inode->ordered_tree_lock);
for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
+3 -3
View File
@@ -163,11 +163,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset, u64 len,
struct folio *folio, u64 file_offset, u64 len,
bool uptodate);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
u64 num_bytes, bool uptodate);
struct folio *folio, u64 file_offset,
u64 num_bytes, bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
+7 -17
View File
@@ -9,9 +9,8 @@
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
struct btrfs_path *path;
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -21,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_free_path(path);
return ret;
return btrfs_insert_empty_item(trans, root, path, &key, 0);
}
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
struct btrfs_path *path;
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -44,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret) { /* JDM: Really? */
ret = -ENOENT;
goto out;
}
return ret;
if (ret)
return -ENOENT;
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
return ret;
return btrfs_del_item(trans, root, path);
}
+32 -34
View File
@@ -1998,16 +1998,14 @@ out:
*
* Return 0 for success insert
* Return >0 for existing record, caller can free @record safely.
* Error is not possible
* Return <0 for insertion failure, caller can free @record safely.
*/
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
{
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
struct btrfs_qgroup_extent_record *existing, *ret;
unsigned long bytenr = record->bytenr;
if (!btrfs_qgroup_full_accounting(fs_info))
return 1;
@@ -2015,26 +2013,24 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
} else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
} else {
if (record->data_rsv && !entry->data_rsv) {
entry->data_rsv = record->data_rsv;
entry->data_rsv_refroot =
record->data_rsv_refroot;
}
return 1;
xa_lock(&delayed_refs->dirty_extents);
existing = xa_load(&delayed_refs->dirty_extents, bytenr);
if (existing) {
if (record->data_rsv && !existing->data_rsv) {
existing->data_rsv = record->data_rsv;
existing->data_rsv_refroot = record->data_rsv_refroot;
}
xa_unlock(&delayed_refs->dirty_extents);
return 1;
}
ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents);
if (xa_is_err(ret)) {
qgroup_mark_inconsistent(fs_info);
return xa_err(ret);
}
rb_link_node(&record->node, parent_node, p);
rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
return 0;
}
@@ -2141,6 +2137,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!record)
return -ENOMEM;
if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) {
kfree(record);
return -ENOMEM;
}
delayed_refs = &trans->transaction->delayed_refs;
record->bytenr = bytenr;
record->num_bytes = num_bytes;
@@ -2149,7 +2150,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
spin_lock(&delayed_refs->lock);
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
spin_unlock(&delayed_refs->lock);
if (ret > 0) {
if (ret) {
/* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, record->bytenr);
kfree(record);
return 0;
}
@@ -3018,7 +3021,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
unsigned long index;
u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -3028,10 +3031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
xa_for_each(&delayed_refs->dirty_extents, index, record) {
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
@@ -3097,7 +3097,7 @@ cleanup:
ulist_free(record->old_roots);
ulist_free(new_roots);
new_roots = NULL;
rb_erase(node, &delayed_refs->dirty_extent_root);
xa_erase(&delayed_refs->dirty_extents, index);
kfree(record);
}
@@ -4874,15 +4874,13 @@ out:
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
{
struct btrfs_qgroup_extent_record *entry;
struct btrfs_qgroup_extent_record *next;
struct rb_root *root;
unsigned long index;
root = &trans->delayed_refs.dirty_extent_root;
rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) {
ulist_free(entry->old_roots);
kfree(entry);
}
*root = RB_ROOT;
xa_destroy(&trans->delayed_refs.dirty_extents);
}
void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
-1
View File
@@ -125,7 +125,6 @@ struct btrfs_inode;
* Record a dirty extent, and info qgroup to update quota on it
*/
struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
+41 -5
View File
@@ -66,6 +66,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
if (ret)
break;
start += key.offset;
length -= key.offset;
if (length == 0)
break;
btrfs_release_path(path);
}
@@ -73,6 +78,36 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
return ret;
}
static int update_raid_extent_item(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_stripe_extent *stripe_extent,
const size_t item_size)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
int ret;
int slot;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
0, 1);
if (ret)
return (ret == 1 ? ret : -EINVAL);
leaf = path->nodes[0];
slot = path->slots[0];
write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
item_size);
btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
return ret;
}
static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_io_context *bioc)
{
@@ -112,6 +147,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
if (ret == -EEXIST)
ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
item_size);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -172,7 +210,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
if (!path)
return -ENOMEM;
if (stripe->is_scrub) {
if (stripe->rst_search_commit_root) {
path->skip_locking = 1;
path->search_commit_root = 1;
}
@@ -245,10 +283,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
out:
if (ret > 0)
ret = -ENOENT;
if (ret && ret != -EIO && !stripe->is_scrub) {
if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
btrfs_print_tree(leaf, 1);
btrfs_err(fs_info,
if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
btrfs_debug(fs_info,
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
logical, logical + *length, stripe->dev->devid,
btrfs_bg_type_to_raid_name(map_type));
+18 -17
View File
@@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
struct folio *folio = NULL;
struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
@@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret)
goto out;
page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
btrfs_alloc_write_mask(mapping));
if (!page) {
folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
btrfs_alloc_write_mask(mapping));
if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out_unlock;
}
ret = set_page_extent_mapped(page);
ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
@@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
if (comp_type == BTRFS_COMPRESS_NONE) {
memcpy_to_page(page, offset_in_page(file_offset), data_start,
datal);
memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
datal);
} else {
ret = btrfs_decompress(comp_type, data_start, page,
offset_in_page(file_offset),
ret = btrfs_decompress(comp_type, data_start, folio,
offset_in_folio(folio, file_offset),
inline_size, datal);
if (ret)
goto out_unlock;
flush_dcache_page(page);
flush_dcache_folio(folio);
}
/*
@@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
* So what's in the range [500, 4095] corresponds to zeroes.
*/
if (datal < block_size)
memzero_page(page, datal, block_size - datal);
folio_zero_range(folio, datal, block_size - datal);
btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
put_page(page);
if (!IS_ERR(folio)) {
folio_unlock(folio);
folio_put(folio);
}
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+18 -4
View File
@@ -36,6 +36,7 @@
#include "relocation.h"
#include "super.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
/*
* Relocation overview
@@ -2965,21 +2966,34 @@ static int relocate_one_folio(struct reloc_control *rc,
u64 folio_end;
u64 cur;
int ret;
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
index, last_index + 1 - index);
/*
* On relocation we're doing readahead on the relocation inode,
* but if the filesystem is backed by a RAID stripe tree we can
* get ENOENT (e.g. due to preallocated extents not being
* mapped in the RST) from the lookup.
*
* But readahead doesn't handle the error and submits invalid
* reads to the device, causing a assertion failures.
*/
if (!use_rst)
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
index, last_index + 1 - index);
folio = __filemap_get_folio(inode->i_mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mask);
if (IS_ERR(folio))
return PTR_ERR(folio);
}
WARN_ON(folio_order(folio));
if (folio_test_readahead(folio))
if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
folio, last_index + 1 - index);

Some files were not shown because too many files have changed in this diff Show More