From 2ed45c0f1879079b30248568c515cf60fc668d8a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Sep 2023 18:20:18 +0100 Subject: [PATCH 1/9] btrfs: fix race when refilling delayed refs block reserve If we have two (or more) tasks attempting to refill the delayed refs block reserve we can end up with the delayed block reserve being over reserved, that is, with a reserved space greater than its size. If this happens, we are holding to more reserved space than necessary for a while. The race happens like this: 1) The delayed refs block reserve has a size of 8M and a reserved space of 6M for example; 2) Task A calls btrfs_delayed_refs_rsv_refill(); 3) Task B also calls btrfs_delayed_refs_rsv_refill(); 4) Task A sees there's a 2M difference between the size and the reserved space of the delayed refs rsv, so it will reserve 2M of space by calling btrfs_reserve_metadata_bytes(); 5) Task B also sees that 2M difference, and like task A, it reserves another 2M of metadata space; 6) Both task A and task B increase the reserved space of block reserve by 2M, by calling btrfs_block_rsv_add_bytes(), so the block reserve ends up with a size of 8M and a reserved space of 10M; 7) The extra, over reserved space will eventually be freed by some task calling btrfs_delayed_refs_rsv_release() -> btrfs_block_rsv_release() -> block_rsv_release_bytes(), as there we will detect the over reserve and release that space. So fix this by checking if we still need to add space to the delayed refs block reserve after reserving the metadata space, and if we don't, just release that space immediately. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6a13cf00218b..1043f66cc130 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -163,6 +163,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; + u64 refilled_bytes; + u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); @@ -178,9 +180,38 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); + + /* + * We may have raced with someone else, so check again if we the block + * reserve is still not full and release any excess space. + */ + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + u64 needed = block_rsv->size - block_rsv->reserved; + + if (num_bytes >= needed) { + block_rsv->reserved += needed; + block_rsv->full = true; + to_free = num_bytes - needed; + refilled_bytes = needed; + } else { + block_rsv->reserved += num_bytes; + to_free = 0; + refilled_bytes = num_bytes; + } + } else { + to_free = num_bytes; + refilled_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (to_free > 0) + btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info, + to_free); + + if (refilled_bytes > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, + refilled_bytes, 1); return 0; } From a7ddeeb079505961355cf0106154da0110f1fdff Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Sep 2023 18:20:19 +0100 Subject: [PATCH 2/9] btrfs: prevent transaction block reserve underflow when starting transaction When starting a transaction, with a non-zero number of items, we reserve metadata space for that number of items and for delayed refs by doing a call to btrfs_block_rsv_add(), with the transaction block reserve passed as the block reserve argument. This reserves metadata space and adds it to the transaction block reserve. Later we migrate the space we reserved for delayed references from the transaction block reserve into the delayed refs block reserve, by calling btrfs_migrate_to_delayed_refs_rsv(). btrfs_migrate_to_delayed_refs_rsv() decrements the number of bytes to migrate from the source block reserve, and this however may result in an underflow in case the space added to the transaction block reserve ended up being used by another task that has not reserved enough space for its own use - examples are tasks doing reflinks or hole punching because they end up calling btrfs_replace_file_extents() -> btrfs_drop_extents() and may need to modify/COW a variable number of leaves/paths, so they keep trying to use space from the transaction block reserve when they need to COW an extent buffer, and may end up trying to use more space then they have reserved (1 unit/path only for removing file extent items). This can be avoided by simply reserving space first without adding it to the transaction block reserve, then add the space for delayed refs to the delayed refs block reserve and finally add the remaining reserved space to the transaction block reserve. This also makes the code a bit shorter and simpler. So just do that. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 9 +-------- fs/btrfs/delayed-ref.h | 1 - fs/btrfs/transaction.c | 6 +++--- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 1043f66cc130..9fe4ccca50a0 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -103,24 +103,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem - * @src: source block rsv to transfer from * @num_bytes: number of bytes to transfer * - * This transfers up to the num_bytes amount from the src rsv to the + * This transfers up to the num_bytes amount, previously reserved, to the * delayed_refs_rsv. Any extra bytes are returned to the space info. */ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; u64 to_free = 0; - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - spin_lock(&delayed_refs_rsv->lock); if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { u64 delta = delayed_refs_rsv->size - diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b8e14b0ba5f1..fd9bf2b709c0 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -407,7 +407,6 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 341363beaf10..3b60e56e5e02 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -631,14 +631,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush); if (ret) goto reserve_fail; if (delayed_refs_bytes) { - btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, - delayed_refs_bytes); + btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes); num_bytes -= delayed_refs_bytes; } + btrfs_block_rsv_add_bytes(rsv, num_bytes, true); if (rsv->space_info->force_alloc) do_chunk_alloc = true; From 1bf76df3fee56d6637718e267f7c34ed70d0c7dc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Sep 2023 18:20:23 +0100 Subject: [PATCH 3/9] btrfs: return -EUCLEAN for delayed tree ref with a ref count not equals to 1 When running a delayed tree reference, if we find a ref count different from 1, we return -EIO. This isn't an IO error, as it indicates either a bug in the delayed refs code or a memory corruption, so change the error code from -EIO to -EUCLEAN. Also tag the branch as 'unlikely' as this is not expected to ever happen, and change the error message to print the tree block's bytenr without the parenthesis (and there was a missing space between the 'block' word and the opening parenthesis), for consistency as that's the style we used everywhere else. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f356f08b55cb..4282bdb5a9f1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1699,12 +1699,12 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); From d2f79e6385b0fcb1a38368e17d4721b8cd72af9f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Sep 2023 18:20:24 +0100 Subject: [PATCH 4/9] btrfs: remove redundant BUG_ON() from __btrfs_inc_extent_ref() At __btrfs_inc_extent_ref() we are doing a BUG_ON() if we are dealing with a tree block reference that has a reference count that is different from 1, but we have already dealt with this case at run_delayed_tree_ref(), making it useless. So remove the BUG_ON(). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4282bdb5a9f1..fd80129acc3c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1514,15 +1514,14 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); + if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, bytenr, parent, root_objectid); - } else { + else ret = insert_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - } + if (ret) btrfs_abort_transaction(trans, ret); out: From 8ec0a4a5774ab3f91c356c71f24dfba615bee860 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Sep 2023 18:20:29 +0100 Subject: [PATCH 5/9] btrfs: log message if extent item not found when running delayed extent op When running a delayed extent operation, if we don't find the extent item in the extent tree we just return -EIO without any logged message. This indicates some bug or possibly a memory or fs corruption, so the return value should not be -EIO but -EUCLEAN instead, and since it's not expected to ever happen, print an informative error message so that if it happens we have some idea of what went wrong, where to look at. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fd80129acc3c..fc313fce5bbd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1655,7 +1655,10 @@ again: goto again; } } else { - err = -EIO; + err = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, extent_op->level); goto out; } } From 58bfe2ccec5f9f137b41dd38f335290dcc13cd5c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Sep 2023 10:34:51 -0400 Subject: [PATCH 6/9] btrfs: properly report 0 avail for very full file systems A user reported some issues with smaller file systems that get very full. While investigating this issue I noticed that df wasn't showing 100% full, despite having 0 chunk space and having < 1MiB of available metadata space. This turns out to be an overflow issue, we're doing: total_available_metadata_space - SZ_4M < global_block_rsv_size to determine if there's not enough space to make metadata allocations, which overflows if total_available_metadata_space is < 4M. Fix this by checking to see if our available space is greater than the 4M threshold. This makes df properly report 100% usage on the file system. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cffdd6f7f8e8..1a093ec0f7e3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2117,7 +2117,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; From 74ee79142c0a344d4eae2eb7012ebc4e82254109 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 19 Sep 2023 11:44:42 +0930 Subject: [PATCH 7/9] btrfs: reset destination buffer when read_extent_buffer() gets invalid range Commit f98b6215d7d1 ("btrfs: extent_io: do extra check for extent buffer read write functions") changed how we handle invalid extent buffer range for read_extent_buffer(). Previously if the range is invalid we just set the destination to zero, but after the patch we do nothing and error out. This can lead to smatch static checker errors like: fs/btrfs/print-tree.c:186 print_uuid_item() error: uninitialized symbol 'subvol_id'. fs/btrfs/tests/extent-io-tests.c:338 check_eb_bitmap() error: uninitialized symbol 'has'. fs/btrfs/tests/extent-io-tests.c:353 check_eb_bitmap() error: uninitialized symbol 'has'. fs/btrfs/uuid-tree.c:203 btrfs_uuid_tree_remove() error: uninitialized symbol 'read_subid'. fs/btrfs/uuid-tree.c:353 btrfs_uuid_tree_iterate() error: uninitialized symbol 'subid_le'. fs/btrfs/uuid-tree.c:72 btrfs_uuid_tree_lookup() error: uninitialized symbol 'data'. fs/btrfs/volumes.c:7415 btrfs_dev_stats_value() error: uninitialized symbol 'val'. Fix those warnings by reverting back to the old memset() behavior. By this we keep the static checker happy and would still make a lot of noise when such invalid ranges are passed in. Reported-by: Dan Carpenter Fixes: f98b6215d7d1 ("btrfs: extent_io: do extra check for extent buffer read write functions") Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6954ae763b86..caccd0376342 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3995,8 +3995,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = get_eb_page_index(start); - if (check_eb_range(eb, start, len)) + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; + } offset = get_eb_offset_in_page(eb, start); From 20218dfbaa31b8d3ef842fafcc7eb4c6aa03f80a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Sep 2023 12:15:23 -0400 Subject: [PATCH 8/9] btrfs: make sure to initialize start and len in find_free_dev_extent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jens reported a compiler error when using CONFIG_CC_OPTIMIZE_FOR_SIZE=y that looks like this In function ‘gather_device_info’, inlined from ‘btrfs_create_chunk’ at fs/btrfs/volumes.c:5507:8: fs/btrfs/volumes.c:5245:48: warning: ‘dev_offset’ may be used uninitialized [-Wmaybe-uninitialized] 5245 | devices_info[ndevs].dev_offset = dev_offset; | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~ fs/btrfs/volumes.c: In function ‘btrfs_create_chunk’: fs/btrfs/volumes.c:5196:13: note: ‘dev_offset’ was declared here 5196 | u64 dev_offset; This occurs because find_free_dev_extent is responsible for setting dev_offset, however if we get an -ENOMEM at the top of the function we'll return without setting the value. This isn't actually a problem because we will see the -ENOMEM in gather_device_info() and return and not use the uninitialized value, however we also just don't want the compiler warning so rework the code slightly in find_free_dev_extent() to make sure it's always setting *start and *len to avoid the compiler warning. Reported-by: Jens Axboe Tested-by: Jens Axboe Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 733842136163..e3a3769fd92e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1594,7 +1594,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 search_start; u64 hole_size; u64 max_hole_start; - u64 max_hole_size; + u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; @@ -1602,17 +1602,16 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, struct extent_buffer *l; search_start = dev_extent_search_start(device); + max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - max_hole_start = search_start; - max_hole_size = 0; - + if (!path) { + ret = -ENOMEM; + goto out; + } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { From b4c639f699349880b7918b861e1bd360442ec450 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Sep 2023 12:15:24 -0400 Subject: [PATCH 9/9] btrfs: initialize start_slot in btrfs_log_prealloc_extents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jens reported a compiler warning when using CONFIG_CC_OPTIMIZE_FOR_SIZE=y that looks like this fs/btrfs/tree-log.c: In function ‘btrfs_log_prealloc_extents’: fs/btrfs/tree-log.c:4828:23: warning: ‘start_slot’ may be used uninitialized [-Wmaybe-uninitialized] 4828 | ret = copy_items(trans, inode, dst_path, path, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4829 | start_slot, ins_nr, 1, 0); | ~~~~~~~~~~~~~~~~~~~~~~~~~ fs/btrfs/tree-log.c:4725:13: note: ‘start_slot’ was declared here 4725 | int start_slot; | ^~~~~~~~~~ The compiler is incorrect, as we only use this code when ins_len > 0, and when ins_len > 0 we have start_slot properly initialized. However we generally find the -Wmaybe-uninitialized warnings valuable, so initialize start_slot to get rid of the warning. Reported-by: Jens Axboe Tested-by: Jens Axboe Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b9229c08164f..537eb3de8809 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4722,7 +4722,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; int ins_nr = 0; - int start_slot; + int start_slot = 0; int ret; if (!(inode->flags & BTRFS_INODE_PREALLOC))