diff --git a/crypto/algapi.c b/crypto/algapi.c index 74e2261c184c..004d27e41315 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -373,7 +373,7 @@ found: q->cra_flags |= CRYPTO_ALG_DEAD; alg = test->adult; - if (list_empty(&alg->cra_list)) + if (crypto_is_dead(alg)) goto complete; if (err == -ECANCELED) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index ee8da628e9da..2f5f6b52b2d4 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1940,7 +1940,7 @@ static int __alg_test_hash(const struct hash_testvec *vecs, atfm = crypto_alloc_ahash(driver, type, mask); if (IS_ERR(atfm)) { if (PTR_ERR(atfm) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: hash: failed to allocate transform for %s: %ld\n", driver, PTR_ERR(atfm)); return PTR_ERR(atfm); @@ -2706,7 +2706,7 @@ static int alg_test_aead(const struct alg_test_desc *desc, const char *driver, tfm = crypto_alloc_aead(driver, type, mask); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: aead: failed to allocate transform for %s: %ld\n", driver, PTR_ERR(tfm)); return PTR_ERR(tfm); @@ -3285,7 +3285,7 @@ static int alg_test_skcipher(const struct alg_test_desc *desc, tfm = crypto_alloc_skcipher(driver, type, mask); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: skcipher: failed to allocate transform for %s: %ld\n", driver, PTR_ERR(tfm)); return PTR_ERR(tfm); @@ -3700,7 +3700,7 @@ static int alg_test_cipher(const struct alg_test_desc *desc, tfm = crypto_alloc_cipher(driver, type, mask); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) - return -ENOENT; + return 0; printk(KERN_ERR "alg: cipher: Failed to load transform for " "%s: %ld\n", driver, PTR_ERR(tfm)); return PTR_ERR(tfm); @@ -3726,7 +3726,7 @@ static int alg_test_comp(const struct alg_test_desc *desc, const char *driver, acomp = crypto_alloc_acomp(driver, type, mask); if (IS_ERR(acomp)) { if (PTR_ERR(acomp) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: acomp: Failed to load transform for %s: %ld\n", driver, PTR_ERR(acomp)); return PTR_ERR(acomp); @@ -3740,7 +3740,7 @@ static int alg_test_comp(const struct alg_test_desc *desc, const char *driver, comp = crypto_alloc_comp(driver, type, mask); if (IS_ERR(comp)) { if (PTR_ERR(comp) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: comp: Failed to load transform for %s: %ld\n", driver, PTR_ERR(comp)); return PTR_ERR(comp); @@ -3818,7 +3818,7 @@ static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver, rng = crypto_alloc_rng(driver, type, mask); if (IS_ERR(rng)) { if (PTR_ERR(rng) == -ENOENT) - return -ENOENT; + return 0; printk(KERN_ERR "alg: cprng: Failed to load transform for %s: " "%ld\n", driver, PTR_ERR(rng)); return PTR_ERR(rng); @@ -3846,12 +3846,11 @@ static int drbg_cavs_test(const struct drbg_testvec *test, int pr, drng = crypto_alloc_rng(driver, type, mask); if (IS_ERR(drng)) { + kfree_sensitive(buf); if (PTR_ERR(drng) == -ENOENT) - goto out_no_rng; + return 0; printk(KERN_ERR "alg: drbg: could not allocate DRNG handle for " "%s\n", driver); -out_no_rng: - kfree_sensitive(buf); return PTR_ERR(drng); } @@ -4095,7 +4094,7 @@ static int alg_test_kpp(const struct alg_test_desc *desc, const char *driver, tfm = crypto_alloc_kpp(driver, type, mask); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: kpp: Failed to load tfm for %s: %ld\n", driver, PTR_ERR(tfm)); return PTR_ERR(tfm); @@ -4325,7 +4324,7 @@ static int alg_test_akcipher(const struct alg_test_desc *desc, tfm = crypto_alloc_akcipher(driver, type, mask); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) - return -ENOENT; + return 0; pr_err("alg: akcipher: Failed to load tfm for %s: %ld\n", driver, PTR_ERR(tfm)); return PTR_ERR(tfm); diff --git a/drivers/crypto/marvell/cesa/hash.c b/drivers/crypto/marvell/cesa/hash.c index 8d84ad45571c..f150861ceaf6 100644 --- a/drivers/crypto/marvell/cesa/hash.c +++ b/drivers/crypto/marvell/cesa/hash.c @@ -947,7 +947,7 @@ struct ahash_alg mv_md5_alg = { .base = { .cra_name = "md5", .cra_driver_name = "mv-md5", - .cra_priority = 300, + .cra_priority = 0, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY, @@ -1018,7 +1018,7 @@ struct ahash_alg mv_sha1_alg = { .base = { .cra_name = "sha1", .cra_driver_name = "mv-sha1", - .cra_priority = 300, + .cra_priority = 0, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY, @@ -1092,7 +1092,7 @@ struct ahash_alg mv_sha256_alg = { .base = { .cra_name = "sha256", .cra_driver_name = "mv-sha256", - .cra_priority = 300, + .cra_priority = 0, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY, @@ -1302,7 +1302,7 @@ struct ahash_alg mv_ahmac_md5_alg = { .base = { .cra_name = "hmac(md5)", .cra_driver_name = "mv-hmac-md5", - .cra_priority = 300, + .cra_priority = 0, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY, @@ -1373,7 +1373,7 @@ struct ahash_alg mv_ahmac_sha1_alg = { .base = { .cra_name = "hmac(sha1)", .cra_driver_name = "mv-hmac-sha1", - .cra_priority = 300, + .cra_priority = 0, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY, @@ -1444,7 +1444,7 @@ struct ahash_alg mv_ahmac_sha256_alg = { .base = { .cra_name = "hmac(sha256)", .cra_driver_name = "mv-hmac-sha256", - .cra_priority = 300, + .cra_priority = 0, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY, diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index be0743dac3ff..c4cf26f1d149 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -269,6 +269,8 @@ rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) break; #endif } + if (!ret && dev && is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); return ret ? ERR_PTR(ret) : dev; } diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 39f89a4b8649..7dc8e2ec62cc 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2816,6 +2816,8 @@ int rdma_nl_notify_event(struct ib_device *device, u32 port_num, nlh = nlmsg_put(skb, 0, 0, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), 0, 0); + if (!nlh) + goto err_free; switch (type) { case RDMA_REGISTER_EVENT: diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 128651c01595..1e63f8091748 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -366,7 +366,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } } - if (rdev->pacing.dbr_pacing) + if (rdev->pacing.dbr_pacing && bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) bnxt_re_copy_db_pacing_stats(rdev, stats); } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 460f33914825..e66ae9f22c71 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1307,7 +1307,11 @@ static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, 0 : BNXT_QPLIB_RESERVED_QP_WRS; entries = bnxt_re_init_depth(entries + diff + 1, uctx); sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); - sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + else + sq->max_sw_wqe = sq->max_wqe; + } sq->q_full_delta = diff + 1; /* diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 777068de4bbc..6715c96a3eee 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -188,8 +188,11 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) bnxt_re_set_db_offset(rdev); rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); - if (rc) + if (rc) { + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; return rc; + } if (bnxt_qplib_determine_atomics(en_dev->pdev)) ibdev_info(&rdev->ibdev, @@ -531,6 +534,7 @@ static bool is_dbr_fifo_full(struct bnxt_re_dev *rdev) static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) { struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; + u32 retry_fifo_check = 1000; u32 fifo_occup; /* loop shouldn't run infintely as the occupancy usually goes @@ -544,6 +548,14 @@ static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) if (fifo_occup < pacing_data->pacing_th) break; + if (!retry_fifo_check--) { + dev_info_once(rdev_to_dev(rdev), + "%s: fifo_occup = 0x%xfifo_max_depth = 0x%x pacing_th = 0x%x\n", + __func__, fifo_occup, pacing_data->fifo_max_depth, + pacing_data->pacing_th); + break; + } + } } @@ -957,7 +969,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); } -static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, +static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, struct bnxt_en_dev *en_dev) { struct bnxt_re_dev *rdev; @@ -973,6 +985,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, rdev->nb.notifier_call = NULL; rdev->netdev = en_dev->net; rdev->en_dev = en_dev; + rdev->adev = adev; rdev->id = rdev->en_dev->pdev->devfn; INIT_LIST_HEAD(&rdev->qp_list); mutex_init(&rdev->qp_lock); @@ -1025,12 +1038,15 @@ static int bnxt_re_handle_unaffi_async_event(struct creq_func_event static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event, struct bnxt_re_qp *qp) { - struct bnxt_re_srq *srq = container_of(qp->qplib_qp.srq, struct bnxt_re_srq, - qplib_srq); struct creq_qp_error_notification *err_event; + struct bnxt_re_srq *srq = NULL; struct ib_event event = {}; unsigned int flags; + if (qp->qplib_qp.srq) + srq = container_of(qp->qplib_qp.srq, struct bnxt_re_srq, + qplib_srq); + if (qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR && rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -1258,15 +1274,9 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, { struct bnxt_re_cq *cq = container_of(handle, struct bnxt_re_cq, qplib_cq); - u32 *cq_ptr; - if (cq->ib_cq.comp_handler) { - if (cq->uctx_cq_page) { - cq_ptr = (u32 *)cq->uctx_cq_page; - *cq_ptr = cq->qplib_cq.toggle; - } + if (cq->ib_cq.comp_handler) (*cq->ib_cq.comp_handler)(&cq->ib_cq, cq->ib_cq.cq_context); - } return 0; } @@ -1823,7 +1833,6 @@ static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, */ rtnl_lock(); en_info->rdev = rdev; - rdev->adev = adev; rtnl_unlock(); } @@ -1840,7 +1849,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) en_dev = en_info->en_dev; - rdev = bnxt_re_dev_add(aux_priv, en_dev); + rdev = bnxt_re_dev_add(adev, en_dev); if (!rdev || !rdev_to_dev(rdev)) { rc = -ENOMEM; goto exit; @@ -1865,12 +1874,14 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) rdev->nb.notifier_call = NULL; pr_err("%s: Cannot register to netdevice_notifier", ROCE_DRV_MODULE_NAME); - return rc; + goto re_dev_unreg; } bnxt_re_setup_cc(rdev, true); return 0; +re_dev_unreg: + ib_unregister_device(&rdev->ibdev); re_dev_uninit: bnxt_re_update_en_info_rdev(NULL, en_info, adev); bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); @@ -2014,15 +2025,7 @@ static int bnxt_re_probe(struct auxiliary_device *adev, auxiliary_set_drvdata(adev, en_info); rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT); - if (rc) - goto err; mutex_unlock(&bnxt_re_mutex); - return 0; - -err: - mutex_unlock(&bnxt_re_mutex); - bnxt_re_remove(adev); - return rc; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 42e98e5f94cb..2ebcb2de962b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -327,6 +327,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_CQ_NOTIFICATION: { struct nq_cn *nqcne = (struct nq_cn *)nqe; + struct bnxt_re_cq *cq_p; q_handle = le32_to_cpu(nqcne->cq_handle_low); q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high) @@ -337,6 +338,10 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) cq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) >> NQ_CN_TOGGLE_SFT; cq->dbinfo.toggle = cq->toggle; + cq_p = container_of(cq, struct bnxt_re_cq, qplib_cq); + if (cq_p->uctx_cq_page) + *((u32 *)cq_p->uctx_cq_page) = cq->toggle; + bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA); spin_lock_bh(&cq->compl_lock); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index b62df8701950..820611a23943 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -170,7 +170,7 @@ struct bnxt_qplib_swqe { }; u32 q_key; u32 dst_qp; - u16 avid; + u32 avid; } send; /* Send Raw Ethernet and QP1 */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3ffaef0c2651..7294221b3316 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -525,7 +525,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* failed with status */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", cookie, opcode, evnt->status); - rc = -EFAULT; + rc = -EIO; } return rc; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index dfc943fab87b..96ceec1e8199 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -244,6 +244,8 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, sginfo.pgsize = npde * pg_size; sginfo.npages = 1; rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo); + if (rc) + goto fail; /* Alloc PBL pages */ sginfo.npages = npbl; @@ -255,22 +257,9 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, dst_virt_ptr = (dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr; src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr; - if (hwq_attr->type == HWQ_TYPE_MR) { - /* For MR it is expected that we supply only 1 contigous - * page i.e only 1 entry in the PDL that will contain - * all the PBLs for the user supplied memory region - */ - for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; - i++) - dst_virt_ptr[0][i] = src_phys_ptr[i] | - flag; - } else { - for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; - i++) - dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = - src_phys_ptr[i] | - PTU_PDE_VALID; - } + for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) + dst_virt_ptr[0][i] = src_phys_ptr[i] | flag; + /* Alloc or init PTEs */ rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_2], hwq_attr->sginfo); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 4f75e7e5bcf7..e29fbbdab9fd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -140,6 +140,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); + if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) + attr->max_cq_wqes = min_t(u32, BNXT_QPLIB_MAX_CQ_WQES, attr->max_cq_wqes); attr->max_cq_sges = attr->max_qp_sges; attr->max_mr = le32_to_cpu(sb->max_mr); attr->max_mw = le32_to_cpu(sb->max_mw); @@ -157,7 +159,14 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) attr->l2_db_size = (sb->l2_db_space_size + 1) * (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); - attr->max_sgid = BNXT_QPLIB_NUM_GIDS_SUPPORTED; + /* + * Read the max gid supported by HW. + * For each entry in HW GID in HW table, we consume 2 + * GID entries in the kernel GID table. So max_gid reported + * to stack can be up to twice the value reported by the HW, up to 256 gids. + */ + attr->max_sgid = le32_to_cpu(sb->max_gid); + attr->max_sgid = min_t(u32, BNXT_QPLIB_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid); attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index acd9c14a31c4..ecf3f45fea74 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -56,6 +56,7 @@ struct bnxt_qplib_dev_attr { u32 max_qp_wqes; u32 max_qp_sges; u32 max_cq; +#define BNXT_QPLIB_MAX_CQ_WQES 0xfffff u32 max_cq_wqes; u32 max_cq_sges; u32 max_mr; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index b3757c6a0457..8d753e6e0c71 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2086,7 +2086,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, err = -ENOMEM; if (n->dev->flags & IFF_LOOPBACK) { if (iptype == 4) - pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip); + pdev = __ip_dev_find(&init_net, *(__be32 *)peer_ip, false); else if (IS_ENABLED(CONFIG_IPV6)) for_each_netdev(&init_net, pdev) { if (ipv6_chk_addr(&init_net, @@ -2101,12 +2101,12 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, err = -ENODEV; goto out; } + if (is_vlan_dev(pdev)) + pdev = vlan_dev_real_dev(pdev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, n, pdev, rt_tos2priority(tos)); - if (!ep->l2t) { - dev_put(pdev); + if (!ep->l2t) goto out; - } ep->mtu = pdev->mtu; ep->tx_chan = cxgb4_port_chan(pdev); ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx; @@ -2119,7 +2119,6 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, ep->rss_qid = cdev->rdev.lldi.rxq_ids[ cxgb4_port_idx(pdev) * step]; set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); - dev_put(pdev); } else { pdev = get_real_dev(n->dev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 36bb7e5ce638..ce8d821bdad8 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3631,7 +3631,7 @@ void irdma_free_lsmm_rsrc(struct irdma_qp *iwqp) /** * irdma_accept - registered call for connection to be accepted * @cm_id: cm information for passive connection - * @conn_param: accpet parameters + * @conn_param: accept parameters */ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 64ad9e0895bd..a034264c5669 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -331,6 +331,8 @@ static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, msg.msg_flags &= ~MSG_MORE; tcp_rate_check_app_limited(sk); + if (!sendpage_ok(page[i])) + msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page[i], bytes, offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 9632afbd727b..5dfb4644446b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -68,6 +68,8 @@ MODULE_LICENSE("Dual BSD/GPL"); static u64 srpt_service_guid; static DEFINE_SPINLOCK(srpt_dev_lock); /* Protects srpt_dev_list. */ static LIST_HEAD(srpt_dev_list); /* List of srpt_device structures. */ +static DEFINE_MUTEX(srpt_mc_mutex); /* Protects srpt_memory_caches. */ +static DEFINE_XARRAY(srpt_memory_caches); /* See also srpt_memory_cache_entry */ static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE; module_param(srp_max_req_size, int, 0444); @@ -105,6 +107,63 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc); static void srpt_process_wait_list(struct srpt_rdma_ch *ch); +/* Type of the entries in srpt_memory_caches. */ +struct srpt_memory_cache_entry { + refcount_t ref; + struct kmem_cache *c; +}; + +static struct kmem_cache *srpt_cache_get(unsigned int object_size) +{ + struct srpt_memory_cache_entry *e; + char name[32]; + void *res; + + guard(mutex)(&srpt_mc_mutex); + e = xa_load(&srpt_memory_caches, object_size); + if (e) { + refcount_inc(&e->ref); + return e->c; + } + snprintf(name, sizeof(name), "srpt-%u", object_size); + e = kmalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return NULL; + refcount_set(&e->ref, 1); + e->c = kmem_cache_create(name, object_size, /*align=*/512, 0, NULL); + if (!e->c) + goto free_entry; + res = xa_store(&srpt_memory_caches, object_size, e, GFP_KERNEL); + if (xa_is_err(res)) + goto destroy_cache; + return e->c; + +destroy_cache: + kmem_cache_destroy(e->c); + +free_entry: + kfree(e); + return NULL; +} + +static void srpt_cache_put(struct kmem_cache *c) +{ + struct srpt_memory_cache_entry *e = NULL; + unsigned long object_size; + + guard(mutex)(&srpt_mc_mutex); + xa_for_each(&srpt_memory_caches, object_size, e) + if (e->c == c) + break; + if (WARN_ON_ONCE(!e)) + return; + if (!refcount_dec_and_test(&e->ref)) + return; + WARN_ON_ONCE(xa_erase(&srpt_memory_caches, object_size) != e); + kmem_cache_destroy(e->c); + kfree(e); +} + /* * The only allowed channel state changes are those that change the channel * state into a state with a higher numerical value. Hence the new > prev test. @@ -2119,13 +2178,13 @@ static void srpt_release_channel_work(struct work_struct *w) ch->sport->sdev, ch->rq_size, ch->rsp_buf_cache, DMA_TO_DEVICE); - kmem_cache_destroy(ch->rsp_buf_cache); + srpt_cache_put(ch->rsp_buf_cache); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring, sdev, ch->rq_size, ch->req_buf_cache, DMA_FROM_DEVICE); - kmem_cache_destroy(ch->req_buf_cache); + srpt_cache_put(ch->req_buf_cache); kref_put(&ch->kref, srpt_free_ch); } @@ -2245,8 +2304,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, INIT_LIST_HEAD(&ch->cmd_wait_list); ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size; - ch->rsp_buf_cache = kmem_cache_create("srpt-rsp-buf", ch->max_rsp_size, - 512, 0, NULL); + ch->rsp_buf_cache = srpt_cache_get(ch->max_rsp_size); if (!ch->rsp_buf_cache) goto free_ch; @@ -2280,8 +2338,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, alignment_offset = round_up(imm_data_offset, 512) - imm_data_offset; req_sz = alignment_offset + imm_data_offset + srp_max_req_size; - ch->req_buf_cache = kmem_cache_create("srpt-req-buf", req_sz, - 512, 0, NULL); + ch->req_buf_cache = srpt_cache_get(req_sz); if (!ch->req_buf_cache) goto free_rsp_ring; @@ -2478,7 +2535,7 @@ free_recv_ring: ch->req_buf_cache, DMA_FROM_DEVICE); free_recv_cache: - kmem_cache_destroy(ch->req_buf_cache); + srpt_cache_put(ch->req_buf_cache); free_rsp_ring: srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, @@ -2486,7 +2543,7 @@ free_rsp_ring: ch->rsp_buf_cache, DMA_TO_DEVICE); free_rsp_cache: - kmem_cache_destroy(ch->rsp_buf_cache); + srpt_cache_put(ch->rsp_buf_cache); free_ch: if (rdma_cm_id) @@ -3055,7 +3112,7 @@ static void srpt_free_srq(struct srpt_device *sdev) srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, sdev->srq_size, sdev->req_buf_cache, DMA_FROM_DEVICE); - kmem_cache_destroy(sdev->req_buf_cache); + srpt_cache_put(sdev->req_buf_cache); sdev->srq = NULL; } @@ -3082,8 +3139,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size, sdev->device->attrs.max_srq_wr, dev_name(&device->dev)); - sdev->req_buf_cache = kmem_cache_create("srpt-srq-req-buf", - srp_max_req_size, 0, 0, NULL); + sdev->req_buf_cache = srpt_cache_get(srp_max_req_size); if (!sdev->req_buf_cache) goto free_srq; @@ -3105,7 +3161,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) return 0; free_cache: - kmem_cache_destroy(sdev->req_buf_cache); + srpt_cache_put(sdev->req_buf_cache); free_srq: ib_destroy_srq(srq); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4e4a448f6931..6e161f8ffe8d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c) continue; } + if (k.k->p.offset < ca->mi.first_bucket) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + continue; + } + + if (k.k->p.offset >= ca->mi.nbuckets) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 0; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 84832c2d4df9..5004f6ba997c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -678,7 +678,8 @@ struct bch_sb_field_ext { x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 771154e3a291..94bbd8505582 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c) u64 b, start_time = local_clock(); int ret; - /* - * Ideally we would be using state_lock and not gc_gens_lock here, but that - * introduces a deadlock in the RO path - we currently take the state - * lock at the start of going RO, thus the gc thread may get stuck: - */ if (!mutex_trylock(&c->gc_gens_lock)) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->state_lock); + /* + * We have to use trylock here. Otherwise, we would + * introduce a deadlock in the RO path - we take the + * state lock at the start of going RO. + */ + if (!down_read_trylock(&c->state_lock)) { + mutex_unlock(&c->gc_gens_lock); + return 0; + } for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 1c1448b52207..cf933409d385 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + + /* we don't need transaction context anymore after we got the lock. */ + bch2_trans_put(trans); __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); } static void btree_node_write_work(struct work_struct *work) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index bfe9f0c1e1be..0883cf6e1a3e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(iter_pos, end) - : bkey_ge(iter_pos, end))) + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : + bkey_gt(iter_pos, end))) goto end; break; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 78e63ad7d380..31a58bf46fdb 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(&(_iter))) + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 1e694fedc5da..a7aedb134e9f 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) return; + if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 462b1a2fe1ad..a6ee0beee6b0 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc if (ptr2 == ptr) break; + ca = bch2_dev_have_ref(c, ptr2->dev); bucket = PTR_BUCKET_POS(ca, ptr2); bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); } diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9f3133e3e7e5..e309fb78529b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k) *p = swab64(*p); } +static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, + struct disk_accounting_pos acc) +{ + unsafe_memcpy(r, &acc.replicas, + replicas_entry_bytes(&acc.replicas), + "variable length struct"); +} + static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; @@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - unsafe_memcpy(r, &acc_k.replicas, - replicas_entry_bytes(&acc_k.replicas), - "variable length struct"); + __accounting_to_replicas(r, acc_k); return true; default: return false; @@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return ret; } +static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + struct disk_accounting_pos acc, + u64 *v, unsigned nr) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0, invalid_dev = -1; + + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r.e.devs[i])) { + invalid_dev = r.e.devs[i]; + goto invalid_device; + } + + /* + * All replicas entry checks except for invalid device are done + * in bch2_accounting_validate + */ + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_write(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_write(&c->mark_lock); + } + break; + } + + case BCH_DISK_ACCOUNTING_dev_data_type: + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { + invalid_dev = acc.dev_data_type.dev; + goto invalid_device; + } + break; + } + +fsck_err: + printbuf_exit(&buf); + return ret; +invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n %s", + invalid_dev, + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { + ret = -BCH_ERR_remove_disk_accounting_entry; + } + goto fsck_err; +} + /* * At startup time, initialize the in memory accounting from the btree (and * journal) @@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c) } keys->gap = keys->nr = dst - keys->data; - percpu_down_read(&c->mark_lock); - for (unsigned i = 0; i < acc->k.nr; i++) { + percpu_down_write(&c->mark_lock); + unsigned i = 0; + while (i < acc->k.nr) { + unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) - continue; - - struct bch_replicas_padded r; - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) - continue; + bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); /* - * If the replicas entry is invalid it'll get cleaned up by - * check_allocations: + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: */ - if (bch2_replicas_entry_validate(&r.e, c, &buf)) + ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, acc_k, + v, acc->k.data[idx].nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(acc->k.data[idx].v[0]); + free_percpu(acc->k.data[idx].v[1]); + darray_remove_item(&acc->k, &acc->k.data[idx]); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + ret = 0; continue; - - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &k), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_read(&c->mark_lock); } + + if (ret) + goto fsck_err; + i++; } preempt_disable(); @@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c) } preempt_enable(); fsck_err: - percpu_up_read(&c->mark_lock); + percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1587c6e1866a..e410cfe37b1a 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); + bkey_fsck_err_on(s->csum_granularity_bits >= 64, + c, stripe_csum_granularity_bad, + "invalid csum granularity (%u >= 64)", + s->csum_granularity_bits); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; @@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, nr_data, s.nr_redundant); bch2_prt_csum_type(out, s.csum_type); - prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + prt_str(out, " gran "); + if (s.csum_granularity_bits < 64) + prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); + else + prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); if (s.disk_label) { prt_str(out, " label"); @@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c) /* stripe creation: */ static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *new, - bool create) + struct bkey_i_stripe *old, + struct bkey_i_stripe *new) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; + bool create = !old; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { - bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type]); + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), + c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type])) { ret = -EINVAL; goto err; } if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; - unsigned i; + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - if (old->nr_blocks != new->v.nr_blocks) { - bch_err(c, "error updating stripe: nr_blocks does not match"); - ret = -EINVAL; - goto err; - } + BUG_ON(old->v.nr_blocks != new->v.nr_blocks); + BUG_ON(old->v.nr_blocks != v->nr_blocks); - for (i = 0; i < new->v.nr_blocks; i++) { - unsigned v = stripe_blockcount_get(old, i); + for (unsigned i = 0; i < new->v.nr_blocks; i++) { + unsigned sectors = stripe_blockcount_get(v, i); - BUG_ON(v && - (old->ptrs[i].dev != new->v.ptrs[i].dev || - old->ptrs[i].gen != new->v.ptrs[i].gen || - old->ptrs[i].offset != new->v.ptrs[i].offset)); + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { + struct printbuf buf = PRINTBUF; - stripe_blockcount_set(&new->v, i, v); + prt_printf(&buf, "stripe changed nonempty block %u", i); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + /* + * If the stripe ptr changed underneath us, it must have + * been dev_remove_stripes() -> * invalidate_stripe_to_dev() + */ + if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { + BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); + + if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) + new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; + } + + stripe_blockcount_set(&new->v, i, sectors); } } @@ -1499,8 +1523,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, ec_stripe_key_update(trans, - bkey_i_to_stripe(&s->new_stripe.key), - !s->have_existing_stripe)); + s->have_existing_stripe + ? bkey_i_to_stripe(&s->existing_stripe.key) + : NULL, + bkey_i_to_stripe(&s->new_stripe.key))); bch_err_msg(c, ret, "creating stripe key"); if (ret) { goto err; @@ -1876,7 +1902,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { - __clear_bit(v->ptrs[i].dev, devs.d); + /* + * Note: we don't yet repair invalid blocks (failed/removed + * devices) when reusing stripes - we still need a codepath to + * walk backpointers and update all extents that point to that + * block when updating the stripe + */ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) + __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) nr_have_data++; else diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 60b7875adada..649263516ab1 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -268,7 +268,8 @@ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) + x(0, option_needs_open_fs) \ + x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index ed5001dd662e..923a5f1849a8 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -695,6 +695,16 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + struct bch_extent_ptr ptr2) +{ + return (ptr1.cached == ptr2.cached && + ptr1.unwritten == ptr2.unwritten && + ptr1.offset == ptr2.offset && + ptr1.dev == ptr2.dev && + ptr1.dev == ptr2.dev); +} + void bch2_ptr_swab(struct bkey_s); const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index ee1c0325f313..6d3a05ae5da8 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio) static __always_inline long bch2_dio_write_done(struct dio_write *dio) { + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; bool sync = dio->sync; @@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 02969dff165d..d7bd28bdaffc 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) return a.subvol == b.subvol && a.inum == b.inum; } +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + + return jhash(&inum->inum, sizeof(inum->inum), seed); +} + +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +{ + const struct bch_inode_info *inode = data; + + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); +} + static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +{ + struct bch_fs *c = trans->c; + struct rhashtable *ht = &c->vfs_inodes_table; + subvol_inum inum = (subvol_inum) { .inum = p.offset }; + DARRAY(u32) subvols; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return false; + + darray_init(&subvols); +restart_from_top: + + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum.inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; + } +err: + darray_exit(&subvols); + return ret; +} + +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } @@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c, subvol_inum inum) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wait; + wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); @@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index da74ecc236e7..59f9f7ae728d 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) return inode->ei_inum; } -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); - /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -148,6 +146,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -198,10 +198,7 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) -static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return NULL; -} +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b8a6ceb0cc7a..a1087fd292e4 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -326,17 +326,54 @@ err: return ret; } +static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) +{ + if (inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) + return false; + + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); +} + +static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, + SPOS(d_pos.inode, d_pos.offset, snapshot), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bpos_eq(k.k->p, d_pos)) { + /* + * delet_at() doesn't work because the update path doesn't + * internally use BTREE_ITER_with_updates yet + */ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + bkey_init(&k->k); + k->k.type = KEY_TYPE_whiteout; + k->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) { struct bch_fs *c = trans->c; - struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; - struct qstr name; - u64 dir_offset = 0; - u32 dirent_snapshot = inode->bi_snapshot; int ret; + u32 dirent_snapshot = inode->bi_snapshot; if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; @@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; - dir_hash = bch2_hash_info_init(c, &lostfound); + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); + struct qstr name = (struct qstr) QSTR(name_buf); - name = (struct qstr) QSTR(name_buf); + inode->bi_dir = lostfound.bi_inum; ret = bch2_dirent_create_snapshot(trans, inode->bi_parent_subvol, lostfound.bi_inum, @@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * inode_d_type(inode), &name, inode->bi_subvol ?: inode->bi_inum, - &dir_offset, + &inode->bi_dir_offset, STR_HASH_must_create); if (ret) { bch_err_msg(c, ret, "error creating dirent"); return ret; } - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; + ret = __bch2_fsck_write_inode(trans, inode); + if (ret) + return ret; - return __bch2_fsck_write_inode(trans, inode); + /* + * Fix up inodes in child snapshots: if they should also be reattached + * update the backpointer field, if they should not be we need to emit + * whiteouts for the dirent we just created. + */ + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { + snapshot_id_list whiteouts_done; + struct btree_iter iter; + struct bkey_s_c k; + + darray_init(&whiteouts_done); + + for_each_btree_key_reverse_norestart(trans, iter, + BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), + BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bkey_is_inode(k.k) || + !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || + snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) + continue; + + struct bch_inode_unpacked child_inode; + bch2_inode_unpack(k, &child_inode); + + if (!inode_should_reattach(&child_inode)) { + ret = maybe_delete_dirent(trans, + SPOS(lostfound.bi_inum, inode->bi_dir_offset, + dirent_snapshot), + k.k->p.snapshot); + if (ret) + break; + + ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + if (ret) + break; + } else { + iter.snapshot = k.k->p.snapshot; + child_inode.bi_dir = inode->bi_dir; + child_inode.bi_dir_offset = inode->bi_dir_offset; + + ret = bch2_inode_write_flags(trans, &iter, &child_inode, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + break; + } + } + darray_exit(&whiteouts_done); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; } static int remove_backpointer(struct btree_trans *trans, @@ -994,7 +1085,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, */ inode->bi_dir = 0; inode->bi_dir_offset = 0; - inode->bi_flags &= ~BCH_INODE_backptr_untrusted; *write_inode = true; } @@ -1006,28 +1096,11 @@ fsck_err: return ret; } -static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) -{ - subvol_inum inum = { - .subvol = snapshot_t(c, p.snapshot)->subvol, - .inum = p.offset, - }; - - /* snapshot tree corruption, can't safely delete */ - if (!inum.subvol) { - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); - return true; - } - - return __bch2_inode_hash_find(c, inum) != NULL; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bch_inode_unpacked *prev, - struct snapshots_seen *s, - bool full) + struct snapshots_seen *s) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -1050,12 +1123,6 @@ static int check_inode(struct btree_trans *trans, BUG_ON(bch2_inode_unpack(k, &u)); - if (!full && - !(u.bi_flags & (BCH_INODE_i_size_dirty| - BCH_INODE_i_sectors_dirty| - BCH_INODE_unlinked))) - return 0; - if (prev->bi_inum != u.bi_inum) *prev = u; @@ -1101,28 +1168,27 @@ static int check_inode(struct btree_trans *trans, ret = 0; } - if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && - bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { - struct bpos new_min_pos; + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; - ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); + if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), + trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be %u)\n%s", + ret, + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { if (ret) - goto err; - - u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - - ret = __bch2_fsck_write_inode(trans, &u); - - bch_err_msg(c, ret, "in fsck updating inode"); - if (ret) - goto err_noprint; - - if (!bpos_eq(new_min_pos, POS_MIN)) - bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); - goto err_noprint; + u.bi_flags |= BCH_INODE_has_child_snapshot; + else + u.bi_flags &= ~BCH_INODE_has_child_snapshot; + do_update = true; } + ret = 0; - if (u.bi_flags & BCH_INODE_unlinked) { + if ((u.bi_flags & BCH_INODE_unlinked) && + !(u.bi_flags & BCH_INODE_has_child_snapshot)) { if (!test_bit(BCH_FS_started, &c->flags)) { /* * If we're not in online fsck, don't delete unlinked @@ -1147,7 +1213,11 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; } else { - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, "inode %llu%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { @@ -1155,69 +1225,10 @@ static int check_inode(struct btree_trans *trans, bch_err_msg(c, ret, "in fsck deleting inode"); goto err_noprint; } + ret = 0; } } - /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ - if (u.bi_flags & BCH_INODE_i_size_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_size_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_size dirty", - u.bi_inum))) { - bch_verbose(c, "truncating inode %llu", u.bi_inum); - - /* - * XXX: need to truncate partial blocks too here - or ideally - * just switch units to bytes and that issue goes away - */ - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, - iter->pos.snapshot), - POS(u.bi_inum, U64_MAX), - 0, NULL); - bch_err_msg(c, ret, "in fsck truncating inode"); - if (ret) - return ret; - - /* - * We truncated without our normal sector accounting hook, just - * make sure we recalculate it: - */ - u.bi_flags |= BCH_INODE_i_sectors_dirty; - - u.bi_flags &= ~BCH_INODE_i_size_dirty; - do_update = true; - } - - /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ - if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_sectors_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_sectors dirty", - u.bi_inum))) { - s64 sectors; - - bch_verbose(c, "recounting sectors for inode %llu", - u.bi_inum); - - sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); - if (sectors < 0) { - bch_err_msg(c, sectors, "in fsck recounting inode sectors"); - return sectors; - } - - u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_i_sectors_dirty; - do_update = true; - } - - if (u.bi_flags & BCH_INODE_backptr_untrusted) { - u.bi_dir = 0; - u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_backptr_untrusted; - do_update = true; - } - if (fsck_err_on(u.bi_parent_subvol && (u.bi_subvol == 0 || u.bi_subvol == BCACHEFS_ROOT_SUBVOL), @@ -1274,7 +1285,6 @@ err_noprint: int bch2_check_inodes(struct bch_fs *c) { - bool full = c->opts.fsck; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; @@ -1285,13 +1295,104 @@ int bch2_check_inodes(struct bch_fs *c) POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &prev, &s, full))); + check_inode(trans, &iter, k, &prev, &s))); snapshots_seen_exit(&s); bch_err_fn(c, ret); return ret; } +static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + /* + * We look for inodes to reattach in natural key order, leaves first, + * but we should do the reattach at the oldest version that needs to be + * reattached: + */ + for_each_btree_key_norestart(trans, iter, + BTREE_ID_inodes, + SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) + continue; + + if (!bkey_is_inode(k.k)) + break; + + struct bch_inode_unpacked parent_inode; + bch2_inode_unpack(k, &parent_inode); + + if (!inode_should_reattach(&parent_inode)) + break; + + *inode = parent_inode; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_unreachable_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!bkey_is_inode(k.k)) + return 0; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + if (!inode_should_reattach(&inode)) + return 0; + + ret = find_oldest_inode_needs_reattach(trans, &inode); + if (ret) + return ret; + + if (fsck_err(trans, inode_unreachable, + "unreachable inode:\n%s", + (bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) + ret = reattach_inode(trans, &inode); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * Reattach unreachable (but not unlinked) inodes + * + * Run after check_inodes() and check_dirents(), so we node that inode + * backpointer fields point to valid dirents, and every inode that has a dirent + * that points to it has its backpointer field set - so we're just looking for + * non-unlinked inodes without backpointers: + * + * XXX: this is racy w.r.t. hardlink removal in online fsck + */ +int bch2_check_unreachable_inodes(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_unreachable_inode(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) { switch (btree) { @@ -1694,8 +1795,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", @@ -2450,22 +2550,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (ret) break; - /* - * We've checked that inode backpointers point to valid dirents; - * here, it's sufficient to check that the subvolume root has a - * dirent: - */ - if (fsck_err_on(!subvol_root.bi_dir, - trans, subvol_unreachable, - "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, &subvol_root), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { @@ -2526,12 +2610,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -/* - * Check that a given inode is reachable from its subvolume root - we already - * verified subvolume connectivity: - * - * XXX: we should also be verifying that inodes are in the right subvolumes - */ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; @@ -2545,6 +2623,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino BUG_ON(bch2_inode_unpack(inode_k, &inode)); + if (!S_ISDIR(inode.bi_mode)) + return 0; + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; @@ -2559,21 +2640,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); if (bch2_err_matches(ret, ENOENT)) { - ret = 0; - if (fsck_err(trans, inode_unreachable, - "unreachable inode\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, inode_k), - buf.buf))) - ret = reattach_inode(trans, &inode); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, inode_k); + bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", + bch2_err_str(ret), buf.buf); goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode.bi_mode)) - break; - ret = darray_push(p, ((struct pathbuf_entry) { .inum = inode.bi_inum, .snapshot = snapshot, @@ -2626,9 +2701,8 @@ fsck_err: } /* - * Check for unreachable inodes, as well as loops in the directory structure: - * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's - * unreachable: + * Check for loops in the directory structure: all other connectivity issues + * have been fixed by prior passes */ int bch2_check_directory_structure(struct bch_fs *c) { @@ -2756,6 +2830,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, if (S_ISDIR(u.bi_mode)) continue; + /* + * Previous passes ensured that bi_nlink is nonzero if + * it had multiple hardlinks: + */ if (!u.bi_nlink) continue; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index a4ef94271784..1cca31011530 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); int bch2_check_subvolume_structure(struct bch_fs *); +int bch2_check_unreachable_inodes(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 3e5bc01961b8..344ccb7a824c 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "extent_update.h" +#include "fs.h" #include "inode.h" #include "str_hash.h" #include "snapshot.h" @@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = { }; #undef x +static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -575,9 +578,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) } } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) { - return bkey_inode_flags(k) & BCH_INODE_unlinked; + switch (k.k->type) { + case KEY_TYPE_inode: + bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); + return; + case KEY_TYPE_inode_v2: + bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); + return; + case KEY_TYPE_inode_v3: + bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); + return; + default: + BUG(); + } +} + +static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) +{ + unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + + return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); +} + +static struct bkey_s_c +bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos pos, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, *iter, btree, + bpos_successor(pos), + SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) + return k; + + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +static struct bkey_s_c +bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos, unsigned flags) +{ + struct bkey_s_c k; +again: + k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); + if (!k.k || + bkey_err(k) || + bkey_is_inode(k.k)) + return k; + + bch2_trans_iter_exit(trans, iter); + pos = k.k->p; + goto again; +} + +int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, iter, + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_with_updates, k, ret) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && + bkey_is_inode(k.k)) { + ret = 1; + break; + } + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int update_inode_has_children(struct btree_trans *trans, + struct bkey_s k, + bool have_child) +{ + if (!have_child) { + int ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) + return ret < 0 ? ret : 0; + } + + u64 f = bkey_inode_flags(k.s_c); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) + bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); + + return 0; +} + +static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, + bool have_child) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, + &iter, pos, BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + if (!have_child) { + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) { + ret = ret < 0 ? ret : 0; + goto err; + } + } + + u64 f = bkey_inode_flags(k); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { + struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } int bch2_trigger_inode(struct btree_trans *trans, @@ -586,6 +717,8 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); @@ -599,13 +732,41 @@ int bch2_trigger_inode(struct btree_trans *trans, return ret; } - int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) - - (int) bkey_is_deleted_inode(old); - if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, deleted_delta > 0); - if (ret) - return ret; + if (flags & BTREE_TRIGGER_transactional) { + int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - + (int) bkey_is_unlinked_inode(old); + if (unlinked_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, unlinked_delta > 0); + if (ret) + return ret; + } + + /* + * If we're creating or deleting an inode at this snapshot ID, + * and there might be an inode in a parent snapshot ID, we might + * need to set or clear the has_child_snapshot flag on the + * parent. + */ + int deleted_delta = (int) bkey_is_inode(new.k) - + (int) bkey_is_inode(old.k); + if (deleted_delta && + bch2_snapshot_parent(c, new.k->p.snapshot)) { + int ret = update_parent_inode_has_children(trans, new.k->p, + deleted_delta > 0); + if (ret) + return ret; + } + + /* + * When an inode is first updated in a new snapshot, we may need + * to clear has_child_snapshot + */ + if (deleted_delta > 0) { + int ret = update_inode_has_children(trans, new, false); + if (ret) + return ret; + } } return 0; @@ -888,6 +1049,11 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (ret) + goto err2; + + ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); +err2: bch2_trans_put(trans); return ret; } @@ -992,7 +1158,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; @@ -1055,6 +1221,45 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } +/* + * After deleting an inode, there may be versions in older snapshots that should + * also be deleted - if they're not referenced by sibling snapshots and not open + * in other subvolumes: + */ +static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; +next_parent: + ret = lockrestart_do(trans, + bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); + if (ret || !k.k) + return ret; + + bool unlinked = bkey_is_unlinked_inode(k); + pos = k.k->p; + bch2_trans_iter_exit(trans, &iter); + + if (!unlinked) + return 0; + + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; + + ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); + if (ret) + return ret; + goto next_parent; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); +} + static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos, @@ -1064,6 +1269,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); @@ -1099,6 +1305,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans, pos.offset, pos.snapshot)) goto delete; + if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto out; + + if (ret) { + if (fsck_err(trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be set)\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) { + inode.bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto out; + } + goto delete; + + } + if (test_bit(BCH_FS_clean_recovery, &c->flags) && !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", @@ -1107,33 +1338,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); - if (ret) - goto out; - - inode.bi_flags &= ~BCH_INODE_unlinked; - - ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_internal_snapshot_node); - bch_err_msg(c, ret, "clearing inode unlinked flag"); - if (ret) - goto out; - - /* - * We'll need another write buffer flush to pick up the new - * unlinked inodes in the snapshot leaves: - */ - *need_another_pass = true; - goto out; - } - ret = 1; out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); return ret; delete: ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 9c1f67705684..c8e98443e2d4 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -5,6 +5,7 @@ #include "bkey.h" #include "bkey_methods.h" #include "opts.h" +#include "snapshot.h" enum bch_validate_flags; extern const char * const bch2_inode_opts[]; @@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); + +static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 + ? __bch2_inode_has_child_snapshots(trans, pos) + : 0; +} + int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 83d107331edf..a204e46b6b47 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -133,7 +133,8 @@ enum inode_opt_id { x(i_size_dirty, 5) \ x(i_sectors_dirty, 6) \ x(unlinked, 7) \ - x(backptr_untrusted, 8) + x(backptr_untrusted, 8) \ + x(has_child_snapshot, 9) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f5f7db50ca31..dc099f06341f 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, { int ret; + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + HZ * 10)) + return ret; + + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", + buf.buf); + printbuf_exit(&buf); + closure_wait_event(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK)); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 232be8a44051..84097235eea9 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -427,7 +427,9 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (flags & OPT_SHOW_FULL_LIST) + if (v < opt->min || v >= opt->max - 1) + prt_printf(out, "(invalid option %lli)", v); + else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 6db72d3bad7d..55e1504a8130 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -287,7 +287,8 @@ int bch2_journal_replay(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res, + BCH_TRANS_COMMIT_no_journal_res| + BCH_WATERMARK_reclaim, bch2_journal_replay_accounting_key(trans, k)); if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) goto err; diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 50406ce0e4ef..9d96c06e365c 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -46,6 +46,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index bcb3276747e0..477ef0997949 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) +static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); @@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_fs *c, struct printbuf *err) { - mutex_lock(&c->sb_lock); - int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); - mutex_unlock(&c->sb_lock); - return ret; + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate_locked(e, sb, err); + int ret = bch2_replicas_entry_sb_validate(e, sb, err); if (ret) return ret; @@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, rcu_read_lock(); for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } + nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 5102059a0f1d..ae715ff658e8 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -78,7 +78,10 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(inode_has_child_snapshots, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4135b1ea2fec..aab328ac6dfa 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -180,6 +180,7 @@ enum bch_fsck_flags { x(reflink_p_to_missing_reflink_v, 166, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ + x(stripe_csum_granularity_bad, 290, 0) \ x(stripe_sector_count_wrong, 169, 0) \ x(snapshot_tree_pos_bad, 170, 0) \ x(snapshot_tree_to_missing_snapshot, 171, 0) \ @@ -225,11 +226,13 @@ enum bch_fsck_flags { x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ @@ -289,6 +292,7 @@ enum bch_fsck_flags { x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ @@ -298,7 +302,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 287, 0) + x(MAX, 291, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 02bcde3c1b02..fb08dd680dac 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (m.btree_bitmap_shift >= 64) { + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out, prt_newline(out); prt_printf(out, "Btree allocated bitmap blocksize:\t"); - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + if (m.btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + else + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); prt_newline(out); prt_printf(out, "Btree allocated bitmap:\t"); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 1809442b00ee..ae57638506c3 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - /* 0 is an invalid tree ID */ + /* Do we need to reconstruct the snapshot_tree entry as well? */ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; u32 tree_id = 0; - int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + tree_id = k.k->p.offset; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + if (ret) return ret; + if (!tree_id) { + ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + } + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); ret = PTR_ERR_OR_ZERO(snapshot); if (ret) @@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) snapshot->v.tree = cpu_to_le32(tree_id); snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: @@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } -static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - - return s->children[1] ?: s->children[0]; -} - -static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) -{ - u32 child; - - while ((child = bch2_snapshot_smallest_child(c, id))) - id = child; - return id; -} - -static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c interior_k, - u32 leaf_id, struct bpos *new_min_pos) -{ - struct btree_iter iter; - struct bpos pos = interior_k.k->p; - struct bkey_s_c k; - struct bkey_i *new; - int ret; - - pos.snapshot = leaf_id; - - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - - /* key already overwritten in this snapshot? */ - if (k.k->p.snapshot != interior_k.k->p.snapshot) - goto out; - - if (bpos_eq(*new_min_pos, POS_MIN)) { - *new_min_pos = k.k->p; - new_min_pos->snapshot = leaf_id; - } - - new = bch2_bkey_make_mut_noupdate(trans, interior_k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - new->k.p.snapshot = leaf_id; - ret = bch2_trans_update(trans, &iter, new, 0); -out: - bch2_set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c k, - struct bpos *new_min_pos) -{ - struct bch_fs *c = trans->c; - struct bkey_buf sk; - u32 restart_count = trans->restart_count; - int ret = 0; - - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - *new_min_pos = POS_MIN; - - for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); - id < k.k->p.snapshot; - id++) { - if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || - !bch2_snapshot_is_leaf(c, id)) - continue; -again: - ret = btree_trans_too_many_iters(trans) ?: - bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto again; - } - - if (ret) - break; - } - - bch2_bkey_buf_exit(&sk, c); - - return ret ?: trans_was_restarted(trans, restart_count); -} - static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index eb5ef64221d6..29c94716293e 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, - struct bkey_s_c, struct bpos *); - int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 873e4be7e1dc..77d811a539af 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); +static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); @@ -620,9 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c) up_write(&c->state_lock); for_each_member_device(c, ca) - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -1187,9 +1186,7 @@ static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); @@ -1226,10 +1223,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); - if (ca->kobj.state_in_sysfs) { - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } + bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); @@ -1251,6 +1245,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref) complete(&ca->io_ref_completion); } +static void bch2_dev_unlink(struct bch_dev *ca) +{ + struct kobject *b; + + /* + * This is racy w.r.t. the underlying block device being hot-removed, + * which removes it from sysfs. + * + * It'd be lovely if we had a way to handle this race, but the sysfs + * code doesn't appear to provide a good method and block/holder.c is + * susceptible as well: + */ + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev && + (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { + sysfs_remove_link(b, "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } +} + static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 32f719b9e661..115b90d29b1d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -849,6 +849,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; bool qrecord_inserted = false; @@ -859,11 +860,11 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); + xa_release(&delayed_refs->dirty_extents, + qrecord->bytenr >> fs_info->sectorsize_bits); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); @@ -873,7 +874,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } } - trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -895,8 +896,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += - btrfs_csum_bytes_to_leaves(trans->fs_info, - head_ref->num_bytes); + btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -1030,7 +1030,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, goto free_head_ref; } if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, - generic_ref->bytenr, GFP_NOFS)) { + generic_ref->bytenr >> fs_info->sectorsize_bits, + GFP_NOFS)) { ret = -ENOMEM; goto free_record; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 085f30968aba..352921e76c74 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -202,7 +202,15 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* Track dirty extent records. */ + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits. This is both to get a more + * dense index space (optimizes xarray structure) and because indexes in + * xarrays are of "unsigned long" type, meaning they are 32 bits wide on + * 32 bits platforms, limiting the extent range to 4G which is too low + * and makes it unusable (truncated index values) on 32 bits platforms. + */ struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c297909f1506..1332ec59c539 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2005,16 +2005,26 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *record) { struct btrfs_qgroup_extent_record *existing, *ret; - unsigned long bytenr = record->bytenr; + const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; +#if BITS_PER_LONG == 32 + if (record->bytenr >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, +"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", + record->bytenr); + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; + } +#endif + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); xa_lock(&delayed_refs->dirty_extents); - existing = xa_load(&delayed_refs->dirty_extents, bytenr); + existing = xa_load(&delayed_refs->dirty_extents, index); if (existing) { if (record->data_rsv && !existing->data_rsv) { existing->data_rsv = record->data_rsv; @@ -2024,7 +2034,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 1; } - ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { qgroup_mark_inconsistent(fs_info); @@ -2129,6 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) @@ -2137,7 +2148,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } @@ -2152,7 +2163,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_unlock(&delayed_refs->lock); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, record->bytenr); + xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 27306d98ec43..b068469871f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7190,13 +7190,11 @@ static int changed_extent(struct send_ctx *sctx, static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; - if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } - return ret; + return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2ed2a791f8f..9637c7cdc0cf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1374,7 +1374,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct inode *inode = NULL; struct btrfs_key location; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 666873f745da..320d586c3896 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -191,10 +191,14 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(file)) return PTR_ERR(file); - dif->file = file; - if (!erofs_is_fileio_mode(sbi)) + if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); + } else if (!S_ISREG(file_inode(file)->i_mode)) { + fput(file); + return -EINVAL; + } + dif->file = file; } dif->blocks = le32_to_cpu(dis->blocks); @@ -714,7 +718,10 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (IS_ERR(sbi->fdev)) return PTR_ERR(sbi->fdev); - return get_tree_nodev(fc, erofs_fc_fill_super); + if (S_ISREG(file_inode(sbi->fdev)->i_mode) && + sbi->fdev->f_mapping->a_ops->read_folio) + return get_tree_nodev(fc, erofs_fc_fill_super); + fput(sbi->fdev); } #endif return ret; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8936790618c6..a569ff9dfd04 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -710,24 +710,6 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) -{ - struct z_erofs_pcluster *pcl = f->pcl; - z_erofs_next_pcluster_t *owned_head = &f->owned_head; - - /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) == Z_EROFS_PCLUSTER_NIL) { - *owned_head = &pcl->next; - /* so we can attach this pcluster to our submission chain. */ - f->mode = Z_EROFS_PCLUSTER_FOLLOWED; - return; - } - - /* type 2, it belongs to an ongoing chain */ - f->mode = Z_EROFS_PCLUSTER_INFLIGHT; -} - static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; @@ -803,7 +785,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) int ret; DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); @@ -823,7 +804,15 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - z_erofs_try_to_claim_pcluster(fe); + /* check if this pcluster hasn't been linked into any chain. */ + if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL, + fe->owned_head) == Z_EROFS_PCLUSTER_NIL) { + /* .. so it can be attached to our submission chain */ + fe->owned_head = &fe->pcl->next; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + } else { /* otherwise, it belongs to an inflight chain */ + fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; + } } else if (ret) { return ret; } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1253a8456e59..a076cca1f547 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -10,8 +10,6 @@ struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; - void *kaddr; - unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; @@ -33,14 +31,11 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, struct z_erofs_lcluster_index *di; unsigned int advise; - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - - m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(di)) + return PTR_ERR(di); m->lcn = lcn; - di = m->kaddr; + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; @@ -53,8 +48,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = m->delta[0] & - ~Z_EROFS_LI_D0_CBLKCNT; + m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); @@ -110,9 +104,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; - int i; - u8 *in, type; bool big_pcluster; + u8 *in, type; + int i; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; @@ -121,6 +115,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(in)) + return PTR_ERR(in); + /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); @@ -128,9 +126,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); - - in = m->kaddr - bytes; - + in -= bytes; i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); @@ -255,10 +251,6 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9ae54c4c72fe..321d8ffbab6e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4647,7 +4647,8 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_count(to), READ); /* In LFS mode, if there is inflight dio, wait for its completion */ - if (f2fs_lfs_mode(F2FS_I_SB(inode))) + if (f2fs_lfs_mode(F2FS_I_SB(inode)) && + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) inode_dio_wait(inode); if (f2fs_should_use_dio(inode, iocb, to)) { diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 09b20039636e..611716bc8f27 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -512,6 +512,7 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len) { struct ksmbd_spnego_authen_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; int retval; @@ -540,7 +541,10 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, goto out; } - user = ksmbd_alloc_user(&resp->login_response); + if (resp->login_response.status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(resp->login_response.account); + + user = ksmbd_alloc_user(&resp->login_response, resp_ext); if (!user) { ksmbd_debug(AUTH, "login failure\n"); retval = -ENOMEM; diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 38e6fd2da3b8..3d01d9d15293 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -51,6 +51,9 @@ * - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response) * This event is to make kerberos authentication to be processed in * userspace. + * + * - KSMBD_EVENT_LOGIN_REQUEST_EXT/RESPONSE_EXT(ksmbd_login_request_ext/response_ext) + * This event is to get user account extension info to user IPC daemon. */ #define KSMBD_GENL_NAME "SMBD_GENL" @@ -145,6 +148,16 @@ struct ksmbd_login_response { __u32 reserved[16]; /* Reserved room */ }; +/* + * IPC user login response extension. + */ +struct ksmbd_login_response_ext { + __u32 handle; + __s32 ngroups; /* supplementary group count */ + __s8 reserved[128]; /* Reserved room */ + __s8 ____payload[]; +}; + /* * IPC request to fetch net share config. */ @@ -306,6 +319,9 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, + KSMBD_EVENT_LOGIN_REQUEST_EXT, + KSMBD_EVENT_LOGIN_RESPONSE_EXT, + __KSMBD_EVENT_MAX, KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; @@ -336,6 +352,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) #define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) +#define KSMBD_USER_FLAG_EXTENSION BIT(6) /* * Share config flags. diff --git a/fs/smb/server/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c index 279d00feff21..421a4a95e216 100644 --- a/fs/smb/server/mgmt/user_config.c +++ b/fs/smb/server/mgmt/user_config.c @@ -12,6 +12,7 @@ struct ksmbd_user *ksmbd_login_user(const char *account) { struct ksmbd_login_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; resp = ksmbd_ipc_login_request(account); @@ -21,15 +22,19 @@ struct ksmbd_user *ksmbd_login_user(const char *account) if (!(resp->status & KSMBD_USER_FLAG_OK)) goto out; - user = ksmbd_alloc_user(resp); + if (resp->status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(account); + + user = ksmbd_alloc_user(resp, resp_ext); out: kvfree(resp); return user; } -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext) { - struct ksmbd_user *user = NULL; + struct ksmbd_user *user; user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL); if (!user) @@ -44,18 +49,42 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) if (user->passkey) memcpy(user->passkey, resp->hash, resp->hash_sz); - if (!user->name || !user->passkey) { - kfree(user->name); - kfree(user->passkey); - kfree(user); - user = NULL; + user->ngroups = 0; + user->sgid = NULL; + + if (!user->name || !user->passkey) + goto err_free; + + if (resp_ext) { + if (resp_ext->ngroups > NGROUPS_MAX) { + pr_err("ngroups(%u) from login response exceeds max groups(%d)\n", + resp_ext->ngroups, NGROUPS_MAX); + goto err_free; + } + + user->sgid = kmemdup(resp_ext->____payload, + resp_ext->ngroups * sizeof(gid_t), + GFP_KERNEL); + if (!user->sgid) + goto err_free; + + user->ngroups = resp_ext->ngroups; + ksmbd_debug(SMB, "supplementary groups : %d\n", user->ngroups); } + return user; + +err_free: + kfree(user->name); + kfree(user->passkey); + kfree(user); + return NULL; } void ksmbd_free_user(struct ksmbd_user *user) { ksmbd_ipc_logout_request(user->name, user->flags); + kfree(user->sgid); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h index e068a19fd904..8c227b8d4954 100644 --- a/fs/smb/server/mgmt/user_config.h +++ b/fs/smb/server/mgmt/user_config.h @@ -18,6 +18,8 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + int ngroups; + gid_t *sgid; }; static inline bool user_guest(struct ksmbd_user *user) @@ -60,7 +62,8 @@ static inline unsigned int user_gid(struct ksmbd_user *user) } struct ksmbd_user *ksmbd_login_user(const char *account); -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 99416ce9f501..1e4624e9d434 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -177,9 +177,10 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (sess->state != SMB2_SESSION_VALID || - time_after(jiffies, - sess->last_active + SMB2_SESSION_TIMEOUT)) { + if (atomic_read(&sess->refcnt) == 0 && + (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT))) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); ksmbd_session_destroy(sess); @@ -269,8 +270,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); - if (sess) - sess->last_active = jiffies; up_read(&sessions_table_lock); return sess; @@ -289,6 +288,22 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, return sess; } +void ksmbd_user_session_get(struct ksmbd_session *sess) +{ + atomic_inc(&sess->refcnt); +} + +void ksmbd_user_session_put(struct ksmbd_session *sess) +{ + if (!sess) + return; + + if (atomic_read(&sess->refcnt) <= 0) + WARN_ON(1); + else + atomic_dec(&sess->refcnt); +} + struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, u64 sess_id) { @@ -393,6 +408,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); + atomic_set(&sess->refcnt, 1); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index dc9fded2cd43..c1c4b20bd5c6 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -61,6 +61,8 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; rwlock_t tree_conns_lock; + + atomic_t refcnt; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) @@ -104,4 +106,6 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name); void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id); +void ksmbd_user_session_get(struct ksmbd_session *sess); +void ksmbd_user_session_put(struct ksmbd_session *sess); #endif /* __USER_SESSION_MANAGEMENT_H__ */ diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 231d2d224656..9670c97f14b3 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -238,6 +238,8 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } while (is_chained == true); send: + if (work->sess) + ksmbd_user_session_put(work->sess); if (work->tcon) ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 797b0f24097b..599118aed205 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -605,8 +605,10 @@ int smb2_check_user_session(struct ksmbd_work *work) /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); - if (work->sess) + if (work->sess) { + ksmbd_user_session_get(work->sess); return 1; + } ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); return -ENOENT; } @@ -1740,6 +1742,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = true; + ksmbd_user_session_get(sess); } else if ((conn->dialect < SMB30_PROT_ID || server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { @@ -1766,6 +1769,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = false; + ksmbd_user_session_get(sess); } work->sess = sess; @@ -2228,7 +2232,9 @@ int smb2_session_logoff(struct ksmbd_work *work) } ksmbd_destroy_file_table(&sess->file_table); + down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; + up_write(&conn->session_lock); ksmbd_free_user(sess->user); sess->user = NULL; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 5b8d75e78ffb..a2ebbe604c8c 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -736,13 +736,15 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; + struct ksmbd_user *user = sess->user; struct cred *cred; struct group_info *gi; unsigned int uid; unsigned int gid; + int i; - uid = user_uid(sess->user); - gid = user_gid(sess->user); + uid = user_uid(user); + gid = user_gid(user); if (share->force_uid != KSMBD_SHARE_INVALID_UID) uid = share->force_uid; if (share->force_gid != KSMBD_SHARE_INVALID_GID) @@ -755,11 +757,18 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, gid); - gi = groups_alloc(0); + gi = groups_alloc(user->ngroups); if (!gi) { abort_creds(cred); return -ENOMEM; } + + for (i = 0; i < user->ngroups; i++) + gi->gid[i] = make_kgid(&init_user_ns, user->sgid[i]); + + if (user->ngroups) + groups_sort(gi); + set_groups(cred, gi); put_group_info(gi); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 8752ac82c557..2f27afb695f6 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -120,6 +120,12 @@ static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { }, [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = { }, + [KSMBD_EVENT_LOGIN_REQUEST_EXT] = { + .len = sizeof(struct ksmbd_login_request), + }, + [KSMBD_EVENT_LOGIN_RESPONSE_EXT] = { + .len = sizeof(struct ksmbd_login_response_ext), + }, }; static struct genl_ops ksmbd_genl_ops[] = { @@ -187,6 +193,14 @@ static struct genl_ops ksmbd_genl_ops[] = { .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE, .doit = handle_generic_event, }, + { + .cmd = KSMBD_EVENT_LOGIN_REQUEST_EXT, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGIN_RESPONSE_EXT, + .doit = handle_generic_event, + }, }; static struct genl_family ksmbd_genl_family = { @@ -198,7 +212,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), - .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, + .resv_start_op = KSMBD_EVENT_LOGIN_RESPONSE_EXT + 1, }; static void ksmbd_nl_init_fixup(void) @@ -459,16 +473,24 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) { unsigned int msg_sz = entry->msg_sz; - if (entry->type == KSMBD_EVENT_RPC_REQUEST) { + switch (entry->type) { + case KSMBD_EVENT_RPC_REQUEST: + { struct ksmbd_rpc_command *resp = entry->response; msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; - } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) { + break; + } + case KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST: + { struct ksmbd_spnego_authen_response *resp = entry->response; msg_sz = sizeof(struct ksmbd_spnego_authen_response) + resp->session_key_len + resp->spnego_blob_len; - } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) { + break; + } + case KSMBD_EVENT_SHARE_CONFIG_REQUEST: + { struct ksmbd_share_config_response *resp = entry->response; if (resp->payload_sz) { @@ -478,6 +500,17 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) msg_sz = sizeof(struct ksmbd_share_config_response) + resp->payload_sz; } + break; + } + case KSMBD_EVENT_LOGIN_REQUEST_EXT: + { + struct ksmbd_login_response_ext *resp = entry->response; + + if (resp->ngroups) { + msg_sz = sizeof(struct ksmbd_login_response_ext) + + resp->ngroups * sizeof(gid_t); + } + } } return entry->msg_sz != msg_sz ? -EINVAL : 0; @@ -560,6 +593,29 @@ struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account) return resp; } +struct ksmbd_login_response_ext *ksmbd_ipc_login_request_ext(const char *account) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_login_request *req; + struct ksmbd_login_response_ext *resp; + + if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_LOGIN_REQUEST_EXT; + req = (struct ksmbd_login_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + struct ksmbd_spnego_authen_response * ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len) { diff --git a/fs/smb/server/transport_ipc.h b/fs/smb/server/transport_ipc.h index 5e5b90a0c187..d9b6737f8cd0 100644 --- a/fs/smb/server/transport_ipc.h +++ b/fs/smb/server/transport_ipc.h @@ -12,6 +12,8 @@ struct ksmbd_login_response * ksmbd_ipc_login_request(const char *account); +struct ksmbd_login_response_ext * +ksmbd_ipc_login_request_ext(const char *account); struct ksmbd_session; struct ksmbd_share_config; diff --git a/include/linux/closure.h b/include/linux/closure.h index 2af44427107d..880fe85e35e9 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -454,4 +454,39 @@ do { \ __closure_wait_event(waitlist, _cond); \ } while (0) +#define __closure_wait_event_timeout(waitlist, _cond, _until) \ +({ \ + struct closure cl; \ + long _t; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + closure_wait(waitlist, &cl); \ + if (_cond) { \ + _t = max_t(long, 1L, _until - jiffies); \ + break; \ + } \ + _t = max_t(long, 0L, _until - jiffies); \ + if (!_t) \ + break; \ + closure_sync_timeout(&cl, _t); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ + _t; \ +}) + +/* + * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if + * condition became true + */ +#define closure_wait_event_timeout(waitlist, _cond, _timeout) \ +({ \ + unsigned long _until = jiffies + _timeout; \ + (_cond) \ + ? max_t(long, 1L, _until - jiffies) \ + : __closure_wait_event_timeout(waitlist, _cond, _until);\ +}) + #endif /* _LINUX_CLOSURE_H */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 410a4df8a121..6eae3b69bf6e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,7 +9,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { - SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, @@ -19,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -1274,86 +1279,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; + u32 cnt; }; /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + spin_lock_irq(&scx_tasks_lock); + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; + iter->cnt = 0; } -/** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for - * - * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. - * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. - */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { if (iter->locked) { task_rq_unlock(iter->rq, iter->locked, &iter->rf); iter->locked = NULL; - return true; - } else { - return false; } } /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock + * + * If @iter is in the middle of a locked iteration, it may be locking the rq of + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock + * + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. + */ +static void scx_task_iter_relock(struct scx_task_iter *iter) +{ + spin_lock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - - scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) @@ -1374,14 +1397,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; - scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -1949,7 +1972,6 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; @@ -1967,7 +1989,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (bypassing) + if (scx_rq_bypassing(rq)) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2022,7 +2044,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; + p->scx.slice = SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -2958,8 +2980,8 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; @@ -3064,11 +3086,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return prev_cpu; - } - /* * If WAKE_SYNC, the waker's local DSQ is empty, and the system is * under utilized, wake up @p to the local DSQ of the waker. Checking @@ -3133,7 +3150,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3198,7 +3215,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); - if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; @@ -4261,21 +4278,23 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - * %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. * - * b. ops.dispatch() is ignored. + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - * can't be trusted. Whenever a tick triggers, the running task is rotated to - * the tail of the queue with core_sched_at touched. + * - ops.dispatch() is ignored. * - * d. pick_next_task() suppresses zero slice warning. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - * operations. + * - pick_next_task() suppresses zero slice warning. * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { @@ -4345,7 +4364,7 @@ static void scx_ops_bypass(bool bypass) rq_unlock_irqrestore(rq, &rf); - /* kick to restore ticks */ + /* resched to restore ticks and idle state */ resched_cpu(cpu); } } @@ -4467,15 +4486,13 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_ops_init_task_enabled = false; - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); @@ -4484,8 +4501,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ @@ -5136,8 +5152,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5147,15 +5162,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; - scx_task_iter_rq_unlock(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); scx_ops_error("ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); goto err_disable_unlock_all; @@ -5164,10 +5177,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); @@ -5184,14 +5196,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + p->scx.slice = SCX_SLICE_DFL; __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); @@ -5199,8 +5211,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); @@ -5872,16 +5883,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + goto prev_cpu; } + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu: *is_idle = false; return prev_cpu; -#endif } __bpf_kfunc_end_defs(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fb04445f92c3..3ea4f7bb1837 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6728,39 +6728,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6769,11 +6768,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a4d0d990ad2..69a0e1816d3d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10619,10 +10619,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b38199965f99..363d031a16f7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -88,6 +88,7 @@ TARGETS += rlimits TARGETS += rseq TARGETS += rtc TARGETS += rust +TARGETS += sched_ext TARGETS += seccomp TARGETS += sgx TARGETS += sigaltstack @@ -129,10 +130,10 @@ ifeq ($(filter net/lib,$(TARGETS)),) endif endif -# User can optionally provide a TARGETS skiplist. By default we skip -# BPF since it has cutting edge build time dependencies which require -# more effort to install. -SKIP_TARGETS ?= bpf +# User can optionally provide a TARGETS skiplist. By default we skip +# targets using BPF since it has cutting edge build time dependencies +# which require more effort to install. +SKIP_TARGETS ?= bpf sched_ext ifneq ($(SKIP_TARGETS),) TMP := $(filter-out $(SKIP_TARGETS), $(TARGETS)) override TARGETS := $(TMP) diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 0754a2c110a1..06ae9c107049 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -3,24 +3,13 @@ include ../../../build/Build.include include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include + +TEST_GEN_PROGS := runner + +# override lib.mk's default rules +OVERRIDE_TARGETS := 1 include ../lib.mk -ifneq ($(LLVM),) -ifneq ($(filter %/,$(LLVM)),) -LLVM_PREFIX := $(LLVM) -else ifneq ($(filter -%,$(LLVM)),) -LLVM_SUFFIX := $(LLVM) -endif - -CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as -else -CC := gcc -endif # LLVM - -ifneq ($(CROSS_COMPILE),) -$(error CROSS_COMPILE not supported for scx selftests) -endif # CROSS_COMPILE - CURDIR := $(abspath .) REPOROOT := $(abspath ../../../..) TOOLSDIR := $(REPOROOT)/tools @@ -34,18 +23,23 @@ GENHDR := $(GENDIR)/autoconf.h SCXTOOLSDIR := $(TOOLSDIR)/sched_ext SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include -OUTPUT_DIR := $(CURDIR)/build +OUTPUT_DIR := $(OUTPUT)/build OBJ_DIR := $(OUTPUT_DIR)/obj INCLUDE_DIR := $(OUTPUT_DIR)/include BPFOBJ_DIR := $(OBJ_DIR)/libbpf SCXOBJ_DIR := $(OBJ_DIR)/sched_ext BPFOBJ := $(BPFOBJ_DIR)/libbpf.a LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a -DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool -HOST_BUILD_DIR := $(OBJ_DIR) -HOST_OUTPUT_DIR := $(OUTPUT_DIR) -VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ +DEFAULT_BPFTOOL := $(OUTPUT_DIR)/host/sbin/bpftool +HOST_OBJ_DIR := $(OBJ_DIR)/host/bpftool +HOST_LIBBPF_OUTPUT := $(OBJ_DIR)/host/libbpf/ +HOST_LIBBPF_DESTDIR := $(OUTPUT_DIR)/host/ +HOST_DESTDIR := $(OUTPUT_DIR)/host/ + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../../vmlinux \ /sys/kernel/btf/vmlinux \ /boot/vmlinux-$(shell uname -r) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) @@ -80,17 +74,23 @@ IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ +$(shell $(1) $(2) -v -E - &1 \ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -$(shell $(1) -dM -E -