From c829dba797360d9a266cabfaac16d1cd80abfc2b Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Wed, 31 Aug 2022 11:40:39 +0800 Subject: [PATCH 01/10] MAINTAINERS: Add include/linux/tnum.h to BPF CORE Maintainers of the kerne/bpf/tnum.c are also the maintainers of the corresponding header file include/linux/tnum.h. Add the file entry for include/linux/tnum.h to the appropriate section in MAINTAINERS. Signed-off-by: Shung-Hsi Yu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831034039.17998-1-shung-hsi.yu@suse.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index af4848466a08..1a9fe9736ddd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3824,6 +3824,7 @@ F: kernel/bpf/dispatcher.c F: kernel/bpf/trampoline.c F: include/linux/bpf* F: include/linux/filter.h +F: include/linux/tnum.h BPF [BTF] M: Martin KaFai Lau From c00c4461689e15ac2cc3b9a595a54e4d8afd3d77 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Aug 2022 14:17:05 +0200 Subject: [PATCH 02/10] xsk: Fix backpressure mechanism on Tx Commit d678cbd2f867 ("xsk: Fix handling of invalid descriptors in XSK TX batching API") fixed batch API usage against set of descriptors with invalid ones but introduced a problem when AF_XDP SW rings are smaller than HW ones. Mismatch of reported Tx'ed frames between HW generator and user space app was observed. It turned out that backpressure mechanism became a bottleneck when the amount of produced descriptors to CQ is lower than what we grabbed from XSK Tx ring. Say that 512 entries had been taken from XSK Tx ring but we had only 490 free entries in CQ. Then callsite (ZC driver) will produce only 490 entries onto HW Tx ring but 512 entries will be released from Tx ring and this is what will be seen by the user space. In order to fix this case, mix XSK Tx/CQ ring interractions by moving around internal functions and changing call order: * pull out xskq_prod_nb_free() from xskq_prod_reserve_addr_batch() up to xsk_tx_peek_release_desc_batch(); ** move xskq_cons_release_n() into xskq_cons_read_desc_batch() After doing so, algorithm can be described as follows: 1. lookup Tx entries 2. use value from 1. to reserve space in CQ (*) 3. Read from Tx ring as much descriptors as value from 2 3a. release descriptors from XSK Tx ring (**) 4. Finally produce addresses to CQ Fixes: d678cbd2f867 ("xsk: Fix handling of invalid descriptors in XSK TX batching API") Signed-off-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220830121705.8618-1-maciej.fijalkowski@intel.com --- net/xdp/xsk.c | 22 +++++++++++----------- net/xdp/xsk_queue.h | 22 ++++++++++------------ 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5b4ce6ba1bc7..639b2c3beb69 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -355,16 +355,15 @@ static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entr return nb_pkts; } -u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) { struct xdp_sock *xs; - u32 nb_pkts; rcu_read_lock(); if (!list_is_singular(&pool->xsk_tx_list)) { /* Fallback to the non-batched version */ rcu_read_unlock(); - return xsk_tx_peek_release_fallback(pool, max_entries); + return xsk_tx_peek_release_fallback(pool, nb_pkts); } xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); @@ -373,12 +372,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) goto out; } - max_entries = xskq_cons_nb_entries(xs->tx, max_entries); - nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries); - if (!nb_pkts) { - xs->tx->queue_empty_descs++; - goto out; - } + nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); /* This is the backpressure mechanism for the Tx path. Try to * reserve space in the completion queue for all packets, but @@ -386,12 +380,18 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) * packets. This avoids having to implement any buffering in * the Tx path. */ - nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts); + nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); if (!nb_pkts) goto out; - xskq_cons_release_n(xs->tx, max_entries); + nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + __xskq_cons_release(xs->tx); + xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); xs->sk.sk_write_space(&xs->sk); out: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index fb20bf7207cf..c6fb6b763658 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -205,6 +205,11 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, u32 max) { @@ -226,6 +231,8 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff cached_cons++; } + /* Release valid plus any invalid entries */ + xskq_cons_release_n(q, cached_cons - q->cached_cons); return nb_entries; } @@ -291,11 +298,6 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } -static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) -{ - q->cached_cons += cnt; -} - static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -350,21 +352,17 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } -static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, - u32 max) +static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 nb_entries) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 nb_entries, i, cached_prod; - - nb_entries = xskq_prod_nb_free(q, max); + u32 i, cached_prod; /* A, matches D */ cached_prod = q->cached_prod; for (i = 0; i < nb_entries; i++) ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; q->cached_prod = cached_prod; - - return nb_entries; } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, From 8a7d61bdc2fac2c460a2f32a062f5c6dbd21a764 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Aug 2022 15:39:05 +0200 Subject: [PATCH 03/10] selftests/xsk: Add missing close() on netns fd Commit 1034b03e54ac ("selftests: xsk: Simplify cleanup of ifobjects") removed close on netns fd, which is not correct, so let us restore it. Fixes: 1034b03e54ac ("selftests: xsk: Simplify cleanup of ifobjects") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220830133905.9945-1-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 74d56d971baf..091402dc5390 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -1606,6 +1606,8 @@ static struct ifobject *ifobject_create(void) if (!ifobj->umem) goto out_umem; + ifobj->ns_fd = -1; + return ifobj; out_umem: @@ -1617,6 +1619,8 @@ out_xsk_arr: static void ifobject_delete(struct ifobject *ifobj) { + if (ifobj->ns_fd != -1) + close(ifobj->ns_fd); free(ifobj->umem); free(ifobj->xsk_arr); free(ifobj); From a37a32583e282d8d815e22add29bc1e91e19951a Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Sat, 10 Sep 2022 11:01:20 +0000 Subject: [PATCH 04/10] bpf: btf: fix truncated last_member_type_id in btf_struct_resolve When trying to finish resolving a struct member, btf_struct_resolve saves the member type id in a u16 temporary variable. This truncates the 32 bit type id value if it exceeds UINT16_MAX. As a result, structs that have members with type ids > UINT16_MAX and which need resolution will fail with a message like this: [67414] STRUCT ff_device size=120 vlen=12 effect_owners type_id=67434 bits_offset=960 Member exceeds struct_size Fix this by changing the type of last_member_type_id to u32. Fixes: a0791f0df7d2 ("bpf: fix BTF limits") Reviewed-by: Stanislav Fomichev Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20220910110120.339242-1-oss@lmb.io Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7e64447659f3..36fd4b509294 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3128,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; - u16 last_member_type_id; + u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; From 83c10cc362d91c0d8d25e60779ee52fdbbf3894d Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 12 Sep 2022 14:38:55 +0100 Subject: [PATCH 05/10] bpf: Ensure correct locking around vulnerable function find_vpid() The documentation for find_vpid() clearly states: "Must be called with the tasklist_lock or rcu_read_lock() held." Presently we do neither for find_vpid() instance in bpf_task_fd_query(). Add proper rcu_read_lock/unlock() to fix the issue. Fixes: 41bdc4b40ed6f ("bpf: introduce bpf subcommand BPF_TASK_FD_QUERY") Signed-off-by: Lee Jones Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220912133855.1218900-1-lee@kernel.org --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 27760627370d..1bd18af8af83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4395,7 +4395,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (attr->task_fd_query.flags != 0) return -EINVAL; + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); if (!task) return -ENOENT; From 0e426a3ae030a9e891899370229e117158b35de6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 21 Sep 2022 10:46:02 +0000 Subject: [PATCH 06/10] bpf, cgroup: Reject prog_attach_flags array when effective query Attach flags is only valid for attached progs of this layer cgroup, but not for effective progs. For querying with EFFECTIVE flags, exporting attach flags does not make sense. So when effective query, we reject prog_attach_flags array and don't need to populate it. Also we limit attach_flags to output 0 during effective query. Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20220921104604.2340580-2-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 7 +++++-- kernel/bpf/cgroup.c | 28 ++++++++++++++++++---------- tools/include/uapi/linux/bpf.h | 7 +++++-- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59a217ca2dfd..4eff7fc7ae58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1233,7 +1233,7 @@ enum { /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. - * attach_flags with this flag are returned only for directly attached programs. + * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -1432,7 +1432,10 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; - __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4a400cd63731..22888aaa68b6 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); + bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type from_atype, to_atype; @@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int total_cnt = 0; u32 flags; + if (effective_query && prog_attach_flags) + return -EINVAL; + if (type == BPF_LSM_CGROUP) { - if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + if (!effective_query && attr->query.prog_cnt && + prog_ids && !prog_attach_flags) return -EINVAL; from_atype = CGROUP_LSM_START; @@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } for (atype = from_atype; atype <= to_atype; atype++) { - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); @@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } } + /* always output uattr->query.attach_flags as 0 during effective query */ + flags = effective_query ? 0 : flags; if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) @@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); @@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, if (++i == cnt) break; } - } - if (prog_attach_flags) { - flags = cgrp->bpf.flags[atype]; + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; - for (i = 0; i < cnt; i++) - if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) - return -EFAULT; - prog_attach_flags += cnt; + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, + &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; + } } prog_ids += cnt; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 59a217ca2dfd..4eff7fc7ae58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1233,7 +1233,7 @@ enum { /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. - * attach_flags with this flag are returned only for directly attached programs. + * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -1432,7 +1432,10 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; - __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ From bdcee1b0b0834d031c76a12209840afe949b048a Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 21 Sep 2022 10:46:03 +0000 Subject: [PATCH 07/10] bpftool: Fix wrong cgroup attach flags being assigned to effective progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When root-cgroup attach multi progs and sub-cgroup attach a override prog, bpftool will display incorrectly for the attach flags of the sub-cgroup’s effective progs: $ bpftool cgroup tree /sys/fs/cgroup effective CgroupPath ID AttachType AttachFlags Name /sys/fs/cgroup 6 cgroup_sysctl multi sysctl_tcp_mem 13 cgroup_sysctl multi sysctl_tcp_mem /sys/fs/cgroup/cg1 20 cgroup_sysctl override sysctl_tcp_mem 6 cgroup_sysctl override sysctl_tcp_mem <- wrong 13 cgroup_sysctl override sysctl_tcp_mem <- wrong /sys/fs/cgroup/cg1/cg2 20 cgroup_sysctl sysctl_tcp_mem 6 cgroup_sysctl sysctl_tcp_mem 13 cgroup_sysctl sysctl_tcp_mem Attach flags is only valid for attached progs of this layer cgroup, but not for effective progs. For querying with EFFECTIVE flags, exporting attach flags does not make sense. So let's remove the AttachFlags field and the associated logic. After this patch, the above effective cgroup tree will show as bellow: $ bpftool cgroup tree /sys/fs/cgroup effective CgroupPath ID AttachType Name /sys/fs/cgroup 6 cgroup_sysctl sysctl_tcp_mem 13 cgroup_sysctl sysctl_tcp_mem /sys/fs/cgroup/cg1 20 cgroup_sysctl sysctl_tcp_mem 6 cgroup_sysctl sysctl_tcp_mem 13 cgroup_sysctl sysctl_tcp_mem /sys/fs/cgroup/cg1/cg2 20 cgroup_sysctl sysctl_tcp_mem 6 cgroup_sysctl sysctl_tcp_mem 13 cgroup_sysctl sysctl_tcp_mem Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Fixes: a98bf57391a2 ("tools: bpftool: add support for reporting the effective cgroup progs") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20220921104604.2340580-3-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/bpf/bpftool/cgroup.c | 54 ++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index cced668fb2a3..b46a998d8f8d 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -136,8 +136,8 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, jsonw_string_field(json_wtr, "attach_type", attach_type_str); else jsonw_uint_field(json_wtr, "attach_type", attach_type); - jsonw_string_field(json_wtr, "attach_flags", - attach_flags_str); + if (!(query_flags & BPF_F_QUERY_EFFECTIVE)) + jsonw_string_field(json_wtr, "attach_flags", attach_flags_str); jsonw_string_field(json_wtr, "name", prog_name); if (attach_btf_name) jsonw_string_field(json_wtr, "attach_btf_name", attach_btf_name); @@ -150,7 +150,10 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, printf("%-15s", attach_type_str); else printf("type %-10u", attach_type); - printf(" %-15s %-15s", attach_flags_str, prog_name); + if (query_flags & BPF_F_QUERY_EFFECTIVE) + printf(" %-15s", prog_name); + else + printf(" %-15s %-15s", attach_flags_str, prog_name); if (attach_btf_name) printf(" %-15s", attach_btf_name); else if (info.attach_btf_id) @@ -195,6 +198,32 @@ static int cgroup_has_attached_progs(int cgroup_fd) return no_prog ? 0 : 1; } + +static int show_effective_bpf_progs(int cgroup_fd, enum bpf_attach_type type, + int level) +{ + LIBBPF_OPTS(bpf_prog_query_opts, p); + __u32 prog_ids[1024] = {0}; + __u32 iter; + int ret; + + p.query_flags = query_flags; + p.prog_cnt = ARRAY_SIZE(prog_ids); + p.prog_ids = prog_ids; + + ret = bpf_prog_query_opts(cgroup_fd, type, &p); + if (ret) + return ret; + + if (p.prog_cnt == 0) + return 0; + + for (iter = 0; iter < p.prog_cnt; iter++) + show_bpf_prog(prog_ids[iter], type, NULL, level); + + return 0; +} + static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, int level) { @@ -245,6 +274,14 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, return 0; } +static int show_bpf_progs(int cgroup_fd, enum bpf_attach_type type, + int level) +{ + return query_flags & BPF_F_QUERY_EFFECTIVE ? + show_effective_bpf_progs(cgroup_fd, type, level) : + show_attached_bpf_progs(cgroup_fd, type, level); +} + static int do_show(int argc, char **argv) { enum bpf_attach_type type; @@ -292,6 +329,8 @@ static int do_show(int argc, char **argv) if (json_output) jsonw_start_array(json_wtr); + else if (query_flags & BPF_F_QUERY_EFFECTIVE) + printf("%-8s %-15s %-15s\n", "ID", "AttachType", "Name"); else printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType", "AttachFlags", "Name"); @@ -304,7 +343,7 @@ static int do_show(int argc, char **argv) * If we were able to get the show for at least one * attach type, let's return 0. */ - if (show_attached_bpf_progs(cgroup_fd, type, 0) == 0) + if (show_bpf_progs(cgroup_fd, type, 0) == 0) ret = 0; } @@ -362,7 +401,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb, btf_vmlinux = libbpf_find_kernel_btf(); for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) - show_attached_bpf_progs(cgroup_fd, type, ftw->level); + show_bpf_progs(cgroup_fd, type, ftw->level); if (errno == EINVAL) /* Last attach type does not support query. @@ -436,6 +475,11 @@ static int do_show_tree(int argc, char **argv) if (json_output) jsonw_start_array(json_wtr); + else if (query_flags & BPF_F_QUERY_EFFECTIVE) + printf("%s\n" + "%-8s %-15s %-15s\n", + "CgroupPath", + "ID", "AttachType", "Name"); else printf("%s\n" "%-8s %-15s %-15s %-15s\n", From d2aa993b7d9de6deeb1df6c9a6b9b6193c337cc6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 21 Sep 2022 10:46:04 +0000 Subject: [PATCH 08/10] selftests/bpf: Adapt cgroup effective query uapi change The attach flags is meaningless for effective query and its value will always be set as 0 during effective query. Root cg's effective progs is always its attached progs, so we use non-effective query to get its progs count and attach flags. And we don't need the remain attach flags check. Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20220921104604.2340580-4-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/cgroup_link.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c index 9e6e6aad347c..15093a69510e 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c @@ -71,10 +71,9 @@ void serial_test_cgroup_link(void) ping_and_check(cg_nr, 0); - /* query the number of effective progs and attach flags in root cg */ + /* query the number of attached progs and attach flags in root cg */ err = bpf_prog_query(cgs[0].fd, BPF_CGROUP_INET_EGRESS, - BPF_F_QUERY_EFFECTIVE, &attach_flags, NULL, - &prog_cnt); + 0, &attach_flags, NULL, &prog_cnt); CHECK_FAIL(err); CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); if (CHECK(prog_cnt != 1, "effect_cnt", "exp %d, got %d\n", 1, prog_cnt)) @@ -85,17 +84,15 @@ void serial_test_cgroup_link(void) BPF_F_QUERY_EFFECTIVE, NULL, NULL, &prog_cnt); CHECK_FAIL(err); - CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n", cg_nr, prog_cnt)) goto cleanup; /* query the effective prog IDs in last cg */ err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS, - BPF_F_QUERY_EFFECTIVE, &attach_flags, - prog_ids, &prog_cnt); + BPF_F_QUERY_EFFECTIVE, NULL, prog_ids, + &prog_cnt); CHECK_FAIL(err); - CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n", cg_nr, prog_cnt)) goto cleanup; From 8addbfc7b308d591f8a5f2f6bb24d08d9d79dfbb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 21 Sep 2022 16:35:50 +0200 Subject: [PATCH 09/10] bpf: Gate dynptr API behind CAP_BPF This has been enabled for unprivileged programs for only one kernel release, hence the expected annoyances due to this move are low. Users using ringbuf can stick to non-dynptr APIs. The actual use cases dynptr is meant to serve may not make sense in unprivileged BPF programs. Hence, gate these helpers behind CAP_BPF and limit use to privileged BPF programs. Fixes: 263ae152e962 ("bpf: Add bpf_dynptr_from_mem for local dynptrs") Fixes: bc34dee65a65 ("bpf: Dynptr support for ring buffers") Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write") Fixes: 34d4ef5775f7 ("bpf: Add dynptr data slices") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220921143550.30247-1-memxor@gmail.com Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1f961f9982d2..3814b0fd3a2c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1627,26 +1627,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; - case BPF_FUNC_ringbuf_reserve_dynptr: - return &bpf_ringbuf_reserve_dynptr_proto; - case BPF_FUNC_ringbuf_submit_dynptr: - return &bpf_ringbuf_submit_dynptr_proto; - case BPF_FUNC_ringbuf_discard_dynptr: - return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; - case BPF_FUNC_dynptr_from_mem: - return &bpf_dynptr_from_mem_proto; - case BPF_FUNC_dynptr_read: - return &bpf_dynptr_read_proto; - case BPF_FUNC_dynptr_write: - return &bpf_dynptr_write_proto; - case BPF_FUNC_dynptr_data: - return &bpf_dynptr_data_proto; default: break; } @@ -1675,6 +1661,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; default: break; } From 60240bc26114543fcbfcd8a28466e67e77b20388 Mon Sep 17 00:00:00 2001 From: Jalal Mostafa Date: Wed, 21 Sep 2022 13:57:01 +0000 Subject: [PATCH 10/10] xsk: Inherit need_wakeup flag for shared sockets The flag for need_wakeup is not set for xsks with `XDP_SHARED_UMEM` flag and of different queue ids and/or devices. They should inherit the flag from the first socket buffer pool since no flags can be specified once `XDP_SHARED_UMEM` is specified. Fixes: b5aea28dca134 ("xsk: Add shared umem support between queue ids") Signed-off-by: Jalal Mostafa Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220921135701.10199-1-jalal.a.mostapha@gmail.com --- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 647722e847b4..f787c3f524b0 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -95,7 +95,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); -int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id); int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 639b2c3beb69..9f0561b67c12 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -954,8 +954,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - err = xp_assign_dev_shared(xs->pool, umem_xs->umem, - dev, qid); + err = xp_assign_dev_shared(xs->pool, umem_xs, dev, + qid); if (err) { xp_destroy(xs->pool); xs->pool = NULL; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index a71a8c6edf55..ed6c71826d31 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -212,17 +212,18 @@ err_unreg_pool: return err; } -int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id) { u16 flags; + struct xdp_umem *umem = umem_xs->umem; /* One fill and completion ring required for each queue id. */ if (!pool->fq || !pool->cq) return -EINVAL; flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; - if (pool->uses_need_wakeup) + if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; return xp_assign_dev(pool, dev, queue_id, flags);