From 394e6869f0185e89cb815db29bf819474df858ae Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:31:56 +0800 Subject: [PATCH 1/7] mm/percpu.c: don't acquire pcpu_lock for pcpu_chunk_addr_search() There is no need to acquire pcpu_lock for pcpu_chunk_addr_search(): 1) both pcpu_first_chunk & pcpu_reserved_chunk must have been initialized before the invocation of free_percpu(). 2) The dynamically-created chunk must be valid before the per-cpu pointers allocated from it are freed. So acquire pcpu_lock() after the invocation of pcpu_chunk_addr_search(). Acked-by: Dennis Zhou Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- mm/percpu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a7665de8485f..ea607078368d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2267,12 +2267,10 @@ void free_percpu(void __percpu *ptr) kmemleak_free_percpu(ptr); addr = __pcpu_ptr_to_addr(ptr); - - spin_lock_irqsave(&pcpu_lock, flags); - chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; + spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); pcpu_memcg_free_hook(chunk, off, size); From b460bc8302f222d346f0c15bba980eb8c36d6278 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:31:57 +0800 Subject: [PATCH 2/7] mm/percpu.c: introduce pcpu_alloc_size() Introduce pcpu_alloc_size() to get the size of the dynamic per-cpu area. It will be used by bpf memory allocator in the following patches. BPF memory allocator maintains per-cpu area caches for multiple area sizes and its free API only has the to-be-freed per-cpu pointer, so it needs the size of dynamic per-cpu area to select the corresponding cache when bpf program frees the dynamic per-cpu pointer. Acked-by: Dennis Zhou Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/percpu.h | 1 + mm/percpu.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 68fac2e7cbe6..8c677f185901 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void); extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); +extern size_t pcpu_alloc_size(void __percpu *__pdata); DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T)) diff --git a/mm/percpu.c b/mm/percpu.c index ea607078368d..60ed078e4cd0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2244,6 +2244,37 @@ static void pcpu_balance_workfn(struct work_struct *work) mutex_unlock(&pcpu_alloc_mutex); } +/** + * pcpu_alloc_size - the size of the dynamic percpu area + * @ptr: pointer to the dynamic percpu area + * + * Returns the size of the @ptr allocation. This is undefined for statically + * defined percpu variables as there is no corresponding chunk->bound_map. + * + * RETURNS: + * The size of the dynamic percpu area. + * + * CONTEXT: + * Can be called from atomic context. + */ +size_t pcpu_alloc_size(void __percpu *ptr) +{ + struct pcpu_chunk *chunk; + unsigned long bit_off, end; + void *addr; + + if (!ptr) + return 0; + + addr = __pcpu_ptr_to_addr(ptr); + /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */ + chunk = pcpu_chunk_addr_search(addr); + bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE; + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), + bit_off + 1); + return (end - bit_off) * PCPU_MIN_ALLOC_SIZE; +} + /** * free_percpu - free percpu area * @ptr: pointer to area to free From baa8fdecd87bb8751237b45e3bcb5a179e5a12ca Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:31:58 +0800 Subject: [PATCH 3/7] bpf: Re-enable unit_size checking for global per-cpu allocator With pcpu_alloc_size() in place, check whether or not the size of the dynamic per-cpu area is matched with unit_size. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 39ea316c55e7..776bdf5ffd80 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -491,21 +491,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) struct llist_node *first; unsigned int obj_size; - /* For per-cpu allocator, the size of free objects in free list doesn't - * match with unit_size and now there is no way to get the size of - * per-cpu pointer saved in free object, so just skip the checking. - */ - if (c->percpu_size) - return 0; - first = c->free_llist.first; if (!first) return 0; - obj_size = ksize(first); + if (c->percpu_size) + obj_size = pcpu_alloc_size(((void **)first)[1]); + else + obj_size = ksize(first); if (obj_size != c->unit_size) { - WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n", - idx, obj_size, c->unit_size); + WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n", + idx, c->percpu_size, obj_size, c->unit_size); return -EINVAL; } return 0; @@ -973,6 +969,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } +/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the + * actual size of dynamic per-cpu area will always be matched and there is + * no need to adjust size_index for per-cpu allocation. However for the + * simplicity of the implementation, use an unified size_index for both + * kmalloc and per-cpu allocation. + */ static __init int bpf_mem_cache_adjust_size(void) { unsigned int size; From 3f2189e4f77b7a3e979d143dc4ff586488c7e8a5 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:31:59 +0800 Subject: [PATCH 4/7] bpf: Use pcpu_alloc_size() in bpf_mem_free{_rcu}() For bpf_global_percpu_ma, the pointer passed to bpf_mem_free_rcu() is allocated by kmalloc() and its size is fixed (16-bytes on x86-64). So no matter which cache allocates the dynamic per-cpu area, on x86-64 cache[2] will always be used to free the per-cpu area. Fix the unbalance by checking whether the bpf memory allocator is per-cpu or not and use pcpu_alloc_size() instead of ksize() to find the correct cache for per-cpu free. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 1 + kernel/bpf/memalloc.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index d644bbb298af..bb1223b21308 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + bool percpu; struct work_struct work; }; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 776bdf5ffd80..5308e386380a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -525,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); + ma->percpu = percpu; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); @@ -874,6 +875,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) return !ret ? NULL : ret + LLIST_NODE_SZ; } +static notrace int bpf_mem_free_idx(void *ptr, bool percpu) +{ + size_t size; + + if (percpu) + size = pcpu_alloc_size(*((void **)ptr)); + else + size = ksize(ptr - LLIST_NODE_SZ); + return bpf_mem_cache_idx(size); +} + void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { int idx; @@ -881,7 +893,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; @@ -895,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; From e581a3461de3f129cfe888a67d9f31086328271f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:32:00 +0800 Subject: [PATCH 5/7] bpf: Move the declaration of __bpf_obj_drop_impl() to bpf.h both syscall.c and helpers.c have the declaration of __bpf_obj_drop_impl(), so just move it to a common header file. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/helpers.c | 2 -- kernel/bpf/syscall.c | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b40b45962b..ebd412179771 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index da058aead20c..c814bb44d2d1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 341f8cb4405c..69998f84f7c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } -extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; From e383a45902337356d9ccad797094a27c6b2150f9 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:32:01 +0800 Subject: [PATCH 6/7] bpf: Use bpf_global_percpu_ma for per-cpu kptr in __bpf_obj_drop_impl() The following warning was reported when running "./test_progs -t test_bpf_ma/percpu_free_through_map_free": ------------[ cut here ]------------ WARNING: CPU: 1 PID: 68 at kernel/bpf/memalloc.c:342 CPU: 1 PID: 68 Comm: kworker/u16:2 Not tainted 6.6.0-rc2+ #222 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:bpf_mem_refill+0x21c/0x2a0 ...... Call Trace: ? bpf_mem_refill+0x21c/0x2a0 irq_work_single+0x27/0x70 irq_work_run_list+0x2a/0x40 irq_work_run+0x18/0x40 __sysvec_irq_work+0x1c/0xc0 sysvec_irq_work+0x73/0x90 asm_sysvec_irq_work+0x1b/0x20 RIP: 0010:unit_free+0x50/0x80 ...... bpf_mem_free+0x46/0x60 __bpf_obj_drop_impl+0x40/0x90 bpf_obj_free_fields+0x17d/0x1a0 array_map_free+0x6b/0x170 bpf_map_free_deferred+0x54/0xa0 process_scheduled_works+0xba/0x370 worker_thread+0x16d/0x2e0 kthread+0x105/0x140 ret_from_fork+0x39/0x60 ret_from_fork_asm+0x1b/0x30 ---[ end trace 0000000000000000 ]--- The reason is simple: __bpf_obj_drop_impl() does not know the freeing field is a per-cpu pointer and it uses bpf_global_ma to free the pointer. Because bpf_global_ma is not a per-cpu allocator, so ksize() is used to select the corresponding cache. The bpf_mem_cache with 16-bytes unit_size will always be selected to do the unmatched free and it will trigger the warning in free_bulk() eventually. Because per-cpu kptr doesn't support list or rb-tree now, so fix the problem by only checking whether or not the type of kptr is per-cpu in bpf_obj_free_fields(), and using bpf_global_percpu_ma to these kptrs. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-7-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 24 +++++++++++++++--------- kernel/bpf/syscall.c | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ebd412179771..b4825d3cdb29 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2058,7 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c814bb44d2d1..e46ac288a108 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1842,7 +1842,7 @@ unlock: * bpf_list_head which needs to be freed. */ migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } @@ -1881,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } @@ -1913,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) } /* Must be called under migrate_disable(), as required by bpf_mem_free */ -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { + struct bpf_mem_alloc *ma; + if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 @@ -1926,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) if (rec) bpf_obj_free_fields(rec, p); - if (rec && rec->refcount_off >= 0) - bpf_mem_free_rcu(&bpf_global_ma, p); + if (percpu) + ma = &bpf_global_percpu_ma; else - bpf_mem_free(&bpf_global_ma, p); + ma = &bpf_global_ma; + if (rec && rec->refcount_off >= 0) + bpf_mem_free_rcu(ma, p); + else + bpf_mem_free(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1937,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; - __bpf_obj_drop_impl(p, meta ? meta->record : NULL); + __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1981,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } @@ -2080,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 69998f84f7c8..a9b2cb500bf7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -660,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.btf_id); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); + pointee_struct_meta->record : NULL, + fields[i].type == BPF_KPTR_PERCPU); migrate_enable(); } else { field->kptr.dtor(xchgd_field); From d440ba91ca4d3004709876779d40713a99e21f6a Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Oct 2023 21:32:02 +0800 Subject: [PATCH 7/7] selftests/bpf: Add more test cases for bpf memory allocator Add the following 3 test cases for bpf memory allocator: 1) Do allocation in bpf program and free through map free 2) Do batch per-cpu allocation and per-cpu free in bpf program 3) Do per-cpu allocation in bpf program and free through map free For per-cpu allocation, because per-cpu allocation can not refill timely sometimes, so test 2) and test 3) consider it is OK for bpf_percpu_obj_new_impl() to return NULL. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231020133202.4043247-8-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/test_bpf_ma.c | 20 +- .../testing/selftests/bpf/progs/test_bpf_ma.c | 180 +++++++++++++++++- 2 files changed, 193 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c index 0cca4e8ae38e..d3491a84b3b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c @@ -9,9 +9,10 @@ #include "test_bpf_ma.skel.h" -void test_test_bpf_ma(void) +static void do_bpf_ma_test(const char *name) { struct test_bpf_ma *skel; + struct bpf_program *prog; struct btf *btf; int i, err; @@ -34,6 +35,11 @@ void test_test_bpf_ma(void) skel->rodata->data_btf_ids[i] = id; } + prog = bpf_object__find_program_by_name(skel->obj, name); + if (!ASSERT_OK_PTR(prog, "invalid prog name")) + goto out; + bpf_program__set_autoload(prog, true); + err = test_bpf_ma__load(skel); if (!ASSERT_OK(err, "load")) goto out; @@ -48,3 +54,15 @@ void test_test_bpf_ma(void) out: test_bpf_ma__destroy(skel); } + +void test_test_bpf_ma(void) +{ + if (test__start_subtest("batch_alloc_free")) + do_bpf_ma_test("test_batch_alloc_free"); + if (test__start_subtest("free_through_map_free")) + do_bpf_ma_test("test_free_through_map_free"); + if (test__start_subtest("batch_percpu_alloc_free")) + do_bpf_ma_test("test_batch_percpu_alloc_free"); + if (test__start_subtest("percpu_free_through_map_free")) + do_bpf_ma_test("test_percpu_free_through_map_free"); +} diff --git a/tools/testing/selftests/bpf/progs/test_bpf_ma.c b/tools/testing/selftests/bpf/progs/test_bpf_ma.c index ecde41ae0fc8..b685a4aba6bd 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_ma.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_ma.c @@ -37,10 +37,20 @@ int pid = 0; __type(key, int); \ __type(value, struct map_value_##_size); \ __uint(max_entries, 128); \ - } array_##_size SEC(".maps"); + } array_##_size SEC(".maps") -static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int batch, - unsigned int idx) +#define DEFINE_ARRAY_WITH_PERCPU_KPTR(_size) \ + struct map_value_percpu_##_size { \ + struct bin_data_##_size __percpu_kptr * data; \ + }; \ + struct { \ + __uint(type, BPF_MAP_TYPE_ARRAY); \ + __type(key, int); \ + __type(value, struct map_value_percpu_##_size); \ + __uint(max_entries, 128); \ + } array_percpu_##_size SEC(".maps") + +static __always_inline void batch_alloc(struct bpf_map *map, unsigned int batch, unsigned int idx) { struct generic_map_value *value; unsigned int i, key; @@ -65,6 +75,14 @@ static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int b return; } } +} + +static __always_inline void batch_free(struct bpf_map *map, unsigned int batch, unsigned int idx) +{ + struct generic_map_value *value; + unsigned int i, key; + void *old; + for (i = 0; i < batch; i++) { key = i; value = bpf_map_lookup_elem(map, &key); @@ -81,8 +99,72 @@ static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int b } } +static __always_inline void batch_percpu_alloc(struct bpf_map *map, unsigned int batch, + unsigned int idx) +{ + struct generic_map_value *value; + unsigned int i, key; + void *old, *new; + + for (i = 0; i < batch; i++) { + key = i; + value = bpf_map_lookup_elem(map, &key); + if (!value) { + err = 1; + return; + } + /* per-cpu allocator may not be able to refill in time */ + new = bpf_percpu_obj_new_impl(data_btf_ids[idx], NULL); + if (!new) + continue; + + old = bpf_kptr_xchg(&value->data, new); + if (old) { + bpf_percpu_obj_drop(old); + err = 2; + return; + } + } +} + +static __always_inline void batch_percpu_free(struct bpf_map *map, unsigned int batch, + unsigned int idx) +{ + struct generic_map_value *value; + unsigned int i, key; + void *old; + + for (i = 0; i < batch; i++) { + key = i; + value = bpf_map_lookup_elem(map, &key); + if (!value) { + err = 3; + return; + } + old = bpf_kptr_xchg(&value->data, NULL); + if (!old) + continue; + bpf_percpu_obj_drop(old); + } +} + +#define CALL_BATCH_ALLOC(size, batch, idx) \ + batch_alloc((struct bpf_map *)(&array_##size), batch, idx) + #define CALL_BATCH_ALLOC_FREE(size, batch, idx) \ - batch_alloc_free((struct bpf_map *)(&array_##size), batch, idx) + do { \ + batch_alloc((struct bpf_map *)(&array_##size), batch, idx); \ + batch_free((struct bpf_map *)(&array_##size), batch, idx); \ + } while (0) + +#define CALL_BATCH_PERCPU_ALLOC(size, batch, idx) \ + batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx) + +#define CALL_BATCH_PERCPU_ALLOC_FREE(size, batch, idx) \ + do { \ + batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx); \ + batch_percpu_free((struct bpf_map *)(&array_percpu_##size), batch, idx); \ + } while (0) DEFINE_ARRAY_WITH_KPTR(8); DEFINE_ARRAY_WITH_KPTR(16); @@ -97,8 +179,21 @@ DEFINE_ARRAY_WITH_KPTR(1024); DEFINE_ARRAY_WITH_KPTR(2048); DEFINE_ARRAY_WITH_KPTR(4096); -SEC("fentry/" SYS_PREFIX "sys_nanosleep") -int test_bpf_mem_alloc_free(void *ctx) +/* per-cpu kptr doesn't support bin_data_8 which is a zero-sized array */ +DEFINE_ARRAY_WITH_PERCPU_KPTR(16); +DEFINE_ARRAY_WITH_PERCPU_KPTR(32); +DEFINE_ARRAY_WITH_PERCPU_KPTR(64); +DEFINE_ARRAY_WITH_PERCPU_KPTR(96); +DEFINE_ARRAY_WITH_PERCPU_KPTR(128); +DEFINE_ARRAY_WITH_PERCPU_KPTR(192); +DEFINE_ARRAY_WITH_PERCPU_KPTR(256); +DEFINE_ARRAY_WITH_PERCPU_KPTR(512); +DEFINE_ARRAY_WITH_PERCPU_KPTR(1024); +DEFINE_ARRAY_WITH_PERCPU_KPTR(2048); +DEFINE_ARRAY_WITH_PERCPU_KPTR(4096); + +SEC("?fentry/" SYS_PREFIX "sys_nanosleep") +int test_batch_alloc_free(void *ctx) { if ((u32)bpf_get_current_pid_tgid() != pid) return 0; @@ -121,3 +216,76 @@ int test_bpf_mem_alloc_free(void *ctx) return 0; } + +SEC("?fentry/" SYS_PREFIX "sys_nanosleep") +int test_free_through_map_free(void *ctx) +{ + if ((u32)bpf_get_current_pid_tgid() != pid) + return 0; + + /* Alloc 128 8-bytes objects in batch to trigger refilling, + * then free these objects through map free. + */ + CALL_BATCH_ALLOC(8, 128, 0); + CALL_BATCH_ALLOC(16, 128, 1); + CALL_BATCH_ALLOC(32, 128, 2); + CALL_BATCH_ALLOC(64, 128, 3); + CALL_BATCH_ALLOC(96, 128, 4); + CALL_BATCH_ALLOC(128, 128, 5); + CALL_BATCH_ALLOC(192, 128, 6); + CALL_BATCH_ALLOC(256, 128, 7); + CALL_BATCH_ALLOC(512, 64, 8); + CALL_BATCH_ALLOC(1024, 32, 9); + CALL_BATCH_ALLOC(2048, 16, 10); + CALL_BATCH_ALLOC(4096, 8, 11); + + return 0; +} + +SEC("?fentry/" SYS_PREFIX "sys_nanosleep") +int test_batch_percpu_alloc_free(void *ctx) +{ + if ((u32)bpf_get_current_pid_tgid() != pid) + return 0; + + /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling, + * then free 128 16-bytes per-cpu objects in batch to trigger freeing. + */ + CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 1); + CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 2); + CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 3); + CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 4); + CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 5); + CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 6); + CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 7); + CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 8); + CALL_BATCH_PERCPU_ALLOC_FREE(1024, 32, 9); + CALL_BATCH_PERCPU_ALLOC_FREE(2048, 16, 10); + CALL_BATCH_PERCPU_ALLOC_FREE(4096, 8, 11); + + return 0; +} + +SEC("?fentry/" SYS_PREFIX "sys_nanosleep") +int test_percpu_free_through_map_free(void *ctx) +{ + if ((u32)bpf_get_current_pid_tgid() != pid) + return 0; + + /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling, + * then free these object through map free. + */ + CALL_BATCH_PERCPU_ALLOC(16, 128, 1); + CALL_BATCH_PERCPU_ALLOC(32, 128, 2); + CALL_BATCH_PERCPU_ALLOC(64, 128, 3); + CALL_BATCH_PERCPU_ALLOC(96, 128, 4); + CALL_BATCH_PERCPU_ALLOC(128, 128, 5); + CALL_BATCH_PERCPU_ALLOC(192, 128, 6); + CALL_BATCH_PERCPU_ALLOC(256, 128, 7); + CALL_BATCH_PERCPU_ALLOC(512, 64, 8); + CALL_BATCH_PERCPU_ALLOC(1024, 32, 9); + CALL_BATCH_PERCPU_ALLOC(2048, 16, 10); + CALL_BATCH_PERCPU_ALLOC(4096, 8, 11); + + return 0; +}