From 8ece5abd74902edd5711cab8d1cc25f635733eff Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Mar 2025 16:13:45 +0100 Subject: [PATCH 1/6] Revert "sched/core: Reduce cost of sched_move_task when config autogroup" commit 76f970ce51c80f625eb6ddbb24e9cb51b977b598 upstream. This reverts commit eff6c8ce8d4d7faef75f66614dd20bb50595d261. Hazem reported a 30% drop in UnixBench spawn test with commit eff6c8ce8d4d ("sched/core: Reduce cost of sched_move_task when config autogroup") on a m6g.xlarge AWS EC2 instance with 4 vCPUs and 16 GiB RAM (aarch64) (single level MC sched domain): https://lkml.kernel.org/r/20250205151026.13061-1-hagarhem@amazon.com There is an early bail from sched_move_task() if p->sched_task_group is equal to p's 'cpu cgroup' (sched_get_task_group()). E.g. both are pointing to taskgroup '/user.slice/user-1000.slice/session-1.scope' (Ubuntu '22.04.5 LTS'). So in: do_exit() sched_autogroup_exit_task() sched_move_task() if sched_get_task_group(p) == p->sched_task_group return /* p is enqueued */ dequeue_task() \ sched_change_group() | task_change_group_fair() | detach_task_cfs_rq() | (1) set_task_rq() | attach_task_cfs_rq() | enqueue_task() / (1) isn't called for p anymore. Turns out that the regression is related to sgs->group_util in group_is_overloaded() and group_has_capacity(). If (1) isn't called for all the 'spawn' tasks then sgs->group_util is ~900 and sgs->group_capacity = 1024 (single CPU sched domain) and this leads to group_is_overloaded() returning true (2) and group_has_capacity() false (3) much more often compared to the case when (1) is called. I.e. there are much more cases of 'group_is_overloaded' and 'group_fully_busy' in WF_FORK wakeup sched_balance_find_dst_cpu() which then returns much more often a CPU != smp_processor_id() (5). This isn't good for these extremely short running tasks (FORK + EXIT) and also involves calling sched_balance_find_dst_group_cpu() unnecessary (single CPU sched domain). Instead if (1) is called for 'p->flags & PF_EXITING' then the path (4),(6) is taken much more often. select_task_rq_fair(..., wake_flags = WF_FORK) cpu = smp_processor_id() new_cpu = sched_balance_find_dst_cpu(..., cpu, ...) group = sched_balance_find_dst_group(..., cpu) do { update_sg_wakeup_stats() sgs->group_type = group_classify() if group_is_overloaded() (2) return group_overloaded if !group_has_capacity() (3) return group_fully_busy return group_has_spare (4) } while group if local_sgs.group_type > idlest_sgs.group_type return idlest (5) case group_has_spare: if local_sgs.idle_cpus >= idlest_sgs.idle_cpus return NULL (6) Unixbench Tests './Run -c 4 spawn' on: (a) VM AWS instance (m7gd.16xlarge) with v6.13 ('maxcpus=4 nr_cpus=4') and Ubuntu 22.04.5 LTS (aarch64). Shell & test run in '/user.slice/user-1000.slice/session-1.scope'. w/o patch w/ patch 21005 27120 (b) i7-13700K with tip/sched/core ('nosmt maxcpus=8 nr_cpus=8') and Ubuntu 22.04.5 LTS (x86_64). Shell & test run in '/A'. w/o patch w/ patch 67675 88806 CONFIG_SCHED_AUTOGROUP=y & /sys/proc/kernel/sched_autogroup_enabled equal 0 or 1. Reported-by: Hazem Mohamed Abuelfotoh Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Tested-by: Hagar Hemdan Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250314151345.275739-1-dietmar.eggemann@arm.com Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f817d0c5d2d..e9bb1b4c5842 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8919,7 +8919,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -8931,13 +8931,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -8958,20 +8952,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); running = task_current(rq, tsk); @@ -8982,7 +8967,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); + sched_change_group(tsk); if (!for_autogroup) scx_cgroup_move_task(tsk); From 9aaffd3718293523727e650b292963df3e1c1be0 Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Tue, 22 Oct 2024 19:23:29 +0200 Subject: [PATCH 2/6] libsubcmd: Silence compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7a4ffec9fd54ea27395e24dff726dbf58e2fe06b upstream. Initialize the pointer 'o' in options__order to NULL to prevent a compiler warning/error which is observed when compiling with the '-Og' option, but is not emitted by the compiler with the current default compilation options. For example, when compiling libsubcmd with $ make "EXTRA_CFLAGS=-Og" -C tools/lib/subcmd/ clean all Clang version 17.0.6 and GCC 13.3.1 fail to compile parse-options.c due to following error: parse-options.c: In function ‘options__order’: parse-options.c:832:9: error: ‘o’ may be used uninitialized [-Werror=maybe-uninitialized] 832 | memcpy(&ordered[nr_opts], o, sizeof(*o)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ parse-options.c:810:30: note: ‘o’ was declared here 810 | const struct option *o, *p = opts; | ^ cc1: all warnings being treated as errors Signed-off-by: Eder Zulian Signed-off-by: Andrii Nakryiko Acked-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241022172329.3871958-4-ezulian@redhat.com Signed-off-by: Bin Lan Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- tools/lib/subcmd/parse-options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index eb896d30545b..555d617c1f50 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -807,7 +807,7 @@ static int option__cmp(const void *va, const void *vb) static struct option *options__order(const struct option *opts) { int nr_opts = 0, nr_group = 0, nr_parent = 0, len; - const struct option *o, *p = opts; + const struct option *o = NULL, *p = opts; struct option *opt, *ordered = NULL, *group; /* flatten the options that have parents */ From e7940c5794c3e936b5517b00c1668c6e0cc1bc09 Mon Sep 17 00:00:00 2001 From: Justin Klaassen Date: Tue, 25 Feb 2025 17:03:58 +0000 Subject: [PATCH 3/6] arm64: dts: rockchip: fix u2phy1_host status for NanoPi R4S commit 38f4aa34a5f737ea8588dac320d884cc2e762c03 upstream. The u2phy1_host should always have the same status as usb_host1_ehci and usb_host1_ohci, otherwise the EHCI and OHCI drivers may be initialized for a disabled usb port. Per the NanoPi R4S schematic, the phy-supply for u2phy1_host is set to the vdd_5v regulator. Fixes: db792e9adbf8 ("rockchip: rk3399: Add support for FriendlyARM NanoPi R4S") Cc: stable@vger.kernel.org Signed-off-by: Justin Klaassen Reviewed-by: Dragan Simic Link: https://lore.kernel.org/r/20250225170420.3898-1-justin@tidylabs.net Signed-off-by: Heiko Stuebner Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dts b/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dts index fe5b52610010..6a6b36c36ce2 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dts @@ -117,7 +117,7 @@ }; &u2phy1_host { - status = "disabled"; + phy-supply = <&vdd_5v>; }; &uart0 { From 86368616a9ce51f6b41efa251b6e066893851d67 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 10 Mar 2025 11:57:27 -0400 Subject: [PATCH 4/6] mm/huge_memory: drop beyond-EOF folios with the right number of refs commit 14efb4793519d73fb2902bb0ece319b886e4b4b9 upstream. When an after-split folio is large and needs to be dropped due to EOF, folio_put_refs(folio, folio_nr_pages(folio)) should be used to drop all page cache refs. Otherwise, the folio will not be freed, causing memory leak. This leak would happen on a filesystem with blocksize > page_size and a truncate is performed, where the blocksize makes folios split to >0 order ones, causing truncated folios not being freed. Link: https://lkml.kernel.org/r/20250310155727.472846-1-ziy@nvidia.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Zi Yan Reported-by: Hugh Dickins Closes: https://lore.kernel.org/all/fcbadb7f-dd3e-21df-f9a7-2853b53183c4@google.com/ Cc: Baolin Wang Cc: David Hildenbrand Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Luis Chamberalin Cc: Matthew Wilcow (Oracle) Cc: Miaohe Lin Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f127b61f04a8..40ac11e29423 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3224,7 +3224,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); - folio_put(tail); + folio_put_refs(tail, folio_nr_pages(tail)); } else if (!PageAnon(page)) { __xa_store(&folio->mapping->i_pages, head[i].index, head + i, 0); From 2fa52cd829c194c86f3c324d2a49018393bf70d1 Mon Sep 17 00:00:00 2001 From: Arthur Mongodin Date: Fri, 14 Mar 2025 21:11:31 +0100 Subject: [PATCH 5/6] mptcp: Fix data stream corruption in the address announcement commit 2c1f97a52cb827a5f2768e67a9dddffae1ed47ab upstream. Because of the size restriction in the TCP options space, the MPTCP ADD_ADDR option is exclusive and cannot be sent with other MPTCP ones. For this reason, in the linked mptcp_out_options structure, group of fields linked to different options are part of the same union. There is a case where the mptcp_pm_add_addr_signal() function can modify opts->addr, but not ended up sending an ADD_ADDR. Later on, back in mptcp_established_options, other options will be sent, but with unexpected data written in other fields due to the union, e.g. in opts->ext_copy. This could lead to a data stream corruption in the next packet. Using an intermediate variable, prevents from corrupting previously established DSS option. The assignment of the ADD_ADDR option parameters is now done once we are sure this ADD_ADDR option can be set in the packet, e.g. after having dropped other suboptions. Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Signed-off-by: Arthur Mongodin Reviewed-by: Matthieu Baerts (NGI0) [ Matt: the commit message has been updated: long lines splits and some clarifications. ] Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250314-net-mptcp-fix-data-stream-corr-sockopt-v1-1-122dbb249db3@kernel.org Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index fd2de185bc93..23949ae2a3a8 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -651,6 +651,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; + struct mptcp_addr_info addr; bool echo; int len; @@ -659,7 +660,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; @@ -672,7 +673,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * else if (opts->suboptions & OPTION_MPTCP_DSS) return false; - len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); + len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; @@ -689,6 +690,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = 0; *size -= opt_size; } + opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); From 3423cae6907838f760aada1a72bb6e378ebaa16d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 28 Mar 2025 22:03:33 +0100 Subject: [PATCH 6/6] Linux 6.12.21 Link: https://lore.kernel.org/r/20250325122149.207086105@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Miguel Ojeda Tested-by: Pavel Machek (CIP) Link: https://lore.kernel.org/r/20250326154546.724728617@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Hardik Garg Tested-by: Peter Schneider Tested-by: Ron Economos Tested-by: Mark Brown Tested-by: Jon Hunter Tested-by: Harshit Mogalapalli Tested-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Tested-by: Harshit Mogalapalli Tested-by: Linux Kernel Functional Testing Tested-by: Pavel Machek (CIP) Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ca000bd227be..a646151342b8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 12 -SUBLEVEL = 20 +SUBLEVEL = 21 EXTRAVERSION = NAME = Baby Opossum Posse