From b980b832318ca6686d1c077162d45153fae16875 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 5 Mar 2025 21:54:35 +0800 Subject: [PATCH 01/37] ASoC: Intel: adl: add 2xrt1316 audio configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8b36447c9ae102539d82d6278971b23b20d87629 upstream. That is a speaker only configuration and 2 rt1316 are on link 0 and 2. Signed-off-by: Bard Liao Reviewed-by: Liam Girdwood Reviewed-by: Péter Ujfalusi Link: https://patch.msgid.link/20250305135443.201884-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- .../intel/common/soc-acpi-intel-adl-match.c | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/sound/soc/intel/common/soc-acpi-intel-adl-match.c b/sound/soc/intel/common/soc-acpi-intel-adl-match.c index bb1324fb588e..a68efbe98948 100644 --- a/sound/soc/intel/common/soc-acpi-intel-adl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-adl-match.c @@ -214,6 +214,15 @@ static const struct snd_soc_acpi_adr_device rt1316_1_group2_adr[] = { } }; +static const struct snd_soc_acpi_adr_device rt1316_2_group2_adr[] = { + { + .adr = 0x000232025D131601ull, + .num_endpoints = 1, + .endpoints = &spk_r_endpoint, + .name_prefix = "rt1316-2" + } +}; + static const struct snd_soc_acpi_adr_device rt1316_1_single_adr[] = { { .adr = 0x000130025D131601ull, @@ -547,6 +556,20 @@ static const struct snd_soc_acpi_link_adr adl_chromebook_base[] = { {} }; +static const struct snd_soc_acpi_link_adr adl_sdw_rt1316_link02[] = { + { + .mask = BIT(0), + .num_adr = ARRAY_SIZE(rt1316_0_group2_adr), + .adr_d = rt1316_0_group2_adr, + }, + { + .mask = BIT(2), + .num_adr = ARRAY_SIZE(rt1316_2_group2_adr), + .adr_d = rt1316_2_group2_adr, + }, + {} +}; + static const struct snd_soc_acpi_codecs adl_max98357a_amp = { .num_codecs = 1, .codecs = {"MX98357A"} @@ -749,6 +772,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_adl_sdw_machines[] = { .drv_name = "sof_sdw", .sof_tplg_filename = "sof-adl-sdw-max98373-rt5682.tplg", }, + { + .link_mask = BIT(0) | BIT(2), + .links = adl_sdw_rt1316_link02, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-adl-rt1316-l02.tplg", + }, {}, }; EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_adl_sdw_machines); From 40bc55e4fcbd740731e5f131f4fc9a94a2f86ac8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 30 Mar 2025 17:52:40 -0400 Subject: [PATCH 02/37] cgroup/cpuset: Fix incorrect isolated_cpus update in update_parent_effective_cpumask() [ Upstream commit 668e041662e92ab3ebcb9eb606d3ec01884546ab ] Before commit f0af1bfc27b5 ("cgroup/cpuset: Relax constraints to partition & cpus changes"), a cpuset partition cannot be enabled if not all the requested CPUs can be granted from the parent cpuset. After that commit, a cpuset partition can be created even if the requested exclusive CPUs contain CPUs not allowed its parent. The delmask containing exclusive CPUs to be removed from its parent wasn't adjusted accordingly. That is not a problem until the introduction of a new isolated_cpus mask in commit 11e5f407b64a ("cgroup/cpuset: Keep track of CPUs in isolated partitions") as the CPUs in the delmask may be added directly into isolated_cpus. As a result, isolated_cpus may incorrectly contain CPUs that are not isolated leading to incorrect data reporting. Fix this by adjusting the delmask to reflect the actual exclusive CPUs for the creation of the partition. Fixes: 11e5f407b64a ("cgroup/cpuset: Keep track of CPUs in isolated partitions") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/cgroup/cpuset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 24ece85fd3b1..f7ad5651c93d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1656,9 +1656,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->delmask, xcpus); - deleting = true; - subparts_delta++; + deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus); + if (deleting) + subparts_delta++; new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* From 9701dcbf5fce193def747ed03d6c788bcc49e2ca Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 30 Mar 2025 17:52:41 -0400 Subject: [PATCH 03/37] cgroup/cpuset: Fix error handling in remote_partition_disable() [ Upstream commit 8bf450f3aec3d1bbd725d179502c64b8992588e4 ] When remote_partition_disable() is called to disable a remote partition, it always sets the partition to an invalid partition state. It should only do so if an error code (prs_err) has been set. Correct that and add proper error code in places where remote_partition_disable() is called due to error. Fixes: 181c8e091aae ("cgroup/cpuset: Introduce remote partition") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7ad5651c93d..70fac05123c6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1383,6 +1383,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, list_add(&cs->remote_sibling, &remote_children); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cs->prs_err = 0; /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1413,9 +1414,11 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); - cs->partition_root_state = -cs->partition_root_state; - if (!cs->prs_err) - cs->prs_err = PERR_INVCPUS; + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -1448,8 +1451,10 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); - if (cpumask_empty(newmask)) + if (cpumask_empty(newmask)) { + cs->prs_err = PERR_CPUSEMPTY; goto invalidate; + } adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); @@ -1459,10 +1464,15 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ - if (adding && (!capable(CAP_SYS_ADMIN) || - cpumask_intersects(tmp->addmask, subpartitions_cpus) || - cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) - goto invalidate; + if (adding) { + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + if (cs->prs_err) + goto invalidate; + } spin_lock_irq(&callback_lock); if (adding) @@ -1578,7 +1588,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't - * check for error and so partition_root_state and prs_error will be updated + * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, @@ -3726,6 +3736,7 @@ retry: if (remote && cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; From 2dbd1b166034ab4bff100130a1a49309c6b31e48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 9 Nov 2024 21:50:21 -0500 Subject: [PATCH 04/37] cgroup/cpuset: Revert "Allow suppression of sched domain rebuild in update_cpumasks_hier()" [ Upstream commit bcd7012afd7bcd45fcd7a0e2f48e57b273702317 ] Revert commit 3ae0b773211e ("cgroup/cpuset: Allow suppression of sched domain rebuild in update_cpumasks_hier()") to allow for an alternative way to suppress unnecessary rebuild_sched_domains_locked() calls in update_cpumasks_hier() and elsewhere in a following commit. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Stable-dep-of: a22b3d54de94 ("cgroup/cpuset: Fix race between newly created partition and dying one") Signed-off-by: Sasha Levin --- kernel/cgroup/cpuset.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 70fac05123c6..0012c34bb860 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1940,12 +1940,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, rcu_read_unlock(); } -/* - * update_cpumasks_hier() flags - */ -#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ -#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ - /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider @@ -1960,7 +1954,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - int flags) + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -2025,10 +2019,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, - * 3) HIER_CHECKALL flag not set, and + * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { @@ -2130,8 +2124,7 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) && - !force_sd_rebuild) + if (need_rebuild_sched_domains && !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -2159,9 +2152,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * directly. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. HIER_NO_SD_REBUILD - * flag is used to suppress rebuild of sched domains as the callers - * will take care of that. + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -2177,7 +2168,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -2197,7 +2188,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -2258,8 +2249,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); @@ -2327,7 +2317,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ - update_cpumasks_hier(cs, &tmp, hier_flags); + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2352,7 +2342,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; if (!*buf) { @@ -2375,8 +2365,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) @@ -2429,8 +2418,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ - if (is_partition_valid(cs) || hier_flags) - update_cpumasks_hier(cs, &tmp, hier_flags); + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2871,7 +2860,7 @@ out: update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + update_cpumasks_hier(cs, &tmpmask, !new_prs); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); From 6b145f8b22011743b563965df0e090789b413bb8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 9 Nov 2024 21:50:22 -0500 Subject: [PATCH 05/37] cgroup/cpuset: Enforce at most one rebuild_sched_domains_locked() call per operation [ Upstream commit a040c351283e3ac75422621ea205b1d8d687e108 ] Since commit ff0ce721ec21 ("cgroup/cpuset: Eliminate unncessary sched domains rebuilds in hotplug"), there is only one rebuild_sched_domains_locked() call per hotplug operation. However, writing to the various cpuset control files may still casue more than one rebuild_sched_domains_locked() call to happen in some cases. Juri had found that two rebuild_sched_domains_locked() calls in update_prstate(), one from update_cpumasks_hier() and another one from update_partition_sd_lb() could cause cpuset partition to be created with null total_bw for DL tasks. IOW, DL tasks may not be scheduled correctly in such a partition. A sample command sequence that can reproduce null total_bw is as follows. # echo Y >/sys/kernel/debug/sched/verbose # echo +cpuset >/sys/fs/cgroup/cgroup.subtree_control # mkdir /sys/fs/cgroup/test # echo 0-7 > /sys/fs/cgroup/test/cpuset.cpus # echo 6-7 > /sys/fs/cgroup/test/cpuset.cpus.exclusive # echo root >/sys/fs/cgroup/test/cpuset.cpus.partition Fix this double rebuild_sched_domains_locked() calls problem by replacing existing calls with cpuset_force_rebuild() except the rebuild_sched_domains_cpuslocked() call at the end of cpuset_handle_hotplug(). Checking of the force_sd_rebuild flag is now done at the end of cpuset_write_resmask() and update_prstate() to determine if rebuild_sched_domains_locked() should be called or not. The cpuset v1 code can still call rebuild_sched_domains_locked() directly as double rebuild_sched_domains_locked() calls is not possible. Reported-by: Juri Lelli Closes: https://lore.kernel.org/lkml/ZyuUcJDPBln1BK1Y@jlelli-thinkpadt14gen4.remote.csb/ Signed-off-by: Waiman Long Tested-by: Juri Lelli Signed-off-by: Tejun Heo Stable-dep-of: a22b3d54de94 ("cgroup/cpuset: Fix race between newly created partition and dying one") Signed-off-by: Sasha Levin --- kernel/cgroup/cpuset.c | 49 ++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0012c34bb860..7ac2a634128b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -84,9 +84,19 @@ static bool have_boot_isolcpus; static struct list_head remote_children; /* - * A flag to force sched domain rebuild at the end of an operation while - * inhibiting it in the intermediate stages when set. Currently it is only - * set in hotplug code. + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - remote_partition_check() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. */ static bool force_sd_rebuild; @@ -998,6 +1008,7 @@ void rebuild_sched_domains_locked(void) lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); + force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid @@ -1172,8 +1183,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - if (rebuild_domains && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (rebuild_domains) + cpuset_force_rebuild(); } /* @@ -1530,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, remote_partition_disable(child, tmp); disable_cnt++; } - if (disable_cnt && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (disable_cnt) + cpuset_force_rebuild(); } /* @@ -2124,8 +2135,8 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (need_rebuild_sched_domains) + cpuset_force_rebuild(); } /** @@ -2744,9 +2755,13 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed && - !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (!IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) cpuset1_update_tasks_flags(cs); @@ -2866,6 +2881,8 @@ out: update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); free_cpumasks(NULL, &tmpmask); return 0; } @@ -3136,6 +3153,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); @@ -3879,11 +3898,9 @@ static void cpuset_handle_hotplug(void) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (force_sd_rebuild) { - force_sd_rebuild = false; + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - } free_cpumasks(NULL, ptmp); } From 1b06f00edaaa08ee23dba7972b24ede337c3a3cf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 9 Nov 2024 21:50:23 -0500 Subject: [PATCH 06/37] cgroup/cpuset: Further optimize code if CONFIG_CPUSETS_V1 not set [ Upstream commit c4c9cebe2fb9cdc73e55513de7af7a4f50260e88 ] Currently the cpuset code uses group_subsys_on_dfl() to check if we are running with cgroup v2. If CONFIG_CPUSETS_V1 isn't set, there is really no need to do this check and we can optimize out some of the unneeded v1 specific code paths. Introduce a new cpuset_v2() and use it to replace the cgroup_subsys_on_dfl() check to further optimize the code. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Stable-dep-of: a22b3d54de94 ("cgroup/cpuset: Fix race between newly created partition and dying one") Signed-off-by: Sasha Levin --- kernel/cgroup/cpuset.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7ac2a634128b..07ea3a563150 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -293,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -303,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs) */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } @@ -738,7 +744,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool cgrpv2 = cpuset_v2(); int nslot_update; doms = NULL; @@ -1206,7 +1212,7 @@ static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); @@ -2035,7 +2041,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, */ if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2109,8 +2115,7 @@ get_css: * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_partition_valid(cp) && + if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); @@ -2126,8 +2131,7 @@ get_css: */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_valid(cp))) + (!cpuset_v2() || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -2264,7 +2268,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = validate_change(cs, trialcs); - if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if ((retval == -EINVAL) && cpuset_v2()) { struct cgroup_subsys_state *css; struct cpuset *cp; @@ -2756,8 +2760,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { - if (!IS_ENABLED(CONFIG_CPUSETS_V1) || - cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) cpuset_force_rebuild(); else rebuild_sched_domains_locked(); @@ -2943,8 +2946,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpus_updated || mems_updated)) { + if (!cpuset_v2() || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3058,8 +3060,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !cpus_updated && !mems_updated) { + if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } @@ -3384,7 +3385,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; @@ -3411,8 +3412,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) + if (cpuset_v2() && !is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -3482,8 +3482,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_partition_valid(cs)) update_prstate(cs, 0); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) + if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); From cdb6e724e7c5713d13c5ad3340e9d71c3dd8c9fb Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 30 Mar 2025 17:52:39 -0400 Subject: [PATCH 07/37] cgroup/cpuset: Fix race between newly created partition and dying one [ Upstream commit a22b3d54de94f82ca057cc2ebf9496fa91ebf698 ] There is a possible race between removing a cgroup diectory that is a partition root and the creation of a new partition. The partition to be removed can be dying but still online, it doesn't not currently participate in checking for exclusive CPUs conflict, but the exclusive CPUs are still there in subpartitions_cpus and isolated_cpus. These two cpumasks are global states that affect the operation of cpuset partitions. The exclusive CPUs in dying cpusets will only be removed when cpuset_css_offline() function is called after an RCU delay. As a result, it is possible that a new partition can be created with exclusive CPUs that overlap with those of a dying one. When that dying partition is finally offlined, it removes those overlapping exclusive CPUs from subpartitions_cpus and maybe isolated_cpus resulting in an incorrect CPU configuration. This bug was found when a warning was triggered in remote_partition_disable() during testing because the subpartitions_cpus mask was empty. One possible way to fix this is to iterate the dying cpusets as well and avoid using the exclusive CPUs in those dying cpusets. However, this can still cause random partition creation failures or other anomalies due to racing. A better way to fix this race is to reset the partition state at the moment when a cpuset is being killed. Introduce a new css_killed() CSS function pointer and call it, if defined, before setting CSS_DYING flag in kill_css(). Also update the css_is_dying() helper to use the CSS_DYING flag introduced by commit 33c35aa48178 ("cgroup: Prevent kill_css() from being called more than once") for proper synchronization. Add a new cpuset_css_killed() function to reset the partition state of a valid partition root if it is being killed. Fixes: ee8dde0cd2ce ("cpuset: Add new v2 cpuset.sched.partition flag") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- include/linux/cgroup-defs.h | 1 + include/linux/cgroup.h | 2 +- kernel/cgroup/cgroup.c | 6 ++++++ kernel/cgroup/cpuset.c | 20 +++++++++++++++++--- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 38b2af336e4a..252eed781a6e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -711,6 +711,7 @@ struct cgroup_subsys { void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); + void (*css_killed)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8ef47f8a634..fc1324ed597d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -343,7 +343,7 @@ static inline u64 cgroup_id(const struct cgroup *cgrp) */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { - return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); + return css->flags & CSS_DYING; } static inline void cgroup_get(struct cgroup *cgrp) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 216535e055e1..4378f3eff25d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5909,6 +5909,12 @@ static void kill_css(struct cgroup_subsys_state *css) if (css->flags & CSS_DYING) return; + /* + * Call css_killed(), if defined, before setting the CSS_DYING flag + */ + if (css->ss->css_killed) + css->ss->css_killed(css); + css->flags |= CSS_DYING; /* diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 07ea3a563150..839f88ba17f7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3479,9 +3479,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (is_partition_valid(cs)) - update_prstate(cs, 0); - if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -3492,6 +3489,22 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_unlock(); } +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); + + cpus_read_lock(); + mutex_lock(&cpuset_mutex); + + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); + + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + +} + static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -3613,6 +3626,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, From 179ef2f8109e3a8cde6cf02afa730d54bee92cd7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 2 Apr 2025 15:20:00 +0300 Subject: [PATCH 08/37] gpiolib: of: Fix the choice for Ingenic NAND quirk [ Upstream commit 2b9c536430126c233552cdcd6ec9d5077454ece4 ] The Ingenic NAND quirk has been added under CONFIG_LCD_HX8357 ifdeffery which sounds quite wrong. Fix the choice for Ingenic NAND quirk by wrapping it into own ifdeffery related to the respective driver. Fixes: 3a7fd473bd5d ("mtd: rawnand: ingenic: move the GPIO quirk to gpiolib-of.c") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250402122058.1517393-2-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpiolib-of.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 880f1efcaca5..e543129d3605 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -193,6 +193,8 @@ static void of_gpio_try_fixup_polarity(const struct device_node *np, */ { "himax,hx8357", "gpios-reset", false }, { "himax,hx8369", "gpios-reset", false }, +#endif +#if IS_ENABLED(CONFIG_MTD_NAND_JZ4780) /* * The rb-gpios semantics was undocumented and qi,lb60 (along with * the ingenic driver) got it wrong. The active state encodes the From 206d0df7b6a52daa6a9dcb326b243bbce389344c Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Fri, 4 Apr 2025 22:12:20 +0000 Subject: [PATCH 09/37] selftests/futex: futex_waitv wouldblock test should fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7d50e00fef2832e98d7e06bbfc85c1d66ee110ca ] Testcase should fail if -EWOULDBLOCK is not returned when expected value differs from actual value from the waiter. Link: https://lore.kernel.org/r/20250404221225.1596324-1-edliaw@google.com Fixes: 9d57f7c79748920636f8293d2f01192d702fe390 ("selftests: futex: Test sys_futex_waitv() wouldblock") Signed-off-by: Edward Liaw Reviewed-by: Thomas Gleixner Reviewed-by: André Almeida Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- .../testing/selftests/futex/functional/futex_wait_wouldblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 7d7a6a06cdb7..2d8230da9064 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); if (!res || errno != EWOULDBLOCK) { - ksft_test_result_pass("futex_waitv returned: %d %s\n", + ksft_test_result_fail("futex_waitv returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); ret = RET_FAIL; From cb8372e54fdb91f1123f01b566d0797cd585a3d1 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 7 Oct 2024 12:24:15 -0600 Subject: [PATCH 10/37] ublk: refactor recovery configuration flag helpers [ Upstream commit 3b939b8f715e014adcc48f7827fe9417252f0833 ] ublk currently supports the following behaviors on ublk server exit: A: outstanding I/Os get errors, subsequently issued I/Os get errors B: outstanding I/Os get errors, subsequently issued I/Os queue C: outstanding I/Os get reissued, subsequently issued I/Os queue and the following behaviors for recovery of preexisting block devices by a future incarnation of the ublk server: 1: ublk devices stopped on ublk server exit (no recovery possible) 2: ublk devices are recoverable using start/end_recovery commands The userspace interface allows selection of combinations of these behaviors using flags specified at device creation time, namely: default behavior: A + 1 UBLK_F_USER_RECOVERY: B + 2 UBLK_F_USER_RECOVERY|UBLK_F_USER_RECOVERY_REISSUE: C + 2 We can't easily change the userspace interface to allow independent selection of one of {A, B, C} and one of {1, 2}, but we can refactor the internal helpers which test for the flags. Replace the existing helpers with the following set: ublk_nosrv_should_reissue_outstanding: tests for behavior C ublk_nosrv_[dev_]should_queue_io: tests for behavior B ublk_nosrv_should_stop_dev: tests for behavior 1 Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241007182419.3263186-3-ushankar@purestorage.com Signed-off-by: Jens Axboe Stable-dep-of: 6ee6bd5d4fce ("ublk: fix handling recovery & reissue in ublk_abort_queue()") Signed-off-by: Sasha Levin --- drivers/block/ublk_drv.c | 62 +++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 79b7bd8bfd45..dd328d40c7de 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -681,22 +681,44 @@ static int ublk_max_cmd_buf_size(void) return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); } -static inline bool ublk_queue_can_use_recovery_reissue( - struct ublk_queue *ubq) +/* + * Should I/O outstanding to the ublk server when it exits be reissued? + * If not, outstanding I/O will get errors. + */ +static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) { - return (ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); } -static inline bool ublk_queue_can_use_recovery( - struct ublk_queue *ubq) +/* + * Should I/O issued while there is no ublk server queue? If not, I/O + * issued while there is no ublk server will get errors. + */ +static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_RECOVERY; +} + +/* + * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy + * of the device flags for smaller cache footprint - better for fast + * paths. + */ +static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) { return ubq->flags & UBLK_F_USER_RECOVERY; } -static inline bool ublk_can_use_recovery(struct ublk_device *ub) +/* + * Should ublk devices be stopped (i.e. no recovery possible) when the + * ublk server exits? If not, devices can be used again by a future + * incarnation of a ublk server via the start_recovery/end_recovery + * commands. + */ +static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_RECOVERY; + return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); } static void ublk_free_disk(struct gendisk *disk) @@ -1072,7 +1094,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_queue_can_use_recovery_reissue(ubq)) + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) blk_mq_requeue_request(req, false); else ublk_put_req_ref(ubq, req); @@ -1100,7 +1122,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, struct request *rq) { /* We cannot process this rq so just requeue it. */ - if (ublk_queue_can_use_recovery(ubq)) + if (ublk_nosrv_dev_should_queue_io(ubq->dev)) blk_mq_requeue_request(rq, false); else blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1245,10 +1267,10 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) struct ublk_device *ub = ubq->dev; if (ublk_abort_requests(ub, ubq)) { - if (ublk_can_use_recovery(ub)) - schedule_work(&ub->quiesce_work); - else + if (ublk_nosrv_should_stop_dev(ub)) schedule_work(&ub->stop_work); + else + schedule_work(&ub->quiesce_work); } return BLK_EH_DONE; } @@ -1277,7 +1299,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; if (unlikely(ubq->canceling)) { @@ -1517,10 +1539,10 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, io, issue_flags); if (need_schedule) { - if (ublk_can_use_recovery(ub)) - schedule_work(&ub->quiesce_work); - else + if (ublk_nosrv_should_stop_dev(ub)) schedule_work(&ub->stop_work); + else + schedule_work(&ub->quiesce_work); } } @@ -1640,7 +1662,7 @@ static void ublk_stop_dev(struct ublk_device *ub) mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) goto unlock; - if (ublk_can_use_recovery(ub)) { + if (ublk_nosrv_dev_should_queue_io(ub)) { if (ub->dev_info.state == UBLK_S_DEV_LIVE) __ublk_quiesce_dev(ub); ublk_unquiesce_dev(ub); @@ -2738,7 +2760,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, int i; mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; if (!ub->nr_queues_ready) goto out_unlock; @@ -2791,7 +2813,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, __func__, ub->dev_info.nr_hw_queues, header->dev_id); mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { From caa5c8a2358604f38bf0a4afaa5eacda13763067 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Apr 2025 09:14:41 +0800 Subject: [PATCH 11/37] ublk: fix handling recovery & reissue in ublk_abort_queue() [ Upstream commit 6ee6bd5d4fce502a5b5a2ea805e9ff16e6aa890f ] Commit 8284066946e6 ("ublk: grab request reference when the request is handled by userspace") doesn't grab request reference in case of recovery reissue. Then the request can be requeued & re-dispatch & failed when canceling uring command. If it is one zc request, the request can be freed before io_uring returns the zc buffer back, then cause kernel panic: [ 126.773061] BUG: kernel NULL pointer dereference, address: 00000000000000c8 [ 126.773657] #PF: supervisor read access in kernel mode [ 126.774052] #PF: error_code(0x0000) - not-present page [ 126.774455] PGD 0 P4D 0 [ 126.774698] Oops: Oops: 0000 [#1] SMP NOPTI [ 126.775034] CPU: 13 UID: 0 PID: 1612 Comm: kworker/u64:55 Not tainted 6.14.0_blk+ #182 PREEMPT(full) [ 126.775676] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 [ 126.776275] Workqueue: iou_exit io_ring_exit_work [ 126.776651] RIP: 0010:ublk_io_release+0x14/0x130 [ublk_drv] Fixes it by always grabbing request reference for aborting the request. Reported-by: Caleb Sander Mateos Closes: https://lore.kernel.org/linux-block/CADUfDZodKfOGUeWrnAxcZiLT+puaZX8jDHoj_sfHZCOZwhzz6A@mail.gmail.com/ Fixes: 8284066946e6 ("ublk: grab request reference when the request is handled by userspace") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250409011444.2142010-2-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/block/ublk_drv.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index dd328d40c7de..38b9e485e520 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1081,6 +1081,25 @@ static void ublk_complete_rq(struct kref *ref) __ublk_complete_rq(req); } +static void ublk_do_fail_rq(struct request *req) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) + blk_mq_requeue_request(req, false); + else + __ublk_complete_rq(req); +} + +static void ublk_fail_rq_fn(struct kref *ref) +{ + struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, + ref); + struct request *req = blk_mq_rq_from_pdu(data); + + ublk_do_fail_rq(req); +} + /* * Since __ublk_rq_task_work always fails requests immediately during * exiting, __ublk_fail_req() is only called from abort context during @@ -1094,10 +1113,13 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) - blk_mq_requeue_request(req, false); - else - ublk_put_req_ref(ubq, req); + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_put(&data->ref, ublk_fail_rq_fn); + } else { + ublk_do_fail_rq(req); + } } static void ubq_complete_io_cmd(struct ublk_io *io, int res, From 9ddc7edc558acef8dd4f645af3683b226e2b590c Mon Sep 17 00:00:00 2001 From: Badal Nilawar Date: Mon, 10 Mar 2025 20:58:21 +0530 Subject: [PATCH 12/37] drm/i915: Disable RPG during live selftest [ Upstream commit 9d3d9776bd3bd9c32d460dfe6c3363134de578bc ] The Forcewake timeout issue has been observed on Gen 12.0 and above. To address this, disable Render Power-Gating (RPG) during live self-tests for these generations. The temporary workaround 'drm/i915/mtl: do not enable render power-gating on MTL' disables RPG globally, which is unnecessary since the issues were only seen during self-tests. v2: take runtime pm wakeref Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/9413 Fixes: 25e7976db86b ("drm/i915/mtl: do not enable render power-gating on MTL") Cc: Rodrigo Vivi Cc: Andi Shyti Cc: Andrzej Hajda Signed-off-by: Badal Nilawar Signed-off-by: Sk Anirban Reviewed-by: Karthik Poosa Signed-off-by: Anshuman Gupta Link: https://patchwork.freedesktop.org/patch/msgid/20250310152821.2931678-1-sk.anirban@intel.com (cherry picked from commit 0a4ae87706c6d15d14648e428c3a76351f823e48) Signed-off-by: Jani Nikula Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_rc6.c | 19 ++++--------------- .../gpu/drm/i915/selftests/i915_selftest.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index 9378d5901c49..9ca42589da4d 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -117,21 +117,10 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) GEN6_RC_CTL_RC6_ENABLE | GEN6_RC_CTL_EI_MODE(1); - /* - * BSpec 52698 - Render powergating must be off. - * FIXME BSpec is outdated, disabling powergating for MTL is just - * temporary wa and should be removed after fixing real cause - * of forcewake timeouts. - */ - if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) - pg_enable = - GEN9_MEDIA_PG_ENABLE | - GEN11_MEDIA_SAMPLER_PG_ENABLE; - else - pg_enable = - GEN9_RENDER_PG_ENABLE | - GEN9_MEDIA_PG_ENABLE | - GEN11_MEDIA_SAMPLER_PG_ENABLE; + pg_enable = + GEN9_RENDER_PG_ENABLE | + GEN9_MEDIA_PG_ENABLE | + GEN11_MEDIA_SAMPLER_PG_ENABLE; if (GRAPHICS_VER(gt->i915) >= 12 && !IS_DG1(gt->i915)) { for (i = 0; i < I915_MAX_VCS; i++) diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c b/drivers/gpu/drm/i915/selftests/i915_selftest.c index fee76c1d2f45..889281819c5b 100644 --- a/drivers/gpu/drm/i915/selftests/i915_selftest.c +++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c @@ -23,7 +23,9 @@ #include +#include "gt/intel_gt.h" #include "gt/intel_gt_pm.h" +#include "gt/intel_gt_regs.h" #include "gt/uc/intel_gsc_fw.h" #include "i915_driver.h" @@ -253,11 +255,27 @@ int i915_mock_selftests(void) int i915_live_selftests(struct pci_dev *pdev) { struct drm_i915_private *i915 = pdev_to_i915(pdev); + struct intel_uncore *uncore = &i915->uncore; int err; + u32 pg_enable; + intel_wakeref_t wakeref; if (!i915_selftest.live) return 0; + /* + * FIXME Disable render powergating, this is temporary wa and should be removed + * after fixing real cause of forcewake timeouts. + */ + with_intel_runtime_pm(uncore->rpm, wakeref) { + if (IS_GFX_GT_IP_RANGE(to_gt(i915), IP_VER(12, 00), IP_VER(12, 74))) { + pg_enable = intel_uncore_read(uncore, GEN9_PG_ENABLE); + if (pg_enable & GEN9_RENDER_PG_ENABLE) + intel_uncore_write_fw(uncore, GEN9_PG_ENABLE, + pg_enable & ~GEN9_RENDER_PG_ENABLE); + } + } + __wait_gsc_proxy_completed(i915); __wait_gsc_huc_load_completed(i915); From 857e9432dab9daf4cd1a8278c6a8c4662e63dbc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= Date: Mon, 7 Apr 2025 15:24:27 +0200 Subject: [PATCH 13/37] x86/acpi: Don't limit CPUs to 1 for Xen PV guests due to disabled ACPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8b37357a78d7fa13d88ea822b35b40137da1c85e ] Xen disables ACPI for PV guests in DomU, which causes acpi_mps_check() to return 1 when CONFIG_X86_MPPARSE is not set. As a result, the local APIC is disabled and the guest is later limited to a single vCPU, despite being configured with more. This regression was introduced in version 6.9 in commit 7c0edad3643f ("x86/cpu/topology: Rework possible CPU management"), which added an early check that limits CPUs to 1 if apic_is_disabled. Update the acpi_mps_check() logic to return 0 early when running as a Xen PV guest in DomU, preventing APIC from being disabled in this specific case and restoring correct multi-vCPU behaviour. Fixes: 7c0edad3643f ("x86/cpu/topology: Rework possible CPU management") Signed-off-by: Petr Vaněk Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250407132445.6732-2-arkamar@atlas.cz Signed-off-by: Sasha Levin --- arch/x86/kernel/acpi/boot.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c70b86f1f295..63adda8a143f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -23,6 +23,8 @@ #include #include +#include + #include #include #include @@ -1730,6 +1732,15 @@ int __init acpi_mps_check(void) { #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ + + /* + * Xen disables ACPI in PV DomU guests but it still emulates APIC and + * supports SMP. Returning early here ensures that APIC is not disabled + * unnecessarily and the guest is not limited to a single vCPU. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + if (acpi_disabled || acpi_noirq) { pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; From 9e0bdc15579e3ab619525583e1da699f50afb0d2 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Thu, 27 Mar 2025 17:56:47 +0530 Subject: [PATCH 14/37] drm/xe/hw_engine: define sysfs_ops on all directories [ Upstream commit a5c71fd5b69b9da77e5e0b268e69e256932ba49c ] Sysfs_ops needs to be defined on all directories which can have attr files with set/get method. Add sysfs_ops to even those directories which is currently empty but would have attr files with set/get method in future. Leave .default with default sysfs_ops as it will never have setter method. V2(Himal/Rodrigo): - use single sysfs_ops for all dir and attr with set/get - add default ops as ./default does not need runtime pm at all Fixes: 3f0e14651ab0 ("drm/xe: Runtime PM wake on every sysfs call") Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20250327122647.886637-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay (cherry picked from commit 40780b9760b561e093508d07b8b9b06c94ab201e) Signed-off-by: Lucas De Marchi Signed-off-by: Sasha Levin --- drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c | 108 +++++++++--------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c index b53e8d2accdb..a440442b4d72 100644 --- a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c +++ b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c @@ -32,14 +32,61 @@ bool xe_hw_engine_timeout_in_range(u64 timeout, u64 min, u64 max) return timeout >= min && timeout <= max; } -static void kobj_xe_hw_engine_release(struct kobject *kobj) +static void xe_hw_engine_sysfs_kobj_release(struct kobject *kobj) { kfree(kobj); } +static ssize_t xe_hw_engine_class_sysfs_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct xe_device *xe = kobj_to_xe(kobj); + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->show) { + xe_pm_runtime_get(xe); + ret = kattr->show(kobj, kattr, buf); + xe_pm_runtime_put(xe); + } + + return ret; +} + +static ssize_t xe_hw_engine_class_sysfs_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xe_device *xe = kobj_to_xe(kobj); + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->store) { + xe_pm_runtime_get(xe); + ret = kattr->store(kobj, kattr, buf, count); + xe_pm_runtime_put(xe); + } + + return ret; +} + +static const struct sysfs_ops xe_hw_engine_class_sysfs_ops = { + .show = xe_hw_engine_class_sysfs_attr_show, + .store = xe_hw_engine_class_sysfs_attr_store, +}; + static const struct kobj_type kobj_xe_hw_engine_type = { - .release = kobj_xe_hw_engine_release, - .sysfs_ops = &kobj_sysfs_ops + .release = xe_hw_engine_sysfs_kobj_release, + .sysfs_ops = &xe_hw_engine_class_sysfs_ops, +}; + +static const struct kobj_type kobj_xe_hw_engine_type_def = { + .release = xe_hw_engine_sysfs_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, }; static ssize_t job_timeout_max_store(struct kobject *kobj, @@ -543,7 +590,7 @@ static int xe_add_hw_engine_class_defaults(struct xe_device *xe, if (!kobj) return -ENOMEM; - kobject_init(kobj, &kobj_xe_hw_engine_type); + kobject_init(kobj, &kobj_xe_hw_engine_type_def); err = kobject_add(kobj, parent, "%s", ".defaults"); if (err) goto err_object; @@ -559,57 +606,6 @@ err_object: return err; } -static void xe_hw_engine_sysfs_kobj_release(struct kobject *kobj) -{ - kfree(kobj); -} - -static ssize_t xe_hw_engine_class_sysfs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct xe_device *xe = kobj_to_xe(kobj); - struct kobj_attribute *kattr; - ssize_t ret = -EIO; - - kattr = container_of(attr, struct kobj_attribute, attr); - if (kattr->show) { - xe_pm_runtime_get(xe); - ret = kattr->show(kobj, kattr, buf); - xe_pm_runtime_put(xe); - } - - return ret; -} - -static ssize_t xe_hw_engine_class_sysfs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count) -{ - struct xe_device *xe = kobj_to_xe(kobj); - struct kobj_attribute *kattr; - ssize_t ret = -EIO; - - kattr = container_of(attr, struct kobj_attribute, attr); - if (kattr->store) { - xe_pm_runtime_get(xe); - ret = kattr->store(kobj, kattr, buf, count); - xe_pm_runtime_put(xe); - } - - return ret; -} - -static const struct sysfs_ops xe_hw_engine_class_sysfs_ops = { - .show = xe_hw_engine_class_sysfs_attr_show, - .store = xe_hw_engine_class_sysfs_attr_store, -}; - -static const struct kobj_type xe_hw_engine_sysfs_kobj_type = { - .release = xe_hw_engine_sysfs_kobj_release, - .sysfs_ops = &xe_hw_engine_class_sysfs_ops, -}; static void hw_engine_class_sysfs_fini(void *arg) { @@ -640,7 +636,7 @@ int xe_hw_engine_class_sysfs_init(struct xe_gt *gt) if (!kobj) return -ENOMEM; - kobject_init(kobj, &xe_hw_engine_sysfs_kobj_type); + kobject_init(kobj, &kobj_xe_hw_engine_type); err = kobject_add(kobj, gt->sysfs, "engines"); if (err) From ee2b0301d6bfe16b35d57947687c664ecb815775 Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Fri, 4 Apr 2025 14:14:38 +0800 Subject: [PATCH 15/37] ata: pata_pxa: Fix potential NULL pointer dereference in pxa_ata_probe() [ Upstream commit ad320e408a8c95a282ab9c05cdf0c9b95e317985 ] devm_ioremap() returns NULL on error. Currently, pxa_ata_probe() does not check for this case, which can result in a NULL pointer dereference. Add NULL check after devm_ioremap() to prevent this issue. Fixes: 2dc6c6f15da9 ("[ARM] pata_pxa: DMA-capable PATA driver") Signed-off-by: Henry Martin Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin --- drivers/ata/pata_pxa.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/ata/pata_pxa.c b/drivers/ata/pata_pxa.c index 538bd3423d85..1bdcd6ee741d 100644 --- a/drivers/ata/pata_pxa.c +++ b/drivers/ata/pata_pxa.c @@ -223,10 +223,16 @@ static int pxa_ata_probe(struct platform_device *pdev) ap->ioaddr.cmd_addr = devm_ioremap(&pdev->dev, cmd_res->start, resource_size(cmd_res)); + if (!ap->ioaddr.cmd_addr) + return -ENOMEM; ap->ioaddr.ctl_addr = devm_ioremap(&pdev->dev, ctl_res->start, resource_size(ctl_res)); + if (!ap->ioaddr.ctl_addr) + return -ENOMEM; ap->ioaddr.bmdma_addr = devm_ioremap(&pdev->dev, dma_res->start, resource_size(dma_res)); + if (!ap->ioaddr.bmdma_addr) + return -ENOMEM; /* * Adjust register offsets From 1b7685256db2faa07b75ecd137d172cc45b11c74 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 00:02:13 -0700 Subject: [PATCH 16/37] objtool: Fix INSN_CONTEXT_SWITCH handling in validate_unret() [ Upstream commit a8df7d0ef92eca28c610206c6748daf537ac0586 ] The !CONFIG_IA32_EMULATION version of xen_entry_SYSCALL_compat() ends with a SYSCALL instruction which is classified by objtool as INSN_CONTEXT_SWITCH. Unlike validate_branch(), validate_unret() doesn't consider INSN_CONTEXT_SWITCH in a non-function to be a dead end, so it keeps going past the end of xen_entry_SYSCALL_compat(), resulting in the following warning: vmlinux.o: warning: objtool: xen_reschedule_interrupt+0x2a: RET before UNTRAIN Fix that by adding INSN_CONTEXT_SWITCH handling to validate_unret() to match what validate_branch() is already doing. Fixes: a09a6e2399ba ("objtool: Add entry UNRET validation") Reported-by: Andrew Cooper Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/f5eda46fd09f15b1f5cde3d9ae3b92b958342add.1744095216.git.jpoimboe@kernel.org Signed-off-by: Sasha Levin --- tools/objtool/check.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 286a2c0af02a..127862fa05c6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3990,6 +3990,11 @@ static int validate_unret(struct objtool_file *file, struct instruction *insn) WARN_INSN(insn, "RET before UNTRAIN"); return 1; + case INSN_CONTEXT_SWITCH: + if (insn_func(insn)) + break; + return 0; + case INSN_NOP: if (insn->retpoline_safe) return 0; From 7c5957f7905b4aede9d7a559d271438f3ca9e852 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 3 Apr 2025 09:24:31 +0000 Subject: [PATCH 17/37] tipc: fix memory leak in tipc_link_xmit [ Upstream commit 69ae94725f4fc9e75219d2d69022029c5b24bc9a ] In case the backlog transmit queue for system-importance messages is overloaded, tipc_link_xmit() returns -ENOBUFS but the skb list is not purged. This leads to memory leak and failure when a skb is allocated. This commit fixes this issue by purging the skb list before tipc_link_xmit() returns. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Signed-off-by: Tung Nguyen Link: https://patch.msgid.link/20250403092431.514063-1-tung.quang.nguyen@est.tech Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/tipc/link.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/link.c b/net/tipc/link.c index 5c2088a469ce..5689e1f48547 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1046,6 +1046,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { if (imp == TIPC_SYSTEM_IMPORTANCE) { pr_warn("%s<%s>, link overflow", link_rst_msg, l->name); + __skb_queue_purge(list); return -ENOBUFS; } rc = link_schedule_user(l, hdr); From 4d55144b12e742404bb3f8fee6038bafbf45619d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:31 -0700 Subject: [PATCH 18/37] codel: remove sch->q.qlen check before qdisc_tree_reduce_backlog() [ Upstream commit 342debc12183b51773b3345ba267e9263bdfaaef ] After making all ->qlen_notify() callbacks idempotent, now it is safe to remove the check of qlen!=0 from both fq_codel_dequeue() and codel_qdisc_dequeue(). Reported-by: Gerrard Tai Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250403211636.166257-1-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/sched/sch_codel.c | 5 +---- net/sched/sch_fq_codel.c | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 3e8d4fe4d91e..e1f6e7618deb 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) &q->stats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->stats.drop_count && sch->q.qlen) { + if (q->stats.drop_count) { qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; q->stats.drop_len = 0; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4f908c11ba95..778f6e5966be 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -314,10 +314,8 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->cstats.drop_count && sch->q.qlen) { + + if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; From 9fcbca0f801580cbb583e9cb274e2c7fbe766ca6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Apr 2025 11:03:33 -0700 Subject: [PATCH 19/37] net: tls: explicitly disallow disconnect [ Upstream commit 5071a1e606b30c0c11278d3c6620cd6a24724cf6 ] syzbot discovered that it can disconnect a TLS socket and then run into all sort of unexpected corner cases. I have a vague recollection of Eric pointing this out to us a long time ago. Supporting disconnect is really hard, for one thing if offload is enabled we'd need to wait for all packets to be _acked_. Disconnect is not commonly used, disallow it. The immediate problem syzbot run into is the warning in the strp, but that's just the easiest bug to trigger: WARNING: CPU: 0 PID: 5834 at net/tls/tls_strp.c:486 tls_strp_msg_load+0x72e/0xa80 net/tls/tls_strp.c:486 RIP: 0010:tls_strp_msg_load+0x72e/0xa80 net/tls/tls_strp.c:486 Call Trace: tls_rx_rec_wait+0x280/0xa60 net/tls/tls_sw.c:1363 tls_sw_recvmsg+0x85c/0x1c30 net/tls/tls_sw.c:2043 inet6_recvmsg+0x2c9/0x730 net/ipv6/af_inet6.c:678 sock_recvmsg_nosec net/socket.c:1023 [inline] sock_recvmsg+0x109/0x280 net/socket.c:1045 __sys_recvfrom+0x202/0x380 net/socket.c:2237 Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+b4cd76826045a1eb93c1@syzkaller.appspotmail.com Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250404180334.3224206-1-kuba@kernel.org Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/tls/tls_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6b4b9f2749a6..0acf313deb01 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -809,6 +809,11 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static int tls_disconnect(struct sock *sk, int flags) +{ + return -EOPNOTSUPP; +} + struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -904,6 +909,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_BASE] = *base; prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect; prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; From f0bb06b9f1d0bf6d02d957126cd25b52f089fb46 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Mon, 7 Apr 2025 12:33:41 +0530 Subject: [PATCH 20/37] octeontx2-pf: qos: fix VF root node parent queue index [ Upstream commit b7db94734e785e380b0db0f9295e07024f4d42a0 ] The current code configures the Physical Function (PF) root node at TL1 and the Virtual Function (VF) root node at TL2. This ensure at any given point of time PF traffic gets more priority. PF root node TL1 / \ TL2 TL2 VF root node / \ TL3 TL3 / \ TL4 TL4 / \ SMQ SMQ Due to a bug in the current code, the TL2 parent queue index on the VF interface is not being configured, leading to 'SMQ Flush' errors Fixes: 5e6808b4c68d ("octeontx2-pf: Add support for HTB offload") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250407070341.2765426-1-hkelam@marvell.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 0f844c14485a..35acc07bd964 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -165,6 +165,11 @@ static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf, otx2_config_sched_shaping(pfvf, node, cfg, &num_regs); } else if (level == NIX_TXSCH_LVL_TL2) { + /* configure parent txschq */ + cfg->reg[num_regs] = NIX_AF_TL2X_PARENT(node->schq); + cfg->regval[num_regs] = (u64)hw->tx_link << 16; + num_regs++; + /* configure link cfg */ if (level == pfvf->qos.link_cfg_lvl) { cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link); From b2f3c3d57a832666a9d15e905e7717ac13977ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 7 Apr 2025 12:55:34 +0200 Subject: [PATCH 21/37] tc: Ensure we have enough buffer space when sending filter netlink notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 369609fc6272c2f6ad666ba4fd913f3baf32908f ] The tfilter_notify() and tfilter_del_notify() functions assume that NLMSG_GOODSIZE is always enough to dump the filter chain. This is not always the case, which can lead to silent notify failures (because the return code of tfilter_notify() is not always checked). In particular, this can lead to NLM_F_ECHO not being honoured even though an action succeeds, which forces userspace to create workarounds[0]. Fix this by increasing the message size if dumping the filter chain into the allocated skb fails. Use the size of the incoming skb as a size hint if set, so we can start at a larger value when appropriate. To trigger this, run the following commands: # ip link add type veth # tc qdisc replace dev veth0 root handle 1: fq_codel # tc -echo filter add dev veth0 parent 1: u32 match u32 0 0 $(for i in $(seq 32); do echo action pedit munge ip dport set 22; done) Before this fix, tc just returns: Not a filter(cmd 2) After the fix, we get the correct echo: added filter dev veth0 parent 1: protocol all pref 49152 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 terminal flowid not_in_hw match 00000000/00000000 at 0 action order 1: pedit action pass keys 1 index 1 ref 1 bind 1 key #0 at 20: val 00000016 mask ffff0000 [repeated 32 times] [0] https://github.com/openvswitch/ovs/commit/106ef21860c935e5e0017a88bf42b94025c4e511 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Frode Nordahl Closes: https://bugs.launchpad.net/ubuntu/+source/openvswitch/+bug/2018500 Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250407105542.16601-1-toke@redhat.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/sched/cls_api.c | 66 ++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 998ea3b5badf..a3bab5e27e71 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2051,6 +2051,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); + int ret = -EMSGSIZE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -2095,11 +2096,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, return skb->len; +cls_op_not_supp: + ret = -EOPNOTSUPP; out_nlmsg_trim: nla_put_failure: -cls_op_not_supp: nlmsg_trim(skb, b); - return -1; + return ret; +} + +static struct sk_buff *tfilter_notify_prep(struct net *net, + struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto *tp, + struct tcf_block *block, + struct Qdisc *q, u32 parent, + void *fh, int event, + u32 portid, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE; + struct sk_buff *skb; + int ret; + +retry: + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOBUFS); + + ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event, false, + rtnl_held, extack); + if (ret <= 0) { + kfree_skb(skb); + if (ret == -EMSGSIZE) { + size += NLMSG_GOODSIZE; + goto retry; + } + return ERR_PTR(-EINVAL); + } + return skb; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, @@ -2115,16 +2150,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held, extack) <= 0) { - kfree_skb(skb); - return -EINVAL; - } + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event, + portid, rtnl_held, extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); if (unicast) err = rtnl_unicast(skb, net, portid); @@ -2147,16 +2176,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return tp->ops->delete(tp, fh, last, rtnl_held, extack); - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held, extack) <= 0) { + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, portid, rtnl_held, extack); + if (IS_ERR(skb)) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); - kfree_skb(skb); - return -EINVAL; + return PTR_ERR(skb); } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); From a065b996052656a65afc51ad82336dc55ae4c72f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Mon, 7 Apr 2025 15:05:10 +0200 Subject: [PATCH 22/37] net: ethtool: Don't call .cleanup_data when prepare_data fails [ Upstream commit 4f038a6a02d20859a3479293cbf172b0f14cbdd6 ] There's a consistent pattern where the .cleanup_data() callback is called when .prepare_data() fails, when it should really be called to clean after a successful .prepare_data() as per the documentation. Rewrite the error-handling paths to make sure we don't cleanup un-prepared data. Fixes: c781ff12a2f3 ("ethtool: Allow network drivers to dump arbitrary EEPROM data") Reviewed-by: Kory Maincent Reviewed-by: Simon Horman Reviewed-by: Michal Kubecek Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250407130511.75621-1-maxime.chevallier@bootlin.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/ethtool/netlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e233dfc8ca4b..a52be67139d0 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -490,7 +490,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ops->prepare_data(req_info, reply_data, info); rtnl_unlock(); if (ret < 0) - goto err_cleanup; + goto err_dev; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; @@ -548,7 +548,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); rtnl_unlock(); if (ret < 0) - goto out; + goto out_cancel; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; @@ -557,6 +557,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); +out_cancel: ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); @@ -760,7 +761,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) - goto err_cleanup; + goto err_rep; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; @@ -795,6 +796,7 @@ err_skb: err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); +err_rep: kfree(reply_data); kfree(req_info); return; From 71dd750a08344e46d54023c9477036aaf26d79f9 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 8 Apr 2025 09:34:08 +0200 Subject: [PATCH 23/37] drm/tests: modeset: Fix drm_display_mode memory leak [ Upstream commit dacafdcc7789cfeb0f0552716db56f210238225d ] drm_mode_find_dmt() returns a drm_display_mode that needs to be destroyed later one. The drm_test_pick_cmdline_res_1920_1080_60() test never does however, which leads to a memory leak. Let's make sure it's freed. Reported-by: Philipp Stanner Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/ Fixes: 8fc0380f6ba7 ("drm/client: Add some tests for drm_connector_pick_cmdline_mode()") Reviewed-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-2-996305a2e75a@kernel.org Signed-off-by: Maxime Ripard Signed-off-by: Sasha Levin --- drivers/gpu/drm/tests/drm_client_modeset_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_client_modeset_test.c b/drivers/gpu/drm/tests/drm_client_modeset_test.c index 7516f6cb36e4..3e9518d7b8b7 100644 --- a/drivers/gpu/drm/tests/drm_client_modeset_test.c +++ b/drivers/gpu/drm/tests/drm_client_modeset_test.c @@ -95,6 +95,9 @@ static void drm_test_pick_cmdline_res_1920_1080_60(struct kunit *test) expected_mode = drm_mode_find_dmt(priv->drm, 1920, 1080, 60, false); KUNIT_ASSERT_NOT_NULL(test, expected_mode); + ret = drm_kunit_add_mode_destroy_action(test, expected_mode); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_ASSERT_TRUE(test, drm_mode_parse_command_line_for_connector(cmdline, connector, From f951d643bc2618c41162189f24491f7a21c55e6f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 8 Apr 2025 09:34:07 +0200 Subject: [PATCH 24/37] drm/tests: helpers: Create kunit helper to destroy a drm_display_mode [ Upstream commit 13c1d5f3a7fa7b55a26e73bb9e95342374a489b2 ] A number of test suites call functions that expect the returned drm_display_mode to be destroyed eventually. However, none of the tests called drm_mode_destroy, which results in a memory leak. Since drm_mode_destroy takes two pointers as argument, we can't use a kunit wrapper. Let's just create a helper every test suite can use. Reviewed-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-1-996305a2e75a@kernel.org Signed-off-by: Maxime Ripard Stable-dep-of: 70f29ca3117a ("drm/tests: cmdline: Fix drm_display_mode memory leak") Signed-off-by: Sasha Levin --- drivers/gpu/drm/tests/drm_kunit_helpers.c | 22 ++++++++++++++++++++++ include/drm/drm_kunit_helpers.h | 3 +++ 2 files changed, 25 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_kunit_helpers.c b/drivers/gpu/drm/tests/drm_kunit_helpers.c index 3c0b7824c0be..922c4b6ed1dc 100644 --- a/drivers/gpu/drm/tests/drm_kunit_helpers.c +++ b/drivers/gpu/drm/tests/drm_kunit_helpers.c @@ -319,6 +319,28 @@ static void kunit_action_drm_mode_destroy(void *ptr) drm_mode_destroy(NULL, mode); } +/** + * drm_kunit_add_mode_destroy_action() - Add a drm_destroy_mode kunit action + * @test: The test context object + * @mode: The drm_display_mode to destroy eventually + * + * Registers a kunit action that will destroy the drm_display_mode at + * the end of the test. + * + * If an error occurs, the drm_display_mode will be destroyed. + * + * Returns: + * 0 on success, an error code otherwise. + */ +int drm_kunit_add_mode_destroy_action(struct kunit *test, + struct drm_display_mode *mode) +{ + return kunit_add_action_or_reset(test, + kunit_action_drm_mode_destroy, + mode); +} +EXPORT_SYMBOL_GPL(drm_kunit_add_mode_destroy_action); + /** * drm_kunit_display_mode_from_cea_vic() - return a mode for CEA VIC for a KUnit test * @test: The test context object diff --git a/include/drm/drm_kunit_helpers.h b/include/drm/drm_kunit_helpers.h index afdd46ef04f7..c835f113055d 100644 --- a/include/drm/drm_kunit_helpers.h +++ b/include/drm/drm_kunit_helpers.h @@ -120,6 +120,9 @@ drm_kunit_helper_create_crtc(struct kunit *test, const struct drm_crtc_funcs *funcs, const struct drm_crtc_helper_funcs *helper_funcs); +int drm_kunit_add_mode_destroy_action(struct kunit *test, + struct drm_display_mode *mode); + struct drm_display_mode * drm_kunit_display_mode_from_cea_vic(struct kunit *test, struct drm_device *dev, u8 video_code); From c7a0a32e6cf74d739a1f83d31a9988e143d333c1 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 8 Apr 2025 09:34:10 +0200 Subject: [PATCH 25/37] drm/tests: cmdline: Fix drm_display_mode memory leak [ Upstream commit 70f29ca3117a8796cd6bde7612a3ded96d0f2dde ] drm_analog_tv_mode() and its variants return a drm_display_mode that needs to be destroyed later one. The drm_test_cmdline_tv_options() test never does however, which leads to a memory leak. Let's make sure it's freed. Reported-by: Philipp Stanner Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/ Fixes: e691c9992ae1 ("drm/modes: Introduce the tv_mode property as a command-line option") Reviewed-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-4-996305a2e75a@kernel.org Signed-off-by: Maxime Ripard Signed-off-by: Sasha Levin --- drivers/gpu/drm/tests/drm_cmdline_parser_test.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tests/drm_cmdline_parser_test.c b/drivers/gpu/drm/tests/drm_cmdline_parser_test.c index 59c8408c453c..1cfcb597b088 100644 --- a/drivers/gpu/drm/tests/drm_cmdline_parser_test.c +++ b/drivers/gpu/drm/tests/drm_cmdline_parser_test.c @@ -7,6 +7,7 @@ #include #include +#include #include static const struct drm_connector no_connector = {}; @@ -955,8 +956,15 @@ struct drm_cmdline_tv_option_test { static void drm_test_cmdline_tv_options(struct kunit *test) { const struct drm_cmdline_tv_option_test *params = test->param_value; - const struct drm_display_mode *expected_mode = params->mode_fn(NULL); + struct drm_display_mode *expected_mode; struct drm_cmdline_mode mode = { }; + int ret; + + expected_mode = params->mode_fn(NULL); + KUNIT_ASSERT_NOT_NULL(test, expected_mode); + + ret = drm_kunit_add_mode_destroy_action(test, expected_mode); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, drm_mode_parse_command_line_for_connector(params->cmdline, &no_connector, &mode)); From 80f4dc6e1f5bfa53a6ea0d61e2620f2eb5ddc3db Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 8 Apr 2025 09:34:11 +0200 Subject: [PATCH 26/37] drm/tests: modes: Fix drm_display_mode memory leak [ Upstream commit d34146340f95cd9bf06d4ce71cca72127dc0b7cd ] drm_analog_tv_mode() and its variants return a drm_display_mode that needs to be destroyed later one. The drm_modes_analog_tv tests never do however, which leads to a memory leak. Let's make sure it's freed. Reported-by: Philipp Stanner Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/ Fixes: 4fcd238560ee ("drm/modes: Add a function to generate analog display modes") Reviewed-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-5-996305a2e75a@kernel.org Signed-off-by: Maxime Ripard Signed-off-by: Sasha Levin --- drivers/gpu/drm/tests/drm_modes_test.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_modes_test.c b/drivers/gpu/drm/tests/drm_modes_test.c index 6ed51f99e133..7ba646d87856 100644 --- a/drivers/gpu/drm/tests/drm_modes_test.c +++ b/drivers/gpu/drm/tests/drm_modes_test.c @@ -40,6 +40,7 @@ static void drm_test_modes_analog_tv_ntsc_480i(struct kunit *test) { struct drm_test_modes_priv *priv = test->priv; struct drm_display_mode *mode; + int ret; mode = drm_analog_tv_mode(priv->drm, DRM_MODE_TV_MODE_NTSC, @@ -47,6 +48,9 @@ static void drm_test_modes_analog_tv_ntsc_480i(struct kunit *test) true); KUNIT_ASSERT_NOT_NULL(test, mode); + ret = drm_kunit_add_mode_destroy_action(test, mode); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, drm_mode_vrefresh(mode), 60); KUNIT_EXPECT_EQ(test, mode->hdisplay, 720); @@ -70,6 +74,7 @@ static void drm_test_modes_analog_tv_ntsc_480i_inlined(struct kunit *test) { struct drm_test_modes_priv *priv = test->priv; struct drm_display_mode *expected, *mode; + int ret; expected = drm_analog_tv_mode(priv->drm, DRM_MODE_TV_MODE_NTSC, @@ -77,9 +82,15 @@ static void drm_test_modes_analog_tv_ntsc_480i_inlined(struct kunit *test) true); KUNIT_ASSERT_NOT_NULL(test, expected); + ret = drm_kunit_add_mode_destroy_action(test, expected); + KUNIT_ASSERT_EQ(test, ret, 0); + mode = drm_mode_analog_ntsc_480i(priv->drm); KUNIT_ASSERT_NOT_NULL(test, mode); + ret = drm_kunit_add_mode_destroy_action(test, mode); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_TRUE(test, drm_mode_equal(expected, mode)); } @@ -87,6 +98,7 @@ static void drm_test_modes_analog_tv_pal_576i(struct kunit *test) { struct drm_test_modes_priv *priv = test->priv; struct drm_display_mode *mode; + int ret; mode = drm_analog_tv_mode(priv->drm, DRM_MODE_TV_MODE_PAL, @@ -94,6 +106,9 @@ static void drm_test_modes_analog_tv_pal_576i(struct kunit *test) true); KUNIT_ASSERT_NOT_NULL(test, mode); + ret = drm_kunit_add_mode_destroy_action(test, mode); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, drm_mode_vrefresh(mode), 50); KUNIT_EXPECT_EQ(test, mode->hdisplay, 720); @@ -117,6 +132,7 @@ static void drm_test_modes_analog_tv_pal_576i_inlined(struct kunit *test) { struct drm_test_modes_priv *priv = test->priv; struct drm_display_mode *expected, *mode; + int ret; expected = drm_analog_tv_mode(priv->drm, DRM_MODE_TV_MODE_PAL, @@ -124,9 +140,15 @@ static void drm_test_modes_analog_tv_pal_576i_inlined(struct kunit *test) true); KUNIT_ASSERT_NOT_NULL(test, expected); + ret = drm_kunit_add_mode_destroy_action(test, expected); + KUNIT_ASSERT_EQ(test, ret, 0); + mode = drm_mode_analog_pal_576i(priv->drm); KUNIT_ASSERT_NOT_NULL(test, mode); + ret = drm_kunit_add_mode_destroy_action(test, mode); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_TRUE(test, drm_mode_equal(expected, mode)); } From c81306c9d6d98cc8f5595abea09b55f577595c5f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 8 Apr 2025 09:34:13 +0200 Subject: [PATCH 27/37] drm/tests: probe-helper: Fix drm_display_mode memory leak [ Upstream commit 8b6f2e28431b2f9f84073bff50353aeaf25559d0 ] drm_analog_tv_mode() and its variants return a drm_display_mode that needs to be destroyed later one. The drm_test_connector_helper_tv_get_modes_check() test never does however, which leads to a memory leak. Let's make sure it's freed. Reported-by: Philipp Stanner Closes: https://lore.kernel.org/dri-devel/a7655158a6367ac46194d57f4b7433ef0772a73e.camel@mailbox.org/ Fixes: 1e4a91db109f ("drm/probe-helper: Provide a TV get_modes helper") Reviewed-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250408-drm-kunit-drm-display-mode-memleak-v1-7-996305a2e75a@kernel.org Signed-off-by: Maxime Ripard Signed-off-by: Sasha Levin --- drivers/gpu/drm/tests/drm_probe_helper_test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tests/drm_probe_helper_test.c b/drivers/gpu/drm/tests/drm_probe_helper_test.c index bc09ff38aca1..db0e4f5df275 100644 --- a/drivers/gpu/drm/tests/drm_probe_helper_test.c +++ b/drivers/gpu/drm/tests/drm_probe_helper_test.c @@ -98,7 +98,7 @@ drm_test_connector_helper_tv_get_modes_check(struct kunit *test) struct drm_connector *connector = &priv->connector; struct drm_cmdline_mode *cmdline = &connector->cmdline_mode; struct drm_display_mode *mode; - const struct drm_display_mode *expected; + struct drm_display_mode *expected; size_t len; int ret; @@ -134,6 +134,9 @@ drm_test_connector_helper_tv_get_modes_check(struct kunit *test) KUNIT_EXPECT_TRUE(test, drm_mode_equal(mode, expected)); KUNIT_EXPECT_TRUE(test, mode->type & DRM_MODE_TYPE_PREFERRED); + + ret = drm_kunit_add_mode_destroy_action(test, expected); + KUNIT_ASSERT_EQ(test, ret, 0); } if (params->num_expected_modes >= 2) { @@ -145,6 +148,9 @@ drm_test_connector_helper_tv_get_modes_check(struct kunit *test) KUNIT_EXPECT_TRUE(test, drm_mode_equal(mode, expected)); KUNIT_EXPECT_FALSE(test, mode->type & DRM_MODE_TYPE_PREFERRED); + + ret = drm_kunit_add_mode_destroy_action(test, expected); + KUNIT_ASSERT_EQ(test, ret, 0); } mutex_unlock(&priv->drm->mode_config.mutex); From ad81d666e114ebf989fc9994d4c93d451dc60056 Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Mon, 7 Apr 2025 13:49:52 -0500 Subject: [PATCH 28/37] net: libwx: handle page_pool_dev_alloc_pages error [ Upstream commit 7f1ff1b38a7c8b872382b796023419d87d78c47e ] page_pool_dev_alloc_pages could return NULL. There was a WARN_ON(!page) but it would still proceed to use the NULL pointer and then crash. This is similar to commit 001ba0902046 ("net: fec: handle page_pool_dev_alloc_pages error"). This is found by our static analysis tool KNighter. Signed-off-by: Chenyuan Yang Fixes: 3c47e8ae113a ("net: libwx: Support to receive packets in NAPI") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250407184952.2111299-1-chenyuan0y@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 2b3d6586f44a..71c891d14fb6 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -309,7 +309,8 @@ static bool wx_alloc_mapped_page(struct wx_ring *rx_ring, return true; page = page_pool_dev_alloc_pages(rx_ring->page_pool); - WARN_ON(!page); + if (unlikely(!page)) + return false; dma = page_pool_get_dma_addr(page); bi->page_dma = dma; From 0ae84adbc9cc62d01e9846a767cc39c97a9e9694 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Tue, 8 Apr 2025 15:30:01 +0800 Subject: [PATCH 29/37] ata: sata_sx4: Add error handling in pdc20621_i2c_read() [ Upstream commit 8d46a27085039158eb5e253ab8a35a0e33b5e864 ] The function pdc20621_prog_dimm0() calls the function pdc20621_i2c_read() but does not handle the error if the read fails. This could lead to process with invalid data. A proper implementation can be found in /source/drivers/ata/sata_sx4.c, pdc20621_prog_dimm_global(). As mentioned in its commit: bb44e154e25125bef31fa956785e90fccd24610b, the variable spd0 might be used uninitialized when pdc20621_i2c_read() fails. Add error handling to pdc20621_i2c_read(). If a read operation fails, an error message is logged via dev_err(), and return a negative error code. Add error handling to pdc20621_prog_dimm0() in pdc20621_dimm_init(), and return a negative error code if pdc20621_prog_dimm0() fails. Fixes: 4447d3515616 ("libata: convert the remaining SATA drivers to new init model") Signed-off-by: Wentao Liang Reviewed-by: Niklas Cassel Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin --- drivers/ata/sata_sx4.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c index a482741eb181..c3042eca6332 100644 --- a/drivers/ata/sata_sx4.c +++ b/drivers/ata/sata_sx4.c @@ -1117,9 +1117,14 @@ static int pdc20621_prog_dimm0(struct ata_host *host) mmio += PDC_CHIP0_OFS; for (i = 0; i < ARRAY_SIZE(pdc_i2c_read_data); i++) - pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS, - pdc_i2c_read_data[i].reg, - &spd0[pdc_i2c_read_data[i].ofs]); + if (!pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS, + pdc_i2c_read_data[i].reg, + &spd0[pdc_i2c_read_data[i].ofs])) { + dev_err(host->dev, + "Failed in i2c read at index %d: device=%#x, reg=%#x\n", + i, PDC_DIMM0_SPD_DEV_ADDRESS, pdc_i2c_read_data[i].reg); + return -EIO; + } data |= (spd0[4] - 8) | ((spd0[21] != 0) << 3) | ((spd0[3]-11) << 4); data |= ((spd0[17] / 4) << 6) | ((spd0[5] / 2) << 7) | @@ -1284,6 +1289,8 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host) /* Programming DIMM0 Module Control Register (index_CID0:80h) */ size = pdc20621_prog_dimm0(host); + if (size < 0) + return size; dev_dbg(host->dev, "Local DIMM Size = %dMB\n", size); /* Programming DIMM Module Global Control Register (index_CID0:88h) */ From c5a906806162aea62dbe5d327760ce3b7117ca17 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Wed, 2 Apr 2025 19:20:57 +0200 Subject: [PATCH 30/37] drm/i915/huc: Fix fence not released on early probe errors [ Upstream commit e3ea2eae70692a455e256787e4f54153fb739b90 ] HuC delayed loading fence, introduced with commit 27536e03271da ("drm/i915/huc: track delayed HuC load with a fence"), is registered with object tracker early on driver probe but unregistered only from driver remove, which is not called on early probe errors. Since its memory is allocated under devres, then released anyway, it may happen to be allocated again to the fence and reused on future driver probes, resulting in kernel warnings that taint the kernel: <4> [309.731371] ------------[ cut here ]------------ <3> [309.731373] ODEBUG: init destroyed (active state 0) object: ffff88813d7dd2e0 object type: i915_sw_fence hint: sw_fence_dummy_notify+0x0/0x20 [i915] <4> [309.731575] WARNING: CPU: 2 PID: 3161 at lib/debugobjects.c:612 debug_print_object+0x93/0xf0 ... <4> [309.731693] CPU: 2 UID: 0 PID: 3161 Comm: i915_module_loa Tainted: G U 6.14.0-CI_DRM_16362-gf0fd77956987+ #1 ... <4> [309.731700] RIP: 0010:debug_print_object+0x93/0xf0 ... <4> [309.731728] Call Trace: <4> [309.731730] ... <4> [309.731949] __debug_object_init+0x17b/0x1c0 <4> [309.731957] debug_object_init+0x34/0x50 <4> [309.732126] __i915_sw_fence_init+0x34/0x60 [i915] <4> [309.732256] intel_huc_init_early+0x4b/0x1d0 [i915] <4> [309.732468] intel_uc_init_early+0x61/0x680 [i915] <4> [309.732667] intel_gt_common_init_early+0x105/0x130 [i915] <4> [309.732804] intel_root_gt_init_early+0x63/0x80 [i915] <4> [309.732938] i915_driver_probe+0x1fa/0xeb0 [i915] <4> [309.733075] i915_pci_probe+0xe6/0x220 [i915] <4> [309.733198] local_pci_probe+0x44/0xb0 <4> [309.733203] pci_device_probe+0xf4/0x270 <4> [309.733209] really_probe+0xee/0x3c0 <4> [309.733215] __driver_probe_device+0x8c/0x180 <4> [309.733219] driver_probe_device+0x24/0xd0 <4> [309.733223] __driver_attach+0x10f/0x220 <4> [309.733230] bus_for_each_dev+0x7d/0xe0 <4> [309.733236] driver_attach+0x1e/0x30 <4> [309.733239] bus_add_driver+0x151/0x290 <4> [309.733244] driver_register+0x5e/0x130 <4> [309.733247] __pci_register_driver+0x7d/0x90 <4> [309.733251] i915_pci_register_driver+0x23/0x30 [i915] <4> [309.733413] i915_init+0x34/0x120 [i915] <4> [309.733655] do_one_initcall+0x62/0x3f0 <4> [309.733667] do_init_module+0x97/0x2a0 <4> [309.733671] load_module+0x25ff/0x2890 <4> [309.733688] init_module_from_file+0x97/0xe0 <4> [309.733701] idempotent_init_module+0x118/0x330 <4> [309.733711] __x64_sys_finit_module+0x77/0x100 <4> [309.733715] x64_sys_call+0x1f37/0x2650 <4> [309.733719] do_syscall_64+0x91/0x180 <4> [309.733763] entry_SYSCALL_64_after_hwframe+0x76/0x7e <4> [309.733792] ... <4> [309.733806] ---[ end trace 0000000000000000 ]--- That scenario is most easily reproducible with igt@i915_module_load@reload-with-fault-injection. Fix the issue by moving the cleanup step to driver release path. Fixes: 27536e03271da ("drm/i915/huc: track delayed HuC load with a fence") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13592 Cc: Daniele Ceraolo Spurio Cc: Alan Previn Signed-off-by: Janusz Krzysztofik Reviewed-by: Daniele Ceraolo Spurio Reviewed-by: Krzysztof Karas Signed-off-by: Daniele Ceraolo Spurio Link: https://lore.kernel.org/r/20250402172057.209924-2-janusz.krzysztofik@linux.intel.com (cherry picked from commit 795dbde92fe5c6996a02a5b579481de73035e7bf) Signed-off-by: Jani Nikula Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/uc/intel_huc.c | 11 +++++------ drivers/gpu/drm/i915/gt/uc/intel_huc.h | 1 + drivers/gpu/drm/i915/gt/uc/intel_uc.c | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c index 2d9152eb7282..24fdce844d9e 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c @@ -317,6 +317,11 @@ void intel_huc_init_early(struct intel_huc *huc) } } +void intel_huc_fini_late(struct intel_huc *huc) +{ + delayed_huc_load_fini(huc); +} + #define HUC_LOAD_MODE_STRING(x) (x ? "GSC" : "legacy") static int check_huc_loading_mode(struct intel_huc *huc) { @@ -414,12 +419,6 @@ out: void intel_huc_fini(struct intel_huc *huc) { - /* - * the fence is initialized in init_early, so we need to clean it up - * even if HuC loading is off. - */ - delayed_huc_load_fini(huc); - if (huc->heci_pkt) i915_vma_unpin_and_release(&huc->heci_pkt, 0); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.h b/drivers/gpu/drm/i915/gt/uc/intel_huc.h index ba5cb08e9e7b..09aff3148f7d 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.h @@ -55,6 +55,7 @@ struct intel_huc { int intel_huc_sanitize(struct intel_huc *huc); void intel_huc_init_early(struct intel_huc *huc); +void intel_huc_fini_late(struct intel_huc *huc); int intel_huc_init(struct intel_huc *huc); void intel_huc_fini(struct intel_huc *huc); void intel_huc_suspend(struct intel_huc *huc); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index 5b8080ec5315..4f751ce74214 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -136,6 +136,7 @@ void intel_uc_init_late(struct intel_uc *uc) void intel_uc_driver_late_release(struct intel_uc *uc) { + intel_huc_fini_late(&uc->huc); } /** From ec12da4bcc44d5dd21679b3a0eb5e501e1a831a3 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 8 Apr 2025 17:29:03 +0200 Subject: [PATCH 31/37] nvmet-fcloop: swap list_add_tail arguments [ Upstream commit 2b5f0c5bc819af2b0759a8fcddc1b39102735c0f ] The newly element to be added to the list is the first argument of list_add_tail. This fix is missing dcfad4ab4d67 ("nvmet-fcloop: swap the list_add_tail arguments"). Fixes: 437c0b824dbd ("nvme-fcloop: add target to host LS request support") Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index e1abb27927ff..da195d61a966 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -478,7 +478,7 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, if (targetport) { tport = targetport->private; spin_lock(&tport->lock); - list_add_tail(&tport->ls_list, &tls_req->ls_list); + list_add_tail(&tls_req->ls_list, &tport->ls_list); spin_unlock(&tport->lock); queue_work(nvmet_wq, &tport->ls_work); } From 70449ca40609ec77f58b93ed154d54e1fdb197b6 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 7 Apr 2025 13:24:07 -0700 Subject: [PATCH 32/37] net_sched: sch_sfq: use a temporary work area for validating configuration [ Upstream commit 8c0cea59d40cf6dd13c2950437631dd614fbade6 ] Many configuration parameters have influence on others (e.g. divisor -> flows -> limit, depth -> limit) and so it is difficult to correctly do all of the validation before applying the configuration. And if a validation error is detected late it is difficult to roll back a partially applied configuration. To avoid these issues use a temporary work area to update and validate the configuration and only then apply the configuration to the internal state. Signed-off-by: Octavian Purdila Acked-by: Cong Wang Signed-off-by: David S. Miller Stable-dep-of: b3bf8f63e617 ("net_sched: sch_sfq: move the limit validation") Signed-off-by: Sasha Levin --- net/sched/sch_sfq.c | 56 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 65d5b59da583..7714ae94e052 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -631,6 +631,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; + unsigned int maxflows; + unsigned int quantum; + unsigned int divisor; + int perturb_period; + u8 headdrop; + u8 maxdepth; + int limit; + u8 flags; + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; @@ -656,36 +665,59 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, NL_SET_ERR_MSG_MOD(extack, "invalid limit"); return -EINVAL; } + sch_tree_lock(sch); + + limit = q->limit; + divisor = q->divisor; + headdrop = q->headdrop; + maxdepth = q->maxdepth; + maxflows = q->maxflows; + perturb_period = q->perturb_period; + quantum = q->quantum; + flags = q->flags; + + /* update and validate configuration */ if (ctl->quantum) - q->quantum = ctl->quantum; - WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); + quantum = ctl->quantum; + perturb_period = ctl->perturb_period * HZ; if (ctl->flows) - q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { - q->divisor = ctl->divisor; - q->maxflows = min_t(u32, q->maxflows, q->divisor); + divisor = ctl->divisor; + maxflows = min_t(u32, maxflows, divisor); } if (ctl_v1) { if (ctl_v1->depth) - q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { - swap(q->red_parms, p); - red_set_parms(q->red_parms, + red_set_parms(p, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } - q->flags = ctl_v1->flags; - q->headdrop = ctl_v1->headdrop; + flags = ctl_v1->flags; + headdrop = ctl_v1->headdrop; } if (ctl->limit) { - q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); - q->maxflows = min_t(u32, q->maxflows, q->limit); + limit = min_t(u32, ctl->limit, maxdepth * maxflows); + maxflows = min_t(u32, maxflows, limit); } + /* commit configuration */ + q->limit = limit; + q->divisor = divisor; + q->headdrop = headdrop; + q->maxdepth = maxdepth; + q->maxflows = maxflows; + WRITE_ONCE(q->perturb_period, perturb_period); + q->quantum = quantum; + q->flags = flags; + if (p) + swap(q->red_parms, p); + qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { dropped += sfq_drop(sch, &to_free); From f86293adce0c201cfabb283ef9d6f21292089bb8 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 7 Apr 2025 13:24:08 -0700 Subject: [PATCH 33/37] net_sched: sch_sfq: move the limit validation [ Upstream commit b3bf8f63e6179076b57c9de660c9f80b5abefe70 ] It is not sufficient to directly validate the limit on the data that the user passes as it can be updated based on how the other parameters are changed. Move the check at the end of the configuration update process to also catch scenarios where the limit is indirectly updated, for example with the following configurations: tc qdisc add dev dummy0 handle 1: root sfq limit 2 flows 1 depth 1 tc qdisc add dev dummy0 handle 1: root sfq limit 2 flows 1 divisor 1 This fixes the following syzkaller reported crash: ------------[ cut here ]------------ UBSAN: array-index-out-of-bounds in net/sched/sch_sfq.c:203:6 index 65535 is out of range for type 'struct sfq_head[128]' CPU: 1 UID: 0 PID: 3037 Comm: syz.2.16 Not tainted 6.14.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x201/0x300 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_out_of_bounds+0xf5/0x120 lib/ubsan.c:429 sfq_link net/sched/sch_sfq.c:203 [inline] sfq_dec+0x53c/0x610 net/sched/sch_sfq.c:231 sfq_dequeue+0x34e/0x8c0 net/sched/sch_sfq.c:493 sfq_reset+0x17/0x60 net/sched/sch_sfq.c:518 qdisc_reset+0x12e/0x600 net/sched/sch_generic.c:1035 tbf_reset+0x41/0x110 net/sched/sch_tbf.c:339 qdisc_reset+0x12e/0x600 net/sched/sch_generic.c:1035 dev_reset_queue+0x100/0x1b0 net/sched/sch_generic.c:1311 netdev_for_each_tx_queue include/linux/netdevice.h:2590 [inline] dev_deactivate_many+0x7e5/0xe70 net/sched/sch_generic.c:1375 Reported-by: syzbot Fixes: 10685681bafc ("net_sched: sch_sfq: don't allow 1 packet limit") Signed-off-by: Octavian Purdila Acked-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sched/sch_sfq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 7714ae94e052..58b42dcf8f20 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -661,10 +661,6 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, if (!p) return -ENOMEM; } - if (ctl->limit == 1) { - NL_SET_ERR_MSG_MOD(extack, "invalid limit"); - return -EINVAL; - } sch_tree_lock(sch); @@ -705,6 +701,12 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, limit = min_t(u32, ctl->limit, maxdepth * maxflows); maxflows = min_t(u32, maxflows, limit); } + if (limit == 1) { + sch_tree_unlock(sch); + kfree(p); + NL_SET_ERR_MSG_MOD(extack, "invalid limit"); + return -EINVAL; + } /* commit configuration */ q->limit = limit; From aa5a1e4b882964eb79d5b5d1d1e8a1a5efbb1d15 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 9 Apr 2025 11:14:21 -0300 Subject: [PATCH 34/37] smb: client: fix UAF in decryption with multichannel [ Upstream commit 9502dd5c7029902f4a425bf959917a5a9e7c0e50 ] After commit f7025d861694 ("smb: client: allocate crypto only for primary server") and commit b0abcd65ec54 ("smb: client: fix UAF in async decryption"), the channels started reusing AEAD TFM from primary channel to perform synchronous decryption, but that can't done as there could be multiple cifsd threads (one per channel) simultaneously accessing it to perform decryption. This fixes the following KASAN splat when running fstest generic/249 with 'vers=3.1.1,multichannel,max_channels=4,seal' against Windows Server 2022: BUG: KASAN: slab-use-after-free in gf128mul_4k_lle+0xba/0x110 Read of size 8 at addr ffff8881046c18a0 by task cifsd/986 CPU: 3 UID: 0 PID: 986 Comm: cifsd Not tainted 6.15.0-rc1 #1 PREEMPT(voluntary) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-3.fc41 04/01/2014 Call Trace: dump_stack_lvl+0x5d/0x80 print_report+0x156/0x528 ? gf128mul_4k_lle+0xba/0x110 ? __virt_addr_valid+0x145/0x300 ? __phys_addr+0x46/0x90 ? gf128mul_4k_lle+0xba/0x110 kasan_report+0xdf/0x1a0 ? gf128mul_4k_lle+0xba/0x110 gf128mul_4k_lle+0xba/0x110 ghash_update+0x189/0x210 shash_ahash_update+0x295/0x370 ? __pfx_shash_ahash_update+0x10/0x10 ? __pfx_shash_ahash_update+0x10/0x10 ? __pfx_extract_iter_to_sg+0x10/0x10 ? ___kmalloc_large_node+0x10e/0x180 ? __asan_memset+0x23/0x50 crypto_ahash_update+0x3c/0xc0 gcm_hash_assoc_remain_continue+0x93/0xc0 crypt_message+0xe09/0xec0 [cifs] ? __pfx_crypt_message+0x10/0x10 [cifs] ? _raw_spin_unlock+0x23/0x40 ? __pfx_cifs_readv_from_socket+0x10/0x10 [cifs] decrypt_raw_data+0x229/0x380 [cifs] ? __pfx_decrypt_raw_data+0x10/0x10 [cifs] ? __pfx_cifs_read_iter_from_socket+0x10/0x10 [cifs] smb3_receive_transform+0x837/0xc80 [cifs] ? __pfx_smb3_receive_transform+0x10/0x10 [cifs] ? __pfx___might_resched+0x10/0x10 ? __pfx_smb3_is_transform_hdr+0x10/0x10 [cifs] cifs_demultiplex_thread+0x692/0x1570 [cifs] ? __pfx_cifs_demultiplex_thread+0x10/0x10 [cifs] ? rcu_is_watching+0x20/0x50 ? rcu_lockdep_current_cpu_online+0x62/0xb0 ? find_held_lock+0x32/0x90 ? kvm_sched_clock_read+0x11/0x20 ? local_clock_noinstr+0xd/0xd0 ? trace_irq_enable.constprop.0+0xa8/0xe0 ? __pfx_cifs_demultiplex_thread+0x10/0x10 [cifs] kthread+0x1fe/0x380 ? kthread+0x10f/0x380 ? __pfx_kthread+0x10/0x10 ? local_clock_noinstr+0xd/0xd0 ? ret_from_fork+0x1b/0x60 ? local_clock+0x15/0x30 ? lock_release+0x29b/0x390 ? rcu_is_watching+0x20/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x31/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Tested-by: David Howells Reported-by: Steve French Closes: https://lore.kernel.org/r/CAH2r5mu6Yc0-RJXM3kFyBYUB09XmXBrNodOiCVR4EDrmxq5Szg@mail.gmail.com Fixes: f7025d861694 ("smb: client: allocate crypto only for primary server") Fixes: b0abcd65ec54 ("smb: client: fix UAF in async decryption") Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/cifsencrypt.c | 16 +++++----------- fs/smb/client/smb2ops.c | 6 +++--- fs/smb/client/smb2pdu.c | 11 ++--------- 3 files changed, 10 insertions(+), 23 deletions(-) diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 7a43daacc815..7c61c1e944c7 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -702,18 +702,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) cifs_free_hash(&server->secmech.md5); cifs_free_hash(&server->secmech.sha512); - if (!SERVER_IS_CHAN(server)) { - if (server->secmech.enc) { - crypto_free_aead(server->secmech.enc); - server->secmech.enc = NULL; - } - - if (server->secmech.dec) { - crypto_free_aead(server->secmech.dec); - server->secmech.dec = NULL; - } - } else { + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); server->secmech.enc = NULL; + } + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); server->secmech.dec = NULL; } } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 516be8c0b2a9..590b70d71694 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4576,9 +4576,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, return rc; } } else { - if (unlikely(!server->secmech.dec)) - return -EIO; - + rc = smb3_crypto_aead_allocate(server); + if (unlikely(rc)) + return rc; tfm = server->secmech.dec; } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 75b13175a2e7..1a7b82664255 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1269,15 +1269,8 @@ SMB2_negotiate(const unsigned int xid, cifs_server_dbg(VFS, "Missing expected negotiate contexts\n"); } - if (server->cipher_type && !rc) { - if (!SERVER_IS_CHAN(server)) { - rc = smb3_crypto_aead_allocate(server); - } else { - /* For channels, just reuse the primary server crypto secmech. */ - server->secmech.enc = server->primary_server->secmech.enc; - server->secmech.dec = server->primary_server->secmech.dec; - } - } + if (server->cipher_type && !rc) + rc = smb3_crypto_aead_allocate(server); neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; From 6d98cd63426e35208a380ab43afeb9184bb29477 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Apr 2025 12:38:59 +0300 Subject: [PATCH 35/37] net: phy: move phy_link_change() prior to mdio_bus_phy_may_suspend() [ Upstream commit f40a673d6b4a128fe95dd9b8c3ed02da50a6a862 ] In an upcoming change, mdio_bus_phy_may_suspend() will need to distinguish a phylib-based PHY client from a phylink PHY client. For that, it will need to compare the phydev->phy_link_change() function pointer with the eponymous phy_link_change() provided by phylib. To avoid forward function declarations, the default PHY link state change method should be moved upwards. There is no functional change associated with this patch, it is only to reduce the noise from a real bug fix. Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250407093900.2155112-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Stable-dep-of: fc75ea20ffb4 ("net: phy: allow MDIO bus PM ops to start/stop state machine for phylink-controlled PHY") Signed-off-by: Sasha Levin --- drivers/net/phy/phy_device.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 119dfa2d6643..44aa67fd544d 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -289,6 +289,19 @@ static bool phy_drv_wol_enabled(struct phy_device *phydev) return wol.wolopts != 0; } +static void phy_link_change(struct phy_device *phydev, bool up) +{ + struct net_device *netdev = phydev->attached_dev; + + if (up) + netif_carrier_on(netdev); + else + netif_carrier_off(netdev); + phydev->adjust_link(netdev); + if (phydev->mii_ts && phydev->mii_ts->link_state) + phydev->mii_ts->link_state(phydev->mii_ts, phydev); +} + static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->mdio.dev.driver; @@ -1101,19 +1114,6 @@ struct phy_device *phy_find_first(struct mii_bus *bus) } EXPORT_SYMBOL(phy_find_first); -static void phy_link_change(struct phy_device *phydev, bool up) -{ - struct net_device *netdev = phydev->attached_dev; - - if (up) - netif_carrier_on(netdev); - else - netif_carrier_off(netdev); - phydev->adjust_link(netdev); - if (phydev->mii_ts && phydev->mii_ts->link_state) - phydev->mii_ts->link_state(phydev->mii_ts, phydev); -} - /** * phy_prepare_link - prepares the PHY layer to monitor link status * @phydev: target phy_device struct From a6ed6f8ec81b8ca7100dcd9e62bdbc0dff1b2259 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Apr 2025 12:40:42 +0300 Subject: [PATCH 36/37] net: phy: allow MDIO bus PM ops to start/stop state machine for phylink-controlled PHY [ Upstream commit fc75ea20ffb452652f0d4033f38fe88d7cfdae35 ] DSA has 2 kinds of drivers: 1. Those who call dsa_switch_suspend() and dsa_switch_resume() from their device PM ops: qca8k-8xxx, bcm_sf2, microchip ksz 2. Those who don't: all others. The above methods should be optional. For type 1, dsa_switch_suspend() calls dsa_user_suspend() -> phylink_stop(), and dsa_switch_resume() calls dsa_user_resume() -> phylink_start(). These seem good candidates for setting mac_managed_pm = true because that is essentially its definition [1], but that does not seem to be the biggest problem for now, and is not what this change focuses on. Talking strictly about the 2nd category of DSA drivers here (which do not have MAC managed PM, meaning that for their attached PHYs, mdio_bus_phy_suspend() and mdio_bus_phy_resume() should run in full), I have noticed that the following warning from mdio_bus_phy_resume() is triggered: WARN_ON(phydev->state != PHY_HALTED && phydev->state != PHY_READY && phydev->state != PHY_UP); because the PHY state machine is running. It's running as a result of a previous dsa_user_open() -> ... -> phylink_start() -> phy_start() having been initiated by the user. The previous mdio_bus_phy_suspend() was supposed to have called phy_stop_machine(), but it didn't. So this is why the PHY is in state PHY_NOLINK by the time mdio_bus_phy_resume() runs. mdio_bus_phy_suspend() did not call phy_stop_machine() because for phylink, the phydev->adjust_link function pointer is NULL. This seems a technicality introduced by commit fddd91016d16 ("phylib: fix PAL state machine restart on resume"). That commit was written before phylink existed, and was intended to avoid crashing with consumer drivers which don't use the PHY state machine - phylink always does, when using a PHY. But phylink itself has historically not been developed with suspend/resume in mind, and apparently not tested too much in that scenario, allowing this bug to exist unnoticed for so long. Plus, prior to the WARN_ON(), it would have likely been invisible. This issue is not in fact restricted to type 2 DSA drivers (according to the above ad-hoc classification), but can be extrapolated to any MAC driver with phylink and MDIO-bus-managed PHY PM ops. DSA is just where the issue was reported. Assuming mac_managed_pm is set correctly, a quick search indicates the following other drivers might be affected: $ grep -Zlr PHYLINK_NETDEV drivers/ | xargs -0 grep -L mac_managed_pm drivers/net/ethernet/atheros/ag71xx.c drivers/net/ethernet/microchip/sparx5/sparx5_main.c drivers/net/ethernet/microchip/lan966x/lan966x_main.c drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c drivers/net/ethernet/freescale/dpaa/dpaa_eth.c drivers/net/ethernet/freescale/ucc_geth.c drivers/net/ethernet/freescale/enetc/enetc_pf_common.c drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c drivers/net/ethernet/marvell/mvneta.c drivers/net/ethernet/marvell/prestera/prestera_main.c drivers/net/ethernet/mediatek/mtk_eth_soc.c drivers/net/ethernet/altera/altera_tse_main.c drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c drivers/net/ethernet/meta/fbnic/fbnic_phylink.c drivers/net/ethernet/tehuti/tn40_phy.c drivers/net/ethernet/mscc/ocelot_net.c Make the existing conditions dependent on the PHY device having a phydev->phy_link_change() implementation equal to the default phy_link_change() provided by phylib. Otherwise, we implicitly know that the phydev has the phylink-provided phylink_phy_change() callback, and when phylink is used, the PHY state machine always needs to be stopped/ started on the suspend/resume path. The code is structured as such that if phydev->phy_link_change() is absent, it is a matter of time until the kernel will crash - no need to further complicate the test. Thus, for the situation where the PM is not managed by the MAC, we will make the MDIO bus PM ops treat identically the phylink-controlled PHYs with the phylib-controlled PHYs where an adjust_link() callback is supplied. In both cases, the MDIO bus PM ops should stop and restart the PHY state machine. [1] https://lore.kernel.org/netdev/Z-1tiW9zjcoFkhwc@shell.armlinux.org.uk/ Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Reported-by: Wei Fang Tested-by: Wei Fang Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250407094042.2155633-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/phy/phy_device.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 44aa67fd544d..8af44224480f 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -302,6 +302,33 @@ static void phy_link_change(struct phy_device *phydev, bool up) phydev->mii_ts->link_state(phydev->mii_ts, phydev); } +/** + * phy_uses_state_machine - test whether consumer driver uses PAL state machine + * @phydev: the target PHY device structure + * + * Ultimately, this aims to indirectly determine whether the PHY is attached + * to a consumer which uses the state machine by calling phy_start() and + * phy_stop(). + * + * When the PHY driver consumer uses phylib, it must have previously called + * phy_connect_direct() or one of its derivatives, so that phy_prepare_link() + * has set up a hook for monitoring state changes. + * + * When the PHY driver is used by the MAC driver consumer through phylink (the + * only other provider of a phy_link_change() method), using the PHY state + * machine is not optional. + * + * Return: true if consumer calls phy_start() and phy_stop(), false otherwise. + */ +static bool phy_uses_state_machine(struct phy_device *phydev) +{ + if (phydev->phy_link_change == phy_link_change) + return phydev->attached_dev && phydev->adjust_link; + + /* phydev->phy_link_change is implicitly phylink_phy_change() */ + return true; +} + static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->mdio.dev.driver; @@ -368,7 +395,7 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev) * may call phy routines that try to grab the same lock, and that may * lead to a deadlock. */ - if (phydev->attached_dev && phydev->adjust_link) + if (phy_uses_state_machine(phydev)) phy_stop_machine(phydev); if (!mdio_bus_phy_may_suspend(phydev)) @@ -422,7 +449,7 @@ no_resume: } } - if (phydev->attached_dev && phydev->adjust_link) + if (phy_uses_state_machine(phydev)) phy_start_machine(phydev); return 0; From cc16f7402a9120446339d969d334b654da59787b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Apr 2025 11:43:16 +0300 Subject: [PATCH 37/37] ipv6: Align behavior across nexthops during path selection [ Upstream commit 6933cd4714861eea6848f18396a119d741f25fc3 ] A nexthop is only chosen when the calculated multipath hash falls in the nexthop's hash region (i.e., the hash is smaller than the nexthop's hash threshold) and when the nexthop is assigned a non-negative score by rt6_score_route(). Commit 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop") introduced an unintentional difference between the first nexthop and the rest when the score is negative. When the first nexthop matches, but has a negative score, the code will currently evaluate subsequent nexthops until one is found with a non-negative score. On the other hand, when a different nexthop matches, but has a negative score, the code will fallback to the nexthop with which the selection started ('match'). Align the behavior across all nexthops and fallback to 'match' when the first nexthop matches, but has a negative score. Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N") Fixes: 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop") Reported-by: Willem de Bruijn Closes: https://lore.kernel.org/netdev/67efef607bc41_1ddca82948c@willemb.c.googlers.com.notmuch/ Signed-off-by: Ido Schimmel Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250408084316.243559-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv6/route.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 987492dcb07c..bae8ece3e881 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -470,10 +470,10 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, goto out; hash = fl6->mp_hash; - if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && - rt6_score_route(first->fib6_nh, first->fib6_flags, oif, - strict) >= 0) { - match = first; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { + if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) + match = first; goto out; }