From 09681a0772f773dddffd3c2d1796c87bd0d903b9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:11 +0200 Subject: [PATCH 1/7] cpufreq: Replace deprecated CPU-hotplug functions The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 4 ++-- drivers/cpufreq/cpufreq.c | 6 +++--- drivers/cpufreq/cpufreq_ondemand.c | 4 ++-- drivers/cpufreq/intel_pstate.c | 4 ++-- drivers/cpufreq/powernow-k8.c | 6 +++--- drivers/cpufreq/powernv-cpufreq.c | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 7e7450453714..b49612895c78 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -163,9 +163,9 @@ static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, if (ret || val > 1) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); set_boost(policy, val); - put_online_cpus(); + cpus_read_unlock(); return count; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 45f3416988f1..06c526d66dd3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2654,18 +2654,18 @@ int cpufreq_boost_trigger_state(int state) cpufreq_driver->boost_enabled = state; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - get_online_cpus(); + cpus_read_lock(); for_each_active_policy(policy) { ret = cpufreq_driver->set_boost(policy, state); if (ret) goto err_reset_state; } - put_online_cpus(); + cpus_read_unlock(); return 0; err_reset_state: - put_online_cpus(); + cpus_read_unlock(); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->boost_enabled = !state; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index ac361a8b1d3b..eb4320b619c9 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -418,7 +418,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias) default_powersave_bias = powersave_bias; cpumask_clear(&done); - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct cpufreq_policy *policy; struct policy_dbs_info *policy_dbs; @@ -442,7 +442,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias) od_tuners = dbs_data->tuners; od_tuners->powersave_bias = default_powersave_bias; } - put_online_cpus(); + cpus_read_unlock(); } void od_register_powersave_bias_handler(unsigned int (*f) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bb4549959b11..2d83a9f9651b 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2969,7 +2969,7 @@ static void intel_pstate_driver_cleanup(void) { unsigned int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { if (intel_pstate_driver == &intel_pstate) @@ -2979,7 +2979,7 @@ static void intel_pstate_driver_cleanup(void) all_cpu_data[cpu] = NULL; } } - put_online_cpus(); + cpus_read_unlock(); intel_pstate_driver = NULL; } diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index b9ccb6a3dad9..12ab4014af71 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -1180,7 +1180,7 @@ static int powernowk8_init(void) if (!x86_match_cpu(powernow_k8_ids)) return -ENODEV; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(i) { smp_call_function_single(i, check_supported_cpu, &ret, 1); if (!ret) @@ -1188,10 +1188,10 @@ static int powernowk8_init(void) } if (supported_cpus != num_online_cpus()) { - put_online_cpus(); + cpus_read_unlock(); return -ENODEV; } - put_online_cpus(); + cpus_read_unlock(); ret = cpufreq_register_driver(&cpufreq_amd64_driver); if (ret) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 005600cef273..23a06cba392c 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -918,7 +918,7 @@ static void powernv_cpufreq_work_fn(struct work_struct *work) unsigned int cpu; cpumask_t mask; - get_online_cpus(); + cpus_read_lock(); cpumask_and(&mask, &chip->mask, cpu_online_mask); smp_call_function_any(&mask, powernv_cpufreq_throttle_check, NULL, 0); @@ -939,7 +939,7 @@ static void powernv_cpufreq_work_fn(struct work_struct *work) cpufreq_cpu_put(policy); } out: - put_online_cpus(); + cpus_read_unlock(); } static int powernv_cpufreq_occ_msg(struct notifier_block *nb, From 7fcc17d0cb12938d2b3507973a6f93fc9ed2c7a1 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 3 Aug 2021 11:27:43 +0100 Subject: [PATCH 2/7] PM: EM: Increase energy calculation precision The Energy Model (EM) provides useful information about device power in each performance state to other subsystems like: Energy Aware Scheduler (EAS). The energy calculation in EAS does arithmetic operation based on the EM em_cpu_energy(). Current implementation of that function uses em_perf_state::cost as a pre-computed cost coefficient equal to: cost = power * max_frequency / frequency. The 'power' is expressed in milli-Watts (or in abstract scale). There are corner cases when the EAS energy calculation for two Performance Domains (PDs) return the same value. The EAS compares these values to choose smaller one. It might happen that this values are equal due to rounding error. In such scenario, we need better resolution, e.g. 1000 times better. To provide this possibility increase the resolution in the em_perf_state::cost for 64-bit architectures. The cost of increasing resolution on 32-bit is pretty high (64-bit division) and is not justified since there are no new 32bit big.LITTLE EAS systems expected which would benefit from this higher resolution. This patch allows to avoid the rounding to milli-Watt errors, which might occur in EAS energy estimation for each PD. The rounding error is common for small tasks which have small utilization value. There are two places in the code where it makes a difference: 1. In the find_energy_efficient_cpu() where we are searching for best_delta. We might suffer there when two PDs return the same result, like in the example below. Scenario: Low utilized system e.g. ~200 sum_util for PD0 and ~220 for PD1. There are quite a few small tasks ~10-15 util. These tasks would suffer for the rounding error. These utilization values are typical when running games on Android. One of our partners has reported 5..10mA less battery drain when running with increased resolution. Some details: We have two PDs: PD0 (big) and PD1 (little) Let's compare w/o patch set ('old') and w/ patch set ('new') We are comparing energy w/ task and w/o task placed in the PDs a) 'old' w/o patch set, PD0 task_util = 13 cost = 480 sum_util_w/o_task = 215 sum_util_w_task = 228 scale_cpu = 1024 energy_w/o_task = 480 * 215 / 1024 = 100.78 => 100 energy_w_task = 480 * 228 / 1024 = 106.87 => 106 energy_diff = 106 - 100 = 6 (this is equal to 'old' PD1's energy_diff in 'c)') b) 'new' w/ patch set, PD0 task_util = 13 cost = 480 * 1000 = 480000 sum_util_w/o_task = 215 sum_util_w_task = 228 energy_w/o_task = 480000 * 215 / 1024 = 100781 energy_w_task = 480000 * 228 / 1024 = 106875 energy_diff = 106875 - 100781 = 6094 (this is not equal to 'new' PD1's energy_diff in 'd)') c) 'old' w/o patch set, PD1 task_util = 13 cost = 160 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160 * 283 / 355 = 127.55 => 127 energy_w_task = 160 * 296 / 355 = 133.41 => 133 energy_diff = 133 - 127 = 6 (this is equal to 'old' PD0's energy_diff in 'a)') d) 'new' w/ patch set, PD1 task_util = 13 cost = 160 * 1000 = 160000 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160000 * 283 / 355 = 127549 energy_w_task = 160000 * 296 / 355 = 133408 energy_diff = 133408 - 127549 = 5859 (this is not equal to 'new' PD0's energy_diff in 'b)') 2. Difference in the 6% energy margin filter at the end of find_energy_efficient_cpu(). With this patch the margin comparison also has better resolution, so it's possible to have better task placement thanks to that. Fixes: 27871f7a8a341ef ("PM: Introduce an Energy Model management framework") Reported-by: CCJ Yeh Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 16 ++++++++++++++++ kernel/power/energy_model.c | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3f221dbf5f95..1834752c5617 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -53,6 +53,22 @@ struct em_perf_domain { #ifdef CONFIG_ENERGY_MODEL #define EM_MAX_POWER 0xFFFF +/* + * Increase resolution of energy estimation calculations for 64-bit + * architectures. The extra resolution improves decision made by EAS for the + * task placement when two Performance Domains might provide similar energy + * estimation values (w/o better resolution the values could be equal). + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + */ +#ifdef CONFIG_64BIT +#define em_scale_power(p) ((p) * 1000) +#else +#define em_scale_power(p) (p) +#endif + struct em_data_callback { /** * active_power() - Provide power at the next performance state of diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0f4530b3a8cd..a332ccd829e2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { - table[i].cost = div64_u64(fmax * table[i].power, + unsigned long power_res = em_scale_power(table[i].power); + + table[i].cost = div64_u64(fmax * power_res, table[i].frequency); } From e5c6b312ce3cc97e90ea159446e6bfa06645364d Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 5 Aug 2021 15:29:17 +0800 Subject: [PATCH 3/7] cpufreq: schedutil: Use kobject release() method to free sugov_tunables The struct sugov_tunables is protected by the kobject, so we can't free it directly. Otherwise we would get a call trace like this: ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x30 WARNING: CPU: 3 PID: 720 at lib/debugobjects.c:505 debug_print_object+0xb8/0x100 Modules linked in: CPU: 3 PID: 720 Comm: a.sh Tainted: G W 5.14.0-rc1-next-20210715-yocto-standard+ #507 Hardware name: Marvell OcteonTX CN96XX board (DT) pstate: 40400009 (nZcv daif +PAN -UAO -TCO BTYPE=--) pc : debug_print_object+0xb8/0x100 lr : debug_print_object+0xb8/0x100 sp : ffff80001ecaf910 x29: ffff80001ecaf910 x28: ffff00011b10b8d0 x27: ffff800011043d80 x26: ffff00011a8f0000 x25: ffff800013cb3ff0 x24: 0000000000000000 x23: ffff80001142aa68 x22: ffff800011043d80 x21: ffff00010de46f20 x20: ffff800013c0c520 x19: ffff800011d8f5b0 x18: 0000000000000010 x17: 6e6968207473696c x16: 5f72656d6974203a x15: 6570797420746365 x14: 6a626f2029302065 x13: 303378302f307830 x12: 2b6e665f72656d69 x11: ffff8000124b1560 x10: ffff800012331520 x9 : ffff8000100ca6b0 x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 0000000000000001 x5 : ffff800011d8c000 x4 : ffff800011d8c740 x3 : 0000000000000000 x2 : ffff0001108301c0 x1 : ab3c90eedf9c0f00 x0 : 0000000000000000 Call trace: debug_print_object+0xb8/0x100 __debug_check_no_obj_freed+0x1c0/0x230 debug_check_no_obj_freed+0x20/0x88 slab_free_freelist_hook+0x154/0x1c8 kfree+0x114/0x5d0 sugov_exit+0xbc/0xc0 cpufreq_exit_governor+0x44/0x90 cpufreq_set_policy+0x268/0x4a8 store_scaling_governor+0xe0/0x128 store+0xc0/0xf0 sysfs_kf_write+0x54/0x80 kernfs_fop_write_iter+0x128/0x1c0 new_sync_write+0xf0/0x190 vfs_write+0x2d4/0x478 ksys_write+0x74/0x100 __arm64_sys_write+0x24/0x30 invoke_syscall.constprop.0+0x54/0xe0 do_el0_svc+0x64/0x158 el0_svc+0x2c/0xb0 el0t_64_sync_handler+0xb0/0xb8 el0t_64_sync+0x198/0x19c irq event stamp: 5518 hardirqs last enabled at (5517): [] console_unlock+0x554/0x6c8 hardirqs last disabled at (5518): [] el1_dbg+0x28/0xa0 softirqs last enabled at (5504): [] __do_softirq+0x4d0/0x6c0 softirqs last disabled at (5483): [] irq_exit+0x1b0/0x1b8 So split the original sugov_tunables_free() into two functions, sugov_clear_global_tunables() is just used to clear the global_tunables and the new sugov_tunables_free() is used as kobj_type::release to release the sugov_tunables safely. Fixes: 9bdcb44e391d ("cpufreq: schedutil: New governor based on scheduler utilization data") Cc: 4.7+ # 4.7+ Signed-off-by: Kevin Hao Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 57124614363d..e7af18857371 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -707,7 +713,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); From b2f6662ac08d0e7c25574ce53623c71bdae9dd78 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Aug 2021 21:14:31 +0100 Subject: [PATCH 4/7] PM: cpu: Make notifier chain use a raw_spinlock_t Invoking atomic_notifier_chain_notify() requires acquiring a spinlock_t, which can block under CONFIG_PREEMPT_RT. Notifications for members of the cpu_pm notification chain will be issued by the idle task, which can never block. Making *all* atomic_notifiers use a raw_spinlock is too big of a hammer, as only notifications issued by the idle task are problematic. Special-case cpu_pm_notifier_chain by kludging a raw_notifier and raw_spinlock_t together, matching the atomic_notifier behavior with a raw_spinlock_t. Fixes: 70d932985757 ("notifier: Fix broken error handling pattern") Signed-off-by: Valentin Schneider Acked-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- kernel/cpu_pm.c | 50 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index f7e1d0eccdbc..246efc74e3f3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -13,19 +13,32 @@ #include #include -static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); +/* + * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT. + * Notifications for cpu_pm will be issued by the idle task itself, which can + * never block, IOW it requires using a raw_spinlock_t. + */ +static struct { + struct raw_notifier_head chain; + raw_spinlock_t lock; +} cpu_pm_notifier = { + .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain), + .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock), +}; static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * atomic_notifier_call_chain has a RCU read critical section, which - * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let - * RCU know this. + * This introduces a RCU read critical section, which could be + * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know + * this. */ rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_read_lock(); + ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); + rcu_read_unlock(); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event) static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) { + unsigned long flags; int ret; rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev * Add a driver to a list of drivers that are notified about * CPU and CPU cluster low power entry and exit. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_register. + * This function has the same return conditions as raw_notifier_chain_register. */ int cpu_pm_register_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); @@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); * * Remove a driver from the CPU PM notifier list. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_unregister. + * This function has the same return conditions as raw_notifier_chain_unregister. */ int cpu_pm_unregister_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); From 15538a20579fa4047912508b03b39bcdeec26b05 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Aug 2021 21:14:32 +0100 Subject: [PATCH 5/7] notifier: Remove atomic_notifier_call_chain_robust() This now has no more users, remove it. Suggested-by: Sebastian Andrzej Siewior Signed-off-by: Valentin Schneider Acked-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- include/linux/notifier.h | 2 -- kernel/notifier.c | 19 ------------------- 2 files changed, 21 deletions(-) diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2fb373a5c1ed..87069b8459af 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -168,8 +168,6 @@ extern int raw_notifier_call_chain(struct raw_notifier_head *nh, extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); -extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, diff --git a/kernel/notifier.c b/kernel/notifier.c index 1b019cbca594..b8251dc0bc0f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); -int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v) -{ - unsigned long flags; - int ret; - - /* - * Musn't use RCU; because then the notifier list can - * change between the up and down traversal. - */ - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); - spin_unlock_irqrestore(&nh->lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); -NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); - /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain From 950809cd6ca2ff2e2bb9d826c4d9e35d134d7de0 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 19 Aug 2021 19:40:05 -0700 Subject: [PATCH 6/7] thermal: intel: Allow processing of HWP interrupt Add a weak function to process HWP (Hardware P-states) notifications and move updating HWP_STATUS MSR to this function. This allows HWP interrupts to be processed by the intel_pstate driver in HWP mode by overriding the implementation. Signed-off-by: Srinivas Pandruvada Acked-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/therm_throt.c | 7 ++++++- drivers/thermal/intel/thermal_interrupt.h | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 99abdc03c44c..dab7e8fb1059 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -569,13 +569,18 @@ static void notify_thresholds(__u64 msr_val) platform_thermal_notify(msr_val); } +void __weak notify_hwp_interrupt(void) +{ + wrmsrl_safe(MSR_HWP_STATUS, 0); +} + /* Thermal transition interrupt handler */ void intel_thermal_interrupt(void) { __u64 msr_val; if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); + notify_hwp_interrupt(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); diff --git a/drivers/thermal/intel/thermal_interrupt.h b/drivers/thermal/intel/thermal_interrupt.h index 53f427bb58dc..01e7bed2ffc7 100644 --- a/drivers/thermal/intel/thermal_interrupt.h +++ b/drivers/thermal/intel/thermal_interrupt.h @@ -12,4 +12,7 @@ extern int (*platform_thermal_notify)(__u64 msr_val); * callback has rate control */ extern bool (*platform_thermal_package_rate_control)(void); +/* Handle HWP interrupt */ +extern void notify_hwp_interrupt(void); + #endif /* _INTEL_THERMAL_INTERRUPT_H */ From d0e936adbd2250cb03f2e840c6651d18edc22ace Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 19 Aug 2021 19:40:06 -0700 Subject: [PATCH 7/7] cpufreq: intel_pstate: Process HWP Guaranteed change notification It is possible that HWP guaranteed ratio is changed in response to change in power and thermal limits. For example when Intel Speed Select performance profile is changed or there is change in TDP, hardware can send notifications. It is possible that the guaranteed ratio is increased. This creates an issue when turbo is disabled, as the old limits set in MSR_HWP_REQUEST are still lower and hardware will clip to older limits. This change enables HWP interrupt and process HWP interrupts. When guaranteed is changed, calls cpufreq_update_policy() so that driver callbacks are called to update to new HWP limits. This callback is called from a delayed workqueue of 10ms to avoid frequent updates. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2d83a9f9651b..b4ffe6c8a0d0 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -32,6 +32,7 @@ #include #include #include +#include "../drivers/thermal/intel/thermal_interrupt.h" #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) @@ -219,6 +220,7 @@ struct global_params { * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance * @suspended: Whether or not the driver has been suspended. + * @hwp_notify_work: workqueue for HWP notifications. * * This structure stores per CPU instance data for all CPUs. */ @@ -257,6 +259,7 @@ struct cpudata { unsigned int sched_flags; u32 hwp_boost_min; bool suspended; + struct delayed_work hwp_notify_work; }; static struct cpudata **all_cpu_data; @@ -1625,6 +1628,40 @@ static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) /************************** sysfs end ************************/ +static void intel_pstate_notify_work(struct work_struct *work) +{ + mutex_lock(&intel_pstate_driver_lock); + cpufreq_update_policy(smp_processor_id()); + wrmsrl(MSR_HWP_STATUS, 0); + mutex_unlock(&intel_pstate_driver_lock); +} + +void notify_hwp_interrupt(void) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpudata *cpudata; + u64 value; + + if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) + return; + + rdmsrl(MSR_HWP_STATUS, value); + if (!(value & 0x01)) + return; + + cpudata = all_cpu_data[this_cpu]; + schedule_delayed_work_on(this_cpu, &cpudata->hwp_notify_work, msecs_to_jiffies(10)); +} + +static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) +{ + /* Enable HWP notification interrupt for guaranteed performance change */ + if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) { + INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work); + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01); + } +} + static void intel_pstate_hwp_enable(struct cpudata *cpudata) { /* First disable HWP notification interrupt as we don't process them */ @@ -1634,6 +1671,8 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); if (cpudata->epp_default == -EINVAL) cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); + + intel_pstate_enable_hwp_interrupt(cpudata); } static int atom_get_min_pstate(void)